]> git.donarmstrong.com Git - samtools.git/blob - bam_rmdupse.c
change a parameter. It does nothing
[samtools.git] / bam_rmdupse.c
1 #include <math.h>
2 #include "sam.h"
3 #include "khash.h"
4
5 typedef struct {
6         int n, m;
7         int *a;
8 } listelem_t;
9
10 KHASH_MAP_INIT_INT(32, listelem_t)
11
12 #define BLOCK_SIZE 65536
13
14 typedef struct {
15         bam1_t *b;
16         int rpos, score;
17 } elem_t;
18
19 typedef struct {
20         int n, max, x;
21         elem_t *buf;
22 } buffer_t;
23
24 static int fill_buf(samfile_t *in, buffer_t *buf)
25 {
26         int i, ret, last_tid, min_rpos = 0x7fffffff, capacity;
27         bam1_t *b = bam_init1();
28         bam1_core_t *c = &b->core;
29         // squeeze out the empty cells at the beginning
30         for (i = 0; i < buf->n; ++i)
31                 if (buf->buf[i].b) break;
32         if (i < buf->n) { // squeeze
33                 if (i > 0) {
34                         memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i));
35                         buf->n = buf->n - i;
36                 }
37         } else buf->n = 0;
38         // calculate min_rpos
39         for (i = 0; i < buf->n; ++i) {
40                 elem_t *e = buf->buf + i;
41                 if (e->b && e->rpos >= 0 && e->rpos < min_rpos)
42                         min_rpos = buf->buf[i].rpos;
43         }
44         // fill the buffer
45         buf->x = -1;
46         last_tid = buf->n? buf->buf[0].b->core.tid : -1;
47         capacity = buf->n + BLOCK_SIZE;
48         while ((ret = samread(in, b)) >= 0) {
49                 elem_t *e;
50                 uint8_t *qual = bam1_qual(b);
51                 int is_mapped;
52                 if (last_tid < 0) last_tid = c->tid;
53                 if (c->tid != last_tid) {
54                         if (buf->x < 0) buf->x = buf->n;
55                 }
56                 if (buf->n >= buf->max) { // enlarge
57                         buf->max = buf->max? buf->max<<1 : 8;
58                         buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max);
59                 }
60                 e = &buf->buf[buf->n++];
61                 e->b = bam_dup1(b);
62                 e->rpos = -1; e->score = 0;
63                 for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1;
64                 e->score = (double)e->score / sqrt(c->l_qseq + 1);
65                 is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1;
66                 if (is_mapped && (c->flag & BAM_FREVERSE)) {
67                         e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b));
68                         if (min_rpos > e->rpos) min_rpos = e->rpos;
69                 }
70                 if (buf->n >= capacity) {
71                         if (c->pos <= min_rpos) capacity += BLOCK_SIZE;
72                         else break;
73                 }
74         }
75         if (ret >= 0 && buf->x < 0) buf->x = buf->n;
76         bam_destroy1(b);
77         return buf->n;
78 }
79
80 static void rmdupse_buf(buffer_t *buf)
81 {
82         khash_t(32) *h;
83         uint32_t key;
84         khint_t k;
85         int mpos, i, upper;
86         listelem_t *p;
87         mpos = 0x7fffffff;
88         mpos = (buf->x == buf->n)? buf->buf[buf->x-1].b->core.pos : 0x7fffffff;
89         upper = (buf->x < 0)? buf->n : buf->x;
90         // fill the hash table
91         h = kh_init(32);
92         for (i = 0; i < upper; ++i) {
93                 elem_t *e = buf->buf + i;
94                 int ret;
95                 if (e->score < 0) continue;
96                 if (e->rpos >= 0) {
97                         if (e->rpos <= mpos) key = (uint32_t)e->rpos<<1 | 1;
98                         else continue;
99                 } else {
100                         if (e->b->core.pos < mpos) key = (uint32_t)e->b->core.pos<<1;
101                         else continue;
102                 }
103                 k = kh_put(32, h, key, &ret);
104                 p = &kh_val(h, k);
105                 if (ret == 0) { // present in the hash table
106                         if (p->n == p->m) {
107                                 p->m <<= 1;
108                                 p->a = (int*)realloc(p->a, p->m * sizeof(int));
109                         }
110                         p->a[p->n++] = i;
111                 } else {
112                         p->m = p->n = 1;
113                         p->a = (int*)calloc(p->m, sizeof(int));
114                         p->a[0] = i;
115                 }
116         }
117         // rmdup
118         for (k = kh_begin(h); k < kh_end(h); ++k) {
119                 if (kh_exist(h, k)) {
120                         int max, maxi;
121                         p = &kh_val(h, k);
122                         // get the max
123                         for (i = max = 0, maxi = -1; i < p->n; ++i) {
124                                 if (buf->buf[p->a[i]].score > max) {
125                                         max = buf->buf[p->a[i]].score;
126                                         maxi = i;
127                                 }
128                         }
129                         // mark the elements
130                         for (i = 0; i < p->n; ++i) {
131                                 buf->buf[p->a[i]].score = -1;
132                                 if (i != maxi) {
133                                         bam_destroy1(buf->buf[p->a[i]].b);
134                                         buf->buf[p->a[i]].b = 0;
135                                 }
136                         }
137                         // free
138                         free(p->a);
139                 }
140         }
141         kh_destroy(32, h);
142 }
143
144 static void dump_buf(buffer_t *buf, samfile_t *out)
145 {
146         int i;
147         for (i = 0; i < buf->n; ++i) {
148                 elem_t *e = buf->buf + i;
149                 if (e->score != -1) break;
150                 if (e->b) {
151                         samwrite(out, e->b);
152                         bam_destroy1(e->b);
153                         e->b = 0;
154                 }
155         }
156 }
157
158 int bam_rmdupse(int argc, char *argv[])
159 {
160         samfile_t *in, *out;
161         buffer_t *buf;
162         if (argc < 3) {
163                 fprintf(stderr, "Usage: samtools rmdupse <in.bam> <out.bam>\n");
164                 return 1;
165         }
166         buf = calloc(1, sizeof(buffer_t));
167         in = samopen(argv[1], "rb", 0);
168         out = samopen(argv[2], "wb", in->header);
169         while (fill_buf(in, buf)) {
170                 rmdupse_buf(buf);
171                 dump_buf(buf, out);
172         }
173         samclose(in); samclose(out);
174         free(buf->buf); free(buf);
175         return 0;
176 }