]> git.donarmstrong.com Git - samtools.git/blob - bam_rmdup.c
* samtools-0.1.2-26
[samtools.git] / bam_rmdup.c
1 #include <stdlib.h>
2 #include <string.h>
3 #include <stdio.h>
4 #include <assert.h>
5 #include <zlib.h>
6 #include "bam.h"
7
8 typedef bam1_t *bam1_p;
9 #include "khash.h"
10 KHASH_SET_INIT_STR(name)
11 KHASH_MAP_INIT_INT64(pos, bam1_p)
12
13 #define BUFFER_SIZE 0x40000
14
15 typedef struct {
16         int n, max;
17         bam1_t **a;
18 } tmp_stack_t;
19
20 static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
21 {
22         if (stack->n == stack->max) {
23                 stack->max = stack->max? stack->max<<1 : 0x10000;
24                 stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max);
25         }
26         stack->a[stack->n++] = b;
27 }
28
29 static inline void dump_best(tmp_stack_t *stack, khash_t(pos) *best_hash, bamFile out)
30 {
31         int i;
32         for (i = 0; i != stack->n; ++i) {
33                 bam_write1(out, stack->a[i]);
34                 bam_destroy1(stack->a[i]);
35         }
36         stack->n = 0;
37         if (kh_size(best_hash) > BUFFER_SIZE) kh_clear(pos, best_hash);
38 }
39
40 static void clear_del_set(khash_t(name) *del_set)
41 {
42         khint_t k;
43         for (k = kh_begin(del_set); k < kh_end(del_set); ++k)
44                 if (kh_exist(del_set, k))
45                         free((char*)kh_key(del_set, k));
46         kh_clear(name, del_set);
47 }
48
49 void bam_rmdup_core(bamFile in, bamFile out)
50 {
51         bam_header_t *header;
52         bam1_t *b;
53         int last_tid = -1, last_pos = -1;
54         uint64_t n_checked = 0, n_removed = 0;
55         tmp_stack_t stack;
56         khint_t k;
57         khash_t(pos) *best_hash;
58         khash_t(name) *del_set;
59
60         best_hash = kh_init(pos);
61         del_set = kh_init(name);
62         b = bam_init1();
63         memset(&stack, 0, sizeof(tmp_stack_t));
64         header = bam_header_read(in);
65         bam_header_write(out, header);
66
67         kh_resize(name, del_set, 4 * BUFFER_SIZE);
68         kh_resize(pos, best_hash, 3 * BUFFER_SIZE);
69         while (bam_read1(in, b) >= 0) {
70                 bam1_core_t *c = &b->core;
71                 if (c->tid != last_tid || last_pos != c->pos) {
72                         dump_best(&stack, best_hash, out); // write the result
73                         if (c->tid != last_tid) {
74                                 kh_clear(pos, best_hash);
75                                 if (kh_size(del_set)) { // check
76                                         fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set));
77                                         clear_del_set(del_set);
78                                 }
79                                 if ((int)c->tid == -1) { // append unmapped reads
80                                         bam_write1(out, b);
81                                         while (bam_read1(in, b) >= 0) bam_write1(out, b);
82                                         break;
83                                 }
84                                 last_tid = c->tid;
85                                 fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", header->target_name[c->tid]);
86                         }
87                 }
88                 if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
89                         bam_write1(out, b);
90                 } else if (c->isize > 0) { // paired, head
91                         uint64_t key = (uint64_t)c->pos<<32 | c->isize;
92                         int ret;
93                         ++n_checked;
94                         k = kh_put(pos, best_hash, key, &ret);
95                         if (ret == 0) { // found in best_hash
96                                 bam1_t *p = kh_val(best_hash, k);
97                                 ++n_removed;
98                                 if (p->core.qual < c->qual) { // the current alignment is better
99                                         kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed
100                                         bam_copy1(p, b); // replaced as b
101                                 } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed
102                                 if (ret == 0)
103                                         fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b));
104                         } else { // not found in best_hash
105                                 kh_val(best_hash, k) = bam_dup1(b);
106                                 stack_insert(&stack, kh_val(best_hash, k));
107                         }
108                 } else { // paired, tail
109                         k = kh_get(name, del_set, bam1_qname(b));
110                         if (k != kh_end(del_set)) {
111                                 free((char*)kh_key(del_set, k));
112                                 kh_del(name, del_set, k);
113                         } else bam_write1(out, b);
114                 }
115                 last_pos = c->pos;
116         }
117         dump_best(&stack, best_hash, out);
118
119         bam_header_destroy(header);
120         clear_del_set(del_set);
121         kh_destroy(name, del_set);
122         kh_destroy(pos, best_hash);
123         free(stack.a);
124         bam_destroy1(b);
125         fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf\n", (long long)n_removed, (long long)n_checked,
126                         (double)n_removed/n_checked);
127 }
128 int bam_rmdup(int argc, char *argv[])
129 {
130         bamFile in, out;
131         if (argc < 3) {
132                 fprintf(stderr, "Usage: samtools rmdup <input.srt.bam> <output.bam>\n");
133                 return 1;
134         }
135         in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
136         out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w");
137         if (in == 0 || out == 0) {
138                 fprintf(stderr, "[bam_rmdup] fail to read/write input files\n");
139                 return 1;
140         }
141         bam_rmdup_core(in, out);
142         bam_close(in);
143         bam_close(out);
144         return 0;
145 }