]> git.donarmstrong.com Git - samtools.git/blob - bam_rmdup.c
* samtools-0.1.6-14 (r480)
[samtools.git] / bam_rmdup.c
1 #include <stdlib.h>
2 #include <string.h>
3 #include <stdio.h>
4 #include <zlib.h>
5 #include "sam.h"
6
7 typedef bam1_t *bam1_p;
8
9 #include "khash.h"
10 KHASH_SET_INIT_STR(name)
11 KHASH_MAP_INIT_INT64(pos, bam1_p)
12
13 #define BUFFER_SIZE 0x40000
14
15 typedef struct {
16         uint64_t n_checked, n_removed;
17         khash_t(pos) *best_hash;
18 } lib_aux_t;
19 KHASH_MAP_INIT_STR(lib, lib_aux_t)
20
21 typedef struct {
22         int n, max;
23         bam1_t **a;
24 } tmp_stack_t;
25
26 static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
27 {
28         if (stack->n == stack->max) {
29                 stack->max = stack->max? stack->max<<1 : 0x10000;
30                 stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max);
31         }
32         stack->a[stack->n++] = b;
33 }
34
35 static inline void dump_best(tmp_stack_t *stack, bamFile out)
36 {
37         int i;
38         for (i = 0; i != stack->n; ++i) {
39                 bam_write1(out, stack->a[i]);
40                 bam_destroy1(stack->a[i]);
41         }
42         stack->n = 0;
43 }
44
45 static void clear_del_set(khash_t(name) *del_set)
46 {
47         khint_t k;
48         for (k = kh_begin(del_set); k < kh_end(del_set); ++k)
49                 if (kh_exist(del_set, k))
50                         free((char*)kh_key(del_set, k));
51         kh_clear(name, del_set);
52 }
53
54 static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
55 {
56         khint_t k = kh_get(lib, aux, lib);
57         if (k == kh_end(aux)) {
58                 int ret;
59                 char *p = strdup(lib);
60                 lib_aux_t *q;
61                 k = kh_put(lib, aux, p, &ret);
62                 q = &kh_val(aux, k);
63                 q->n_checked = q->n_removed = 0;
64                 q->best_hash = kh_init(pos);
65                 return q;
66         } else return &kh_val(aux, k);
67 }
68
69 static void clear_best(khash_t(lib) *aux, int max)
70 {
71         khint_t k;
72         for (k = kh_begin(aux); k != kh_end(aux); ++k) {
73                 if (kh_exist(aux, k)) {
74                         lib_aux_t *q = &kh_val(aux, k);
75                         if (kh_size(q->best_hash) >= max)
76                                 kh_clear(pos, q->best_hash);
77                 }
78         }
79 }
80
81 static inline int sum_qual(const bam1_t *b)
82 {
83         int i, q;
84         uint8_t *qual = bam1_qual(b);
85         for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
86         return q;
87 }
88
89 void bam_rmdup_core(bamFile in, bamFile out)
90 {
91         bam_header_t *header;
92         bam1_t *b;
93         int last_tid = -1, last_pos = -1;
94         tmp_stack_t stack;
95         khint_t k;
96         khash_t(lib) *aux;
97         khash_t(name) *del_set;
98         
99         aux = kh_init(lib);
100         del_set = kh_init(name);
101         b = bam_init1();
102         memset(&stack, 0, sizeof(tmp_stack_t));
103         header = bam_header_read(in);
104         sam_header_parse_rg(header);
105         bam_header_write(out, header);
106
107         kh_resize(name, del_set, 4 * BUFFER_SIZE);
108         while (bam_read1(in, b) >= 0) {
109                 bam1_core_t *c = &b->core;
110                 if (c->tid != last_tid || last_pos != c->pos) {
111                         dump_best(&stack, out); // write the result
112                         clear_best(aux, BUFFER_SIZE);
113                         if (c->tid != last_tid) {
114                                 clear_best(aux, 0);
115                                 if (kh_size(del_set)) { // check
116                                         fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set));
117                                         clear_del_set(del_set);
118                                 }
119                                 if ((int)c->tid == -1) { // append unmapped reads
120                                         bam_write1(out, b);
121                                         while (bam_read1(in, b) >= 0) bam_write1(out, b);
122                                         break;
123                                 }
124                                 last_tid = c->tid;
125                                 fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", header->target_name[c->tid]);
126                         }
127                 }
128                 if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
129                         bam_write1(out, b);
130                 } else if (c->isize > 0) { // paired, head
131                         uint64_t key = (uint64_t)c->pos<<32 | c->isize;
132                         const char *lib;
133                         const uint8_t *rg;
134                         lib_aux_t *q;
135                         int ret;
136                         rg = bam_aux_get(b, "RG");
137                         lib = (rg == 0)? 0 : bam_strmap_get(header->rg2lib, (char*)(rg + 1));
138                         q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
139                         ++q->n_checked;
140                         k = kh_put(pos, q->best_hash, key, &ret);
141                         if (ret == 0) { // found in best_hash
142                                 bam1_t *p = kh_val(q->best_hash, k);
143                                 ++q->n_removed;
144                                 if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle
145                                         kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed
146                                         bam_copy1(p, b); // replaced as b
147                                 } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed
148                                 if (ret == 0)
149                                         fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b));
150                         } else { // not found in best_hash
151                                 kh_val(q->best_hash, k) = bam_dup1(b);
152                                 stack_insert(&stack, kh_val(q->best_hash, k));
153                         }
154                 } else { // paired, tail
155                         k = kh_get(name, del_set, bam1_qname(b));
156                         if (k != kh_end(del_set)) {
157                                 free((char*)kh_key(del_set, k));
158                                 kh_del(name, del_set, k);
159                         } else bam_write1(out, b);
160                 }
161                 last_pos = c->pos;
162         }
163
164         for (k = kh_begin(aux); k != kh_end(aux); ++k) {
165                 if (kh_exist(aux, k)) {
166                         lib_aux_t *q = &kh_val(aux, k);                 
167                         dump_best(&stack, out);
168                         fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
169                                         (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
170                         kh_destroy(pos, q->best_hash);
171                         free((char*)kh_key(aux, k));
172                 }
173         }
174         kh_destroy(lib, aux);
175
176         bam_header_destroy(header);
177         clear_del_set(del_set);
178         kh_destroy(name, del_set);
179         free(stack.a);
180         bam_destroy1(b);
181 }
182 int bam_rmdup(int argc, char *argv[])
183 {
184         bamFile in, out;
185         if (argc < 3) {
186                 fprintf(stderr, "Usage: samtools rmdup <input.srt.bam> <output.bam>\n\n");
187                 fprintf(stderr, "Note: Picard is recommended for this task.\n");
188                 return 1;
189         }
190         in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
191         out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w");
192         if (in == 0 || out == 0) {
193                 fprintf(stderr, "[bam_rmdup] fail to read/write input files\n");
194                 return 1;
195         }
196         bam_rmdup_core(in, out);
197         bam_close(in);
198         bam_close(out);
199         return 0;
200 }