]> git.donarmstrong.com Git - samtools.git/blob - bam_rmdupse.c
* samtools-0.1.6-15 (r482)
[samtools.git] / bam_rmdupse.c
1 #include <math.h>
2 #include "sam.h"
3 #include "khash.h"
4 #include "klist.h"
5
6 #define QUEUE_CLEAR_SIZE 0x100000
7 #define MAX_POS 0x7fffffff
8
9 typedef struct {
10         int endpos;
11         uint32_t score:31, discarded:1;
12         bam1_t *b;
13 } elem_t, *elem_p;
14 #define __free_elem(p) bam_destroy1((p)->data.b)
15 KLIST_INIT(q, elem_t, __free_elem)
16 typedef klist_t(q) queue_t;
17
18 KHASH_MAP_INIT_INT(best, elem_p)
19 typedef khash_t(best) besthash_t;
20
21 typedef struct {
22         uint64_t n_checked, n_removed;
23         besthash_t *left, *rght;
24 } lib_aux_t;
25 KHASH_MAP_INIT_STR(lib, lib_aux_t)
26
27 static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
28 {
29         khint_t k = kh_get(lib, aux, lib);
30         if (k == kh_end(aux)) {
31                 int ret;
32                 char *p = strdup(lib);
33                 lib_aux_t *q;
34                 k = kh_put(lib, aux, p, &ret);
35                 q = &kh_val(aux, k);
36                 q->left = kh_init(best);
37                 q->rght = kh_init(best);
38                 q->n_checked = q->n_removed = 0;
39                 return q;
40         } else return &kh_val(aux, k);
41 }
42
43 static inline int sum_qual(const bam1_t *b)
44 {
45         int i, q;
46         uint8_t *qual = bam1_qual(b);
47         for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
48         return q;
49 }
50
51 static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score)
52 {
53         elem_t *p = kl_pushp(q, queue);
54         p->discarded = 0;
55         p->endpos = endpos; p->score = score;
56         p->b = bam_dup1(b);
57         return p;
58 }
59
60 static void clear_besthash(besthash_t *h, int32_t pos)
61 {
62         khint_t k;
63         for (k = kh_begin(h); k != kh_end(h); ++k)
64                 if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos)
65                         kh_del(best, h, k);
66 }
67
68 static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h)
69 {
70         if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) {
71                 khint_t k;
72                 while (1) {
73                         elem_t *q;
74                         if (queue->head == queue->tail) break;
75                         q = &kl_val(queue->head);
76                         if (q->discarded) {
77                                 q->b->data_len = 0;
78                                 kl_shift(q, queue, 0);
79                                 continue;
80                         }
81                         if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break;
82                         samwrite(out, q->b);
83                         q->b->data_len = 0;
84                         kl_shift(q, queue, 0);
85                 }
86                 for (k = kh_begin(h); k != kh_end(h); ++k) {
87                         if (kh_exist(h, k)) {
88                                 clear_besthash(kh_val(h, k).left, pos);
89                                 clear_besthash(kh_val(h, k).rght, pos);
90                         }
91                 }
92         }
93 }
94
95 void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
96 {
97         bam1_t *b;
98         queue_t *queue;
99         khint_t k;
100         int last_tid = -2;
101         khash_t(lib) *aux;
102
103         aux = kh_init(lib);
104         b = bam_init1();
105         queue = kl_init(q);
106         while (samread(in, b) >= 0) {
107                 bam1_core_t *c = &b->core;
108                 int endpos = bam_calend(c, bam1_cigar(b));
109                 int score = sum_qual(b);
110                 
111                 if (last_tid != c->tid) {
112                         if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux);
113                         last_tid = c->tid;
114                 } else dump_alignment(out, queue, c->pos, aux);
115                 if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) {
116                         push_queue(queue, b, endpos, score);
117                 } else {
118                         const char *lib;
119                         const uint8_t *rg;
120                         lib_aux_t *q;
121                         besthash_t *h;
122                         uint32_t key;
123                         int ret;
124                         rg = bam_aux_get(b, "RG");
125                         lib = (rg == 0)? 0 : bam_strmap_get(in->header->rg2lib, (char*)(rg + 1));
126                         q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
127                         h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
128                         key = (c->flag&BAM_FREVERSE)? endpos : c->pos;
129                         k = kh_put(best, h, key, &ret);
130                         if (ret == 0) { // in the hash table
131                                 elem_t *p = kh_val(h, k);
132                                 if (p->score < score) {
133                                         if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue
134                                                 p->discarded = 1;
135                                                 kh_val(h, k) = push_queue(queue, b, endpos, score);
136                                         } else { // replace
137                                                 p->score = score; p->endpos = endpos;
138                                                 bam_copy1(p->b, b);
139                                         }
140                                 } // otherwise, discard the alignment
141                         } else kh_val(h, k) = push_queue(queue, b, endpos, score);
142                 }
143         }
144         dump_alignment(out, queue, MAX_POS, aux);
145
146         for (k = kh_begin(aux); k != kh_end(aux); ++k) {
147                 if (kh_exist(aux, k)) {
148                         kh_destroy(best, kh_val(aux, k).rght);
149                         kh_destroy(best, kh_val(aux, k).left);
150                         free((char*)kh_key(aux, k));
151                 }
152         }
153         kh_destroy(lib, aux);
154         bam_destroy1(b);
155         kl_destroy(q, queue);
156 }
157
158 int bam_rmdupse(int argc, char *argv[])
159 {
160         samfile_t *in, *out;
161         if (argc < 3) {
162                 fprintf(stderr, "Usage: samtools rmdupse <in.bam> <out.bam>\n");
163                 return 1;
164         }
165         in = samopen(argv[1], "rb", 0);
166         out = samopen(argv[2], "wb", in->header);
167         bam_rmdupse_core(in, out, 1);
168         samclose(in); samclose(out);
169         return 0;
170 }