5 #include "bam_endian.h"
10 Alignment indexing. Before indexing, BAM must be sorted based on the
11 leftmost coordinate of alignments. In indexing, BAM uses two indices:
12 a UCSC binning index and a simple linear index. The binning index is
13 efficient for alignments spanning long distance, while the auxiliary
14 linear index helps to reduce unnecessary seek calls especially for
17 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
18 Stein and is explained by Kent et al. (2002). In this scheme, each bin
19 represents a contiguous genomic region which can be fully contained in
20 another bin; each alignment is associated with a bin which represents
21 the smallest region containing the entire alignment. The binning
22 scheme is essentially another representation of R-tree. A distinct bin
23 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
24 a child of Bin B if region A is contained in B.
26 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
27 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
28 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
29 find the alignments overlapped with a region [rbeg,rend), we need to
30 calculate the list of bins that may be overlapped the region and test
31 the alignments in the bins to confirm the overlaps. If the specified
32 region is short, typically only a few alignments in six bins need to
33 be retrieved. The overlapping alignments can be quickly fetched.
37 #define BAM_MIN_CHUNK_GAP 32768
38 // 1<<14 is the size of minimum bin.
39 #define BAM_LIDX_SHIFT 14
45 #define pair64_lt(a,b) ((a).u < (b).u)
46 KSORT_INIT(off, pair64_t, pair64_lt)
58 KHASH_MAP_INIT_INT(i, bam_binlist_t)
60 struct __bam_index_t {
66 // requirement: len <= LEN_MASK
67 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
72 k = kh_put(i, h, bin, &ret);
74 if (ret) { // not present
76 l->list = (pair64_t*)calloc(l->m, 16);
80 l->list = (pair64_t*)realloc(l->list, l->m * 16);
82 l->list[l->n].u = beg; l->list[l->n++].v = end;
85 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
88 beg = b->core.pos >> BAM_LIDX_SHIFT;
89 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
90 if (index2->m < end + 1) {
91 int old_m = index2->m;
93 kroundup32(index2->m);
94 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
95 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
97 for (i = beg + 1; i <= end; ++i)
98 if (index2->offset[i] == 0) index2->offset[i] = offset;
102 static void merge_chunks(bam_index_t *idx)
104 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
108 for (i = 0; i < idx->n; ++i) {
109 index = idx->index[i];
110 for (k = kh_begin(index); k != kh_end(index); ++k) {
112 if (!kh_exist(index, k)) continue;
113 p = &kh_value(index, k);
115 for (l = 1; l < p->n; ++l) {
116 #ifdef BAM_TRUE_OFFSET
117 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
119 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
121 else p->list[++m] = p->list[l];
126 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
129 bam_index_t *bam_index_core(bamFile fp)
135 uint32_t last_bin, save_bin;
136 int32_t last_coor, last_tid, save_tid;
138 uint64_t save_off, last_off;
140 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
141 b = (bam1_t*)calloc(1, sizeof(bam1_t));
142 h = bam_header_read(fp);
145 idx->n = h->n_targets;
146 bam_header_destroy(h);
147 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
148 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
149 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
151 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
152 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
153 while ((ret = bam_read1(fp, b)) >= 0) {
154 if (last_tid != c->tid) { // change of chromosomes
156 last_bin = 0xffffffffu;
157 } else if (last_coor > c->pos) {
158 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
159 bam1_qname(b), last_coor, c->pos, c->tid+1);
162 if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off);
163 if (c->bin != last_bin) { // then possibly write the binning index
164 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
165 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
167 save_bin = last_bin = c->bin;
169 if (save_tid < 0) break;
171 if (bam_tell(fp) <= last_off) {
172 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
173 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
176 last_off = bam_tell(fp);
177 last_coor = b->core.pos;
179 if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
181 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
182 free(b->data); free(b);
186 void bam_index_destroy(bam_index_t *idx)
190 if (idx == 0) return;
191 for (i = 0; i < idx->n; ++i) {
192 khash_t(i) *index = idx->index[i];
193 bam_lidx_t *index2 = idx->index2 + i;
194 for (k = kh_begin(index); k != kh_end(index); ++k) {
195 if (kh_exist(index, k))
196 free(kh_value(index, k).list);
198 kh_destroy(i, index);
199 free(index2->offset);
201 free(idx->index); free(idx->index2);
205 void bam_index_save(const bam_index_t *idx, FILE *fp)
209 fwrite("BAI\1", 1, 4, fp);
212 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
213 } else fwrite(&idx->n, 4, 1, fp);
214 for (i = 0; i < idx->n; ++i) {
215 khash_t(i) *index = idx->index[i];
216 bam_lidx_t *index2 = idx->index2 + i;
217 // write binning index
218 size = kh_size(index);
219 if (bam_is_be) { // big endian
221 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
222 } else fwrite(&size, 4, 1, fp);
223 for (k = kh_begin(index); k != kh_end(index); ++k) {
224 if (kh_exist(index, k)) {
225 bam_binlist_t *p = &kh_value(index, k);
226 if (bam_is_be) { // big endian
228 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
229 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
230 for (x = 0; (int)x < p->n; ++x) {
231 bam_swap_endian_8p(&p->list[x].u);
232 bam_swap_endian_8p(&p->list[x].v);
234 fwrite(p->list, 16, p->n, fp);
235 for (x = 0; (int)x < p->n; ++x) {
236 bam_swap_endian_8p(&p->list[x].u);
237 bam_swap_endian_8p(&p->list[x].v);
240 fwrite(&kh_key(index, k), 4, 1, fp);
241 fwrite(&p->n, 4, 1, fp);
242 fwrite(p->list, 16, p->n, fp);
246 // write linear index (index2)
249 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
250 } else fwrite(&index2->n, 4, 1, fp);
251 if (bam_is_be) { // big endian
253 for (x = 0; (int)x < index2->n; ++x)
254 bam_swap_endian_8p(&index2->offset[x]);
255 fwrite(index2->offset, 8, index2->n, fp);
256 for (x = 0; (int)x < index2->n; ++x)
257 bam_swap_endian_8p(&index2->offset[x]);
258 } else fwrite(index2->offset, 8, index2->n, fp);
263 bam_index_t *bam_index_load(const char *fn)
268 char *fnidx, magic[4];
270 fnidx = (char*)calloc(strlen(fn) + 5, 1);
271 strcpy(fnidx, fn); strcat(fnidx, ".bai");
272 fp = fopen(fnidx, "r");
273 if (fp == 0) { // try "{base}.bai"
274 char *s = strstr(fn, "bam");
275 if (s == fn + strlen(fn) - 3) {
277 fnidx[strlen(fn)-1] = 'i';
278 fp = fopen(fnidx, "r");
282 fprintf(stderr, "[bam_index_load] the alignment is not indexed. Please run `index' command first. Abort!\n");
287 fread(magic, 1, 4, fp);
288 if (strncmp(magic, "BAI\1", 4)) {
289 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
293 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
294 fread(&idx->n, 4, 1, fp);
295 if (bam_is_be) bam_swap_endian_4p(&idx->n);
296 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
297 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
298 for (i = 0; i < idx->n; ++i) {
300 bam_lidx_t *index2 = idx->index2 + i;
305 index = idx->index[i] = kh_init(i);
306 // load binning index
307 fread(&size, 4, 1, fp);
308 if (bam_is_be) bam_swap_endian_4p(&size);
309 for (j = 0; j < (int)size; ++j) {
310 fread(&key, 4, 1, fp);
311 if (bam_is_be) bam_swap_endian_4p(&key);
312 k = kh_put(i, index, key, &ret);
313 p = &kh_value(index, k);
314 fread(&p->n, 4, 1, fp);
315 if (bam_is_be) bam_swap_endian_4p(&p->n);
317 p->list = (pair64_t*)malloc(p->m * 16);
318 fread(p->list, 16, p->n, fp);
321 for (x = 0; x < p->n; ++x) {
322 bam_swap_endian_8p(&p->list[x].u);
323 bam_swap_endian_8p(&p->list[x].v);
328 fread(&index2->n, 4, 1, fp);
329 if (bam_is_be) bam_swap_endian_4p(&index2->n);
330 index2->m = index2->n;
331 index2->offset = (uint64_t*)calloc(index2->m, 8);
332 fread(index2->offset, index2->n, 8, fp);
334 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
340 int bam_index_build2(const char *fn, const char *_fnidx)
346 assert(fp = bam_open(fn, "r"));
347 idx = bam_index_core(fp);
350 fnidx = (char*)calloc(strlen(fn) + 5, 1);
351 strcpy(fnidx, fn); strcat(fnidx, ".bai");
352 } else fnidx = strdup(_fnidx);
353 fpidx = fopen(fnidx, "w");
355 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
359 bam_index_save(idx, fpidx);
360 bam_index_destroy(idx);
366 int bam_index_build(const char *fn)
368 return bam_index_build2(fn, 0);
371 int bam_index(int argc, char *argv[])
374 fprintf(stderr, "Usage: samtools index <in.bam> [<out.index>]\n");
377 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
378 else bam_index_build(argv[1]);
382 #define MAX_BIN 37450 // =(8^6-1)/7+1
384 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
389 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
390 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
391 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
392 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
393 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
397 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
399 uint32_t rbeg = b->core.pos;
400 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
401 return (rend > beg && rbeg < end);
404 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
407 int i, n_bins, n_off;
413 bins = (uint16_t*)calloc(MAX_BIN, 2);
414 n_bins = reg2bins(beg, end, bins);
415 index = idx->index[tid];
416 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
417 for (i = n_off = 0; i < n_bins; ++i) {
418 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
419 n_off += kh_value(index, k).n;
422 free(bins); return 0;
424 off = (pair64_t*)calloc(n_off, 16);
425 for (i = n_off = 0; i < n_bins; ++i) {
426 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
428 bam_binlist_t *p = &kh_value(index, k);
429 for (j = 0; j < p->n; ++j)
430 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
438 b = (bam1_t*)calloc(1, sizeof(bam1_t));
439 ks_introsort(off, n_off, off);
440 // resolve completely contained adjacent blocks
441 for (i = 1, l = 0; i < n_off; ++i)
442 if (off[l].v < off[i].v)
445 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
446 for (i = 1; i < n_off; ++i)
447 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
448 { // merge adjacent blocks
449 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
450 for (i = 1, l = 0; i < n_off; ++i) {
451 #ifdef BAM_TRUE_OFFSET
452 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
454 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
456 else off[++l] = off[i];
461 // retrive alignments
462 n_seeks = 0; i = -1; curr_off = 0;
464 if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
465 if (i == n_off - 1) break; // no more chunks
466 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
467 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
468 bam_seek(fp, off[i+1].u, SEEK_SET);
469 curr_off = bam_tell(fp);
474 if ((ret = bam_read1(fp, b)) > 0) {
475 curr_off = bam_tell(fp);
476 if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
477 else if (is_overlap(beg, end, b)) func(b, data);
478 } else break; // end of file
480 // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);