5 #include "bam_endian.h"
10 Alignment indexing. Before indexing, BAM must be sorted based on the
11 leftmost coordinate of alignments. In indexing, BAM uses two indices:
12 a UCSC binning index and a simple linear index. The binning index is
13 efficient for alignments spanning long distance, while the auxiliary
14 linear index helps to reduce unnecessary seek calls especially for
17 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
18 Stein and is explained by Kent et al. (2002). In this scheme, each bin
19 represents a contiguous genomic region which can be fully contained in
20 another bin; each alignment is associated with a bin which represents
21 the smallest region containing the entire alignment. The binning
22 scheme is essentially another representation of R-tree. A distinct bin
23 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
24 a child of Bin B if region A is contained in B.
26 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
27 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
28 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
29 find the alignments overlapped with a region [rbeg,rend), we need to
30 calculate the list of bins that may be overlapped the region and test
31 the alignments in the bins to confirm the overlaps. If the specified
32 region is short, typically only a few alignments in six bins need to
33 be retrieved. The overlapping alignments can be quickly fetched.
37 #define BAM_MIN_CHUNK_GAP 32768
38 // 1<<14 is the size of minimum bin.
39 #define BAM_LIDX_SHIFT 14
45 #define pair64_lt(a,b) ((a).u < (b).u)
46 KSORT_INIT(off, pair64_t, pair64_lt)
58 KHASH_MAP_INIT_INT(i, bam_binlist_t)
60 struct __bam_index_t {
66 // requirement: len <= LEN_MASK
67 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
72 k = kh_put(i, h, bin, &ret);
74 if (ret) { // not present
76 l->list = (pair64_t*)calloc(l->m, 16);
80 l->list = (pair64_t*)realloc(l->list, l->m * 16);
82 l->list[l->n].u = beg; l->list[l->n++].v = end;
85 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
88 beg = b->core.pos >> BAM_LIDX_SHIFT;
89 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
90 if (index2->m < end + 1) {
91 int old_m = index2->m;
93 kroundup32(index2->m);
94 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
95 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
97 for (i = beg + 1; i <= end; ++i)
98 if (index2->offset[i] == 0) index2->offset[i] = offset;
102 static void merge_chunks(bam_index_t *idx)
104 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
108 for (i = 0; i < idx->n; ++i) {
109 index = idx->index[i];
110 for (k = kh_begin(index); k != kh_end(index); ++k) {
112 if (!kh_exist(index, k)) continue;
113 p = &kh_value(index, k);
115 for (l = 1; l < p->n; ++l) {
116 #ifdef BAM_TRUE_OFFSET
117 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
119 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
121 else p->list[++m] = p->list[l];
126 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
129 bam_index_t *bam_index_core(bamFile fp)
135 uint32_t last_bin, save_bin;
136 int32_t last_coor, last_tid, save_tid;
138 uint64_t save_off, last_off;
140 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
141 b = (bam1_t*)calloc(1, sizeof(bam1_t));
142 h = bam_header_read(fp);
145 idx->n = h->n_targets;
146 bam_header_destroy(h);
147 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
148 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
149 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
151 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
152 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
153 while ((ret = bam_read1(fp, b)) >= 0) {
154 if (last_tid != c->tid) { // change of chromosomes
156 last_bin = 0xffffffffu;
157 } else if (last_coor > c->pos) {
158 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
159 bam1_qname(b), last_coor, c->pos, c->tid+1);
162 if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off);
163 if (c->bin != last_bin) { // then possibly write the binning index
164 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
165 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
167 save_bin = last_bin = c->bin;
169 if (save_tid < 0) break;
171 if (bam_tell(fp) <= last_off) {
172 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
173 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
176 last_off = bam_tell(fp);
177 last_coor = b->core.pos;
179 if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
181 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
182 free(b->data); free(b);
186 void bam_index_destroy(bam_index_t *idx)
190 if (idx == 0) return;
191 for (i = 0; i < idx->n; ++i) {
192 khash_t(i) *index = idx->index[i];
193 bam_lidx_t *index2 = idx->index2 + i;
194 for (k = kh_begin(index); k != kh_end(index); ++k) {
195 if (kh_exist(index, k))
196 free(kh_value(index, k).list);
198 kh_destroy(i, index);
199 free(index2->offset);
201 free(idx->index); free(idx->index2);
205 void bam_index_save(const bam_index_t *idx, FILE *fp)
209 fwrite("BAI\1", 1, 4, fp);
212 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
213 } else fwrite(&idx->n, 4, 1, fp);
214 for (i = 0; i < idx->n; ++i) {
215 khash_t(i) *index = idx->index[i];
216 bam_lidx_t *index2 = idx->index2 + i;
217 // write binning index
218 size = kh_size(index);
219 if (bam_is_be) { // big endian
221 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
222 } else fwrite(&size, 4, 1, fp);
223 for (k = kh_begin(index); k != kh_end(index); ++k) {
224 if (kh_exist(index, k)) {
225 bam_binlist_t *p = &kh_value(index, k);
226 if (bam_is_be) { // big endian
228 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
229 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
230 for (x = 0; (int)x < p->n; ++x) {
231 bam_swap_endian_8p(&p->list[x].u);
232 bam_swap_endian_8p(&p->list[x].v);
234 fwrite(p->list, 16, p->n, fp);
235 for (x = 0; (int)x < p->n; ++x) {
236 bam_swap_endian_8p(&p->list[x].u);
237 bam_swap_endian_8p(&p->list[x].v);
240 fwrite(&kh_key(index, k), 4, 1, fp);
241 fwrite(&p->n, 4, 1, fp);
242 fwrite(p->list, 16, p->n, fp);
246 // write linear index (index2)
249 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
250 } else fwrite(&index2->n, 4, 1, fp);
251 if (bam_is_be) { // big endian
253 for (x = 0; (int)x < index2->n; ++x)
254 bam_swap_endian_8p(&index2->offset[x]);
255 fwrite(index2->offset, 8, index2->n, fp);
256 for (x = 0; (int)x < index2->n; ++x)
257 bam_swap_endian_8p(&index2->offset[x]);
258 } else fwrite(index2->offset, 8, index2->n, fp);
263 static bam_index_t *bam_index_load_core(FILE *fp)
269 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
272 fread(magic, 1, 4, fp);
273 if (strncmp(magic, "BAI\1", 4)) {
274 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
278 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
279 fread(&idx->n, 4, 1, fp);
280 if (bam_is_be) bam_swap_endian_4p(&idx->n);
281 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
282 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
283 for (i = 0; i < idx->n; ++i) {
285 bam_lidx_t *index2 = idx->index2 + i;
290 index = idx->index[i] = kh_init(i);
291 // load binning index
292 fread(&size, 4, 1, fp);
293 if (bam_is_be) bam_swap_endian_4p(&size);
294 for (j = 0; j < (int)size; ++j) {
295 fread(&key, 4, 1, fp);
296 if (bam_is_be) bam_swap_endian_4p(&key);
297 k = kh_put(i, index, key, &ret);
298 p = &kh_value(index, k);
299 fread(&p->n, 4, 1, fp);
300 if (bam_is_be) bam_swap_endian_4p(&p->n);
302 p->list = (pair64_t*)malloc(p->m * 16);
303 fread(p->list, 16, p->n, fp);
306 for (x = 0; x < p->n; ++x) {
307 bam_swap_endian_8p(&p->list[x].u);
308 bam_swap_endian_8p(&p->list[x].v);
313 fread(&index2->n, 4, 1, fp);
314 if (bam_is_be) bam_swap_endian_4p(&index2->n);
315 index2->m = index2->n;
316 index2->offset = (uint64_t*)calloc(index2->m, 8);
317 fread(index2->offset, index2->n, 8, fp);
319 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
324 bam_index_t *bam_index_load2(const char *fnidx)
327 FILE *fp = fopen(fnidx, "r");
328 idx = bam_index_load_core(fp);
333 bam_index_t *bam_index_load(const char *_fn)
339 if (strstr(_fn, "ftp://") == _fn) {
342 for (p = _fn + l - 1; p >= _fn; --p)
343 if (*p == '/') break;
345 } else fn = strdup(_fn);
346 fnidx = (char*)calloc(strlen(fn) + 5, 1);
347 strcpy(fnidx, fn); strcat(fnidx, ".bai");
348 fp = fopen(fnidx, "r");
349 if (fp == 0) { // try "{base}.bai"
350 char *s = strstr(fn, "bam");
351 if (s == fn + strlen(fn) - 3) {
353 fnidx[strlen(fn)-1] = 'i';
354 fp = fopen(fnidx, "r");
357 free(fnidx); free(fn);
358 idx = bam_index_load_core(fp);
363 int bam_index_build2(const char *fn, const char *_fnidx)
369 assert(fp = bam_open(fn, "r"));
370 idx = bam_index_core(fp);
373 fnidx = (char*)calloc(strlen(fn) + 5, 1);
374 strcpy(fnidx, fn); strcat(fnidx, ".bai");
375 } else fnidx = strdup(_fnidx);
376 fpidx = fopen(fnidx, "w");
378 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
382 bam_index_save(idx, fpidx);
383 bam_index_destroy(idx);
389 int bam_index_build(const char *fn)
391 return bam_index_build2(fn, 0);
394 int bam_index(int argc, char *argv[])
397 fprintf(stderr, "Usage: samtools index <in.bam> [<out.index>]\n");
400 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
401 else bam_index_build(argv[1]);
405 #define MAX_BIN 37450 // =(8^6-1)/7+1
407 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
412 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
413 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
414 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
415 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
416 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
420 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
422 uint32_t rbeg = b->core.pos;
423 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
424 return (rend > beg && rbeg < end);
427 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
430 int i, n_bins, n_off;
436 bins = (uint16_t*)calloc(MAX_BIN, 2);
437 n_bins = reg2bins(beg, end, bins);
438 index = idx->index[tid];
439 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
440 for (i = n_off = 0; i < n_bins; ++i) {
441 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
442 n_off += kh_value(index, k).n;
445 free(bins); return 0;
447 off = (pair64_t*)calloc(n_off, 16);
448 for (i = n_off = 0; i < n_bins; ++i) {
449 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
451 bam_binlist_t *p = &kh_value(index, k);
452 for (j = 0; j < p->n; ++j)
453 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
461 b = (bam1_t*)calloc(1, sizeof(bam1_t));
462 ks_introsort(off, n_off, off);
463 // resolve completely contained adjacent blocks
464 for (i = 1, l = 0; i < n_off; ++i)
465 if (off[l].v < off[i].v)
468 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
469 for (i = 1; i < n_off; ++i)
470 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
471 { // merge adjacent blocks
472 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
473 for (i = 1, l = 0; i < n_off; ++i) {
474 #ifdef BAM_TRUE_OFFSET
475 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
477 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
479 else off[++l] = off[i];
484 // retrive alignments
485 n_seeks = 0; i = -1; curr_off = 0;
487 if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
488 if (i == n_off - 1) break; // no more chunks
489 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
490 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
491 bam_seek(fp, off[i+1].u, SEEK_SET);
492 curr_off = bam_tell(fp);
497 if ((ret = bam_read1(fp, b)) > 0) {
498 curr_off = bam_tell(fp);
499 if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
500 else if (is_overlap(beg, end, b)) func(b, data);
501 } else break; // end of file
503 // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);