5 #include "bam_endian.h"
10 Alignment indexing. Before indexing, BAM must be sorted based on the
11 leftmost coordinate of alignments. In indexing, BAM uses two indices:
12 a UCSC binning index and a simple linear index. The binning index is
13 efficient for alignments spanning long distance, while the auxiliary
14 linear index helps to reduce unnecessary seek calls especially for
17 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
18 Stein and is explained by Kent et al. (2002). In this scheme, each bin
19 represents a contiguous genomic region which can be fully contained in
20 another bin; each alignment is associated with a bin which represents
21 the smallest region containing the entire alignment. The binning
22 scheme is essentially another representation of R-tree. A distinct bin
23 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
24 a child of Bin B if region A is contained in B.
26 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
27 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
28 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
29 find the alignments overlapped with a region [rbeg,rend), we need to
30 calculate the list of bins that may be overlapped the region and test
31 the alignments in the bins to confirm the overlaps. If the specified
32 region is short, typically only a few alignments in six bins need to
33 be retrieved. The overlapping alignments can be quickly fetched.
37 #define BAM_MIN_CHUNK_GAP 32768
38 // 1<<14 is the size of minimum bin.
39 #define BAM_LIDX_SHIFT 14
45 #define pair64_lt(a,b) ((a).u < (b).u)
46 KSORT_INIT(off, pair64_t, pair64_lt)
58 KHASH_MAP_INIT_INT(i, bam_binlist_t)
60 struct __bam_index_t {
66 // requirement: len <= LEN_MASK
67 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
72 k = kh_put(i, h, bin, &ret);
74 if (ret) { // not present
76 l->list = (pair64_t*)calloc(l->m, 16);
80 l->list = (pair64_t*)realloc(l->list, l->m * 16);
82 l->list[l->n].u = beg; l->list[l->n++].v = end;
85 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
88 beg = b->core.pos >> BAM_LIDX_SHIFT;
89 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
90 if (index2->m < end + 1) {
91 int old_m = index2->m;
93 kroundup32(index2->m);
94 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
95 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
97 for (i = beg + 1; i <= end; ++i)
98 if (index2->offset[i] == 0) index2->offset[i] = offset;
102 static void merge_chunks(bam_index_t *idx)
104 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
108 for (i = 0; i < idx->n; ++i) {
109 index = idx->index[i];
110 for (k = kh_begin(index); k != kh_end(index); ++k) {
112 if (!kh_exist(index, k)) continue;
113 p = &kh_value(index, k);
115 for (l = 1; l < p->n; ++l) {
116 #ifdef BAM_TRUE_OFFSET
117 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
119 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
121 else p->list[++m] = p->list[l];
126 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
129 bam_index_t *bam_index_core(bamFile fp)
135 uint32_t last_bin, save_bin;
136 int32_t last_coor, last_tid, save_tid;
138 uint64_t save_off, last_off;
140 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
141 b = (bam1_t*)calloc(1, sizeof(bam1_t));
142 h = bam_header_read(fp);
145 idx->n = h->n_targets;
146 bam_header_destroy(h);
147 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
148 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
149 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
151 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
152 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
153 while ((ret = bam_read1(fp, b)) >= 0) {
154 if (last_tid != c->tid) { // change of chromosomes
156 last_bin = 0xffffffffu;
157 } else if (last_coor > c->pos) {
158 fprintf(stderr, "[bam_index_core] the alignment is not sorted. Abort!\n");
161 if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off);
162 if (c->bin != last_bin) { // then possibly write the binning index
163 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
164 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
166 save_bin = last_bin = c->bin;
168 if (save_tid < 0) break;
170 if (bam_tell(fp) <= last_off) {
171 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
172 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
175 last_off = bam_tell(fp);
176 last_coor = b->core.pos;
178 if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
180 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
181 free(b->data); free(b);
185 void bam_index_destroy(bam_index_t *idx)
189 if (idx == 0) return;
190 for (i = 0; i < idx->n; ++i) {
191 khash_t(i) *index = idx->index[i];
192 bam_lidx_t *index2 = idx->index2 + i;
193 for (k = kh_begin(index); k != kh_end(index); ++k) {
194 if (kh_exist(index, k))
195 free(kh_value(index, k).list);
197 kh_destroy(i, index);
198 free(index2->offset);
200 free(idx->index); free(idx->index2);
204 void bam_index_save(const bam_index_t *idx, FILE *fp)
208 fwrite("BAI\1", 1, 4, fp);
211 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
212 } else fwrite(&idx->n, 4, 1, fp);
213 for (i = 0; i < idx->n; ++i) {
214 khash_t(i) *index = idx->index[i];
215 bam_lidx_t *index2 = idx->index2 + i;
216 // write binning index
217 size = kh_size(index);
218 if (bam_is_be) { // big endian
220 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
221 } else fwrite(&size, 4, 1, fp);
222 for (k = kh_begin(index); k != kh_end(index); ++k) {
223 if (kh_exist(index, k)) {
224 bam_binlist_t *p = &kh_value(index, k);
225 if (bam_is_be) { // big endian
227 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
228 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
229 for (x = 0; (int)x < p->n; ++x) {
230 bam_swap_endian_8p(&p->list[x].u);
231 bam_swap_endian_8p(&p->list[x].v);
233 fwrite(p->list, 16, p->n, fp);
234 for (x = 0; (int)x < p->n; ++x) {
235 bam_swap_endian_8p(&p->list[x].u);
236 bam_swap_endian_8p(&p->list[x].v);
239 fwrite(&kh_key(index, k), 4, 1, fp);
240 fwrite(&p->n, 4, 1, fp);
241 fwrite(p->list, 16, p->n, fp);
245 // write linear index (index2)
248 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
249 } else fwrite(&index2->n, 4, 1, fp);
250 if (bam_is_be) { // big endian
252 for (x = 0; (int)x < index2->n; ++x)
253 bam_swap_endian_8p(&index2->offset[x]);
254 fwrite(index2->offset, 8, index2->n, fp);
255 for (x = 0; (int)x < index2->n; ++x)
256 bam_swap_endian_8p(&index2->offset[x]);
257 } else fwrite(index2->offset, 8, index2->n, fp);
262 bam_index_t *bam_index_load(const char *fn)
267 char *fnidx, magic[4];
269 fnidx = (char*)calloc(strlen(fn) + 5, 1);
270 strcpy(fnidx, fn); strcat(fnidx, ".bai");
271 fp = fopen(fnidx, "r");
272 if (fp == 0) { // try "{base}.bai"
273 char *s = strstr(fn, "bam");
274 if (s == fn + strlen(fn) - 3) {
276 fnidx[strlen(fn)-1] = 'i';
277 fp = fopen(fnidx, "r");
281 fprintf(stderr, "[bam_index_load] the alignment is not indexed. Please run `index' command first. Abort!\n");
286 fread(magic, 1, 4, fp);
287 if (strncmp(magic, "BAI\1", 4)) {
288 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
292 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
293 fread(&idx->n, 4, 1, fp);
294 if (bam_is_be) bam_swap_endian_4p(&idx->n);
295 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
296 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
297 for (i = 0; i < idx->n; ++i) {
299 bam_lidx_t *index2 = idx->index2 + i;
304 index = idx->index[i] = kh_init(i);
305 // load binning index
306 fread(&size, 4, 1, fp);
307 if (bam_is_be) bam_swap_endian_4p(&size);
308 for (j = 0; j < (int)size; ++j) {
309 fread(&key, 4, 1, fp);
310 if (bam_is_be) bam_swap_endian_4p(&key);
311 k = kh_put(i, index, key, &ret);
312 p = &kh_value(index, k);
313 fread(&p->n, 4, 1, fp);
314 if (bam_is_be) bam_swap_endian_4p(&p->n);
316 p->list = (pair64_t*)malloc(p->m * 16);
317 fread(p->list, 16, p->n, fp);
320 for (x = 0; x < p->n; ++x) {
321 bam_swap_endian_8p(&p->list[x].u);
322 bam_swap_endian_8p(&p->list[x].v);
327 fread(&index2->n, 4, 1, fp);
328 if (bam_is_be) bam_swap_endian_4p(&index2->n);
329 index2->m = index2->n;
330 index2->offset = (uint64_t*)calloc(index2->m, 8);
331 fread(index2->offset, index2->n, 8, fp);
333 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
339 int bam_index_build2(const char *fn, const char *_fnidx)
345 assert(fp = bam_open(fn, "r"));
346 idx = bam_index_core(fp);
349 fnidx = (char*)calloc(strlen(fn) + 5, 1);
350 strcpy(fnidx, fn); strcat(fnidx, ".bai");
351 } else fnidx = strdup(_fnidx);
352 fpidx = fopen(fnidx, "w");
354 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
358 bam_index_save(idx, fpidx);
359 bam_index_destroy(idx);
365 int bam_index_build(const char *fn)
367 return bam_index_build2(fn, 0);
370 int bam_index(int argc, char *argv[])
373 fprintf(stderr, "Usage: samtools index <in.bam> [<out.index>]\n");
376 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
377 else bam_index_build(argv[1]);
381 #define MAX_BIN 37450 // =(8^6-1)/7+1
383 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
388 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
389 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
390 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
391 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
392 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
396 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
398 uint32_t rbeg = b->core.pos;
399 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
400 return (rend > beg && rbeg < end);
403 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
406 int i, n_bins, n_off;
412 bins = (uint16_t*)calloc(MAX_BIN, 2);
413 n_bins = reg2bins(beg, end, bins);
414 index = idx->index[tid];
415 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
416 for (i = n_off = 0; i < n_bins; ++i) {
417 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
418 n_off += kh_value(index, k).n;
421 free(bins); return 0;
423 off = (pair64_t*)calloc(n_off, 16);
424 for (i = n_off = 0; i < n_bins; ++i) {
425 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
427 bam_binlist_t *p = &kh_value(index, k);
428 for (j = 0; j < p->n; ++j)
429 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
437 b = (bam1_t*)calloc(1, sizeof(bam1_t));
438 ks_introsort(off, n_off, off);
439 // resolve completely contained adjacent blocks
440 for (i = 1, l = 0; i < n_off; ++i)
441 if (off[l].v < off[i].v)
444 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
445 for (i = 1; i < n_off; ++i)
446 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
447 { // merge adjacent blocks
448 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
449 for (i = 1, l = 0; i < n_off; ++i) {
450 #ifdef BAM_TRUE_OFFSET
451 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
453 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
455 else off[++l] = off[i];
460 // retrive alignments
461 n_seeks = 0; i = -1; curr_off = 0;
463 if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
464 if (i == n_off - 1) break; // no more chunks
465 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
466 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
467 bam_seek(fp, off[i+1].u, SEEK_SET);
468 curr_off = bam_tell(fp);
473 if ((ret = bam_read1(fp, b)) > 0) {
474 curr_off = bam_tell(fp);
475 if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
476 else if (is_overlap(beg, end, b)) func(b, data);
477 } else break; // end of file
479 // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);