5 #include "bam_endian.h"
11 Alignment indexing. Before indexing, BAM must be sorted based on the
12 leftmost coordinate of alignments. In indexing, BAM uses two indices:
13 a UCSC binning index and a simple linear index. The binning index is
14 efficient for alignments spanning long distance, while the auxiliary
15 linear index helps to reduce unnecessary seek calls especially for
18 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
19 Stein and is explained by Kent et al. (2002). In this scheme, each bin
20 represents a contiguous genomic region which can be fully contained in
21 another bin; each alignment is associated with a bin which represents
22 the smallest region containing the entire alignment. The binning
23 scheme is essentially another representation of R-tree. A distinct bin
24 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
25 a child of Bin B if region A is contained in B.
27 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
28 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
29 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
30 find the alignments overlapped with a region [rbeg,rend), we need to
31 calculate the list of bins that may be overlapped the region and test
32 the alignments in the bins to confirm the overlaps. If the specified
33 region is short, typically only a few alignments in six bins need to
34 be retrieved. The overlapping alignments can be quickly fetched.
38 #define BAM_MIN_CHUNK_GAP 32768
39 // 1<<14 is the size of minimum bin.
40 #define BAM_LIDX_SHIFT 14
46 #define pair64_lt(a,b) ((a).u < (b).u)
47 KSORT_INIT(off, pair64_t, pair64_lt)
59 KHASH_MAP_INIT_INT(i, bam_binlist_t)
61 struct __bam_index_t {
67 // requirement: len <= LEN_MASK
68 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
73 k = kh_put(i, h, bin, &ret);
75 if (ret) { // not present
77 l->list = (pair64_t*)calloc(l->m, 16);
81 l->list = (pair64_t*)realloc(l->list, l->m * 16);
83 l->list[l->n].u = beg; l->list[l->n++].v = end;
86 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
89 beg = b->core.pos >> BAM_LIDX_SHIFT;
90 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
91 if (index2->m < end + 1) {
92 int old_m = index2->m;
94 kroundup32(index2->m);
95 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
96 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
98 for (i = beg + 1; i <= end; ++i)
99 if (index2->offset[i] == 0) index2->offset[i] = offset;
103 static void merge_chunks(bam_index_t *idx)
105 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
109 for (i = 0; i < idx->n; ++i) {
110 index = idx->index[i];
111 for (k = kh_begin(index); k != kh_end(index); ++k) {
113 if (!kh_exist(index, k)) continue;
114 p = &kh_value(index, k);
116 for (l = 1; l < p->n; ++l) {
117 #ifdef BAM_TRUE_OFFSET
118 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
120 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
122 else p->list[++m] = p->list[l];
127 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
130 bam_index_t *bam_index_core(bamFile fp)
136 uint32_t last_bin, save_bin;
137 int32_t last_coor, last_tid, save_tid;
139 uint64_t save_off, last_off;
141 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
142 b = (bam1_t*)calloc(1, sizeof(bam1_t));
143 h = bam_header_read(fp);
146 idx->n = h->n_targets;
147 bam_header_destroy(h);
148 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
149 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
150 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
152 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
153 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
154 while ((ret = bam_read1(fp, b)) >= 0) {
155 if (last_tid != c->tid) { // change of chromosomes
157 last_bin = 0xffffffffu;
158 } else if (last_coor > c->pos) {
159 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
160 bam1_qname(b), last_coor, c->pos, c->tid+1);
163 if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off);
164 if (c->bin != last_bin) { // then possibly write the binning index
165 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
166 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
168 save_bin = last_bin = c->bin;
170 if (save_tid < 0) break;
172 if (bam_tell(fp) <= last_off) {
173 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
174 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
177 last_off = bam_tell(fp);
178 last_coor = b->core.pos;
180 if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
182 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
183 free(b->data); free(b);
187 void bam_index_destroy(bam_index_t *idx)
191 if (idx == 0) return;
192 for (i = 0; i < idx->n; ++i) {
193 khash_t(i) *index = idx->index[i];
194 bam_lidx_t *index2 = idx->index2 + i;
195 for (k = kh_begin(index); k != kh_end(index); ++k) {
196 if (kh_exist(index, k))
197 free(kh_value(index, k).list);
199 kh_destroy(i, index);
200 free(index2->offset);
202 free(idx->index); free(idx->index2);
206 void bam_index_save(const bam_index_t *idx, FILE *fp)
210 fwrite("BAI\1", 1, 4, fp);
213 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
214 } else fwrite(&idx->n, 4, 1, fp);
215 for (i = 0; i < idx->n; ++i) {
216 khash_t(i) *index = idx->index[i];
217 bam_lidx_t *index2 = idx->index2 + i;
218 // write binning index
219 size = kh_size(index);
220 if (bam_is_be) { // big endian
222 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
223 } else fwrite(&size, 4, 1, fp);
224 for (k = kh_begin(index); k != kh_end(index); ++k) {
225 if (kh_exist(index, k)) {
226 bam_binlist_t *p = &kh_value(index, k);
227 if (bam_is_be) { // big endian
229 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
230 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
231 for (x = 0; (int)x < p->n; ++x) {
232 bam_swap_endian_8p(&p->list[x].u);
233 bam_swap_endian_8p(&p->list[x].v);
235 fwrite(p->list, 16, p->n, fp);
236 for (x = 0; (int)x < p->n; ++x) {
237 bam_swap_endian_8p(&p->list[x].u);
238 bam_swap_endian_8p(&p->list[x].v);
241 fwrite(&kh_key(index, k), 4, 1, fp);
242 fwrite(&p->n, 4, 1, fp);
243 fwrite(p->list, 16, p->n, fp);
247 // write linear index (index2)
250 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
251 } else fwrite(&index2->n, 4, 1, fp);
252 if (bam_is_be) { // big endian
254 for (x = 0; (int)x < index2->n; ++x)
255 bam_swap_endian_8p(&index2->offset[x]);
256 fwrite(index2->offset, 8, index2->n, fp);
257 for (x = 0; (int)x < index2->n; ++x)
258 bam_swap_endian_8p(&index2->offset[x]);
259 } else fwrite(index2->offset, 8, index2->n, fp);
264 static bam_index_t *bam_index_load_core(FILE *fp)
270 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
273 fread(magic, 1, 4, fp);
274 if (strncmp(magic, "BAI\1", 4)) {
275 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
279 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
280 fread(&idx->n, 4, 1, fp);
281 if (bam_is_be) bam_swap_endian_4p(&idx->n);
282 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
283 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
284 for (i = 0; i < idx->n; ++i) {
286 bam_lidx_t *index2 = idx->index2 + i;
291 index = idx->index[i] = kh_init(i);
292 // load binning index
293 fread(&size, 4, 1, fp);
294 if (bam_is_be) bam_swap_endian_4p(&size);
295 for (j = 0; j < (int)size; ++j) {
296 fread(&key, 4, 1, fp);
297 if (bam_is_be) bam_swap_endian_4p(&key);
298 k = kh_put(i, index, key, &ret);
299 p = &kh_value(index, k);
300 fread(&p->n, 4, 1, fp);
301 if (bam_is_be) bam_swap_endian_4p(&p->n);
303 p->list = (pair64_t*)malloc(p->m * 16);
304 fread(p->list, 16, p->n, fp);
307 for (x = 0; x < p->n; ++x) {
308 bam_swap_endian_8p(&p->list[x].u);
309 bam_swap_endian_8p(&p->list[x].v);
314 fread(&index2->n, 4, 1, fp);
315 if (bam_is_be) bam_swap_endian_4p(&index2->n);
316 index2->m = index2->n;
317 index2->offset = (uint64_t*)calloc(index2->m, 8);
318 fread(index2->offset, index2->n, 8, fp);
320 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
325 bam_index_t *bam_index_load_local(const char *_fn)
331 if (strstr(_fn, "ftp://") == _fn) {
334 for (p = _fn + l - 1; p >= _fn; --p)
335 if (*p == '/') break;
337 } else fn = strdup(_fn);
338 fnidx = (char*)calloc(strlen(fn) + 5, 1);
339 strcpy(fnidx, fn); strcat(fnidx, ".bai");
340 fp = fopen(fnidx, "r");
341 if (fp == 0) { // try "{base}.bai"
342 char *s = strstr(fn, "bam");
343 if (s == fn + strlen(fn) - 3) {
345 fnidx[strlen(fn)-1] = 'i';
346 fp = fopen(fnidx, "r");
349 free(fnidx); free(fn);
350 idx = bam_index_load_core(fp);
355 static void download_from_remote(const char *url)
357 const int buf_size = 1 * 1024 * 1024;
363 if (strstr(url, "ftp://") != url) return;
365 for (fn = (char*)url + l - 1; fn >= url; --fn)
366 if (*fn == '/') break;
367 ++fn; // fn now points to the file name
368 fp_remote = knet_open(url, "r");
369 if (fp_remote == 0) {
370 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
373 if ((fp = fopen(fn, "w")) == 0) {
374 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
375 knet_close(fp_remote);
378 buf = (uint8_t*)calloc(buf_size, 1);
379 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
380 fwrite(buf, 1, l, fp);
383 knet_close(fp_remote);
386 bam_index_t *bam_index_load(const char *fn)
389 idx = bam_index_load_local(fn);
390 if (idx == 0 && strstr(fn, "ftp://") == fn) {
391 char *fnidx = calloc(strlen(fn) + 5, 1);
392 strcat(strcpy(fnidx, fn), ".bai");
393 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
394 download_from_remote(fnidx);
395 idx = bam_index_load_local(fn);
400 int bam_index_build2(const char *fn, const char *_fnidx)
406 assert(fp = bam_open(fn, "r"));
407 idx = bam_index_core(fp);
410 fnidx = (char*)calloc(strlen(fn) + 5, 1);
411 strcpy(fnidx, fn); strcat(fnidx, ".bai");
412 } else fnidx = strdup(_fnidx);
413 fpidx = fopen(fnidx, "w");
415 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
419 bam_index_save(idx, fpidx);
420 bam_index_destroy(idx);
426 int bam_index_build(const char *fn)
428 return bam_index_build2(fn, 0);
431 int bam_index(int argc, char *argv[])
434 fprintf(stderr, "Usage: samtools index <in.bam> [<out.index>]\n");
437 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
438 else bam_index_build(argv[1]);
442 #define MAX_BIN 37450 // =(8^6-1)/7+1
444 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
449 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
450 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
451 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
452 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
453 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
457 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
459 uint32_t rbeg = b->core.pos;
460 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
461 return (rend > beg && rbeg < end);
464 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
467 int i, n_bins, n_off;
473 bins = (uint16_t*)calloc(MAX_BIN, 2);
474 n_bins = reg2bins(beg, end, bins);
475 index = idx->index[tid];
476 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
477 for (i = n_off = 0; i < n_bins; ++i) {
478 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
479 n_off += kh_value(index, k).n;
482 free(bins); return 0;
484 off = (pair64_t*)calloc(n_off, 16);
485 for (i = n_off = 0; i < n_bins; ++i) {
486 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
488 bam_binlist_t *p = &kh_value(index, k);
489 for (j = 0; j < p->n; ++j)
490 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
498 b = (bam1_t*)calloc(1, sizeof(bam1_t));
499 ks_introsort(off, n_off, off);
500 // resolve completely contained adjacent blocks
501 for (i = 1, l = 0; i < n_off; ++i)
502 if (off[l].v < off[i].v)
505 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
506 for (i = 1; i < n_off; ++i)
507 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
508 { // merge adjacent blocks
509 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
510 for (i = 1, l = 0; i < n_off; ++i) {
511 #ifdef BAM_TRUE_OFFSET
512 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
514 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
516 else off[++l] = off[i];
521 // retrive alignments
522 n_seeks = 0; i = -1; curr_off = 0;
524 if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
525 if (i == n_off - 1) break; // no more chunks
526 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
527 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
528 bam_seek(fp, off[i+1].u, SEEK_SET);
529 curr_off = bam_tell(fp);
534 if ((ret = bam_read1(fp, b)) > 0) {
535 curr_off = bam_tell(fp);
536 if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
537 else if (is_overlap(beg, end, b)) func(b, data);
538 } else break; // end of file
540 // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);