6 #include "bam_endian.h"
14 Alignment indexing. Before indexing, BAM must be sorted based on the
15 leftmost coordinate of alignments. In indexing, BAM uses two indices:
16 a UCSC binning index and a simple linear index. The binning index is
17 efficient for alignments spanning long distance, while the auxiliary
18 linear index helps to reduce unnecessary seek calls especially for
21 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
22 Stein and is explained by Kent et al. (2002). In this scheme, each bin
23 represents a contiguous genomic region which can be fully contained in
24 another bin; each alignment is associated with a bin which represents
25 the smallest region containing the entire alignment. The binning
26 scheme is essentially another representation of R-tree. A distinct bin
27 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
28 a child of Bin B if region A is contained in B.
30 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
31 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
32 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
33 find the alignments overlapped with a region [rbeg,rend), we need to
34 calculate the list of bins that may be overlapped the region and test
35 the alignments in the bins to confirm the overlaps. If the specified
36 region is short, typically only a few alignments in six bins need to
37 be retrieved. The overlapping alignments can be quickly fetched.
41 #define BAM_MIN_CHUNK_GAP 32768
42 // 1<<14 is the size of minimum bin.
43 #define BAM_LIDX_SHIFT 14
45 #define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
51 #define pair64_lt(a,b) ((a).u < (b).u)
52 KSORT_INIT(off, pair64_t, pair64_lt)
64 KHASH_MAP_INIT_INT(i, bam_binlist_t)
66 struct __bam_index_t {
72 // requirement: len <= LEN_MASK
73 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
78 k = kh_put(i, h, bin, &ret);
80 if (ret) { // not present
82 l->list = (pair64_t*)calloc(l->m, 16);
86 l->list = (pair64_t*)realloc(l->list, l->m * 16);
88 l->list[l->n].u = beg; l->list[l->n++].v = end;
91 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
94 beg = b->core.pos >> BAM_LIDX_SHIFT;
95 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
96 if (index2->m < end + 1) {
97 int old_m = index2->m;
99 kroundup32(index2->m);
100 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
101 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
104 if (index2->offset[beg] == 0) index2->offset[beg] = offset;
106 for (i = beg; i <= end; ++i)
107 if (index2->offset[i] == 0) index2->offset[i] = offset;
112 static void merge_chunks(bam_index_t *idx)
114 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
118 for (i = 0; i < idx->n; ++i) {
119 index = idx->index[i];
120 for (k = kh_begin(index); k != kh_end(index); ++k) {
122 if (!kh_exist(index, k)) continue;
123 p = &kh_value(index, k);
125 for (l = 1; l < p->n; ++l) {
126 #ifdef BAM_TRUE_OFFSET
127 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
129 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
131 else p->list[++m] = p->list[l];
136 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
139 static void fill_missing(bam_index_t *idx)
142 for (i = 0; i < idx->n; ++i) {
143 bam_lidx_t *idx2 = &idx->index2[i];
144 for (j = 1; j < idx2->n; ++j)
145 if (idx2->offset[j] == 0)
146 idx2->offset[j] = idx2->offset[j-1];
150 bam_index_t *bam_index_core(bamFile fp)
156 uint32_t last_bin, save_bin;
157 int32_t last_coor, last_tid, save_tid;
159 uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end;
161 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
162 b = (bam1_t*)calloc(1, sizeof(bam1_t));
163 h = bam_header_read(fp);
166 idx->n = h->n_targets;
167 bam_header_destroy(h);
168 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
169 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
170 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
172 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
173 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
174 n_mapped = n_unmapped = off_end = 0;
175 off_beg = off_end = bam_tell(fp);
176 while ((ret = bam_read1(fp, b)) >= 0) {
177 if (last_tid != c->tid) { // change of chromosomes
179 last_bin = 0xffffffffu;
180 } else if (last_coor > c->pos) {
181 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
182 bam1_qname(b), last_coor, c->pos, c->tid+1);
185 if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off);
186 if (c->bin != last_bin) { // then possibly write the binning index
187 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
188 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
189 if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
191 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
192 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
193 n_mapped = n_unmapped = 0;
197 save_bin = last_bin = c->bin;
199 if (save_tid < 0) break;
201 if (bam_tell(fp) <= last_off) {
202 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
203 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
206 if (c->flag & BAM_FUNMAP) ++n_unmapped;
208 last_off = bam_tell(fp);
209 last_coor = b->core.pos;
211 if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
214 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
215 free(b->data); free(b);
219 void bam_index_destroy(bam_index_t *idx)
223 if (idx == 0) return;
224 for (i = 0; i < idx->n; ++i) {
225 khash_t(i) *index = idx->index[i];
226 bam_lidx_t *index2 = idx->index2 + i;
227 for (k = kh_begin(index); k != kh_end(index); ++k) {
228 if (kh_exist(index, k))
229 free(kh_value(index, k).list);
231 kh_destroy(i, index);
232 free(index2->offset);
234 free(idx->index); free(idx->index2);
238 void bam_index_save(const bam_index_t *idx, FILE *fp)
242 fwrite("BAI\1", 1, 4, fp);
245 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
246 } else fwrite(&idx->n, 4, 1, fp);
247 for (i = 0; i < idx->n; ++i) {
248 khash_t(i) *index = idx->index[i];
249 bam_lidx_t *index2 = idx->index2 + i;
250 // write binning index
251 size = kh_size(index);
252 if (bam_is_be) { // big endian
254 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
255 } else fwrite(&size, 4, 1, fp);
256 for (k = kh_begin(index); k != kh_end(index); ++k) {
257 if (kh_exist(index, k)) {
258 bam_binlist_t *p = &kh_value(index, k);
259 if (bam_is_be) { // big endian
261 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
262 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
263 for (x = 0; (int)x < p->n; ++x) {
264 bam_swap_endian_8p(&p->list[x].u);
265 bam_swap_endian_8p(&p->list[x].v);
267 fwrite(p->list, 16, p->n, fp);
268 for (x = 0; (int)x < p->n; ++x) {
269 bam_swap_endian_8p(&p->list[x].u);
270 bam_swap_endian_8p(&p->list[x].v);
273 fwrite(&kh_key(index, k), 4, 1, fp);
274 fwrite(&p->n, 4, 1, fp);
275 fwrite(p->list, 16, p->n, fp);
279 // write linear index (index2)
282 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
283 } else fwrite(&index2->n, 4, 1, fp);
284 if (bam_is_be) { // big endian
286 for (x = 0; (int)x < index2->n; ++x)
287 bam_swap_endian_8p(&index2->offset[x]);
288 fwrite(index2->offset, 8, index2->n, fp);
289 for (x = 0; (int)x < index2->n; ++x)
290 bam_swap_endian_8p(&index2->offset[x]);
291 } else fwrite(index2->offset, 8, index2->n, fp);
296 static bam_index_t *bam_index_load_core(FILE *fp)
302 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
305 fread(magic, 1, 4, fp);
306 if (strncmp(magic, "BAI\1", 4)) {
307 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
311 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
312 fread(&idx->n, 4, 1, fp);
313 if (bam_is_be) bam_swap_endian_4p(&idx->n);
314 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
315 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
316 for (i = 0; i < idx->n; ++i) {
318 bam_lidx_t *index2 = idx->index2 + i;
323 index = idx->index[i] = kh_init(i);
324 // load binning index
325 fread(&size, 4, 1, fp);
326 if (bam_is_be) bam_swap_endian_4p(&size);
327 for (j = 0; j < (int)size; ++j) {
328 fread(&key, 4, 1, fp);
329 if (bam_is_be) bam_swap_endian_4p(&key);
330 k = kh_put(i, index, key, &ret);
331 p = &kh_value(index, k);
332 fread(&p->n, 4, 1, fp);
333 if (bam_is_be) bam_swap_endian_4p(&p->n);
335 p->list = (pair64_t*)malloc(p->m * 16);
336 fread(p->list, 16, p->n, fp);
339 for (x = 0; x < p->n; ++x) {
340 bam_swap_endian_8p(&p->list[x].u);
341 bam_swap_endian_8p(&p->list[x].v);
346 fread(&index2->n, 4, 1, fp);
347 if (bam_is_be) bam_swap_endian_4p(&index2->n);
348 index2->m = index2->n;
349 index2->offset = (uint64_t*)calloc(index2->m, 8);
350 fread(index2->offset, index2->n, 8, fp);
352 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
357 bam_index_t *bam_index_load_local(const char *_fn)
362 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
365 for (p = _fn + l - 1; p >= _fn; --p)
366 if (*p == '/') break;
368 } else fn = strdup(_fn);
369 fnidx = (char*)calloc(strlen(fn) + 5, 1);
370 strcpy(fnidx, fn); strcat(fnidx, ".bai");
371 fp = fopen(fnidx, "rb");
372 if (fp == 0) { // try "{base}.bai"
373 char *s = strstr(fn, "bam");
374 if (s == fn + strlen(fn) - 3) {
376 fnidx[strlen(fn)-1] = 'i';
377 fp = fopen(fnidx, "rb");
380 free(fnidx); free(fn);
382 bam_index_t *idx = bam_index_load_core(fp);
389 static void download_from_remote(const char *url)
391 const int buf_size = 1 * 1024 * 1024;
397 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
399 for (fn = (char*)url + l - 1; fn >= url; --fn)
400 if (*fn == '/') break;
401 ++fn; // fn now points to the file name
402 fp_remote = knet_open(url, "r");
403 if (fp_remote == 0) {
404 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
407 if ((fp = fopen(fn, "wb")) == 0) {
408 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
409 knet_close(fp_remote);
412 buf = (uint8_t*)calloc(buf_size, 1);
413 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
414 fwrite(buf, 1, l, fp);
417 knet_close(fp_remote);
420 static void download_from_remote(const char *url)
426 bam_index_t *bam_index_load(const char *fn)
429 idx = bam_index_load_local(fn);
430 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
431 char *fnidx = calloc(strlen(fn) + 5, 1);
432 strcat(strcpy(fnidx, fn), ".bai");
433 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
434 download_from_remote(fnidx);
435 idx = bam_index_load_local(fn);
437 if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
441 int bam_index_build2(const char *fn, const char *_fnidx)
447 if ((fp = bam_open(fn, "r")) == 0) {
448 fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
451 idx = bam_index_core(fp);
454 fnidx = (char*)calloc(strlen(fn) + 5, 1);
455 strcpy(fnidx, fn); strcat(fnidx, ".bai");
456 } else fnidx = strdup(_fnidx);
457 fpidx = fopen(fnidx, "wb");
459 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
463 bam_index_save(idx, fpidx);
464 bam_index_destroy(idx);
470 int bam_index_build(const char *fn)
472 return bam_index_build2(fn, 0);
475 int bam_index(int argc, char *argv[])
478 fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
481 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
482 else bam_index_build(argv[1]);
486 int bam_idxstats(int argc, char *argv[])
489 bam_header_t *header;
493 fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
496 fp = bam_open(argv[1], "r");
497 if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
498 header = bam_header_read(fp);
500 idx = bam_index_load(argv[1]);
501 if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
502 for (i = 0; i < idx->n; ++i) {
504 khash_t(i) *h = idx->index[i];
505 printf("%s\t%d", header->target_name[i], header->target_len[i]);
506 k = kh_get(i, h, BAM_MAX_BIN);
508 printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
511 bam_header_destroy(header);
512 bam_index_destroy(idx);
516 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
519 if (beg >= end) return 0;
520 if (end >= 1u<<29) end = 1u<<29;
523 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
524 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
525 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
526 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
527 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
531 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
533 uint32_t rbeg = b->core.pos;
534 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
535 return (rend > beg && rbeg < end);
538 struct __bam_iter_t {
539 int from_first; // read from the first record; no random access
540 int tid, beg, end, n_off, i, finished;
545 // bam_fetch helper function retrieves
546 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
549 int i, n_bins, n_off;
556 if (beg < 0) beg = 0;
557 if (end < beg) return 0;
559 iter = calloc(1, sizeof(struct __bam_iter_t));
560 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
562 bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
563 n_bins = reg2bins(beg, end, bins);
564 index = idx->index[tid];
565 if (idx->index2[tid].n > 0) {
566 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
567 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
568 if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
569 int n = beg>>BAM_LIDX_SHIFT;
570 if (n > idx->index2[tid].n) n = idx->index2[tid].n;
571 for (i = n - 1; i >= 0; --i)
572 if (idx->index2[tid].offset[i] != 0) break;
573 if (i >= 0) min_off = idx->index2[tid].offset[i];
575 } else min_off = 0; // tabix 0.1.2 may produce such index files
576 for (i = n_off = 0; i < n_bins; ++i) {
577 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
578 n_off += kh_value(index, k).n;
581 free(bins); return iter;
583 off = (pair64_t*)calloc(n_off, 16);
584 for (i = n_off = 0; i < n_bins; ++i) {
585 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
587 bam_binlist_t *p = &kh_value(index, k);
588 for (j = 0; j < p->n; ++j)
589 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
594 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
596 ks_introsort(off, n_off, off);
597 // resolve completely contained adjacent blocks
598 for (i = 1, l = 0; i < n_off; ++i)
599 if (off[l].v < off[i].v)
602 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
603 for (i = 1; i < n_off; ++i)
604 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
605 { // merge adjacent blocks
606 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
607 for (i = 1, l = 0; i < n_off; ++i) {
608 #ifdef BAM_TRUE_OFFSET
609 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
611 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
613 else off[++l] = off[i];
620 iter->n_off = n_off; iter->off = off;
624 pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
625 { // for pysam compatibility
628 iter = bam_iter_query(idx, tid, beg, end);
629 off = iter->off; *cnt_off = iter->n_off;
634 void bam_iter_destroy(bam_iter_t iter)
636 if (iter) { free(iter->off); free(iter); }
639 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
641 if (iter->finished) return -1;
642 if (iter->from_first) {
643 int ret = bam_read1(fp, b);
644 if (ret < 0) iter->finished = 1;
647 if (iter->off == 0) return -1;
650 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
651 if (iter->i == iter->n_off - 1) break; // no more chunks
652 if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
653 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
654 bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
655 iter->curr_off = bam_tell(fp);
659 if ((ret = bam_read1(fp, b)) > 0) {
660 iter->curr_off = bam_tell(fp);
661 if (b->core.tid != iter->tid || b->core.pos >= iter->end) break; // no need to proceed
662 else if (is_overlap(iter->beg, iter->end, b)) return ret;
663 } else break; // end of file
669 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
674 iter = bam_iter_query(idx, tid, beg, end);
675 while (bam_iter_read(fp, iter, b) >= 0) func(b, data);