6 #include "bam_endian.h"
14 Alignment indexing. Before indexing, BAM must be sorted based on the
15 leftmost coordinate of alignments. In indexing, BAM uses two indices:
16 a UCSC binning index and a simple linear index. The binning index is
17 efficient for alignments spanning long distance, while the auxiliary
18 linear index helps to reduce unnecessary seek calls especially for
21 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
22 Stein and is explained by Kent et al. (2002). In this scheme, each bin
23 represents a contiguous genomic region which can be fully contained in
24 another bin; each alignment is associated with a bin which represents
25 the smallest region containing the entire alignment. The binning
26 scheme is essentially another representation of R-tree. A distinct bin
27 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
28 a child of Bin B if region A is contained in B.
30 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
31 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
32 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
33 find the alignments overlapped with a region [rbeg,rend), we need to
34 calculate the list of bins that may be overlapped the region and test
35 the alignments in the bins to confirm the overlaps. If the specified
36 region is short, typically only a few alignments in six bins need to
37 be retrieved. The overlapping alignments can be quickly fetched.
41 #define BAM_MIN_CHUNK_GAP 32768
42 // 1<<14 is the size of minimum bin.
43 #define BAM_LIDX_SHIFT 14
45 #define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
51 #define pair64_lt(a,b) ((a).u < (b).u)
52 KSORT_INIT(off, pair64_t, pair64_lt)
64 KHASH_MAP_INIT_INT(i, bam_binlist_t)
66 struct __bam_index_t {
68 uint64_t n_no_coor; // unmapped reads without coordinate
73 // requirement: len <= LEN_MASK
74 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
79 k = kh_put(i, h, bin, &ret);
81 if (ret) { // not present
83 l->list = (pair64_t*)calloc(l->m, 16);
87 l->list = (pair64_t*)realloc(l->list, l->m * 16);
89 l->list[l->n].u = beg; l->list[l->n++].v = end;
92 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
95 beg = b->core.pos >> BAM_LIDX_SHIFT;
96 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
97 if (index2->m < end + 1) {
98 int old_m = index2->m;
100 kroundup32(index2->m);
101 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
102 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
105 if (index2->offset[beg] == 0) index2->offset[beg] = offset;
107 for (i = beg; i <= end; ++i)
108 if (index2->offset[i] == 0) index2->offset[i] = offset;
113 static void merge_chunks(bam_index_t *idx)
115 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
119 for (i = 0; i < idx->n; ++i) {
120 index = idx->index[i];
121 for (k = kh_begin(index); k != kh_end(index); ++k) {
123 if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
124 p = &kh_value(index, k);
126 for (l = 1; l < p->n; ++l) {
127 #ifdef BAM_TRUE_OFFSET
128 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
130 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
132 else p->list[++m] = p->list[l];
137 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
140 static void fill_missing(bam_index_t *idx)
143 for (i = 0; i < idx->n; ++i) {
144 bam_lidx_t *idx2 = &idx->index2[i];
145 for (j = 1; j < idx2->n; ++j)
146 if (idx2->offset[j] == 0)
147 idx2->offset[j] = idx2->offset[j-1];
151 bam_index_t *bam_index_core(bamFile fp)
157 uint32_t last_bin, save_bin, recalculated_bin;
158 int32_t last_coor, last_tid, save_tid;
160 uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
162 h = bam_header_read(fp);
164 fprintf(stderr, "[bam_index_core] Invalid BAM header.");
168 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
169 b = (bam1_t*)calloc(1, sizeof(bam1_t));
172 idx->n = h->n_targets;
173 bam_header_destroy(h);
174 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
175 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
176 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
178 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
179 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
180 n_mapped = n_unmapped = n_no_coor = off_end = 0;
181 off_beg = off_end = bam_tell(fp);
182 while ((ret = bam_read1(fp, b)) >= 0) {
183 if (c->tid < 0) ++n_no_coor;
184 if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes
186 last_bin = 0xffffffffu;
187 } else if ((uint32_t)last_tid > (uint32_t)c->tid) {
188 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n",
189 bam1_qname(b), last_tid+1, c->tid+1);
191 } else if ((int32_t)c->tid >= 0 && last_coor > c->pos) {
192 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
193 bam1_qname(b), last_coor, c->pos, c->tid+1);
197 recalculated_bin = bam_reg2bin(c->pos, bam_calend(c, bam1_cigar(b)));
198 if (c->bin != recalculated_bin) {
199 fprintf(stderr, "[bam_index_core] read '%s' mapped at POS %d to %d has BIN %d but should be %d\n",
200 bam1_qname(b), c->pos + 1, bam_calend(c, bam1_cigar(b)), c->bin, recalculated_bin);
201 fprintf(stderr, "[bam_index_core] Fix it by using BAM->SAM->BAM to force a recalculation of the BIN field\n");
205 if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
206 if (c->bin != last_bin) { // then possibly write the binning index
207 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
208 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
209 if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
211 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
212 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
213 n_mapped = n_unmapped = 0;
217 save_bin = last_bin = c->bin;
219 if (save_tid < 0) break;
221 if (bam_tell(fp) <= last_off) {
222 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
223 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
226 if (c->flag & BAM_FUNMAP) ++n_unmapped;
228 last_off = bam_tell(fp);
229 last_coor = b->core.pos;
232 insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
233 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
234 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
239 while ((ret = bam_read1(fp, b)) >= 0) {
241 if (c->tid >= 0 && n_no_coor) {
242 fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
247 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
248 free(b->data); free(b);
249 idx->n_no_coor = n_no_coor;
253 void bam_index_destroy(bam_index_t *idx)
257 if (idx == 0) return;
258 for (i = 0; i < idx->n; ++i) {
259 khash_t(i) *index = idx->index[i];
260 bam_lidx_t *index2 = idx->index2 + i;
261 for (k = kh_begin(index); k != kh_end(index); ++k) {
262 if (kh_exist(index, k))
263 free(kh_value(index, k).list);
265 kh_destroy(i, index);
266 free(index2->offset);
268 free(idx->index); free(idx->index2);
272 void bam_index_save(const bam_index_t *idx, FILE *fp)
276 fwrite("BAI\1", 1, 4, fp);
279 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
280 } else fwrite(&idx->n, 4, 1, fp);
281 for (i = 0; i < idx->n; ++i) {
282 khash_t(i) *index = idx->index[i];
283 bam_lidx_t *index2 = idx->index2 + i;
284 // write binning index
285 size = kh_size(index);
286 if (bam_is_be) { // big endian
288 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
289 } else fwrite(&size, 4, 1, fp);
290 for (k = kh_begin(index); k != kh_end(index); ++k) {
291 if (kh_exist(index, k)) {
292 bam_binlist_t *p = &kh_value(index, k);
293 if (bam_is_be) { // big endian
295 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
296 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
297 for (x = 0; (int)x < p->n; ++x) {
298 bam_swap_endian_8p(&p->list[x].u);
299 bam_swap_endian_8p(&p->list[x].v);
301 fwrite(p->list, 16, p->n, fp);
302 for (x = 0; (int)x < p->n; ++x) {
303 bam_swap_endian_8p(&p->list[x].u);
304 bam_swap_endian_8p(&p->list[x].v);
307 fwrite(&kh_key(index, k), 4, 1, fp);
308 fwrite(&p->n, 4, 1, fp);
309 fwrite(p->list, 16, p->n, fp);
313 // write linear index (index2)
316 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
317 } else fwrite(&index2->n, 4, 1, fp);
318 if (bam_is_be) { // big endian
320 for (x = 0; (int)x < index2->n; ++x)
321 bam_swap_endian_8p(&index2->offset[x]);
322 fwrite(index2->offset, 8, index2->n, fp);
323 for (x = 0; (int)x < index2->n; ++x)
324 bam_swap_endian_8p(&index2->offset[x]);
325 } else fwrite(index2->offset, 8, index2->n, fp);
327 { // write the number of reads coor-less records.
328 uint64_t x = idx->n_no_coor;
329 if (bam_is_be) bam_swap_endian_8p(&x);
330 fwrite(&x, 8, 1, fp);
335 static bam_index_t *bam_index_load_core(FILE *fp)
341 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
344 fread(magic, 1, 4, fp);
345 if (strncmp(magic, "BAI\1", 4)) {
346 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
350 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
351 fread(&idx->n, 4, 1, fp);
352 if (bam_is_be) bam_swap_endian_4p(&idx->n);
353 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
354 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
355 for (i = 0; i < idx->n; ++i) {
357 bam_lidx_t *index2 = idx->index2 + i;
362 index = idx->index[i] = kh_init(i);
363 // load binning index
364 fread(&size, 4, 1, fp);
365 if (bam_is_be) bam_swap_endian_4p(&size);
366 for (j = 0; j < (int)size; ++j) {
367 fread(&key, 4, 1, fp);
368 if (bam_is_be) bam_swap_endian_4p(&key);
369 k = kh_put(i, index, key, &ret);
370 p = &kh_value(index, k);
371 fread(&p->n, 4, 1, fp);
372 if (bam_is_be) bam_swap_endian_4p(&p->n);
374 p->list = (pair64_t*)malloc(p->m * 16);
375 fread(p->list, 16, p->n, fp);
378 for (x = 0; x < p->n; ++x) {
379 bam_swap_endian_8p(&p->list[x].u);
380 bam_swap_endian_8p(&p->list[x].v);
385 fread(&index2->n, 4, 1, fp);
386 if (bam_is_be) bam_swap_endian_4p(&index2->n);
387 index2->m = index2->n;
388 index2->offset = (uint64_t*)calloc(index2->m, 8);
389 fread(index2->offset, index2->n, 8, fp);
391 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
393 if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
394 if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
398 bam_index_t *bam_index_load_local(const char *_fn)
403 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
406 for (p = _fn + l - 1; p >= _fn; --p)
407 if (*p == '/') break;
409 } else fn = strdup(_fn);
410 fnidx = (char*)calloc(strlen(fn) + 5, 1);
411 strcpy(fnidx, fn); strcat(fnidx, ".bai");
412 fp = fopen(fnidx, "rb");
413 if (fp == 0) { // try "{base}.bai"
414 char *s = strstr(fn, "bam");
415 if (s == fn + strlen(fn) - 3) {
417 fnidx[strlen(fn)-1] = 'i';
418 fp = fopen(fnidx, "rb");
421 free(fnidx); free(fn);
423 bam_index_t *idx = bam_index_load_core(fp);
430 static void download_from_remote(const char *url)
432 const int buf_size = 1 * 1024 * 1024;
438 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
440 for (fn = (char*)url + l - 1; fn >= url; --fn)
441 if (*fn == '/') break;
442 ++fn; // fn now points to the file name
443 fp_remote = knet_open(url, "r");
444 if (fp_remote == 0) {
445 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
448 if ((fp = fopen(fn, "wb")) == 0) {
449 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
450 knet_close(fp_remote);
453 buf = (uint8_t*)calloc(buf_size, 1);
454 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
455 fwrite(buf, 1, l, fp);
458 knet_close(fp_remote);
461 static void download_from_remote(const char *url)
467 bam_index_t *bam_index_load(const char *fn)
470 idx = bam_index_load_local(fn);
471 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
472 char *fnidx = calloc(strlen(fn) + 5, 1);
473 strcat(strcpy(fnidx, fn), ".bai");
474 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
475 download_from_remote(fnidx);
477 idx = bam_index_load_local(fn);
479 if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
483 int bam_index_build2(const char *fn, const char *_fnidx)
489 if ((fp = bam_open(fn, "r")) == 0) {
490 fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
493 idx = bam_index_core(fp);
496 fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n");
500 fnidx = (char*)calloc(strlen(fn) + 5, 1);
501 strcpy(fnidx, fn); strcat(fnidx, ".bai");
502 } else fnidx = strdup(_fnidx);
503 fpidx = fopen(fnidx, "wb");
505 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
507 bam_index_destroy(idx);
510 bam_index_save(idx, fpidx);
511 bam_index_destroy(idx);
517 int bam_index_build(const char *fn)
519 return bam_index_build2(fn, 0);
522 int bam_index(int argc, char *argv[])
525 fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
528 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
529 else bam_index_build(argv[1]);
533 int bam_idxstats(int argc, char *argv[])
536 bam_header_t *header;
540 fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
543 fp = bam_open(argv[1], "r");
544 if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
545 header = bam_header_read(fp);
547 idx = bam_index_load(argv[1]);
548 if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
549 for (i = 0; i < idx->n; ++i) {
551 khash_t(i) *h = idx->index[i];
552 printf("%s\t%d", header->target_name[i], header->target_len[i]);
553 k = kh_get(i, h, BAM_MAX_BIN);
555 printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
556 else printf("\t0\t0");
559 printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
560 bam_header_destroy(header);
561 bam_index_destroy(idx);
565 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
568 if (beg >= end) return 0;
569 if (end >= 1u<<29) end = 1u<<29;
572 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
573 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
574 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
575 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
576 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
580 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
582 uint32_t rbeg = b->core.pos;
583 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
584 return (rend > beg && rbeg < end);
587 struct __bam_iter_t {
588 int from_first; // read from the first record; no random access
589 int tid, beg, end, n_off, i, finished;
594 // bam_fetch helper function retrieves
595 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
598 int i, n_bins, n_off;
605 if (beg < 0) beg = 0;
606 if (end < beg) return 0;
608 iter = calloc(1, sizeof(struct __bam_iter_t));
609 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
611 bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
612 n_bins = reg2bins(beg, end, bins);
613 index = idx->index[tid];
614 if (idx->index2[tid].n > 0) {
615 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
616 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
617 if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
618 int n = beg>>BAM_LIDX_SHIFT;
619 if (n > idx->index2[tid].n) n = idx->index2[tid].n;
620 for (i = n - 1; i >= 0; --i)
621 if (idx->index2[tid].offset[i] != 0) break;
622 if (i >= 0) min_off = idx->index2[tid].offset[i];
624 } else min_off = 0; // tabix 0.1.2 may produce such index files
625 for (i = n_off = 0; i < n_bins; ++i) {
626 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
627 n_off += kh_value(index, k).n;
630 free(bins); return iter;
632 off = (pair64_t*)calloc(n_off, 16);
633 for (i = n_off = 0; i < n_bins; ++i) {
634 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
636 bam_binlist_t *p = &kh_value(index, k);
637 for (j = 0; j < p->n; ++j)
638 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
643 free(off); return iter;
646 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
648 ks_introsort(off, n_off, off);
649 // resolve completely contained adjacent blocks
650 for (i = 1, l = 0; i < n_off; ++i)
651 if (off[l].v < off[i].v)
654 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
655 for (i = 1; i < n_off; ++i)
656 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
657 { // merge adjacent blocks
658 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
659 for (i = 1, l = 0; i < n_off; ++i) {
660 #ifdef BAM_TRUE_OFFSET
661 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
663 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
665 else off[++l] = off[i];
672 iter->n_off = n_off; iter->off = off;
676 pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
677 { // for pysam compatibility
680 iter = bam_iter_query(idx, tid, beg, end);
681 off = iter->off; *cnt_off = iter->n_off;
686 void bam_iter_destroy(bam_iter_t iter)
688 if (iter) { free(iter->off); free(iter); }
691 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
694 if (iter && iter->finished) return -1;
695 if (iter == 0 || iter->from_first) {
696 ret = bam_read1(fp, b);
697 if (ret < 0 && iter) iter->finished = 1;
700 if (iter->off == 0) return -1;
702 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
703 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
704 if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
705 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
706 bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
707 iter->curr_off = bam_tell(fp);
711 if ((ret = bam_read1(fp, b)) >= 0) {
712 iter->curr_off = bam_tell(fp);
713 if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
714 ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
717 else if (is_overlap(iter->beg, iter->end, b)) return ret;
718 } else break; // end of file or error
724 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
730 iter = bam_iter_query(idx, tid, beg, end);
731 while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
732 bam_iter_destroy(iter);
734 return (ret == -1)? 0 : ret;