6 #include "bam_endian.h"
14 Alignment indexing. Before indexing, BAM must be sorted based on the
15 leftmost coordinate of alignments. In indexing, BAM uses two indices:
16 a UCSC binning index and a simple linear index. The binning index is
17 efficient for alignments spanning long distance, while the auxiliary
18 linear index helps to reduce unnecessary seek calls especially for
21 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
22 Stein and is explained by Kent et al. (2002). In this scheme, each bin
23 represents a contiguous genomic region which can be fully contained in
24 another bin; each alignment is associated with a bin which represents
25 the smallest region containing the entire alignment. The binning
26 scheme is essentially another representation of R-tree. A distinct bin
27 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
28 a child of Bin B if region A is contained in B.
30 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
31 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
32 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
33 find the alignments overlapped with a region [rbeg,rend), we need to
34 calculate the list of bins that may be overlapped the region and test
35 the alignments in the bins to confirm the overlaps. If the specified
36 region is short, typically only a few alignments in six bins need to
37 be retrieved. The overlapping alignments can be quickly fetched.
41 #define BAM_MIN_CHUNK_GAP 32768
42 // 1<<14 is the size of minimum bin.
43 #define BAM_LIDX_SHIFT 14
45 #define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
51 #define pair64_lt(a,b) ((a).u < (b).u)
52 KSORT_INIT(off, pair64_t, pair64_lt)
64 KHASH_MAP_INIT_INT(i, bam_binlist_t)
66 struct __bam_index_t {
68 uint64_t n_no_coor; // unmapped reads without coordinate
73 // requirement: len <= LEN_MASK
74 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
79 k = kh_put(i, h, bin, &ret);
81 if (ret) { // not present
83 l->list = (pair64_t*)calloc(l->m, 16);
87 l->list = (pair64_t*)realloc(l->list, l->m * 16);
89 l->list[l->n].u = beg; l->list[l->n++].v = end;
92 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
95 beg = b->core.pos >> BAM_LIDX_SHIFT;
96 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
97 if (index2->m < end + 1) {
98 int old_m = index2->m;
100 kroundup32(index2->m);
101 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
102 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
105 if (index2->offset[beg] == 0) index2->offset[beg] = offset;
107 for (i = beg; i <= end; ++i)
108 if (index2->offset[i] == 0) index2->offset[i] = offset;
113 static void merge_chunks(bam_index_t *idx)
115 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
119 for (i = 0; i < idx->n; ++i) {
120 index = idx->index[i];
121 for (k = kh_begin(index); k != kh_end(index); ++k) {
123 if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
124 p = &kh_value(index, k);
126 for (l = 1; l < p->n; ++l) {
127 #ifdef BAM_TRUE_OFFSET
128 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
130 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
132 else p->list[++m] = p->list[l];
137 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
140 static void fill_missing(bam_index_t *idx)
143 for (i = 0; i < idx->n; ++i) {
144 bam_lidx_t *idx2 = &idx->index2[i];
145 for (j = 1; j < idx2->n; ++j)
146 if (idx2->offset[j] == 0)
147 idx2->offset[j] = idx2->offset[j-1];
151 bam_index_t *bam_index_core(bamFile fp)
157 uint32_t last_bin, save_bin;
158 int32_t last_coor, last_tid, save_tid;
160 uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
162 h = bam_header_read(fp);
164 fprintf(stderr, "[bam_index_core] Invalid BAM header.");
168 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
169 b = (bam1_t*)calloc(1, sizeof(bam1_t));
172 idx->n = h->n_targets;
173 bam_header_destroy(h);
174 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
175 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
176 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
178 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
179 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
180 n_mapped = n_unmapped = n_no_coor = off_end = 0;
181 off_beg = off_end = bam_tell(fp);
182 while ((ret = bam_read1(fp, b)) >= 0) {
183 if (c->tid < 0) ++n_no_coor;
184 if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes
186 last_bin = 0xffffffffu;
187 } else if ((uint32_t)last_tid > (uint32_t)c->tid) {
188 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n",
189 bam1_qname(b), last_tid+1, c->tid+1);
191 } else if ((int32_t)c->tid >= 0 && last_coor > c->pos) {
192 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
193 bam1_qname(b), last_coor, c->pos, c->tid+1);
196 if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
197 if (c->bin != last_bin) { // then possibly write the binning index
198 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
199 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
200 if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
202 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
203 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
204 n_mapped = n_unmapped = 0;
208 save_bin = last_bin = c->bin;
210 if (save_tid < 0) break;
212 if (bam_tell(fp) <= last_off) {
213 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
214 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
217 if (c->flag & BAM_FUNMAP) ++n_unmapped;
219 last_off = bam_tell(fp);
220 last_coor = b->core.pos;
223 insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
224 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
225 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
230 while ((ret = bam_read1(fp, b)) >= 0) {
232 if (c->tid >= 0 && n_no_coor) {
233 fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
238 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
239 free(b->data); free(b);
240 idx->n_no_coor = n_no_coor;
244 void bam_index_destroy(bam_index_t *idx)
248 if (idx == 0) return;
249 for (i = 0; i < idx->n; ++i) {
250 khash_t(i) *index = idx->index[i];
251 bam_lidx_t *index2 = idx->index2 + i;
252 for (k = kh_begin(index); k != kh_end(index); ++k) {
253 if (kh_exist(index, k))
254 free(kh_value(index, k).list);
256 kh_destroy(i, index);
257 free(index2->offset);
259 free(idx->index); free(idx->index2);
263 void bam_index_save(const bam_index_t *idx, FILE *fp)
267 fwrite("BAI\1", 1, 4, fp);
270 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
271 } else fwrite(&idx->n, 4, 1, fp);
272 for (i = 0; i < idx->n; ++i) {
273 khash_t(i) *index = idx->index[i];
274 bam_lidx_t *index2 = idx->index2 + i;
275 // write binning index
276 size = kh_size(index);
277 if (bam_is_be) { // big endian
279 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
280 } else fwrite(&size, 4, 1, fp);
281 for (k = kh_begin(index); k != kh_end(index); ++k) {
282 if (kh_exist(index, k)) {
283 bam_binlist_t *p = &kh_value(index, k);
284 if (bam_is_be) { // big endian
286 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
287 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
288 for (x = 0; (int)x < p->n; ++x) {
289 bam_swap_endian_8p(&p->list[x].u);
290 bam_swap_endian_8p(&p->list[x].v);
292 fwrite(p->list, 16, p->n, fp);
293 for (x = 0; (int)x < p->n; ++x) {
294 bam_swap_endian_8p(&p->list[x].u);
295 bam_swap_endian_8p(&p->list[x].v);
298 fwrite(&kh_key(index, k), 4, 1, fp);
299 fwrite(&p->n, 4, 1, fp);
300 fwrite(p->list, 16, p->n, fp);
304 // write linear index (index2)
307 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
308 } else fwrite(&index2->n, 4, 1, fp);
309 if (bam_is_be) { // big endian
311 for (x = 0; (int)x < index2->n; ++x)
312 bam_swap_endian_8p(&index2->offset[x]);
313 fwrite(index2->offset, 8, index2->n, fp);
314 for (x = 0; (int)x < index2->n; ++x)
315 bam_swap_endian_8p(&index2->offset[x]);
316 } else fwrite(index2->offset, 8, index2->n, fp);
318 { // write the number of reads coor-less records.
319 uint64_t x = idx->n_no_coor;
320 if (bam_is_be) bam_swap_endian_8p(&x);
321 fwrite(&x, 8, 1, fp);
326 static bam_index_t *bam_index_load_core(FILE *fp)
332 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
335 fread(magic, 1, 4, fp);
336 if (strncmp(magic, "BAI\1", 4)) {
337 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
341 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
342 fread(&idx->n, 4, 1, fp);
343 if (bam_is_be) bam_swap_endian_4p(&idx->n);
344 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
345 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
346 for (i = 0; i < idx->n; ++i) {
348 bam_lidx_t *index2 = idx->index2 + i;
353 index = idx->index[i] = kh_init(i);
354 // load binning index
355 fread(&size, 4, 1, fp);
356 if (bam_is_be) bam_swap_endian_4p(&size);
357 for (j = 0; j < (int)size; ++j) {
358 fread(&key, 4, 1, fp);
359 if (bam_is_be) bam_swap_endian_4p(&key);
360 k = kh_put(i, index, key, &ret);
361 p = &kh_value(index, k);
362 fread(&p->n, 4, 1, fp);
363 if (bam_is_be) bam_swap_endian_4p(&p->n);
365 p->list = (pair64_t*)malloc(p->m * 16);
366 fread(p->list, 16, p->n, fp);
369 for (x = 0; x < p->n; ++x) {
370 bam_swap_endian_8p(&p->list[x].u);
371 bam_swap_endian_8p(&p->list[x].v);
376 fread(&index2->n, 4, 1, fp);
377 if (bam_is_be) bam_swap_endian_4p(&index2->n);
378 index2->m = index2->n;
379 index2->offset = (uint64_t*)calloc(index2->m, 8);
380 fread(index2->offset, index2->n, 8, fp);
382 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
384 if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
385 if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
389 bam_index_t *bam_index_load_local(const char *_fn)
394 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
397 for (p = _fn + l - 1; p >= _fn; --p)
398 if (*p == '/') break;
400 } else fn = strdup(_fn);
401 fnidx = (char*)calloc(strlen(fn) + 5, 1);
402 strcpy(fnidx, fn); strcat(fnidx, ".bai");
403 fp = fopen(fnidx, "rb");
404 if (fp == 0) { // try "{base}.bai"
405 char *s = strstr(fn, "bam");
406 if (s == fn + strlen(fn) - 3) {
408 fnidx[strlen(fn)-1] = 'i';
409 fp = fopen(fnidx, "rb");
412 free(fnidx); free(fn);
414 bam_index_t *idx = bam_index_load_core(fp);
421 static void download_from_remote(const char *url)
423 const int buf_size = 1 * 1024 * 1024;
429 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
431 for (fn = (char*)url + l - 1; fn >= url; --fn)
432 if (*fn == '/') break;
433 ++fn; // fn now points to the file name
434 fp_remote = knet_open(url, "r");
435 if (fp_remote == 0) {
436 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
439 if ((fp = fopen(fn, "wb")) == 0) {
440 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
441 knet_close(fp_remote);
444 buf = (uint8_t*)calloc(buf_size, 1);
445 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
446 fwrite(buf, 1, l, fp);
449 knet_close(fp_remote);
452 static void download_from_remote(const char *url)
458 bam_index_t *bam_index_load(const char *fn)
461 idx = bam_index_load_local(fn);
462 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
463 char *fnidx = calloc(strlen(fn) + 5, 1);
464 strcat(strcpy(fnidx, fn), ".bai");
465 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
466 download_from_remote(fnidx);
467 idx = bam_index_load_local(fn);
469 if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
473 int bam_index_build2(const char *fn, const char *_fnidx)
479 if ((fp = bam_open(fn, "r")) == 0) {
480 fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
483 idx = bam_index_core(fp);
486 fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n");
490 fnidx = (char*)calloc(strlen(fn) + 5, 1);
491 strcpy(fnidx, fn); strcat(fnidx, ".bai");
492 } else fnidx = strdup(_fnidx);
493 fpidx = fopen(fnidx, "wb");
495 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
499 bam_index_save(idx, fpidx);
500 bam_index_destroy(idx);
506 int bam_index_build(const char *fn)
508 return bam_index_build2(fn, 0);
511 int bam_index(int argc, char *argv[])
514 fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
517 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
518 else bam_index_build(argv[1]);
522 int bam_idxstats(int argc, char *argv[])
525 bam_header_t *header;
529 fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
532 fp = bam_open(argv[1], "r");
533 if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
534 header = bam_header_read(fp);
536 idx = bam_index_load(argv[1]);
537 if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
538 for (i = 0; i < idx->n; ++i) {
540 khash_t(i) *h = idx->index[i];
541 printf("%s\t%d", header->target_name[i], header->target_len[i]);
542 k = kh_get(i, h, BAM_MAX_BIN);
544 printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
545 else printf("\t0\t0");
548 printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
549 bam_header_destroy(header);
550 bam_index_destroy(idx);
554 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
557 if (beg >= end) return 0;
558 if (end >= 1u<<29) end = 1u<<29;
561 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
562 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
563 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
564 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
565 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
569 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
571 uint32_t rbeg = b->core.pos;
572 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
573 return (rend > beg && rbeg < end);
576 struct __bam_iter_t {
577 int from_first; // read from the first record; no random access
578 int tid, beg, end, n_off, i, finished;
583 // bam_fetch helper function retrieves
584 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
587 int i, n_bins, n_off;
594 if (beg < 0) beg = 0;
595 if (end < beg) return 0;
597 iter = calloc(1, sizeof(struct __bam_iter_t));
598 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
600 bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
601 n_bins = reg2bins(beg, end, bins);
602 index = idx->index[tid];
603 if (idx->index2[tid].n > 0) {
604 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
605 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
606 if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
607 int n = beg>>BAM_LIDX_SHIFT;
608 if (n > idx->index2[tid].n) n = idx->index2[tid].n;
609 for (i = n - 1; i >= 0; --i)
610 if (idx->index2[tid].offset[i] != 0) break;
611 if (i >= 0) min_off = idx->index2[tid].offset[i];
613 } else min_off = 0; // tabix 0.1.2 may produce such index files
614 for (i = n_off = 0; i < n_bins; ++i) {
615 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
616 n_off += kh_value(index, k).n;
619 free(bins); return iter;
621 off = (pair64_t*)calloc(n_off, 16);
622 for (i = n_off = 0; i < n_bins; ++i) {
623 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
625 bam_binlist_t *p = &kh_value(index, k);
626 for (j = 0; j < p->n; ++j)
627 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
632 free(off); return iter;
635 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
637 ks_introsort(off, n_off, off);
638 // resolve completely contained adjacent blocks
639 for (i = 1, l = 0; i < n_off; ++i)
640 if (off[l].v < off[i].v)
643 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
644 for (i = 1; i < n_off; ++i)
645 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
646 { // merge adjacent blocks
647 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
648 for (i = 1, l = 0; i < n_off; ++i) {
649 #ifdef BAM_TRUE_OFFSET
650 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
652 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
654 else off[++l] = off[i];
661 iter->n_off = n_off; iter->off = off;
665 pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
666 { // for pysam compatibility
669 iter = bam_iter_query(idx, tid, beg, end);
670 off = iter->off; *cnt_off = iter->n_off;
675 void bam_iter_destroy(bam_iter_t iter)
677 if (iter) { free(iter->off); free(iter); }
680 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
683 if (iter && iter->finished) return -1;
684 if (iter == 0 || iter->from_first) {
685 ret = bam_read1(fp, b);
686 if (ret < 0 && iter) iter->finished = 1;
689 if (iter->off == 0) return -1;
691 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
692 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
693 if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
694 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
695 bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
696 iter->curr_off = bam_tell(fp);
700 if ((ret = bam_read1(fp, b)) >= 0) {
701 iter->curr_off = bam_tell(fp);
702 if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
703 ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
706 else if (is_overlap(iter->beg, iter->end, b)) return ret;
707 } else break; // end of file or error
713 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
719 iter = bam_iter_query(idx, tid, beg, end);
720 while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
721 bam_iter_destroy(iter);
723 return (ret == -1)? 0 : ret;