6 #include "bam_endian.h"
14 Alignment indexing. Before indexing, BAM must be sorted based on the
15 leftmost coordinate of alignments. In indexing, BAM uses two indices:
16 a UCSC binning index and a simple linear index. The binning index is
17 efficient for alignments spanning long distance, while the auxiliary
18 linear index helps to reduce unnecessary seek calls especially for
21 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
22 Stein and is explained by Kent et al. (2002). In this scheme, each bin
23 represents a contiguous genomic region which can be fully contained in
24 another bin; each alignment is associated with a bin which represents
25 the smallest region containing the entire alignment. The binning
26 scheme is essentially another representation of R-tree. A distinct bin
27 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
28 a child of Bin B if region A is contained in B.
30 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
31 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
32 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
33 find the alignments overlapped with a region [rbeg,rend), we need to
34 calculate the list of bins that may be overlapped the region and test
35 the alignments in the bins to confirm the overlaps. If the specified
36 region is short, typically only a few alignments in six bins need to
37 be retrieved. The overlapping alignments can be quickly fetched.
41 #define BAM_MIN_CHUNK_GAP 32768
42 // 1<<14 is the size of minimum bin.
43 #define BAM_LIDX_SHIFT 14
45 #define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
51 #define pair64_lt(a,b) ((a).u < (b).u)
52 KSORT_INIT(off, pair64_t, pair64_lt)
64 KHASH_MAP_INIT_INT(i, bam_binlist_t)
66 struct __bam_index_t {
68 uint64_t n_no_coor; // unmapped reads without coordinate
73 // requirement: len <= LEN_MASK
74 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
79 k = kh_put(i, h, bin, &ret);
81 if (ret) { // not present
83 l->list = (pair64_t*)calloc(l->m, 16);
87 l->list = (pair64_t*)realloc(l->list, l->m * 16);
89 l->list[l->n].u = beg; l->list[l->n++].v = end;
92 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
95 beg = b->core.pos >> BAM_LIDX_SHIFT;
96 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
97 if (index2->m < end + 1) {
98 int old_m = index2->m;
100 kroundup32(index2->m);
101 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
102 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
105 if (index2->offset[beg] == 0) index2->offset[beg] = offset;
107 for (i = beg; i <= end; ++i)
108 if (index2->offset[i] == 0) index2->offset[i] = offset;
113 static void merge_chunks(bam_index_t *idx)
115 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
119 for (i = 0; i < idx->n; ++i) {
120 index = idx->index[i];
121 for (k = kh_begin(index); k != kh_end(index); ++k) {
123 if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
124 p = &kh_value(index, k);
126 for (l = 1; l < p->n; ++l) {
127 #ifdef BAM_TRUE_OFFSET
128 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
130 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
132 else p->list[++m] = p->list[l];
137 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
140 static void fill_missing(bam_index_t *idx)
143 for (i = 0; i < idx->n; ++i) {
144 bam_lidx_t *idx2 = &idx->index2[i];
145 for (j = 1; j < idx2->n; ++j)
146 if (idx2->offset[j] == 0)
147 idx2->offset[j] = idx2->offset[j-1];
151 bam_index_t *bam_index_core(bamFile fp)
157 uint32_t last_bin, save_bin;
158 int32_t last_coor, last_tid, save_tid;
160 uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
162 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
163 b = (bam1_t*)calloc(1, sizeof(bam1_t));
164 h = bam_header_read(fp);
167 idx->n = h->n_targets;
168 bam_header_destroy(h);
169 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
170 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
171 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
173 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
174 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
175 n_mapped = n_unmapped = n_no_coor = off_end = 0;
176 off_beg = off_end = bam_tell(fp);
177 while ((ret = bam_read1(fp, b)) >= 0) {
178 if (c->tid < 0) ++n_no_coor;
179 if (last_tid != c->tid) { // change of chromosomes
181 last_bin = 0xffffffffu;
182 } else if (last_coor > c->pos) {
183 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
184 bam1_qname(b), last_coor, c->pos, c->tid+1);
187 if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off);
188 if (c->bin != last_bin) { // then possibly write the binning index
189 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
190 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
191 if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
193 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
194 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
195 n_mapped = n_unmapped = 0;
199 save_bin = last_bin = c->bin;
201 if (save_tid < 0) break;
203 if (bam_tell(fp) <= last_off) {
204 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
205 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
208 if (c->flag & BAM_FUNMAP) ++n_unmapped;
210 last_off = bam_tell(fp);
211 last_coor = b->core.pos;
214 insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
215 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
216 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
221 while ((ret = bam_read1(fp, b)) >= 0) ++n_no_coor;
222 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
223 free(b->data); free(b);
224 idx->n_no_coor = n_no_coor;
228 void bam_index_destroy(bam_index_t *idx)
232 if (idx == 0) return;
233 for (i = 0; i < idx->n; ++i) {
234 khash_t(i) *index = idx->index[i];
235 bam_lidx_t *index2 = idx->index2 + i;
236 for (k = kh_begin(index); k != kh_end(index); ++k) {
237 if (kh_exist(index, k))
238 free(kh_value(index, k).list);
240 kh_destroy(i, index);
241 free(index2->offset);
243 free(idx->index); free(idx->index2);
247 void bam_index_save(const bam_index_t *idx, FILE *fp)
251 fwrite("BAI\1", 1, 4, fp);
254 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
255 } else fwrite(&idx->n, 4, 1, fp);
256 for (i = 0; i < idx->n; ++i) {
257 khash_t(i) *index = idx->index[i];
258 bam_lidx_t *index2 = idx->index2 + i;
259 // write binning index
260 size = kh_size(index);
261 if (bam_is_be) { // big endian
263 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
264 } else fwrite(&size, 4, 1, fp);
265 for (k = kh_begin(index); k != kh_end(index); ++k) {
266 if (kh_exist(index, k)) {
267 bam_binlist_t *p = &kh_value(index, k);
268 if (bam_is_be) { // big endian
270 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
271 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
272 for (x = 0; (int)x < p->n; ++x) {
273 bam_swap_endian_8p(&p->list[x].u);
274 bam_swap_endian_8p(&p->list[x].v);
276 fwrite(p->list, 16, p->n, fp);
277 for (x = 0; (int)x < p->n; ++x) {
278 bam_swap_endian_8p(&p->list[x].u);
279 bam_swap_endian_8p(&p->list[x].v);
282 fwrite(&kh_key(index, k), 4, 1, fp);
283 fwrite(&p->n, 4, 1, fp);
284 fwrite(p->list, 16, p->n, fp);
288 // write linear index (index2)
291 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
292 } else fwrite(&index2->n, 4, 1, fp);
293 if (bam_is_be) { // big endian
295 for (x = 0; (int)x < index2->n; ++x)
296 bam_swap_endian_8p(&index2->offset[x]);
297 fwrite(index2->offset, 8, index2->n, fp);
298 for (x = 0; (int)x < index2->n; ++x)
299 bam_swap_endian_8p(&index2->offset[x]);
300 } else fwrite(index2->offset, 8, index2->n, fp);
302 { // write the number of reads coor-less records.
303 uint64_t x = idx->n_no_coor;
304 if (bam_is_be) bam_swap_endian_8p(&x);
305 fwrite(&x, 8, 1, fp);
310 static bam_index_t *bam_index_load_core(FILE *fp)
316 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
319 fread(magic, 1, 4, fp);
320 if (strncmp(magic, "BAI\1", 4)) {
321 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
325 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
326 fread(&idx->n, 4, 1, fp);
327 if (bam_is_be) bam_swap_endian_4p(&idx->n);
328 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
329 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
330 for (i = 0; i < idx->n; ++i) {
332 bam_lidx_t *index2 = idx->index2 + i;
337 index = idx->index[i] = kh_init(i);
338 // load binning index
339 fread(&size, 4, 1, fp);
340 if (bam_is_be) bam_swap_endian_4p(&size);
341 for (j = 0; j < (int)size; ++j) {
342 fread(&key, 4, 1, fp);
343 if (bam_is_be) bam_swap_endian_4p(&key);
344 k = kh_put(i, index, key, &ret);
345 p = &kh_value(index, k);
346 fread(&p->n, 4, 1, fp);
347 if (bam_is_be) bam_swap_endian_4p(&p->n);
349 p->list = (pair64_t*)malloc(p->m * 16);
350 fread(p->list, 16, p->n, fp);
353 for (x = 0; x < p->n; ++x) {
354 bam_swap_endian_8p(&p->list[x].u);
355 bam_swap_endian_8p(&p->list[x].v);
360 fread(&index2->n, 4, 1, fp);
361 if (bam_is_be) bam_swap_endian_4p(&index2->n);
362 index2->m = index2->n;
363 index2->offset = (uint64_t*)calloc(index2->m, 8);
364 fread(index2->offset, index2->n, 8, fp);
366 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
368 if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
369 if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
373 bam_index_t *bam_index_load_local(const char *_fn)
378 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
381 for (p = _fn + l - 1; p >= _fn; --p)
382 if (*p == '/') break;
384 } else fn = strdup(_fn);
385 fnidx = (char*)calloc(strlen(fn) + 5, 1);
386 strcpy(fnidx, fn); strcat(fnidx, ".bai");
387 fp = fopen(fnidx, "rb");
388 if (fp == 0) { // try "{base}.bai"
389 char *s = strstr(fn, "bam");
390 if (s == fn + strlen(fn) - 3) {
392 fnidx[strlen(fn)-1] = 'i';
393 fp = fopen(fnidx, "rb");
396 free(fnidx); free(fn);
398 bam_index_t *idx = bam_index_load_core(fp);
405 static void download_from_remote(const char *url)
407 const int buf_size = 1 * 1024 * 1024;
413 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
415 for (fn = (char*)url + l - 1; fn >= url; --fn)
416 if (*fn == '/') break;
417 ++fn; // fn now points to the file name
418 fp_remote = knet_open(url, "r");
419 if (fp_remote == 0) {
420 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
423 if ((fp = fopen(fn, "wb")) == 0) {
424 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
425 knet_close(fp_remote);
428 buf = (uint8_t*)calloc(buf_size, 1);
429 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
430 fwrite(buf, 1, l, fp);
433 knet_close(fp_remote);
436 static void download_from_remote(const char *url)
442 bam_index_t *bam_index_load(const char *fn)
445 idx = bam_index_load_local(fn);
446 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
447 char *fnidx = calloc(strlen(fn) + 5, 1);
448 strcat(strcpy(fnidx, fn), ".bai");
449 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
450 download_from_remote(fnidx);
451 idx = bam_index_load_local(fn);
453 if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
457 int bam_index_build2(const char *fn, const char *_fnidx)
463 if ((fp = bam_open(fn, "r")) == 0) {
464 fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
467 idx = bam_index_core(fp);
470 fnidx = (char*)calloc(strlen(fn) + 5, 1);
471 strcpy(fnidx, fn); strcat(fnidx, ".bai");
472 } else fnidx = strdup(_fnidx);
473 fpidx = fopen(fnidx, "wb");
475 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
479 bam_index_save(idx, fpidx);
480 bam_index_destroy(idx);
486 int bam_index_build(const char *fn)
488 return bam_index_build2(fn, 0);
491 int bam_index(int argc, char *argv[])
494 fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
497 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
498 else bam_index_build(argv[1]);
502 int bam_idxstats(int argc, char *argv[])
505 bam_header_t *header;
509 fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
512 fp = bam_open(argv[1], "r");
513 if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
514 header = bam_header_read(fp);
516 idx = bam_index_load(argv[1]);
517 if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
518 for (i = 0; i < idx->n; ++i) {
520 khash_t(i) *h = idx->index[i];
521 printf("%s\t%d", header->target_name[i], header->target_len[i]);
522 k = kh_get(i, h, BAM_MAX_BIN);
524 printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
529 if (!no_stats) printf("\t0\t%llu", (long long)idx->n_no_coor);
531 bam_header_destroy(header);
532 bam_index_destroy(idx);
536 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
539 if (beg >= end) return 0;
540 if (end >= 1u<<29) end = 1u<<29;
543 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
544 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
545 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
546 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
547 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
551 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
553 uint32_t rbeg = b->core.pos;
554 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
555 return (rend > beg && rbeg < end);
558 struct __bam_iter_t {
559 int from_first; // read from the first record; no random access
560 int tid, beg, end, n_off, i, finished;
565 // bam_fetch helper function retrieves
566 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
569 int i, n_bins, n_off;
576 if (beg < 0) beg = 0;
577 if (end < beg) return 0;
579 iter = calloc(1, sizeof(struct __bam_iter_t));
580 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
582 bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
583 n_bins = reg2bins(beg, end, bins);
584 index = idx->index[tid];
585 if (idx->index2[tid].n > 0) {
586 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
587 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
588 if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
589 int n = beg>>BAM_LIDX_SHIFT;
590 if (n > idx->index2[tid].n) n = idx->index2[tid].n;
591 for (i = n - 1; i >= 0; --i)
592 if (idx->index2[tid].offset[i] != 0) break;
593 if (i >= 0) min_off = idx->index2[tid].offset[i];
595 } else min_off = 0; // tabix 0.1.2 may produce such index files
596 for (i = n_off = 0; i < n_bins; ++i) {
597 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
598 n_off += kh_value(index, k).n;
601 free(bins); return iter;
603 off = (pair64_t*)calloc(n_off, 16);
604 for (i = n_off = 0; i < n_bins; ++i) {
605 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
607 bam_binlist_t *p = &kh_value(index, k);
608 for (j = 0; j < p->n; ++j)
609 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
614 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
616 ks_introsort(off, n_off, off);
617 // resolve completely contained adjacent blocks
618 for (i = 1, l = 0; i < n_off; ++i)
619 if (off[l].v < off[i].v)
622 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
623 for (i = 1; i < n_off; ++i)
624 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
625 { // merge adjacent blocks
626 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
627 for (i = 1, l = 0; i < n_off; ++i) {
628 #ifdef BAM_TRUE_OFFSET
629 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
631 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
633 else off[++l] = off[i];
640 iter->n_off = n_off; iter->off = off;
644 pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
645 { // for pysam compatibility
648 iter = bam_iter_query(idx, tid, beg, end);
649 off = iter->off; *cnt_off = iter->n_off;
654 void bam_iter_destroy(bam_iter_t iter)
656 if (iter) { free(iter->off); free(iter); }
659 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
661 if (iter->finished) return -1;
662 if (iter->from_first) {
663 int ret = bam_read1(fp, b);
664 if (ret < 0) iter->finished = 1;
667 if (iter->off == 0) return -1;
670 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
671 if (iter->i == iter->n_off - 1) break; // no more chunks
672 if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
673 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
674 bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
675 iter->curr_off = bam_tell(fp);
679 if ((ret = bam_read1(fp, b)) > 0) {
680 iter->curr_off = bam_tell(fp);
681 if (b->core.tid != iter->tid || b->core.pos >= iter->end) break; // no need to proceed
682 else if (is_overlap(iter->beg, iter->end, b)) return ret;
683 } else break; // end of file
689 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
694 iter = bam_iter_query(idx, tid, beg, end);
695 while (bam_iter_read(fp, iter, b) >= 0) func(b, data);