6 #include "bam_endian.h"
14 Alignment indexing. Before indexing, BAM must be sorted based on the
15 leftmost coordinate of alignments. In indexing, BAM uses two indices:
16 a UCSC binning index and a simple linear index. The binning index is
17 efficient for alignments spanning long distance, while the auxiliary
18 linear index helps to reduce unnecessary seek calls especially for
21 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
22 Stein and is explained by Kent et al. (2002). In this scheme, each bin
23 represents a contiguous genomic region which can be fully contained in
24 another bin; each alignment is associated with a bin which represents
25 the smallest region containing the entire alignment. The binning
26 scheme is essentially another representation of R-tree. A distinct bin
27 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
28 a child of Bin B if region A is contained in B.
30 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
31 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
32 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
33 find the alignments overlapped with a region [rbeg,rend), we need to
34 calculate the list of bins that may be overlapped the region and test
35 the alignments in the bins to confirm the overlaps. If the specified
36 region is short, typically only a few alignments in six bins need to
37 be retrieved. The overlapping alignments can be quickly fetched.
41 #define BAM_MIN_CHUNK_GAP 32768
42 // 1<<14 is the size of minimum bin.
43 #define BAM_LIDX_SHIFT 14
45 #define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
51 #define pair64_lt(a,b) ((a).u < (b).u)
52 KSORT_INIT(off, pair64_t, pair64_lt)
64 KHASH_MAP_INIT_INT(i, bam_binlist_t)
66 struct __bam_index_t {
68 uint64_t n_no_coor; // unmapped reads without coordinate
73 // requirement: len <= LEN_MASK
74 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
79 k = kh_put(i, h, bin, &ret);
81 if (ret) { // not present
83 l->list = (pair64_t*)calloc(l->m, 16);
87 l->list = (pair64_t*)realloc(l->list, l->m * 16);
89 l->list[l->n].u = beg; l->list[l->n++].v = end;
92 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
95 beg = b->core.pos >> BAM_LIDX_SHIFT;
96 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
97 if (index2->m < end + 1) {
98 int old_m = index2->m;
100 kroundup32(index2->m);
101 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
102 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
105 if (index2->offset[beg] == 0) index2->offset[beg] = offset;
107 for (i = beg; i <= end; ++i)
108 if (index2->offset[i] == 0) index2->offset[i] = offset;
113 static void merge_chunks(bam_index_t *idx)
115 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
119 for (i = 0; i < idx->n; ++i) {
120 index = idx->index[i];
121 for (k = kh_begin(index); k != kh_end(index); ++k) {
123 if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
124 p = &kh_value(index, k);
126 for (l = 1; l < p->n; ++l) {
127 #ifdef BAM_TRUE_OFFSET
128 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
130 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
132 else p->list[++m] = p->list[l];
137 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
140 static void fill_missing(bam_index_t *idx)
143 for (i = 0; i < idx->n; ++i) {
144 bam_lidx_t *idx2 = &idx->index2[i];
145 for (j = 1; j < idx2->n; ++j)
146 if (idx2->offset[j] == 0)
147 idx2->offset[j] = idx2->offset[j-1];
151 bam_index_t *bam_index_core(bamFile fp)
157 uint32_t last_bin, save_bin;
158 int32_t last_coor, last_tid, save_tid;
160 uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
162 h = bam_header_read(fp);
164 fprintf(stderr, "[bam_index_core] Invalid BAM header.");
168 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
169 b = (bam1_t*)calloc(1, sizeof(bam1_t));
172 idx->n = h->n_targets;
173 bam_header_destroy(h);
174 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
175 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
176 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
178 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
179 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
180 n_mapped = n_unmapped = n_no_coor = off_end = 0;
181 off_beg = off_end = bam_tell(fp);
182 while ((ret = bam_read1(fp, b)) >= 0) {
183 if (c->tid < 0) ++n_no_coor;
184 if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes
186 last_bin = 0xffffffffu;
187 } else if ((uint32_t)last_tid > (uint32_t)c->tid) {
188 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n",
189 bam1_qname(b), last_tid+1, c->tid+1);
191 } else if ((int32_t)c->tid >= 0 && last_coor > c->pos) {
192 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
193 bam1_qname(b), last_coor, c->pos, c->tid+1);
196 if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
197 if (c->bin != last_bin) { // then possibly write the binning index
198 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
199 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
200 if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
202 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
203 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
204 n_mapped = n_unmapped = 0;
208 save_bin = last_bin = c->bin;
210 if (save_tid < 0) break;
212 if (bam_tell(fp) <= last_off) {
213 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
214 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
217 if (c->flag & BAM_FUNMAP) ++n_unmapped;
219 last_off = bam_tell(fp);
220 last_coor = b->core.pos;
223 insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
224 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
225 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
230 while ((ret = bam_read1(fp, b)) >= 0) {
232 if (c->tid >= 0 && n_no_coor) {
233 fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
238 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
239 free(b->data); free(b);
240 idx->n_no_coor = n_no_coor;
244 void bam_index_destroy(bam_index_t *idx)
248 if (idx == 0) return;
249 for (i = 0; i < idx->n; ++i) {
250 khash_t(i) *index = idx->index[i];
251 bam_lidx_t *index2 = idx->index2 + i;
252 for (k = kh_begin(index); k != kh_end(index); ++k) {
253 if (kh_exist(index, k))
254 free(kh_value(index, k).list);
256 kh_destroy(i, index);
257 free(index2->offset);
259 free(idx->index); free(idx->index2);
263 void bam_index_save(const bam_index_t *idx, FILE *fp)
267 fwrite("BAI\1", 1, 4, fp);
270 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
271 } else fwrite(&idx->n, 4, 1, fp);
272 for (i = 0; i < idx->n; ++i) {
273 khash_t(i) *index = idx->index[i];
274 bam_lidx_t *index2 = idx->index2 + i;
275 // write binning index
276 size = kh_size(index);
277 if (bam_is_be) { // big endian
279 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
280 } else fwrite(&size, 4, 1, fp);
281 for (k = kh_begin(index); k != kh_end(index); ++k) {
282 if (kh_exist(index, k)) {
283 bam_binlist_t *p = &kh_value(index, k);
284 if (bam_is_be) { // big endian
286 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
287 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
288 for (x = 0; (int)x < p->n; ++x) {
289 bam_swap_endian_8p(&p->list[x].u);
290 bam_swap_endian_8p(&p->list[x].v);
292 fwrite(p->list, 16, p->n, fp);
293 for (x = 0; (int)x < p->n; ++x) {
294 bam_swap_endian_8p(&p->list[x].u);
295 bam_swap_endian_8p(&p->list[x].v);
298 fwrite(&kh_key(index, k), 4, 1, fp);
299 fwrite(&p->n, 4, 1, fp);
300 fwrite(p->list, 16, p->n, fp);
304 // write linear index (index2)
307 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
308 } else fwrite(&index2->n, 4, 1, fp);
309 if (bam_is_be) { // big endian
311 for (x = 0; (int)x < index2->n; ++x)
312 bam_swap_endian_8p(&index2->offset[x]);
313 fwrite(index2->offset, 8, index2->n, fp);
314 for (x = 0; (int)x < index2->n; ++x)
315 bam_swap_endian_8p(&index2->offset[x]);
316 } else fwrite(index2->offset, 8, index2->n, fp);
318 { // write the number of reads coor-less records.
319 uint64_t x = idx->n_no_coor;
320 if (bam_is_be) bam_swap_endian_8p(&x);
321 fwrite(&x, 8, 1, fp);
326 static bam_index_t *bam_index_load_core(FILE *fp)
332 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
335 fread(magic, 1, 4, fp);
336 if (strncmp(magic, "BAI\1", 4)) {
337 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
341 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
342 fread(&idx->n, 4, 1, fp);
343 if (bam_is_be) bam_swap_endian_4p(&idx->n);
344 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
345 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
346 for (i = 0; i < idx->n; ++i) {
348 bam_lidx_t *index2 = idx->index2 + i;
353 index = idx->index[i] = kh_init(i);
354 // load binning index
355 fread(&size, 4, 1, fp);
356 if (bam_is_be) bam_swap_endian_4p(&size);
357 for (j = 0; j < (int)size; ++j) {
358 fread(&key, 4, 1, fp);
359 if (bam_is_be) bam_swap_endian_4p(&key);
360 k = kh_put(i, index, key, &ret);
361 p = &kh_value(index, k);
362 fread(&p->n, 4, 1, fp);
363 if (bam_is_be) bam_swap_endian_4p(&p->n);
365 p->list = (pair64_t*)malloc(p->m * 16);
366 fread(p->list, 16, p->n, fp);
369 for (x = 0; x < p->n; ++x) {
370 bam_swap_endian_8p(&p->list[x].u);
371 bam_swap_endian_8p(&p->list[x].v);
376 fread(&index2->n, 4, 1, fp);
377 if (bam_is_be) bam_swap_endian_4p(&index2->n);
378 index2->m = index2->n;
379 index2->offset = (uint64_t*)calloc(index2->m, 8);
380 fread(index2->offset, index2->n, 8, fp);
382 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
384 if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
385 if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
389 bam_index_t *bam_index_load_local(const char *_fn)
394 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
397 for (p = _fn + l - 1; p >= _fn; --p)
398 if (*p == '/') break;
400 } else fn = strdup(_fn);
401 fnidx = (char*)calloc(strlen(fn) + 5, 1);
402 strcpy(fnidx, fn); strcat(fnidx, ".bai");
403 fp = fopen(fnidx, "rb");
404 if (fp == 0) { // try "{base}.bai"
405 char *s = strstr(fn, "bam");
406 if (s == fn + strlen(fn) - 3) {
408 fnidx[strlen(fn)-1] = 'i';
409 fp = fopen(fnidx, "rb");
412 free(fnidx); free(fn);
414 bam_index_t *idx = bam_index_load_core(fp);
421 static void download_from_remote(const char *url)
423 const int buf_size = 1 * 1024 * 1024;
429 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
431 for (fn = (char*)url + l - 1; fn >= url; --fn)
432 if (*fn == '/') break;
433 ++fn; // fn now points to the file name
434 fp_remote = knet_open(url, "r");
435 if (fp_remote == 0) {
436 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
439 if ((fp = fopen(fn, "wb")) == 0) {
440 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
441 knet_close(fp_remote);
444 buf = (uint8_t*)calloc(buf_size, 1);
445 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
446 fwrite(buf, 1, l, fp);
449 knet_close(fp_remote);
452 static void download_from_remote(const char *url)
458 bam_index_t *bam_index_load(const char *fn)
461 idx = bam_index_load_local(fn);
462 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
463 char *fnidx = calloc(strlen(fn) + 5, 1);
464 strcat(strcpy(fnidx, fn), ".bai");
465 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
466 download_from_remote(fnidx);
468 idx = bam_index_load_local(fn);
470 if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
474 int bam_index_build2(const char *fn, const char *_fnidx)
480 if ((fp = bam_open(fn, "r")) == 0) {
481 fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
484 idx = bam_index_core(fp);
487 fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n");
491 fnidx = (char*)calloc(strlen(fn) + 5, 1);
492 strcpy(fnidx, fn); strcat(fnidx, ".bai");
493 } else fnidx = strdup(_fnidx);
494 fpidx = fopen(fnidx, "wb");
496 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
498 bam_index_destroy(idx);
501 bam_index_save(idx, fpidx);
502 bam_index_destroy(idx);
508 int bam_index_build(const char *fn)
510 return bam_index_build2(fn, 0);
513 int bam_index(int argc, char *argv[])
516 fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
519 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
520 else bam_index_build(argv[1]);
524 int bam_idxstats(int argc, char *argv[])
527 bam_header_t *header;
531 fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
534 fp = bam_open(argv[1], "r");
535 if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
536 header = bam_header_read(fp);
538 idx = bam_index_load(argv[1]);
539 if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
540 for (i = 0; i < idx->n; ++i) {
542 khash_t(i) *h = idx->index[i];
543 printf("%s\t%d", header->target_name[i], header->target_len[i]);
544 k = kh_get(i, h, BAM_MAX_BIN);
546 printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
547 else printf("\t0\t0");
550 printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
551 bam_header_destroy(header);
552 bam_index_destroy(idx);
556 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
559 if (beg >= end) return 0;
560 if (end >= 1u<<29) end = 1u<<29;
563 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
564 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
565 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
566 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
567 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
571 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
573 uint32_t rbeg = b->core.pos;
574 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
575 return (rend > beg && rbeg < end);
578 struct __bam_iter_t {
579 int from_first; // read from the first record; no random access
580 int tid, beg, end, n_off, i, finished;
585 // bam_fetch helper function retrieves
586 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
589 int i, n_bins, n_off;
596 if (beg < 0) beg = 0;
597 if (end < beg) return 0;
599 iter = calloc(1, sizeof(struct __bam_iter_t));
600 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
602 bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
603 n_bins = reg2bins(beg, end, bins);
604 index = idx->index[tid];
605 if (idx->index2[tid].n > 0) {
606 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
607 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
608 if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
609 int n = beg>>BAM_LIDX_SHIFT;
610 if (n > idx->index2[tid].n) n = idx->index2[tid].n;
611 for (i = n - 1; i >= 0; --i)
612 if (idx->index2[tid].offset[i] != 0) break;
613 if (i >= 0) min_off = idx->index2[tid].offset[i];
615 } else min_off = 0; // tabix 0.1.2 may produce such index files
616 for (i = n_off = 0; i < n_bins; ++i) {
617 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
618 n_off += kh_value(index, k).n;
621 free(bins); return iter;
623 off = (pair64_t*)calloc(n_off, 16);
624 for (i = n_off = 0; i < n_bins; ++i) {
625 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
627 bam_binlist_t *p = &kh_value(index, k);
628 for (j = 0; j < p->n; ++j)
629 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
634 free(off); return iter;
637 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
639 ks_introsort(off, n_off, off);
640 // resolve completely contained adjacent blocks
641 for (i = 1, l = 0; i < n_off; ++i)
642 if (off[l].v < off[i].v)
645 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
646 for (i = 1; i < n_off; ++i)
647 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
648 { // merge adjacent blocks
649 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
650 for (i = 1, l = 0; i < n_off; ++i) {
651 #ifdef BAM_TRUE_OFFSET
652 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
654 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
656 else off[++l] = off[i];
663 iter->n_off = n_off; iter->off = off;
667 pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
668 { // for pysam compatibility
671 iter = bam_iter_query(idx, tid, beg, end);
672 off = iter->off; *cnt_off = iter->n_off;
677 void bam_iter_destroy(bam_iter_t iter)
679 if (iter) { free(iter->off); free(iter); }
682 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
685 if (iter && iter->finished) return -1;
686 if (iter == 0 || iter->from_first) {
687 ret = bam_read1(fp, b);
688 if (ret < 0 && iter) iter->finished = 1;
691 if (iter->off == 0) return -1;
693 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
694 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
695 if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
696 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
697 bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
698 iter->curr_off = bam_tell(fp);
702 if ((ret = bam_read1(fp, b)) >= 0) {
703 iter->curr_off = bam_tell(fp);
704 if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
705 ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
708 else if (is_overlap(iter->beg, iter->end, b)) return ret;
709 } else break; // end of file or error
715 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
721 iter = bam_iter_query(idx, tid, beg, end);
722 while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
723 bam_iter_destroy(iter);
725 return (ret == -1)? 0 : ret;