6 #include "bam_endian.h"
14 Alignment indexing. Before indexing, BAM must be sorted based on the
15 leftmost coordinate of alignments. In indexing, BAM uses two indices:
16 a UCSC binning index and a simple linear index. The binning index is
17 efficient for alignments spanning long distance, while the auxiliary
18 linear index helps to reduce unnecessary seek calls especially for
21 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
22 Stein and is explained by Kent et al. (2002). In this scheme, each bin
23 represents a contiguous genomic region which can be fully contained in
24 another bin; each alignment is associated with a bin which represents
25 the smallest region containing the entire alignment. The binning
26 scheme is essentially another representation of R-tree. A distinct bin
27 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
28 a child of Bin B if region A is contained in B.
30 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
31 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
32 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
33 find the alignments overlapped with a region [rbeg,rend), we need to
34 calculate the list of bins that may be overlapped the region and test
35 the alignments in the bins to confirm the overlaps. If the specified
36 region is short, typically only a few alignments in six bins need to
37 be retrieved. The overlapping alignments can be quickly fetched.
41 #define BAM_MIN_CHUNK_GAP 32768
42 // 1<<14 is the size of minimum bin.
43 #define BAM_LIDX_SHIFT 14
45 #define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
51 #define pair64_lt(a,b) ((a).u < (b).u)
52 KSORT_INIT(off, pair64_t, pair64_lt)
64 KHASH_MAP_INIT_INT(i, bam_binlist_t)
66 struct __bam_index_t {
68 uint64_t n_no_coor; // unmapped reads without coordinate
73 // requirement: len <= LEN_MASK
74 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
79 k = kh_put(i, h, bin, &ret);
81 if (ret) { // not present
83 l->list = (pair64_t*)calloc(l->m, 16);
87 l->list = (pair64_t*)realloc(l->list, l->m * 16);
89 l->list[l->n].u = beg; l->list[l->n++].v = end;
92 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
95 beg = b->core.pos >> BAM_LIDX_SHIFT;
96 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
97 if (index2->m < end + 1) {
98 int old_m = index2->m;
100 kroundup32(index2->m);
101 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
102 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
105 if (index2->offset[beg] == 0) index2->offset[beg] = offset;
107 for (i = beg; i <= end; ++i)
108 if (index2->offset[i] == 0) index2->offset[i] = offset;
113 static void merge_chunks(bam_index_t *idx)
115 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
119 for (i = 0; i < idx->n; ++i) {
120 index = idx->index[i];
121 for (k = kh_begin(index); k != kh_end(index); ++k) {
123 if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
124 p = &kh_value(index, k);
126 for (l = 1; l < p->n; ++l) {
127 #ifdef BAM_TRUE_OFFSET
128 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
130 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
132 else p->list[++m] = p->list[l];
137 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
140 static void fill_missing(bam_index_t *idx)
143 for (i = 0; i < idx->n; ++i) {
144 bam_lidx_t *idx2 = &idx->index2[i];
145 for (j = 1; j < idx2->n; ++j)
146 if (idx2->offset[j] == 0)
147 idx2->offset[j] = idx2->offset[j-1];
151 bam_index_t *bam_index_core(bamFile fp)
157 uint32_t last_bin, save_bin;
158 int32_t last_coor, last_tid, save_tid;
160 uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
162 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
163 b = (bam1_t*)calloc(1, sizeof(bam1_t));
164 h = bam_header_read(fp);
167 idx->n = h->n_targets;
168 bam_header_destroy(h);
169 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
170 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
171 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
173 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
174 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
175 n_mapped = n_unmapped = n_no_coor = off_end = 0;
176 off_beg = off_end = bam_tell(fp);
177 while ((ret = bam_read1(fp, b)) >= 0) {
178 if (c->tid < 0) ++n_no_coor;
179 if (last_tid != c->tid) { // change of chromosomes
181 last_bin = 0xffffffffu;
182 } else if (last_coor > c->pos) {
183 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
184 bam1_qname(b), last_coor, c->pos, c->tid+1);
187 if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off);
188 if (c->bin != last_bin) { // then possibly write the binning index
189 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
190 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
191 if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
193 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
194 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
195 n_mapped = n_unmapped = 0;
199 save_bin = last_bin = c->bin;
201 if (save_tid < 0) break;
203 if (bam_tell(fp) <= last_off) {
204 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
205 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
208 if (c->flag & BAM_FUNMAP) ++n_unmapped;
210 last_off = bam_tell(fp);
211 last_coor = b->core.pos;
214 insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
215 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
216 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
221 while ((ret = bam_read1(fp, b)) >= 0) ++n_no_coor;
222 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
223 free(b->data); free(b);
224 idx->n_no_coor = n_no_coor;
228 void bam_index_destroy(bam_index_t *idx)
232 if (idx == 0) return;
233 for (i = 0; i < idx->n; ++i) {
234 khash_t(i) *index = idx->index[i];
235 bam_lidx_t *index2 = idx->index2 + i;
236 for (k = kh_begin(index); k != kh_end(index); ++k) {
237 if (kh_exist(index, k))
238 free(kh_value(index, k).list);
240 kh_destroy(i, index);
241 free(index2->offset);
243 free(idx->index); free(idx->index2);
247 void bam_index_save(const bam_index_t *idx, FILE *fp)
251 fwrite("BAI\1", 1, 4, fp);
254 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
255 } else fwrite(&idx->n, 4, 1, fp);
256 for (i = 0; i < idx->n; ++i) {
257 khash_t(i) *index = idx->index[i];
258 bam_lidx_t *index2 = idx->index2 + i;
259 // write binning index
260 size = kh_size(index);
261 if (bam_is_be) { // big endian
263 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
264 } else fwrite(&size, 4, 1, fp);
265 for (k = kh_begin(index); k != kh_end(index); ++k) {
266 if (kh_exist(index, k)) {
267 bam_binlist_t *p = &kh_value(index, k);
268 if (bam_is_be) { // big endian
270 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
271 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
272 for (x = 0; (int)x < p->n; ++x) {
273 bam_swap_endian_8p(&p->list[x].u);
274 bam_swap_endian_8p(&p->list[x].v);
276 fwrite(p->list, 16, p->n, fp);
277 for (x = 0; (int)x < p->n; ++x) {
278 bam_swap_endian_8p(&p->list[x].u);
279 bam_swap_endian_8p(&p->list[x].v);
282 fwrite(&kh_key(index, k), 4, 1, fp);
283 fwrite(&p->n, 4, 1, fp);
284 fwrite(p->list, 16, p->n, fp);
288 // write linear index (index2)
291 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
292 } else fwrite(&index2->n, 4, 1, fp);
293 if (bam_is_be) { // big endian
295 for (x = 0; (int)x < index2->n; ++x)
296 bam_swap_endian_8p(&index2->offset[x]);
297 fwrite(index2->offset, 8, index2->n, fp);
298 for (x = 0; (int)x < index2->n; ++x)
299 bam_swap_endian_8p(&index2->offset[x]);
300 } else fwrite(index2->offset, 8, index2->n, fp);
302 { // write the number of reads coor-less records.
303 uint64_t x = idx->n_no_coor;
304 if (bam_is_be) bam_swap_endian_8p(&x);
305 fwrite(&x, 8, 1, fp);
310 static bam_index_t *bam_index_load_core(FILE *fp)
316 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
319 fread(magic, 1, 4, fp);
320 if (strncmp(magic, "BAI\1", 4)) {
321 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
325 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
326 fread(&idx->n, 4, 1, fp);
327 if (bam_is_be) bam_swap_endian_4p(&idx->n);
328 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
329 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
330 for (i = 0; i < idx->n; ++i) {
332 bam_lidx_t *index2 = idx->index2 + i;
337 index = idx->index[i] = kh_init(i);
338 // load binning index
339 fread(&size, 4, 1, fp);
340 if (bam_is_be) bam_swap_endian_4p(&size);
341 for (j = 0; j < (int)size; ++j) {
342 fread(&key, 4, 1, fp);
343 if (bam_is_be) bam_swap_endian_4p(&key);
344 k = kh_put(i, index, key, &ret);
345 p = &kh_value(index, k);
346 fread(&p->n, 4, 1, fp);
347 if (bam_is_be) bam_swap_endian_4p(&p->n);
349 p->list = (pair64_t*)malloc(p->m * 16);
350 fread(p->list, 16, p->n, fp);
353 for (x = 0; x < p->n; ++x) {
354 bam_swap_endian_8p(&p->list[x].u);
355 bam_swap_endian_8p(&p->list[x].v);
360 fread(&index2->n, 4, 1, fp);
361 if (bam_is_be) bam_swap_endian_4p(&index2->n);
362 index2->m = index2->n;
363 index2->offset = (uint64_t*)calloc(index2->m, 8);
364 fread(index2->offset, index2->n, 8, fp);
366 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
368 if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
369 if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
373 bam_index_t *bam_index_load_local(const char *_fn)
378 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
381 for (p = _fn + l - 1; p >= _fn; --p)
382 if (*p == '/') break;
384 } else fn = strdup(_fn);
385 fnidx = (char*)calloc(strlen(fn) + 5, 1);
386 strcpy(fnidx, fn); strcat(fnidx, ".bai");
387 fp = fopen(fnidx, "rb");
388 if (fp == 0) { // try "{base}.bai"
389 char *s = strstr(fn, "bam");
390 if (s == fn + strlen(fn) - 3) {
392 fnidx[strlen(fn)-1] = 'i';
393 fp = fopen(fnidx, "rb");
396 free(fnidx); free(fn);
398 bam_index_t *idx = bam_index_load_core(fp);
405 static void download_from_remote(const char *url)
407 const int buf_size = 1 * 1024 * 1024;
413 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
415 for (fn = (char*)url + l - 1; fn >= url; --fn)
416 if (*fn == '/') break;
417 ++fn; // fn now points to the file name
418 fp_remote = knet_open(url, "r");
419 if (fp_remote == 0) {
420 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
423 if ((fp = fopen(fn, "wb")) == 0) {
424 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
425 knet_close(fp_remote);
428 buf = (uint8_t*)calloc(buf_size, 1);
429 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
430 fwrite(buf, 1, l, fp);
433 knet_close(fp_remote);
436 static void download_from_remote(const char *url)
442 bam_index_t *bam_index_load(const char *fn)
445 idx = bam_index_load_local(fn);
446 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
447 char *fnidx = calloc(strlen(fn) + 5, 1);
448 strcat(strcpy(fnidx, fn), ".bai");
449 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
450 download_from_remote(fnidx);
451 idx = bam_index_load_local(fn);
453 if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
457 int bam_index_build2(const char *fn, const char *_fnidx)
463 if ((fp = bam_open(fn, "r")) == 0) {
464 fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
467 idx = bam_index_core(fp);
470 fnidx = (char*)calloc(strlen(fn) + 5, 1);
471 strcpy(fnidx, fn); strcat(fnidx, ".bai");
472 } else fnidx = strdup(_fnidx);
473 fpidx = fopen(fnidx, "wb");
475 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
479 bam_index_save(idx, fpidx);
480 bam_index_destroy(idx);
486 int bam_index_build(const char *fn)
488 return bam_index_build2(fn, 0);
491 int bam_index(int argc, char *argv[])
494 fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
497 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
498 else bam_index_build(argv[1]);
502 int bam_idxstats(int argc, char *argv[])
505 bam_header_t *header;
509 fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
512 fp = bam_open(argv[1], "r");
513 if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
514 header = bam_header_read(fp);
516 idx = bam_index_load(argv[1]);
517 if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
518 for (i = 0; i < idx->n; ++i) {
520 khash_t(i) *h = idx->index[i];
521 printf("%s\t%d", header->target_name[i], header->target_len[i]);
522 k = kh_get(i, h, BAM_MAX_BIN);
524 printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
525 else printf("\t0\t0");
528 printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
529 bam_header_destroy(header);
530 bam_index_destroy(idx);
534 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
537 if (beg >= end) return 0;
538 if (end >= 1u<<29) end = 1u<<29;
541 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
542 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
543 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
544 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
545 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
549 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
551 uint32_t rbeg = b->core.pos;
552 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
553 return (rend > beg && rbeg < end);
556 struct __bam_iter_t {
557 int from_first; // read from the first record; no random access
558 int tid, beg, end, n_off, i, finished;
563 // bam_fetch helper function retrieves
564 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
567 int i, n_bins, n_off;
574 if (beg < 0) beg = 0;
575 if (end < beg) return 0;
577 iter = calloc(1, sizeof(struct __bam_iter_t));
578 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
580 bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
581 n_bins = reg2bins(beg, end, bins);
582 index = idx->index[tid];
583 if (idx->index2[tid].n > 0) {
584 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
585 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
586 if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
587 int n = beg>>BAM_LIDX_SHIFT;
588 if (n > idx->index2[tid].n) n = idx->index2[tid].n;
589 for (i = n - 1; i >= 0; --i)
590 if (idx->index2[tid].offset[i] != 0) break;
591 if (i >= 0) min_off = idx->index2[tid].offset[i];
593 } else min_off = 0; // tabix 0.1.2 may produce such index files
594 for (i = n_off = 0; i < n_bins; ++i) {
595 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
596 n_off += kh_value(index, k).n;
599 free(bins); return iter;
601 off = (pair64_t*)calloc(n_off, 16);
602 for (i = n_off = 0; i < n_bins; ++i) {
603 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
605 bam_binlist_t *p = &kh_value(index, k);
606 for (j = 0; j < p->n; ++j)
607 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
612 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
614 ks_introsort(off, n_off, off);
615 // resolve completely contained adjacent blocks
616 for (i = 1, l = 0; i < n_off; ++i)
617 if (off[l].v < off[i].v)
620 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
621 for (i = 1; i < n_off; ++i)
622 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
623 { // merge adjacent blocks
624 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
625 for (i = 1, l = 0; i < n_off; ++i) {
626 #ifdef BAM_TRUE_OFFSET
627 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
629 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
631 else off[++l] = off[i];
638 iter->n_off = n_off; iter->off = off;
642 pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
643 { // for pysam compatibility
646 iter = bam_iter_query(idx, tid, beg, end);
647 off = iter->off; *cnt_off = iter->n_off;
652 void bam_iter_destroy(bam_iter_t iter)
654 if (iter) { free(iter->off); free(iter); }
657 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
659 if (iter && iter->finished) return -1;
660 if (iter == 0 || iter->from_first) {
661 int ret = bam_read1(fp, b);
662 if (ret < 0 && iter) iter->finished = 1;
665 if (iter->off == 0) return -1;
668 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
669 if (iter->i == iter->n_off - 1) break; // no more chunks
670 if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
671 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
672 bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
673 iter->curr_off = bam_tell(fp);
677 if ((ret = bam_read1(fp, b)) > 0) {
678 iter->curr_off = bam_tell(fp);
679 if (b->core.tid != iter->tid || b->core.pos >= iter->end) break; // no need to proceed
680 else if (is_overlap(iter->beg, iter->end, b)) return ret;
681 } else break; // end of file
687 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
692 iter = bam_iter_query(idx, tid, beg, end);
693 while (bam_iter_read(fp, iter, b) >= 0) func(b, data);