6 #include "bam_endian.h"
14 Alignment indexing. Before indexing, BAM must be sorted based on the
15 leftmost coordinate of alignments. In indexing, BAM uses two indices:
16 a UCSC binning index and a simple linear index. The binning index is
17 efficient for alignments spanning long distance, while the auxiliary
18 linear index helps to reduce unnecessary seek calls especially for
21 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
22 Stein and is explained by Kent et al. (2002). In this scheme, each bin
23 represents a contiguous genomic region which can be fully contained in
24 another bin; each alignment is associated with a bin which represents
25 the smallest region containing the entire alignment. The binning
26 scheme is essentially another representation of R-tree. A distinct bin
27 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
28 a child of Bin B if region A is contained in B.
30 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
31 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
32 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
33 find the alignments overlapped with a region [rbeg,rend), we need to
34 calculate the list of bins that may be overlapped the region and test
35 the alignments in the bins to confirm the overlaps. If the specified
36 region is short, typically only a few alignments in six bins need to
37 be retrieved. The overlapping alignments can be quickly fetched.
41 #define BAM_MIN_CHUNK_GAP 32768
42 // 1<<14 is the size of minimum bin.
43 #define BAM_LIDX_SHIFT 14
45 #define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
51 #define pair64_lt(a,b) ((a).u < (b).u)
52 KSORT_INIT(off, pair64_t, pair64_lt)
64 KHASH_MAP_INIT_INT(i, bam_binlist_t)
66 struct __bam_index_t {
68 uint64_t n_no_coor; // unmapped reads without coordinate
73 // requirement: len <= LEN_MASK
74 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
79 k = kh_put(i, h, bin, &ret);
81 if (ret) { // not present
83 l->list = (pair64_t*)calloc(l->m, 16);
87 l->list = (pair64_t*)realloc(l->list, l->m * 16);
89 l->list[l->n].u = beg; l->list[l->n++].v = end;
92 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
95 beg = b->core.pos >> BAM_LIDX_SHIFT;
96 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
97 if (index2->m < end + 1) {
98 int old_m = index2->m;
100 kroundup32(index2->m);
101 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
102 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
105 if (index2->offset[beg] == 0) index2->offset[beg] = offset;
107 for (i = beg; i <= end; ++i)
108 if (index2->offset[i] == 0) index2->offset[i] = offset;
113 static void merge_chunks(bam_index_t *idx)
115 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
119 for (i = 0; i < idx->n; ++i) {
120 index = idx->index[i];
121 for (k = kh_begin(index); k != kh_end(index); ++k) {
123 if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
124 p = &kh_value(index, k);
126 for (l = 1; l < p->n; ++l) {
127 #ifdef BAM_TRUE_OFFSET
128 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
130 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
132 else p->list[++m] = p->list[l];
137 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
140 static void fill_missing(bam_index_t *idx)
143 for (i = 0; i < idx->n; ++i) {
144 bam_lidx_t *idx2 = &idx->index2[i];
145 for (j = 1; j < idx2->n; ++j)
146 if (idx2->offset[j] == 0)
147 idx2->offset[j] = idx2->offset[j-1];
151 bam_index_t *bam_index_core(bamFile fp)
157 uint32_t last_bin, save_bin;
158 int32_t last_coor, last_tid, save_tid;
160 uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
162 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
163 b = (bam1_t*)calloc(1, sizeof(bam1_t));
164 h = bam_header_read(fp);
167 idx->n = h->n_targets;
168 bam_header_destroy(h);
169 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
170 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
171 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
173 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
174 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
175 n_mapped = n_unmapped = n_no_coor = off_end = 0;
176 off_beg = off_end = bam_tell(fp);
177 while ((ret = bam_read1(fp, b)) >= 0) {
178 if (c->tid < 0) ++n_no_coor;
179 if (last_tid != c->tid) { // change of chromosomes
181 last_bin = 0xffffffffu;
182 } else if (last_coor > c->pos) {
183 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
184 bam1_qname(b), last_coor, c->pos, c->tid+1);
187 if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off);
188 if (c->bin != last_bin) { // then possibly write the binning index
189 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
190 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
191 if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
193 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
194 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
195 n_mapped = n_unmapped = 0;
199 save_bin = last_bin = c->bin;
201 if (save_tid < 0) break;
203 if (bam_tell(fp) <= last_off) {
204 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
205 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
208 if (c->flag & BAM_FUNMAP) ++n_unmapped;
210 last_off = bam_tell(fp);
211 last_coor = b->core.pos;
214 insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
215 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
216 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
221 while ((ret = bam_read1(fp, b)) >= 0) {
223 if (c->tid >= 0 && n_no_coor) {
224 fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
229 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
230 free(b->data); free(b);
231 idx->n_no_coor = n_no_coor;
235 void bam_index_destroy(bam_index_t *idx)
239 if (idx == 0) return;
240 for (i = 0; i < idx->n; ++i) {
241 khash_t(i) *index = idx->index[i];
242 bam_lidx_t *index2 = idx->index2 + i;
243 for (k = kh_begin(index); k != kh_end(index); ++k) {
244 if (kh_exist(index, k))
245 free(kh_value(index, k).list);
247 kh_destroy(i, index);
248 free(index2->offset);
250 free(idx->index); free(idx->index2);
254 void bam_index_save(const bam_index_t *idx, FILE *fp)
258 fwrite("BAI\1", 1, 4, fp);
261 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
262 } else fwrite(&idx->n, 4, 1, fp);
263 for (i = 0; i < idx->n; ++i) {
264 khash_t(i) *index = idx->index[i];
265 bam_lidx_t *index2 = idx->index2 + i;
266 // write binning index
267 size = kh_size(index);
268 if (bam_is_be) { // big endian
270 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
271 } else fwrite(&size, 4, 1, fp);
272 for (k = kh_begin(index); k != kh_end(index); ++k) {
273 if (kh_exist(index, k)) {
274 bam_binlist_t *p = &kh_value(index, k);
275 if (bam_is_be) { // big endian
277 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
278 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
279 for (x = 0; (int)x < p->n; ++x) {
280 bam_swap_endian_8p(&p->list[x].u);
281 bam_swap_endian_8p(&p->list[x].v);
283 fwrite(p->list, 16, p->n, fp);
284 for (x = 0; (int)x < p->n; ++x) {
285 bam_swap_endian_8p(&p->list[x].u);
286 bam_swap_endian_8p(&p->list[x].v);
289 fwrite(&kh_key(index, k), 4, 1, fp);
290 fwrite(&p->n, 4, 1, fp);
291 fwrite(p->list, 16, p->n, fp);
295 // write linear index (index2)
298 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
299 } else fwrite(&index2->n, 4, 1, fp);
300 if (bam_is_be) { // big endian
302 for (x = 0; (int)x < index2->n; ++x)
303 bam_swap_endian_8p(&index2->offset[x]);
304 fwrite(index2->offset, 8, index2->n, fp);
305 for (x = 0; (int)x < index2->n; ++x)
306 bam_swap_endian_8p(&index2->offset[x]);
307 } else fwrite(index2->offset, 8, index2->n, fp);
309 { // write the number of reads coor-less records.
310 uint64_t x = idx->n_no_coor;
311 if (bam_is_be) bam_swap_endian_8p(&x);
312 fwrite(&x, 8, 1, fp);
317 static bam_index_t *bam_index_load_core(FILE *fp)
323 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
326 fread(magic, 1, 4, fp);
327 if (strncmp(magic, "BAI\1", 4)) {
328 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
332 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
333 fread(&idx->n, 4, 1, fp);
334 if (bam_is_be) bam_swap_endian_4p(&idx->n);
335 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
336 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
337 for (i = 0; i < idx->n; ++i) {
339 bam_lidx_t *index2 = idx->index2 + i;
344 index = idx->index[i] = kh_init(i);
345 // load binning index
346 fread(&size, 4, 1, fp);
347 if (bam_is_be) bam_swap_endian_4p(&size);
348 for (j = 0; j < (int)size; ++j) {
349 fread(&key, 4, 1, fp);
350 if (bam_is_be) bam_swap_endian_4p(&key);
351 k = kh_put(i, index, key, &ret);
352 p = &kh_value(index, k);
353 fread(&p->n, 4, 1, fp);
354 if (bam_is_be) bam_swap_endian_4p(&p->n);
356 p->list = (pair64_t*)malloc(p->m * 16);
357 fread(p->list, 16, p->n, fp);
360 for (x = 0; x < p->n; ++x) {
361 bam_swap_endian_8p(&p->list[x].u);
362 bam_swap_endian_8p(&p->list[x].v);
367 fread(&index2->n, 4, 1, fp);
368 if (bam_is_be) bam_swap_endian_4p(&index2->n);
369 index2->m = index2->n;
370 index2->offset = (uint64_t*)calloc(index2->m, 8);
371 fread(index2->offset, index2->n, 8, fp);
373 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
375 if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
376 if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
380 bam_index_t *bam_index_load_local(const char *_fn)
385 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
388 for (p = _fn + l - 1; p >= _fn; --p)
389 if (*p == '/') break;
391 } else fn = strdup(_fn);
392 fnidx = (char*)calloc(strlen(fn) + 5, 1);
393 strcpy(fnidx, fn); strcat(fnidx, ".bai");
394 fp = fopen(fnidx, "rb");
395 if (fp == 0) { // try "{base}.bai"
396 char *s = strstr(fn, "bam");
397 if (s == fn + strlen(fn) - 3) {
399 fnidx[strlen(fn)-1] = 'i';
400 fp = fopen(fnidx, "rb");
403 free(fnidx); free(fn);
405 bam_index_t *idx = bam_index_load_core(fp);
412 static void download_from_remote(const char *url)
414 const int buf_size = 1 * 1024 * 1024;
420 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
422 for (fn = (char*)url + l - 1; fn >= url; --fn)
423 if (*fn == '/') break;
424 ++fn; // fn now points to the file name
425 fp_remote = knet_open(url, "r");
426 if (fp_remote == 0) {
427 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
430 if ((fp = fopen(fn, "wb")) == 0) {
431 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
432 knet_close(fp_remote);
435 buf = (uint8_t*)calloc(buf_size, 1);
436 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
437 fwrite(buf, 1, l, fp);
440 knet_close(fp_remote);
443 static void download_from_remote(const char *url)
449 bam_index_t *bam_index_load(const char *fn)
452 idx = bam_index_load_local(fn);
453 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
454 char *fnidx = calloc(strlen(fn) + 5, 1);
455 strcat(strcpy(fnidx, fn), ".bai");
456 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
457 download_from_remote(fnidx);
458 idx = bam_index_load_local(fn);
460 if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
464 int bam_index_build2(const char *fn, const char *_fnidx)
470 if ((fp = bam_open(fn, "r")) == 0) {
471 fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
474 idx = bam_index_core(fp);
477 fnidx = (char*)calloc(strlen(fn) + 5, 1);
478 strcpy(fnidx, fn); strcat(fnidx, ".bai");
479 } else fnidx = strdup(_fnidx);
480 fpidx = fopen(fnidx, "wb");
482 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
486 bam_index_save(idx, fpidx);
487 bam_index_destroy(idx);
493 int bam_index_build(const char *fn)
495 return bam_index_build2(fn, 0);
498 int bam_index(int argc, char *argv[])
501 fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
504 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
505 else bam_index_build(argv[1]);
509 int bam_idxstats(int argc, char *argv[])
512 bam_header_t *header;
516 fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
519 fp = bam_open(argv[1], "r");
520 if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
521 header = bam_header_read(fp);
523 idx = bam_index_load(argv[1]);
524 if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
525 for (i = 0; i < idx->n; ++i) {
527 khash_t(i) *h = idx->index[i];
528 printf("%s\t%d", header->target_name[i], header->target_len[i]);
529 k = kh_get(i, h, BAM_MAX_BIN);
531 printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
532 else printf("\t0\t0");
535 printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
536 bam_header_destroy(header);
537 bam_index_destroy(idx);
541 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
544 if (beg >= end) return 0;
545 if (end >= 1u<<29) end = 1u<<29;
548 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
549 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
550 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
551 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
552 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
556 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
558 uint32_t rbeg = b->core.pos;
559 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
560 return (rend > beg && rbeg < end);
563 struct __bam_iter_t {
564 int from_first; // read from the first record; no random access
565 int tid, beg, end, n_off, i, finished;
570 // bam_fetch helper function retrieves
571 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
574 int i, n_bins, n_off;
581 if (beg < 0) beg = 0;
582 if (end < beg) return 0;
584 iter = calloc(1, sizeof(struct __bam_iter_t));
585 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
587 bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
588 n_bins = reg2bins(beg, end, bins);
589 index = idx->index[tid];
590 if (idx->index2[tid].n > 0) {
591 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
592 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
593 if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
594 int n = beg>>BAM_LIDX_SHIFT;
595 if (n > idx->index2[tid].n) n = idx->index2[tid].n;
596 for (i = n - 1; i >= 0; --i)
597 if (idx->index2[tid].offset[i] != 0) break;
598 if (i >= 0) min_off = idx->index2[tid].offset[i];
600 } else min_off = 0; // tabix 0.1.2 may produce such index files
601 for (i = n_off = 0; i < n_bins; ++i) {
602 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
603 n_off += kh_value(index, k).n;
606 free(bins); return iter;
608 off = (pair64_t*)calloc(n_off, 16);
609 for (i = n_off = 0; i < n_bins; ++i) {
610 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
612 bam_binlist_t *p = &kh_value(index, k);
613 for (j = 0; j < p->n; ++j)
614 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
619 free(off); return iter;
622 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
624 ks_introsort(off, n_off, off);
625 // resolve completely contained adjacent blocks
626 for (i = 1, l = 0; i < n_off; ++i)
627 if (off[l].v < off[i].v)
630 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
631 for (i = 1; i < n_off; ++i)
632 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
633 { // merge adjacent blocks
634 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
635 for (i = 1, l = 0; i < n_off; ++i) {
636 #ifdef BAM_TRUE_OFFSET
637 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
639 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
641 else off[++l] = off[i];
648 iter->n_off = n_off; iter->off = off;
652 pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
653 { // for pysam compatibility
656 iter = bam_iter_query(idx, tid, beg, end);
657 off = iter->off; *cnt_off = iter->n_off;
662 void bam_iter_destroy(bam_iter_t iter)
664 if (iter) { free(iter->off); free(iter); }
667 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
670 if (iter && iter->finished) return -1;
671 if (iter == 0 || iter->from_first) {
672 ret = bam_read1(fp, b);
673 if (ret < 0 && iter) iter->finished = 1;
676 if (iter->off == 0) return -1;
678 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
679 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
680 if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
681 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
682 bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
683 iter->curr_off = bam_tell(fp);
687 if ((ret = bam_read1(fp, b)) >= 0) {
688 iter->curr_off = bam_tell(fp);
689 if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
690 ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
693 else if (is_overlap(iter->beg, iter->end, b)) return ret;
694 } else break; // end of file or error
700 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
706 iter = bam_iter_query(idx, tid, beg, end);
707 while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
708 bam_iter_destroy(iter);
710 return (ret == -1)? 0 : ret;