6 #include "bam_endian.h"
14 Alignment indexing. Before indexing, BAM must be sorted based on the
15 leftmost coordinate of alignments. In indexing, BAM uses two indices:
16 a UCSC binning index and a simple linear index. The binning index is
17 efficient for alignments spanning long distance, while the auxiliary
18 linear index helps to reduce unnecessary seek calls especially for
21 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
22 Stein and is explained by Kent et al. (2002). In this scheme, each bin
23 represents a contiguous genomic region which can be fully contained in
24 another bin; each alignment is associated with a bin which represents
25 the smallest region containing the entire alignment. The binning
26 scheme is essentially another representation of R-tree. A distinct bin
27 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
28 a child of Bin B if region A is contained in B.
30 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
31 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
32 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
33 find the alignments overlapped with a region [rbeg,rend), we need to
34 calculate the list of bins that may be overlapped the region and test
35 the alignments in the bins to confirm the overlaps. If the specified
36 region is short, typically only a few alignments in six bins need to
37 be retrieved. The overlapping alignments can be quickly fetched.
41 #define BAM_MIN_CHUNK_GAP 32768
42 // 1<<14 is the size of minimum bin.
43 #define BAM_LIDX_SHIFT 14
45 #define BAM_MAX_BIN 37450 // =(8^6-1)/7+1
51 #define pair64_lt(a,b) ((a).u < (b).u)
52 KSORT_INIT(off, pair64_t, pair64_lt)
64 KHASH_MAP_INIT_INT(i, bam_binlist_t)
66 struct __bam_index_t {
68 uint64_t n_no_coor; // unmapped reads without coordinate
73 // requirement: len <= LEN_MASK
74 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
79 k = kh_put(i, h, bin, &ret);
81 if (ret) { // not present
83 l->list = (pair64_t*)calloc(l->m, 16);
87 l->list = (pair64_t*)realloc(l->list, l->m * 16);
89 l->list[l->n].u = beg; l->list[l->n++].v = end;
92 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
95 beg = b->core.pos >> BAM_LIDX_SHIFT;
96 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
97 if (index2->m < end + 1) {
98 int old_m = index2->m;
100 kroundup32(index2->m);
101 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
102 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
105 if (index2->offset[beg] == 0) index2->offset[beg] = offset;
107 for (i = beg; i <= end; ++i)
108 if (index2->offset[i] == 0) index2->offset[i] = offset;
113 static void merge_chunks(bam_index_t *idx)
115 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
119 for (i = 0; i < idx->n; ++i) {
120 index = idx->index[i];
121 for (k = kh_begin(index); k != kh_end(index); ++k) {
123 if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue;
124 p = &kh_value(index, k);
126 for (l = 1; l < p->n; ++l) {
127 #ifdef BAM_TRUE_OFFSET
128 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
130 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
132 else p->list[++m] = p->list[l];
137 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
140 static void fill_missing(bam_index_t *idx)
143 for (i = 0; i < idx->n; ++i) {
144 bam_lidx_t *idx2 = &idx->index2[i];
145 for (j = 1; j < idx2->n; ++j)
146 if (idx2->offset[j] == 0)
147 idx2->offset[j] = idx2->offset[j-1];
151 bam_index_t *bam_index_core(bamFile fp)
157 uint32_t last_bin, save_bin;
158 int32_t last_coor, last_tid, save_tid;
160 uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor;
162 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
163 b = (bam1_t*)calloc(1, sizeof(bam1_t));
164 h = bam_header_read(fp);
167 idx->n = h->n_targets;
168 bam_header_destroy(h);
169 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
170 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
171 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
173 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
174 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
175 n_mapped = n_unmapped = n_no_coor = off_end = 0;
176 off_beg = off_end = bam_tell(fp);
177 while ((ret = bam_read1(fp, b)) >= 0) {
178 if (c->tid < 0) ++n_no_coor;
179 if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes
181 last_bin = 0xffffffffu;
182 } else if ((uint32_t)last_tid > (uint32_t)c->tid) {
183 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n",
184 bam1_qname(b), last_tid+1, c->tid+1);
186 } else if ((int32_t)c->tid >= 0 && last_coor > c->pos) {
187 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
188 bam1_qname(b), last_coor, c->pos, c->tid+1);
191 if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
192 if (c->bin != last_bin) { // then possibly write the binning index
193 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
194 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
195 if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element
197 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
198 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
199 n_mapped = n_unmapped = 0;
203 save_bin = last_bin = c->bin;
205 if (save_tid < 0) break;
207 if (bam_tell(fp) <= last_off) {
208 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
209 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
212 if (c->flag & BAM_FUNMAP) ++n_unmapped;
214 last_off = bam_tell(fp);
215 last_coor = b->core.pos;
218 insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
219 insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
220 insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
225 while ((ret = bam_read1(fp, b)) >= 0) {
227 if (c->tid >= 0 && n_no_coor) {
228 fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
233 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
234 free(b->data); free(b);
235 idx->n_no_coor = n_no_coor;
239 void bam_index_destroy(bam_index_t *idx)
243 if (idx == 0) return;
244 for (i = 0; i < idx->n; ++i) {
245 khash_t(i) *index = idx->index[i];
246 bam_lidx_t *index2 = idx->index2 + i;
247 for (k = kh_begin(index); k != kh_end(index); ++k) {
248 if (kh_exist(index, k))
249 free(kh_value(index, k).list);
251 kh_destroy(i, index);
252 free(index2->offset);
254 free(idx->index); free(idx->index2);
258 void bam_index_save(const bam_index_t *idx, FILE *fp)
262 fwrite("BAI\1", 1, 4, fp);
265 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
266 } else fwrite(&idx->n, 4, 1, fp);
267 for (i = 0; i < idx->n; ++i) {
268 khash_t(i) *index = idx->index[i];
269 bam_lidx_t *index2 = idx->index2 + i;
270 // write binning index
271 size = kh_size(index);
272 if (bam_is_be) { // big endian
274 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
275 } else fwrite(&size, 4, 1, fp);
276 for (k = kh_begin(index); k != kh_end(index); ++k) {
277 if (kh_exist(index, k)) {
278 bam_binlist_t *p = &kh_value(index, k);
279 if (bam_is_be) { // big endian
281 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
282 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
283 for (x = 0; (int)x < p->n; ++x) {
284 bam_swap_endian_8p(&p->list[x].u);
285 bam_swap_endian_8p(&p->list[x].v);
287 fwrite(p->list, 16, p->n, fp);
288 for (x = 0; (int)x < p->n; ++x) {
289 bam_swap_endian_8p(&p->list[x].u);
290 bam_swap_endian_8p(&p->list[x].v);
293 fwrite(&kh_key(index, k), 4, 1, fp);
294 fwrite(&p->n, 4, 1, fp);
295 fwrite(p->list, 16, p->n, fp);
299 // write linear index (index2)
302 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
303 } else fwrite(&index2->n, 4, 1, fp);
304 if (bam_is_be) { // big endian
306 for (x = 0; (int)x < index2->n; ++x)
307 bam_swap_endian_8p(&index2->offset[x]);
308 fwrite(index2->offset, 8, index2->n, fp);
309 for (x = 0; (int)x < index2->n; ++x)
310 bam_swap_endian_8p(&index2->offset[x]);
311 } else fwrite(index2->offset, 8, index2->n, fp);
313 { // write the number of reads coor-less records.
314 uint64_t x = idx->n_no_coor;
315 if (bam_is_be) bam_swap_endian_8p(&x);
316 fwrite(&x, 8, 1, fp);
321 static bam_index_t *bam_index_load_core(FILE *fp)
327 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
330 fread(magic, 1, 4, fp);
331 if (strncmp(magic, "BAI\1", 4)) {
332 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
336 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
337 fread(&idx->n, 4, 1, fp);
338 if (bam_is_be) bam_swap_endian_4p(&idx->n);
339 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
340 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
341 for (i = 0; i < idx->n; ++i) {
343 bam_lidx_t *index2 = idx->index2 + i;
348 index = idx->index[i] = kh_init(i);
349 // load binning index
350 fread(&size, 4, 1, fp);
351 if (bam_is_be) bam_swap_endian_4p(&size);
352 for (j = 0; j < (int)size; ++j) {
353 fread(&key, 4, 1, fp);
354 if (bam_is_be) bam_swap_endian_4p(&key);
355 k = kh_put(i, index, key, &ret);
356 p = &kh_value(index, k);
357 fread(&p->n, 4, 1, fp);
358 if (bam_is_be) bam_swap_endian_4p(&p->n);
360 p->list = (pair64_t*)malloc(p->m * 16);
361 fread(p->list, 16, p->n, fp);
364 for (x = 0; x < p->n; ++x) {
365 bam_swap_endian_8p(&p->list[x].u);
366 bam_swap_endian_8p(&p->list[x].v);
371 fread(&index2->n, 4, 1, fp);
372 if (bam_is_be) bam_swap_endian_4p(&index2->n);
373 index2->m = index2->n;
374 index2->offset = (uint64_t*)calloc(index2->m, 8);
375 fread(index2->offset, index2->n, 8, fp);
377 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
379 if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0;
380 if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor);
384 bam_index_t *bam_index_load_local(const char *_fn)
389 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
392 for (p = _fn + l - 1; p >= _fn; --p)
393 if (*p == '/') break;
395 } else fn = strdup(_fn);
396 fnidx = (char*)calloc(strlen(fn) + 5, 1);
397 strcpy(fnidx, fn); strcat(fnidx, ".bai");
398 fp = fopen(fnidx, "rb");
399 if (fp == 0) { // try "{base}.bai"
400 char *s = strstr(fn, "bam");
401 if (s == fn + strlen(fn) - 3) {
403 fnidx[strlen(fn)-1] = 'i';
404 fp = fopen(fnidx, "rb");
407 free(fnidx); free(fn);
409 bam_index_t *idx = bam_index_load_core(fp);
416 static void download_from_remote(const char *url)
418 const int buf_size = 1 * 1024 * 1024;
424 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
426 for (fn = (char*)url + l - 1; fn >= url; --fn)
427 if (*fn == '/') break;
428 ++fn; // fn now points to the file name
429 fp_remote = knet_open(url, "r");
430 if (fp_remote == 0) {
431 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
434 if ((fp = fopen(fn, "wb")) == 0) {
435 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
436 knet_close(fp_remote);
439 buf = (uint8_t*)calloc(buf_size, 1);
440 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
441 fwrite(buf, 1, l, fp);
444 knet_close(fp_remote);
447 static void download_from_remote(const char *url)
453 bam_index_t *bam_index_load(const char *fn)
456 idx = bam_index_load_local(fn);
457 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
458 char *fnidx = calloc(strlen(fn) + 5, 1);
459 strcat(strcpy(fnidx, fn), ".bai");
460 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
461 download_from_remote(fnidx);
462 idx = bam_index_load_local(fn);
464 if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
468 int bam_index_build2(const char *fn, const char *_fnidx)
474 if ((fp = bam_open(fn, "r")) == 0) {
475 fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
478 idx = bam_index_core(fp);
481 fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n");
485 fnidx = (char*)calloc(strlen(fn) + 5, 1);
486 strcpy(fnidx, fn); strcat(fnidx, ".bai");
487 } else fnidx = strdup(_fnidx);
488 fpidx = fopen(fnidx, "wb");
490 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
494 bam_index_save(idx, fpidx);
495 bam_index_destroy(idx);
501 int bam_index_build(const char *fn)
503 return bam_index_build2(fn, 0);
506 int bam_index(int argc, char *argv[])
509 fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n");
512 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
513 else bam_index_build(argv[1]);
517 int bam_idxstats(int argc, char *argv[])
520 bam_header_t *header;
524 fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
527 fp = bam_open(argv[1], "r");
528 if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
529 header = bam_header_read(fp);
531 idx = bam_index_load(argv[1]);
532 if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
533 for (i = 0; i < idx->n; ++i) {
535 khash_t(i) *h = idx->index[i];
536 printf("%s\t%d", header->target_name[i], header->target_len[i]);
537 k = kh_get(i, h, BAM_MAX_BIN);
539 printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v);
540 else printf("\t0\t0");
543 printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor);
544 bam_header_destroy(header);
545 bam_index_destroy(idx);
549 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN])
552 if (beg >= end) return 0;
553 if (end >= 1u<<29) end = 1u<<29;
556 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
557 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
558 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
559 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
560 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
564 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
566 uint32_t rbeg = b->core.pos;
567 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
568 return (rend > beg && rbeg < end);
571 struct __bam_iter_t {
572 int from_first; // read from the first record; no random access
573 int tid, beg, end, n_off, i, finished;
578 // bam_fetch helper function retrieves
579 bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end)
582 int i, n_bins, n_off;
589 if (beg < 0) beg = 0;
590 if (end < beg) return 0;
592 iter = calloc(1, sizeof(struct __bam_iter_t));
593 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
595 bins = (uint16_t*)calloc(BAM_MAX_BIN, 2);
596 n_bins = reg2bins(beg, end, bins);
597 index = idx->index[tid];
598 if (idx->index2[tid].n > 0) {
599 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
600 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
601 if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
602 int n = beg>>BAM_LIDX_SHIFT;
603 if (n > idx->index2[tid].n) n = idx->index2[tid].n;
604 for (i = n - 1; i >= 0; --i)
605 if (idx->index2[tid].offset[i] != 0) break;
606 if (i >= 0) min_off = idx->index2[tid].offset[i];
608 } else min_off = 0; // tabix 0.1.2 may produce such index files
609 for (i = n_off = 0; i < n_bins; ++i) {
610 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
611 n_off += kh_value(index, k).n;
614 free(bins); return iter;
616 off = (pair64_t*)calloc(n_off, 16);
617 for (i = n_off = 0; i < n_bins; ++i) {
618 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
620 bam_binlist_t *p = &kh_value(index, k);
621 for (j = 0; j < p->n; ++j)
622 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
627 free(off); return iter;
630 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
632 ks_introsort(off, n_off, off);
633 // resolve completely contained adjacent blocks
634 for (i = 1, l = 0; i < n_off; ++i)
635 if (off[l].v < off[i].v)
638 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
639 for (i = 1; i < n_off; ++i)
640 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
641 { // merge adjacent blocks
642 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
643 for (i = 1, l = 0; i < n_off; ++i) {
644 #ifdef BAM_TRUE_OFFSET
645 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
647 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
649 else off[++l] = off[i];
656 iter->n_off = n_off; iter->off = off;
660 pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
661 { // for pysam compatibility
664 iter = bam_iter_query(idx, tid, beg, end);
665 off = iter->off; *cnt_off = iter->n_off;
670 void bam_iter_destroy(bam_iter_t iter)
672 if (iter) { free(iter->off); free(iter); }
675 int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
678 if (iter && iter->finished) return -1;
679 if (iter == 0 || iter->from_first) {
680 ret = bam_read1(fp, b);
681 if (ret < 0 && iter) iter->finished = 1;
684 if (iter->off == 0) return -1;
686 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
687 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
688 if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
689 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
690 bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
691 iter->curr_off = bam_tell(fp);
695 if ((ret = bam_read1(fp, b)) >= 0) {
696 iter->curr_off = bam_tell(fp);
697 if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
698 ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
701 else if (is_overlap(iter->beg, iter->end, b)) return ret;
702 } else break; // end of file or error
708 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
714 iter = bam_iter_query(idx, tid, beg, end);
715 while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
716 bam_iter_destroy(iter);
718 return (ret == -1)? 0 : ret;