6 #include "bam_endian.h"
14 Alignment indexing. Before indexing, BAM must be sorted based on the
15 leftmost coordinate of alignments. In indexing, BAM uses two indices:
16 a UCSC binning index and a simple linear index. The binning index is
17 efficient for alignments spanning long distance, while the auxiliary
18 linear index helps to reduce unnecessary seek calls especially for
21 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
22 Stein and is explained by Kent et al. (2002). In this scheme, each bin
23 represents a contiguous genomic region which can be fully contained in
24 another bin; each alignment is associated with a bin which represents
25 the smallest region containing the entire alignment. The binning
26 scheme is essentially another representation of R-tree. A distinct bin
27 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
28 a child of Bin B if region A is contained in B.
30 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
31 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
32 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
33 find the alignments overlapped with a region [rbeg,rend), we need to
34 calculate the list of bins that may be overlapped the region and test
35 the alignments in the bins to confirm the overlaps. If the specified
36 region is short, typically only a few alignments in six bins need to
37 be retrieved. The overlapping alignments can be quickly fetched.
41 #define BAM_MIN_CHUNK_GAP 32768
42 // 1<<14 is the size of minimum bin.
43 #define BAM_LIDX_SHIFT 14
49 #define pair64_lt(a,b) ((a).u < (b).u)
50 KSORT_INIT(off, pair64_t, pair64_lt)
62 KHASH_MAP_INIT_INT(i, bam_binlist_t)
64 struct __bam_index_t {
70 // requirement: len <= LEN_MASK
71 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
76 k = kh_put(i, h, bin, &ret);
78 if (ret) { // not present
80 l->list = (pair64_t*)calloc(l->m, 16);
84 l->list = (pair64_t*)realloc(l->list, l->m * 16);
86 l->list[l->n].u = beg; l->list[l->n++].v = end;
89 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
92 beg = b->core.pos >> BAM_LIDX_SHIFT;
93 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
94 if (index2->m < end + 1) {
95 int old_m = index2->m;
97 kroundup32(index2->m);
98 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
99 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
102 if (index2->offset[beg] == 0) index2->offset[beg] = offset;
104 for (i = beg; i <= end; ++i)
105 if (index2->offset[i] == 0) index2->offset[i] = offset;
110 static void merge_chunks(bam_index_t *idx)
112 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
116 for (i = 0; i < idx->n; ++i) {
117 index = idx->index[i];
118 for (k = kh_begin(index); k != kh_end(index); ++k) {
120 if (!kh_exist(index, k)) continue;
121 p = &kh_value(index, k);
123 for (l = 1; l < p->n; ++l) {
124 #ifdef BAM_TRUE_OFFSET
125 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
127 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
129 else p->list[++m] = p->list[l];
134 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
137 static void fill_missing(bam_index_t *idx)
140 for (i = 0; i < idx->n; ++i) {
141 bam_lidx_t *idx2 = &idx->index2[i];
142 for (j = 1; j < idx2->n; ++j)
143 if (idx2->offset[j] == 0)
144 idx2->offset[j] = idx2->offset[j-1];
148 bam_index_t *bam_index_core(bamFile fp)
154 uint32_t last_bin, save_bin;
155 int32_t last_coor, last_tid, save_tid;
157 uint64_t save_off, last_off;
159 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
160 b = (bam1_t*)calloc(1, sizeof(bam1_t));
161 h = bam_header_read(fp);
164 idx->n = h->n_targets;
165 bam_header_destroy(h);
166 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
167 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
168 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
170 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
171 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
172 while ((ret = bam_read1(fp, b)) >= 0) {
173 if (last_tid != c->tid) { // change of chromosomes
175 last_bin = 0xffffffffu;
176 } else if (last_coor > c->pos) {
177 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
178 bam1_qname(b), last_coor, c->pos, c->tid+1);
181 insert_offset2(&idx->index2[b->core.tid], b, last_off);
182 if (c->bin != last_bin) { // then possibly write the binning index
183 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
184 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
186 save_bin = last_bin = c->bin;
188 if (save_tid < 0) break;
190 if (bam_tell(fp) <= last_off) {
191 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
192 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
195 last_off = bam_tell(fp);
196 last_coor = b->core.pos;
198 if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
201 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
202 free(b->data); free(b);
206 void bam_index_destroy(bam_index_t *idx)
210 if (idx == 0) return;
211 for (i = 0; i < idx->n; ++i) {
212 khash_t(i) *index = idx->index[i];
213 bam_lidx_t *index2 = idx->index2 + i;
214 for (k = kh_begin(index); k != kh_end(index); ++k) {
215 if (kh_exist(index, k))
216 free(kh_value(index, k).list);
218 kh_destroy(i, index);
219 free(index2->offset);
221 free(idx->index); free(idx->index2);
225 void bam_index_save(const bam_index_t *idx, FILE *fp)
229 fwrite("BAI\1", 1, 4, fp);
232 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
233 } else fwrite(&idx->n, 4, 1, fp);
234 for (i = 0; i < idx->n; ++i) {
235 khash_t(i) *index = idx->index[i];
236 bam_lidx_t *index2 = idx->index2 + i;
237 // write binning index
238 size = kh_size(index);
239 if (bam_is_be) { // big endian
241 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
242 } else fwrite(&size, 4, 1, fp);
243 for (k = kh_begin(index); k != kh_end(index); ++k) {
244 if (kh_exist(index, k)) {
245 bam_binlist_t *p = &kh_value(index, k);
246 if (bam_is_be) { // big endian
248 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
249 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
250 for (x = 0; (int)x < p->n; ++x) {
251 bam_swap_endian_8p(&p->list[x].u);
252 bam_swap_endian_8p(&p->list[x].v);
254 fwrite(p->list, 16, p->n, fp);
255 for (x = 0; (int)x < p->n; ++x) {
256 bam_swap_endian_8p(&p->list[x].u);
257 bam_swap_endian_8p(&p->list[x].v);
260 fwrite(&kh_key(index, k), 4, 1, fp);
261 fwrite(&p->n, 4, 1, fp);
262 fwrite(p->list, 16, p->n, fp);
266 // write linear index (index2)
269 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
270 } else fwrite(&index2->n, 4, 1, fp);
271 if (bam_is_be) { // big endian
273 for (x = 0; (int)x < index2->n; ++x)
274 bam_swap_endian_8p(&index2->offset[x]);
275 fwrite(index2->offset, 8, index2->n, fp);
276 for (x = 0; (int)x < index2->n; ++x)
277 bam_swap_endian_8p(&index2->offset[x]);
278 } else fwrite(index2->offset, 8, index2->n, fp);
283 static bam_index_t *bam_index_load_core(FILE *fp)
289 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
292 fread(magic, 1, 4, fp);
293 if (strncmp(magic, "BAI\1", 4)) {
294 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
298 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
299 fread(&idx->n, 4, 1, fp);
300 if (bam_is_be) bam_swap_endian_4p(&idx->n);
301 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
302 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
303 for (i = 0; i < idx->n; ++i) {
305 bam_lidx_t *index2 = idx->index2 + i;
310 index = idx->index[i] = kh_init(i);
311 // load binning index
312 fread(&size, 4, 1, fp);
313 if (bam_is_be) bam_swap_endian_4p(&size);
314 for (j = 0; j < (int)size; ++j) {
315 fread(&key, 4, 1, fp);
316 if (bam_is_be) bam_swap_endian_4p(&key);
317 k = kh_put(i, index, key, &ret);
318 p = &kh_value(index, k);
319 fread(&p->n, 4, 1, fp);
320 if (bam_is_be) bam_swap_endian_4p(&p->n);
322 p->list = (pair64_t*)malloc(p->m * 16);
323 fread(p->list, 16, p->n, fp);
326 for (x = 0; x < p->n; ++x) {
327 bam_swap_endian_8p(&p->list[x].u);
328 bam_swap_endian_8p(&p->list[x].v);
333 fread(&index2->n, 4, 1, fp);
334 if (bam_is_be) bam_swap_endian_4p(&index2->n);
335 index2->m = index2->n;
336 index2->offset = (uint64_t*)calloc(index2->m, 8);
337 fread(index2->offset, index2->n, 8, fp);
339 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
344 bam_index_t *bam_index_load_local(const char *_fn)
349 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
352 for (p = _fn + l - 1; p >= _fn; --p)
353 if (*p == '/') break;
355 } else fn = strdup(_fn);
356 fnidx = (char*)calloc(strlen(fn) + 5, 1);
357 strcpy(fnidx, fn); strcat(fnidx, ".bai");
358 fp = fopen(fnidx, "rb");
359 if (fp == 0) { // try "{base}.bai"
360 char *s = strstr(fn, "bam");
361 if (s == fn + strlen(fn) - 3) {
363 fnidx[strlen(fn)-1] = 'i';
364 fp = fopen(fnidx, "rb");
367 free(fnidx); free(fn);
369 bam_index_t *idx = bam_index_load_core(fp);
376 static void download_from_remote(const char *url)
378 const int buf_size = 1 * 1024 * 1024;
384 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
386 for (fn = (char*)url + l - 1; fn >= url; --fn)
387 if (*fn == '/') break;
388 ++fn; // fn now points to the file name
389 fp_remote = knet_open(url, "r");
390 if (fp_remote == 0) {
391 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
394 if ((fp = fopen(fn, "wb")) == 0) {
395 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
396 knet_close(fp_remote);
399 buf = (uint8_t*)calloc(buf_size, 1);
400 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
401 fwrite(buf, 1, l, fp);
404 knet_close(fp_remote);
407 static void download_from_remote(const char *url)
413 bam_index_t *bam_index_load(const char *fn)
416 idx = bam_index_load_local(fn);
417 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
418 char *fnidx = calloc(strlen(fn) + 5, 1);
419 strcat(strcpy(fnidx, fn), ".bai");
420 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
421 download_from_remote(fnidx);
422 idx = bam_index_load_local(fn);
424 if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
428 int bam_index_build2(const char *fn, const char *_fnidx)
434 if ((fp = bam_open(fn, "r")) == 0) {
435 fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
438 idx = bam_index_core(fp);
441 fnidx = (char*)calloc(strlen(fn) + 5, 1);
442 strcpy(fnidx, fn); strcat(fnidx, ".bai");
443 } else fnidx = strdup(_fnidx);
444 fpidx = fopen(fnidx, "wb");
446 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
450 bam_index_save(idx, fpidx);
451 bam_index_destroy(idx);
457 int bam_index_build(const char *fn)
459 return bam_index_build2(fn, 0);
462 int bam_index(int argc, char *argv[])
465 fprintf(stderr, "Usage: samtools index <in.bam> [<out.index>]\n");
468 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
469 else bam_index_build(argv[1]);
473 #define MAX_BIN 37450 // =(8^6-1)/7+1
475 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
478 if (beg >= end) return 0;
479 if (end >= 1u<<29) end = 1u<<29;
482 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
483 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
484 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
485 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
486 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
490 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
492 uint32_t rbeg = b->core.pos;
493 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
494 return (rend > beg && rbeg < end);
497 struct __bam_iterf_t {
498 int from_first; // read from the first record; no random access
499 int tid, beg, end, n_off, i, finished;
504 // bam_fetch helper function retrieves
505 bam_iterf_t bam_iterf_query(const bam_index_t *idx, int tid, int beg, int end)
508 int i, n_bins, n_off;
513 bam_iterf_t iter = 0;
515 if (beg < 0) beg = 0;
516 if (end < beg) return 0;
518 iter = calloc(1, sizeof(struct __bam_iterf_t));
519 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
521 bins = (uint16_t*)calloc(MAX_BIN, 2);
522 n_bins = reg2bins(beg, end, bins);
523 index = idx->index[tid];
524 if (idx->index2[tid].n > 0) {
525 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
526 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
527 if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4
528 int n = beg>>BAM_LIDX_SHIFT;
529 if (n > idx->index2[tid].n) n = idx->index2[tid].n;
530 for (i = n - 1; i >= 0; --i)
531 if (idx->index2[tid].offset[i] != 0) break;
532 if (i >= 0) min_off = idx->index2[tid].offset[i];
534 } else min_off = 0; // tabix 0.1.2 may produce such index files
535 for (i = n_off = 0; i < n_bins; ++i) {
536 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
537 n_off += kh_value(index, k).n;
540 free(bins); return iter;
542 off = (pair64_t*)calloc(n_off, 16);
543 for (i = n_off = 0; i < n_bins; ++i) {
544 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
546 bam_binlist_t *p = &kh_value(index, k);
547 for (j = 0; j < p->n; ++j)
548 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
553 bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
555 ks_introsort(off, n_off, off);
556 // resolve completely contained adjacent blocks
557 for (i = 1, l = 0; i < n_off; ++i)
558 if (off[l].v < off[i].v)
561 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
562 for (i = 1; i < n_off; ++i)
563 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
564 { // merge adjacent blocks
565 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
566 for (i = 1, l = 0; i < n_off; ++i) {
567 #ifdef BAM_TRUE_OFFSET
568 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
570 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
572 else off[++l] = off[i];
579 iter->n_off = n_off; iter->off = off;
583 pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off)
584 { // for pysam compatibility
587 iter = bam_iterf_query(idx, tid, beg, end);
588 off = iter->off; *cnt_off = iter->n_off;
593 void bam_iterf_destroy(bam_iterf_t iter)
595 if (iter) { free(iter->off); free(iter); }
598 int bam_iterf_read(bamFile fp, bam_iterf_t iter, bam1_t *b)
600 if (iter->finished) return -1;
601 if (iter->from_first) {
602 int ret = bam_read1(fp, b);
603 if (ret < 0) iter->finished = 1;
606 if (iter->off == 0) return -1;
609 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
610 if (iter->i == iter->n_off - 1) break; // no more chunks
611 if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
612 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
613 bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
614 iter->curr_off = bam_tell(fp);
618 if ((ret = bam_read1(fp, b)) > 0) {
619 iter->curr_off = bam_tell(fp);
620 if (b->core.tid != iter->tid || b->core.pos >= iter->end) break; // no need to proceed
621 else if (is_overlap(iter->beg, iter->end, b)) return ret;
622 } else break; // end of file
628 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
633 iter = bam_iterf_query(idx, tid, beg, end);
634 while (bam_iterf_read(fp, iter, b) >= 0) func(b, data);