6 #include "bam_endian.h"
14 Alignment indexing. Before indexing, BAM must be sorted based on the
15 leftmost coordinate of alignments. In indexing, BAM uses two indices:
16 a UCSC binning index and a simple linear index. The binning index is
17 efficient for alignments spanning long distance, while the auxiliary
18 linear index helps to reduce unnecessary seek calls especially for
21 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
22 Stein and is explained by Kent et al. (2002). In this scheme, each bin
23 represents a contiguous genomic region which can be fully contained in
24 another bin; each alignment is associated with a bin which represents
25 the smallest region containing the entire alignment. The binning
26 scheme is essentially another representation of R-tree. A distinct bin
27 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
28 a child of Bin B if region A is contained in B.
30 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
31 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
32 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
33 find the alignments overlapped with a region [rbeg,rend), we need to
34 calculate the list of bins that may be overlapped the region and test
35 the alignments in the bins to confirm the overlaps. If the specified
36 region is short, typically only a few alignments in six bins need to
37 be retrieved. The overlapping alignments can be quickly fetched.
41 #define BAM_MIN_CHUNK_GAP 32768
42 // 1<<14 is the size of minimum bin.
43 #define BAM_LIDX_SHIFT 14
49 #define pair64_lt(a,b) ((a).u < (b).u)
50 KSORT_INIT(off, pair64_t, pair64_lt)
62 KHASH_MAP_INIT_INT(i, bam_binlist_t)
64 struct __bam_index_t {
70 // requirement: len <= LEN_MASK
71 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
76 k = kh_put(i, h, bin, &ret);
78 if (ret) { // not present
80 l->list = (pair64_t*)calloc(l->m, 16);
84 l->list = (pair64_t*)realloc(l->list, l->m * 16);
86 l->list[l->n].u = beg; l->list[l->n++].v = end;
89 static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
92 beg = b->core.pos >> BAM_LIDX_SHIFT;
93 end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
94 if (index2->m < end + 1) {
95 int old_m = index2->m;
97 kroundup32(index2->m);
98 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
99 memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
101 for (i = beg + 1; i <= end; ++i)
102 if (index2->offset[i] == 0) index2->offset[i] = offset;
106 static void merge_chunks(bam_index_t *idx)
108 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
112 for (i = 0; i < idx->n; ++i) {
113 index = idx->index[i];
114 for (k = kh_begin(index); k != kh_end(index); ++k) {
116 if (!kh_exist(index, k)) continue;
117 p = &kh_value(index, k);
119 for (l = 1; l < p->n; ++l) {
120 #ifdef BAM_TRUE_OFFSET
121 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
123 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
125 else p->list[++m] = p->list[l];
130 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
133 bam_index_t *bam_index_core(bamFile fp)
139 uint32_t last_bin, save_bin;
140 int32_t last_coor, last_tid, save_tid;
142 uint64_t save_off, last_off;
144 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
145 b = (bam1_t*)calloc(1, sizeof(bam1_t));
146 h = bam_header_read(fp);
149 idx->n = h->n_targets;
150 bam_header_destroy(h);
151 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
152 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
153 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
155 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
156 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
157 while ((ret = bam_read1(fp, b)) >= 0) {
158 if (last_tid != c->tid) { // change of chromosomes
160 last_bin = 0xffffffffu;
161 } else if (last_coor > c->pos) {
162 fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
163 bam1_qname(b), last_coor, c->pos, c->tid+1);
166 if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off);
167 if (c->bin != last_bin) { // then possibly write the binning index
168 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
169 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
171 save_bin = last_bin = c->bin;
173 if (save_tid < 0) break;
175 if (bam_tell(fp) <= last_off) {
176 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
177 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
180 last_off = bam_tell(fp);
181 last_coor = b->core.pos;
183 if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
185 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
186 free(b->data); free(b);
190 void bam_index_destroy(bam_index_t *idx)
194 if (idx == 0) return;
195 for (i = 0; i < idx->n; ++i) {
196 khash_t(i) *index = idx->index[i];
197 bam_lidx_t *index2 = idx->index2 + i;
198 for (k = kh_begin(index); k != kh_end(index); ++k) {
199 if (kh_exist(index, k))
200 free(kh_value(index, k).list);
202 kh_destroy(i, index);
203 free(index2->offset);
205 free(idx->index); free(idx->index2);
209 void bam_index_save(const bam_index_t *idx, FILE *fp)
213 fwrite("BAI\1", 1, 4, fp);
216 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
217 } else fwrite(&idx->n, 4, 1, fp);
218 for (i = 0; i < idx->n; ++i) {
219 khash_t(i) *index = idx->index[i];
220 bam_lidx_t *index2 = idx->index2 + i;
221 // write binning index
222 size = kh_size(index);
223 if (bam_is_be) { // big endian
225 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
226 } else fwrite(&size, 4, 1, fp);
227 for (k = kh_begin(index); k != kh_end(index); ++k) {
228 if (kh_exist(index, k)) {
229 bam_binlist_t *p = &kh_value(index, k);
230 if (bam_is_be) { // big endian
232 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
233 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
234 for (x = 0; (int)x < p->n; ++x) {
235 bam_swap_endian_8p(&p->list[x].u);
236 bam_swap_endian_8p(&p->list[x].v);
238 fwrite(p->list, 16, p->n, fp);
239 for (x = 0; (int)x < p->n; ++x) {
240 bam_swap_endian_8p(&p->list[x].u);
241 bam_swap_endian_8p(&p->list[x].v);
244 fwrite(&kh_key(index, k), 4, 1, fp);
245 fwrite(&p->n, 4, 1, fp);
246 fwrite(p->list, 16, p->n, fp);
250 // write linear index (index2)
253 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
254 } else fwrite(&index2->n, 4, 1, fp);
255 if (bam_is_be) { // big endian
257 for (x = 0; (int)x < index2->n; ++x)
258 bam_swap_endian_8p(&index2->offset[x]);
259 fwrite(index2->offset, 8, index2->n, fp);
260 for (x = 0; (int)x < index2->n; ++x)
261 bam_swap_endian_8p(&index2->offset[x]);
262 } else fwrite(index2->offset, 8, index2->n, fp);
267 static bam_index_t *bam_index_load_core(FILE *fp)
273 fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
276 fread(magic, 1, 4, fp);
277 if (strncmp(magic, "BAI\1", 4)) {
278 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
282 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
283 fread(&idx->n, 4, 1, fp);
284 if (bam_is_be) bam_swap_endian_4p(&idx->n);
285 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
286 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
287 for (i = 0; i < idx->n; ++i) {
289 bam_lidx_t *index2 = idx->index2 + i;
294 index = idx->index[i] = kh_init(i);
295 // load binning index
296 fread(&size, 4, 1, fp);
297 if (bam_is_be) bam_swap_endian_4p(&size);
298 for (j = 0; j < (int)size; ++j) {
299 fread(&key, 4, 1, fp);
300 if (bam_is_be) bam_swap_endian_4p(&key);
301 k = kh_put(i, index, key, &ret);
302 p = &kh_value(index, k);
303 fread(&p->n, 4, 1, fp);
304 if (bam_is_be) bam_swap_endian_4p(&p->n);
306 p->list = (pair64_t*)malloc(p->m * 16);
307 fread(p->list, 16, p->n, fp);
310 for (x = 0; x < p->n; ++x) {
311 bam_swap_endian_8p(&p->list[x].u);
312 bam_swap_endian_8p(&p->list[x].v);
317 fread(&index2->n, 4, 1, fp);
318 if (bam_is_be) bam_swap_endian_4p(&index2->n);
319 index2->m = index2->n;
320 index2->offset = (uint64_t*)calloc(index2->m, 8);
321 fread(index2->offset, index2->n, 8, fp);
323 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
328 bam_index_t *bam_index_load_local(const char *_fn)
333 if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
336 for (p = _fn + l - 1; p >= _fn; --p)
337 if (*p == '/') break;
339 } else fn = strdup(_fn);
340 fnidx = (char*)calloc(strlen(fn) + 5, 1);
341 strcpy(fnidx, fn); strcat(fnidx, ".bai");
342 fp = fopen(fnidx, "r");
343 if (fp == 0) { // try "{base}.bai"
344 char *s = strstr(fn, "bam");
345 if (s == fn + strlen(fn) - 3) {
347 fnidx[strlen(fn)-1] = 'i';
348 fp = fopen(fnidx, "r");
351 free(fnidx); free(fn);
353 bam_index_t *idx = bam_index_load_core(fp);
360 static void download_from_remote(const char *url)
362 const int buf_size = 1 * 1024 * 1024;
368 if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
370 for (fn = (char*)url + l - 1; fn >= url; --fn)
371 if (*fn == '/') break;
372 ++fn; // fn now points to the file name
373 fp_remote = knet_open(url, "r");
374 if (fp_remote == 0) {
375 fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
378 if ((fp = fopen(fn, "w")) == 0) {
379 fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
380 knet_close(fp_remote);
383 buf = (uint8_t*)calloc(buf_size, 1);
384 while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
385 fwrite(buf, 1, l, fp);
388 knet_close(fp_remote);
391 static void download_from_remote(const char *url)
397 bam_index_t *bam_index_load(const char *fn)
400 idx = bam_index_load_local(fn);
401 if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
402 char *fnidx = calloc(strlen(fn) + 5, 1);
403 strcat(strcpy(fnidx, fn), ".bai");
404 fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
405 download_from_remote(fnidx);
406 idx = bam_index_load_local(fn);
408 if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
412 int bam_index_build2(const char *fn, const char *_fnidx)
418 if ((fp = bam_open(fn, "r")) == 0) {
419 fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
422 idx = bam_index_core(fp);
425 fnidx = (char*)calloc(strlen(fn) + 5, 1);
426 strcpy(fnidx, fn); strcat(fnidx, ".bai");
427 } else fnidx = strdup(_fnidx);
428 fpidx = fopen(fnidx, "w");
430 fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
434 bam_index_save(idx, fpidx);
435 bam_index_destroy(idx);
441 int bam_index_build(const char *fn)
443 return bam_index_build2(fn, 0);
446 int bam_index(int argc, char *argv[])
449 fprintf(stderr, "Usage: samtools index <in.bam> [<out.index>]\n");
452 if (argc >= 3) bam_index_build2(argv[1], argv[2]);
453 else bam_index_build(argv[1]);
457 #define MAX_BIN 37450 // =(8^6-1)/7+1
459 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
464 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
465 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
466 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
467 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
468 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
472 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
474 uint32_t rbeg = b->core.pos;
475 uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
476 return (rend > beg && rbeg < end);
479 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
482 int i, n_bins, n_off;
488 bins = (uint16_t*)calloc(MAX_BIN, 2);
489 n_bins = reg2bins(beg, end, bins);
490 index = idx->index[tid];
491 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
492 for (i = n_off = 0; i < n_bins; ++i) {
493 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
494 n_off += kh_value(index, k).n;
497 free(bins); return 0;
499 off = (pair64_t*)calloc(n_off, 16);
500 for (i = n_off = 0; i < n_bins; ++i) {
501 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
503 bam_binlist_t *p = &kh_value(index, k);
504 for (j = 0; j < p->n; ++j)
505 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
513 b = (bam1_t*)calloc(1, sizeof(bam1_t));
514 ks_introsort(off, n_off, off);
515 // resolve completely contained adjacent blocks
516 for (i = 1, l = 0; i < n_off; ++i)
517 if (off[l].v < off[i].v)
520 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
521 for (i = 1; i < n_off; ++i)
522 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
523 { // merge adjacent blocks
524 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
525 for (i = 1, l = 0; i < n_off; ++i) {
526 #ifdef BAM_TRUE_OFFSET
527 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
529 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
531 else off[++l] = off[i];
536 // retrive alignments
537 n_seeks = 0; i = -1; curr_off = 0;
539 if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
540 if (i == n_off - 1) break; // no more chunks
541 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
542 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
543 bam_seek(fp, off[i+1].u, SEEK_SET);
544 curr_off = bam_tell(fp);
549 if ((ret = bam_read1(fp, b)) > 0) {
550 curr_off = bam_tell(fp);
551 if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
552 else if (is_overlap(beg, end, b)) func(b, data);
553 } else break; // end of file
555 // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);