5 #include "bam_endian.h"
10 Alignment indexing. Before indexing, BAM must be sorted based on the
11 leftmost coordinate of alignments. In indexing, BAM uses two indices:
12 a UCSC binning index and a simple linear index. The binning index is
13 efficient for alignments spanning long distance, while the auxiliary
14 linear index helps to reduce unnecessary seek calls especially for
17 The UCSC binning scheme was suggested by Richard Durbin and Lincoln
18 Stein and is explained by Kent et al. (2002). In this scheme, each bin
19 represents a contiguous genomic region which can be fully contained in
20 another bin; each alignment is associated with a bin which represents
21 the smallest region containing the entire alignment. The binning
22 scheme is essentially another representation of R-tree. A distinct bin
23 uniquely corresponds to a distinct internal node in a R-tree. Bin A is
24 a child of Bin B if region A is contained in B.
26 In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
27 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
28 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
29 find the alignments overlapped with a region [rbeg,rend), we need to
30 calculate the list of bins that may be overlapped the region and test
31 the alignments in the bins to confirm the overlaps. If the specified
32 region is short, typically only a few alignments in six bins need to
33 be retrieved. The overlapping alignments can be quickly fetched.
37 #define BAM_MIN_CHUNK_GAP 32768
38 #define BAM_LIDX_SHIFT 14
44 #define pair64_lt(a,b) ((a).u < (b).u)
45 KSORT_INIT(off, pair64_t, pair64_lt)
57 KHASH_MAP_INIT_INT(i, bam_binlist_t)
59 struct __bam_index_t {
65 // requirement: len <= LEN_MASK
66 static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
71 k = kh_put(i, h, bin, &ret);
73 if (ret) { // not present
75 l->list = (pair64_t*)calloc(l->m, 16);
79 l->list = (pair64_t*)realloc(l->list, l->m * 16);
81 l->list[l->n].u = beg; l->list[l->n++].v = end;
84 static inline void insert_offset2(bam_lidx_t *index2, int last, int curr, uint64_t offset)
87 if (index2->m < curr + 1) {
89 kroundup32(index2->m);
90 index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
92 if (last > curr) last = -1;
93 for (i = last + 1; i <= curr; ++i) index2->offset[i] = offset;
97 static void merge_chunks(bam_index_t *idx)
99 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
103 for (i = 0; i < idx->n; ++i) {
104 index = idx->index[i];
105 for (k = kh_begin(index); k != kh_end(index); ++k) {
107 if (!kh_exist(index, k)) continue;
108 p = &kh_value(index, k);
110 for (l = 1; l < p->n; ++l) {
111 #ifdef BAM_TRUE_OFFSET
112 if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
114 if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
116 else p->list[++m] = p->list[l];
121 #endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
124 bam_index_t *bam_index_core(bamFile fp)
130 uint32_t last_coor, last_tid, last_bin, save_bin, save_tid;
132 uint64_t save_off, last_off;
134 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
135 b = (bam1_t*)calloc(1, sizeof(bam1_t));
136 h = bam_header_read(fp);
139 idx->n = h->n_targets;
140 bam_header_destroy(h);
141 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
142 for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
143 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
145 save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
146 save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
147 while ((ret = bam_read1(fp, b)) >= 0) {
148 if (last_tid != c->tid) { // change of chromosomes
150 last_bin = 0xffffffffu;
151 } else if (last_coor > c->pos) {
152 fprintf(stderr, "[bam_index_core] the alignment is not sorted. Abort!\n");
155 if (last_coor>>BAM_LIDX_SHIFT != b->core.pos>>BAM_LIDX_SHIFT) // then write the linear index
156 insert_offset2(&idx->index2[b->core.tid], last_coor>>BAM_LIDX_SHIFT, b->core.pos>>BAM_LIDX_SHIFT, last_off);
157 if (c->bin != last_bin) { // then possibly write the binning index
158 if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
159 insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
161 save_bin = last_bin = c->bin;
164 if (bam_tell(fp) <= last_off) {
165 fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
166 (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
169 last_off = bam_tell(fp);
170 last_coor = b->core.pos;
172 insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
174 if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
175 free(b->data); free(b);
179 void bam_index_destroy(bam_index_t *idx)
183 if (idx == 0) return;
184 for (i = 0; i < idx->n; ++i) {
185 khash_t(i) *index = idx->index[i];
186 bam_lidx_t *index2 = idx->index2 + i;
187 for (k = kh_begin(index); k != kh_end(index); ++k) {
188 if (kh_exist(index, k))
189 free(kh_value(index, k).list);
191 kh_destroy(i, index);
192 free(index2->offset);
194 free(idx->index); free(idx->index2);
198 void bam_index_save(const bam_index_t *idx, FILE *fp)
202 fwrite("BAI\1", 1, 4, fp);
205 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
206 } else fwrite(&idx->n, 4, 1, fp);
207 for (i = 0; i < idx->n; ++i) {
208 khash_t(i) *index = idx->index[i];
209 bam_lidx_t *index2 = idx->index2 + i;
210 // write binning index
211 size = kh_size(index);
212 if (bam_is_be) { // big endian
214 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
215 } else fwrite(&size, 4, 1, fp);
216 for (k = kh_begin(index); k != kh_end(index); ++k) {
217 if (kh_exist(index, k)) {
218 bam_binlist_t *p = &kh_value(index, k);
219 if (bam_is_be) { // big endian
221 x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
222 x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
223 for (x = 0; (int)x < p->n; ++x) {
224 bam_swap_endian_8p(&p->list[x].u);
225 bam_swap_endian_8p(&p->list[x].v);
227 fwrite(p->list, 16, p->n, fp);
228 for (x = 0; (int)x < p->n; ++x) {
229 bam_swap_endian_8p(&p->list[x].u);
230 bam_swap_endian_8p(&p->list[x].v);
233 fwrite(&kh_key(index, k), 4, 1, fp);
234 fwrite(&p->n, 4, 1, fp);
235 fwrite(p->list, 16, p->n, fp);
239 // write linear index (index2)
242 fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
243 } else fwrite(&index2->n, 4, 1, fp);
244 if (bam_is_be) { // big endian
246 for (x = 0; (int)x < index2->n; ++x)
247 bam_swap_endian_8p(&index2->offset[x]);
248 fwrite(index2->offset, 8, index2->n, fp);
249 for (x = 0; (int)x < index2->n; ++x)
250 bam_swap_endian_8p(&index2->offset[x]);
251 } else fwrite(index2->offset, 8, index2->n, fp);
256 bam_index_t *bam_index_load(const char *fn)
261 char *fnidx, magic[4];
263 fnidx = (char*)calloc(strlen(fn) + 5, 1);
264 strcpy(fnidx, fn); strcat(fnidx, ".bai");
265 if ((fp = fopen(fnidx, "r")) == 0) {
266 fprintf(stderr, "[bam_index_load] the alignment is not indexed. Please run `index' command first. Abort!\n");
271 fread(magic, 1, 4, fp);
272 if (strncmp(magic, "BAI\1", 4)) {
273 fprintf(stderr, "[bam_index_load] wrong magic number.\n");
277 idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
278 fread(&idx->n, 4, 1, fp);
279 if (bam_is_be) bam_swap_endian_4p(&idx->n);
280 idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
281 idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
282 for (i = 0; i < idx->n; ++i) {
284 bam_lidx_t *index2 = idx->index2 + i;
289 index = idx->index[i] = kh_init(i);
290 // load binning index
291 fread(&size, 4, 1, fp);
292 if (bam_is_be) bam_swap_endian_4p(&size);
293 for (j = 0; j < (int)size; ++j) {
294 fread(&key, 4, 1, fp);
295 if (bam_is_be) bam_swap_endian_4p(&key);
296 k = kh_put(i, index, key, &ret);
297 p = &kh_value(index, k);
298 fread(&p->n, 4, 1, fp);
299 if (bam_is_be) bam_swap_endian_4p(&p->n);
301 p->list = (pair64_t*)malloc(p->m * 16);
302 fread(p->list, 16, p->n, fp);
305 for (x = 0; x < p->n; ++x) {
306 bam_swap_endian_8p(&p->list[x].u);
307 bam_swap_endian_8p(&p->list[x].v);
312 fread(&index2->n, 4, 1, fp);
313 if (bam_is_be) bam_swap_endian_4p(&index2->n);
314 index2->m = index2->n;
315 index2->offset = (uint64_t*)calloc(index2->m, 8);
316 fread(index2->offset, index2->n, 8, fp);
318 for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
324 int bam_index_build(const char *fn)
330 assert(fp = bam_open(fn, "r"));
331 idx = bam_index_core(fp);
333 fnidx = (char*)calloc(strlen(fn) + 5, 1);
334 strcpy(fnidx, fn); strcat(fnidx, ".bai");
335 assert(fpidx = fopen(fnidx, "w"));
336 bam_index_save(idx, fpidx);
337 bam_index_destroy(idx);
343 int bam_index(int argc, char *argv[])
346 fprintf(stderr, "Usage: samtools index <in.bam>\n");
349 bam_index_build(argv[1]);
353 #define MAX_BIN 37450 // =(8^6-1)/7+1
355 static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
360 for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
361 for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
362 for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
363 for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
364 for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
368 static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
370 uint32_t rbeg = b->core.pos;
371 uint32_t rend = bam_calend(&b->core, bam1_cigar(b));
372 return (rend > beg && rbeg < end);
375 int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
378 int i, n_bins, n_off;
384 bins = (uint16_t*)calloc(MAX_BIN, 2);
385 n_bins = reg2bins(beg, end, bins);
386 index = idx->index[tid];
387 min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
388 for (i = n_off = 0; i < n_bins; ++i) {
389 if ((k = kh_get(i, index, bins[i])) != kh_end(index))
390 n_off += kh_value(index, k).n;
393 free(bins); return 0;
395 off = (pair64_t*)calloc(n_off, 16);
396 for (i = n_off = 0; i < n_bins; ++i) {
397 if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
399 bam_binlist_t *p = &kh_value(index, k);
400 for (j = 0; j < p->n; ++j)
401 if (p->list[j].v > min_off) off[n_off++] = p->list[j];
409 b = (bam1_t*)calloc(1, sizeof(bam1_t));
410 ks_introsort(off, n_off, off);
411 // resolve overlaps between adjecent blocks; this may happen due to the merge in indexing
412 for (i = 1; i < n_off; ++i)
413 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
414 { // merge adjacent blocks
415 #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
417 for (i = 1, l = 0; i < n_off; ++i) {
418 #ifdef BAM_TRUE_OFFSET
419 if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
421 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
423 else off[++l] = off[i];
428 // retrive alignments
429 n_seeks = 0; i = -1; curr_off = 0;
431 if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
432 if (i == n_off - 1) break; // no more chunks
433 if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
434 if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
435 bam_seek(fp, off[i+1].u, SEEK_SET);
436 curr_off = bam_tell(fp);
441 if ((ret = bam_read1(fp, b)) > 0) {
442 curr_off = bam_tell(fp);
443 if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
444 else if (is_overlap(beg, end, b)) func(b, data);
445 } else break; // end of file
447 // fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);