#include <ctype.h>
+#include <assert.h>
#include "bam.h"
#include "khash.h"
#include "ksort.h"
#include "bam_endian.h"
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
/*!
@header
*/
#define BAM_MIN_CHUNK_GAP 32768
+// 1<<14 is the size of minimum bin.
#define BAM_LIDX_SHIFT 14
typedef struct {
l->list[l->n].u = beg; l->list[l->n++].v = end;
}
-static inline void insert_offset2(bam_lidx_t *index2, int last, int curr, uint64_t offset)
+static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
{
- int i;
- if (index2->m < curr + 1) {
- index2->m = curr + 1;
+ int i, beg, end;
+ beg = b->core.pos >> BAM_LIDX_SHIFT;
+ end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
+ if (index2->m < end + 1) {
+ int old_m = index2->m;
+ index2->m = end + 1;
kroundup32(index2->m);
index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
+ memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
}
- if (last > curr) last = -1;
- for (i = last + 1; i <= curr; ++i) index2->offset[i] = offset;
- index2->n = curr + 1;
+ for (i = beg + 1; i <= end; ++i)
+ if (index2->offset[i] == 0) index2->offset[i] = offset;
+ index2->n = end + 1;
}
static void merge_chunks(bam_index_t *idx)
bam_header_t *h;
int i, ret;
bam_index_t *idx;
- uint32_t last_coor, last_tid, last_bin, save_bin, save_tid;
+ uint32_t last_bin, save_bin;
+ int32_t last_coor, last_tid, save_tid;
bam1_core_t *c;
uint64_t save_off, last_off;
last_tid = c->tid;
last_bin = 0xffffffffu;
} else if (last_coor > c->pos) {
- fprintf(stderr, "[bam_index_core] the alignment is not sorted. Abort!\n");
+ fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
+ bam1_qname(b), last_coor, c->pos, c->tid+1);
exit(1);
}
- if (last_coor>>BAM_LIDX_SHIFT != b->core.pos>>BAM_LIDX_SHIFT) // then write the linear index
- insert_offset2(&idx->index2[b->core.tid], last_coor>>BAM_LIDX_SHIFT, b->core.pos>>BAM_LIDX_SHIFT, last_off);
+ if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off);
if (c->bin != last_bin) { // then possibly write the binning index
if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
save_off = last_off;
save_bin = last_bin = c->bin;
save_tid = c->tid;
+ if (save_tid < 0) break;
}
if (bam_tell(fp) <= last_off) {
fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
last_off = bam_tell(fp);
last_coor = b->core.pos;
}
- insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
+ if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
merge_chunks(idx);
if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
free(b->data); free(b);
fflush(fp);
}
-bam_index_t *bam_index_load(const char *fn)
+static bam_index_t *bam_index_load_core(FILE *fp)
{
- bam_index_t *idx;
- FILE *fp;
int i;
- char *fnidx, magic[4];
-
- fnidx = (char*)calloc(strlen(fn) + 5, 1);
- strcpy(fnidx, fn); strcat(fnidx, ".bai");
- if ((fp = fopen(fnidx, "r")) == 0) {
- fprintf(stderr, "[bam_index_load] the alignment is not indexed. Please run `index' command first. Abort!\n");
- exit(1);
+ char magic[4];
+ bam_index_t *idx;
+ if (fp == 0) {
+ fprintf(stderr, "[bam_index_load_core] fail to load index.\n");
+ return 0;
}
- free(fnidx);
-
fread(magic, 1, 4, fp);
if (strncmp(magic, "BAI\1", 4)) {
fprintf(stderr, "[bam_index_load] wrong magic number.\n");
if (bam_is_be)
for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
}
+ return idx;
+}
+
+bam_index_t *bam_index_load_local(const char *_fn)
+{
+ FILE *fp;
+ char *fnidx, *fn;
+
+ if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) {
+ const char *p;
+ int l = strlen(_fn);
+ for (p = _fn + l - 1; p >= _fn; --p)
+ if (*p == '/') break;
+ fn = strdup(p + 1);
+ } else fn = strdup(_fn);
+ fnidx = (char*)calloc(strlen(fn) + 5, 1);
+ strcpy(fnidx, fn); strcat(fnidx, ".bai");
+ fp = fopen(fnidx, "rb");
+ if (fp == 0) { // try "{base}.bai"
+ char *s = strstr(fn, "bam");
+ if (s == fn + strlen(fn) - 3) {
+ strcpy(fnidx, fn);
+ fnidx[strlen(fn)-1] = 'i';
+ fp = fopen(fnidx, "rb");
+ }
+ }
+ free(fnidx); free(fn);
+ if (fp) {
+ bam_index_t *idx = bam_index_load_core(fp);
+ fclose(fp);
+ return idx;
+ } else return 0;
+}
+
+#ifdef _USE_KNETFILE
+static void download_from_remote(const char *url)
+{
+ const int buf_size = 1 * 1024 * 1024;
+ char *fn;
+ FILE *fp;
+ uint8_t *buf;
+ knetFile *fp_remote;
+ int l;
+ if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
+ l = strlen(url);
+ for (fn = (char*)url + l - 1; fn >= url; --fn)
+ if (*fn == '/') break;
+ ++fn; // fn now points to the file name
+ fp_remote = knet_open(url, "r");
+ if (fp_remote == 0) {
+ fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
+ return;
+ }
+ if ((fp = fopen(fn, "wb")) == 0) {
+ fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
+ knet_close(fp_remote);
+ return;
+ }
+ buf = (uint8_t*)calloc(buf_size, 1);
+ while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
+ fwrite(buf, 1, l, fp);
+ free(buf);
fclose(fp);
+ knet_close(fp_remote);
+}
+#else
+static void download_from_remote(const char *url)
+{
+ return;
+}
+#endif
+
+bam_index_t *bam_index_load(const char *fn)
+{
+ bam_index_t *idx;
+ idx = bam_index_load_local(fn);
+ if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) {
+ char *fnidx = calloc(strlen(fn) + 5, 1);
+ strcat(strcpy(fnidx, fn), ".bai");
+ fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n");
+ download_from_remote(fnidx);
+ idx = bam_index_load_local(fn);
+ }
+ if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n");
return idx;
}
-int bam_index_build(const char *fn)
+int bam_index_build2(const char *fn, const char *_fnidx)
{
char *fnidx;
FILE *fpidx;
bamFile fp;
bam_index_t *idx;
- assert(fp = bam_open(fn, "r"));
+ if ((fp = bam_open(fn, "r")) == 0) {
+ fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n");
+ return -1;
+ }
idx = bam_index_core(fp);
bam_close(fp);
- fnidx = (char*)calloc(strlen(fn) + 5, 1);
- strcpy(fnidx, fn); strcat(fnidx, ".bai");
- assert(fpidx = fopen(fnidx, "w"));
+ if (_fnidx == 0) {
+ fnidx = (char*)calloc(strlen(fn) + 5, 1);
+ strcpy(fnidx, fn); strcat(fnidx, ".bai");
+ } else fnidx = strdup(_fnidx);
+ fpidx = fopen(fnidx, "wb");
+ if (fpidx == 0) {
+ fprintf(stderr, "[bam_index_build2] fail to create the index file.\n");
+ free(fnidx);
+ return -1;
+ }
bam_index_save(idx, fpidx);
bam_index_destroy(idx);
fclose(fpidx);
return 0;
}
+int bam_index_build(const char *fn)
+{
+ return bam_index_build2(fn, 0);
+}
+
int bam_index(int argc, char *argv[])
{
if (argc < 2) {
- fprintf(stderr, "Usage: samtools index <in.bam>\n");
+ fprintf(stderr, "Usage: samtools index <in.bam> [<out.index>]\n");
return 1;
}
- bam_index_build(argv[1]);
+ if (argc >= 3) bam_index_build2(argv[1], argv[2]);
+ else bam_index_build(argv[1]);
return 0;
}
static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
{
uint32_t rbeg = b->core.pos;
- uint32_t rend = bam_calend(&b->core, bam1_cigar(b));
+ uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1;
return (rend > beg && rbeg < end);
}
-int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
+// bam_fetch helper function retrieves
+pair64_t * get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int* cnt_off)
{
uint16_t *bins;
int i, n_bins, n_off;
}
free(bins);
{
- bam1_t *b;
- int ret, n_seeks;
- uint64_t curr_off;
- b = (bam1_t*)calloc(1, sizeof(bam1_t));
+ bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
+ int l;
ks_introsort(off, n_off, off);
- // resolve overlaps between adjecent blocks; this may happen due to the merge in indexing
+ // resolve completely contained adjacent blocks
+ for (i = 1, l = 0; i < n_off; ++i)
+ if (off[l].v < off[i].v)
+ off[++l] = off[i];
+ n_off = l + 1;
+ // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
for (i = 1; i < n_off; ++i)
if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
{ // merge adjacent blocks
#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
- int l;
for (i = 1, l = 0; i < n_off; ++i) {
#ifdef BAM_TRUE_OFFSET
if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
n_off = l + 1;
#endif
}
+ bam_destroy1(b);
+ }
+ *cnt_off = n_off;
+ return off;
+}
+
+int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
+{
+ int n_off;
+ pair64_t *off = get_chunk_coordinates(idx, tid, beg, end, &n_off);
+ if (off == 0) return 0;
+ {
// retrive alignments
+ uint64_t curr_off;
+ int i, ret, n_seeks;
n_seeks = 0; i = -1; curr_off = 0;
+ bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
for (;;) {
if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
if (i == n_off - 1) break; // no more chunks