X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bam_index.c;h=72ef270f9c7afd143759e1f98d681ce69147bb13;hb=9f30c9ba76e0266e896f1199b41e0e7ddf6bd648;hp=8c2555232772ee6571f0da19593c5b2764b04898;hpb=47c65ef17d8949832a0c79c552118b31154bc2ad;p=samtools.git diff --git a/bam_index.c b/bam_index.c index 8c25552..72ef270 100644 --- a/bam_index.c +++ b/bam_index.c @@ -1,8 +1,10 @@ #include +#include #include "bam.h" #include "khash.h" #include "ksort.h" #include "bam_endian.h" +#include "knetfile.h" /*! @header @@ -155,7 +157,8 @@ bam_index_t *bam_index_core(bamFile fp) last_tid = c->tid; last_bin = 0xffffffffu; } else if (last_coor > c->pos) { - fprintf(stderr, "[bam_index_core] the alignment is not sorted. Abort!\n"); + fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n", + bam1_qname(b), last_coor, c->pos, c->tid+1); exit(1); } if (b->core.tid >= 0 && b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off); @@ -259,21 +262,15 @@ void bam_index_save(const bam_index_t *idx, FILE *fp) fflush(fp); } -bam_index_t *bam_index_load(const char *fn) +static bam_index_t *bam_index_load_core(FILE *fp) { - bam_index_t *idx; - FILE *fp; int i; - char *fnidx, magic[4]; - - fnidx = (char*)calloc(strlen(fn) + 5, 1); - strcpy(fnidx, fn); strcat(fnidx, ".bai"); - if ((fp = fopen(fnidx, "r")) == 0) { - fprintf(stderr, "[bam_index_load] the alignment is not indexed. Please run `index' command first. Abort!\n"); - exit(1); + char magic[4]; + bam_index_t *idx; + if (fp == 0) { + fprintf(stderr, "[bam_index_load_core] fail to load index.\n"); + return 0; } - free(fnidx); - fread(magic, 1, 4, fp); if (strncmp(magic, "BAI\1", 4)) { fprintf(stderr, "[bam_index_load] wrong magic number.\n"); @@ -323,22 +320,108 @@ bam_index_t *bam_index_load(const char *fn) if (bam_is_be) for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); } + return idx; +} + +bam_index_t *bam_index_load_local(const char *_fn) +{ + FILE *fp; + char *fnidx, *fn; + + if (strstr(_fn, "ftp://") == _fn) { + const char *p; + int l = strlen(_fn); + for (p = _fn + l - 1; p >= _fn; --p) + if (*p == '/') break; + fn = strdup(p + 1); + } else fn = strdup(_fn); + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".bai"); + fp = fopen(fnidx, "r"); + if (fp == 0) { // try "{base}.bai" + char *s = strstr(fn, "bam"); + if (s == fn + strlen(fn) - 3) { + strcpy(fnidx, fn); + fnidx[strlen(fn)-1] = 'i'; + fp = fopen(fnidx, "r"); + } + } + free(fnidx); free(fn); + if (fp) { + bam_index_t *idx = bam_index_load_core(fp); + fclose(fp); + return idx; + } else return 0; +} + +static void download_from_remote(const char *url) +{ + const int buf_size = 1 * 1024 * 1024; + char *fn; + FILE *fp; + uint8_t *buf; + knetFile *fp_remote; + int l; + if (strstr(url, "ftp://") != url) return; + l = strlen(url); + for (fn = (char*)url + l - 1; fn >= url; --fn) + if (*fn == '/') break; + ++fn; // fn now points to the file name + fp_remote = knet_open(url, "r"); + if (fp_remote == 0) { + fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); + return; + } + if ((fp = fopen(fn, "w")) == 0) { + fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); + knet_close(fp_remote); + return; + } + buf = (uint8_t*)calloc(buf_size, 1); + while ((l = knet_read(fp_remote, buf, buf_size)) != 0) + fwrite(buf, 1, l, fp); + free(buf); fclose(fp); + knet_close(fp_remote); +} + +bam_index_t *bam_index_load(const char *fn) +{ + bam_index_t *idx; + idx = bam_index_load_local(fn); + if (idx == 0 && strstr(fn, "ftp://") == fn) { + char *fnidx = calloc(strlen(fn) + 5, 1); + strcat(strcpy(fnidx, fn), ".bai"); + fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n"); + download_from_remote(fnidx); + idx = bam_index_load_local(fn); + } + if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n"); return idx; } -int bam_index_build(const char *fn) +int bam_index_build2(const char *fn, const char *_fnidx) { char *fnidx; FILE *fpidx; bamFile fp; bam_index_t *idx; - assert(fp = bam_open(fn, "r")); + if ((fp = bam_open(fn, "r")) == 0) { + fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n"); + return -1; + } idx = bam_index_core(fp); bam_close(fp); - fnidx = (char*)calloc(strlen(fn) + 5, 1); - strcpy(fnidx, fn); strcat(fnidx, ".bai"); - assert(fpidx = fopen(fnidx, "w")); + if (_fnidx == 0) { + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".bai"); + } else fnidx = strdup(_fnidx); + fpidx = fopen(fnidx, "w"); + if (fpidx == 0) { + fprintf(stderr, "[bam_index_build2] fail to create the index file.\n"); + free(fnidx); + return -1; + } bam_index_save(idx, fpidx); bam_index_destroy(idx); fclose(fpidx); @@ -346,13 +429,19 @@ int bam_index_build(const char *fn) return 0; } +int bam_index_build(const char *fn) +{ + return bam_index_build2(fn, 0); +} + int bam_index(int argc, char *argv[]) { if (argc < 2) { - fprintf(stderr, "Usage: samtools index \n"); + fprintf(stderr, "Usage: samtools index []\n"); return 1; } - bam_index_build(argv[1]); + if (argc >= 3) bam_index_build2(argv[1], argv[2]); + else bam_index_build(argv[1]); return 0; } @@ -374,7 +463,7 @@ static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN]) static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b) { uint32_t rbeg = b->core.pos; - uint32_t rend = bam_calend(&b->core, bam1_cigar(b)); + uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1; return (rend > beg && rbeg < end); } @@ -410,16 +499,20 @@ int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, voi free(bins); { bam1_t *b; - int ret, n_seeks; + int l, ret, n_seeks; uint64_t curr_off; b = (bam1_t*)calloc(1, sizeof(bam1_t)); ks_introsort(off, n_off, off); - // resolve overlaps between adjecent blocks; this may happen due to the merge in indexing + // resolve completely contained adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) + if (off[l].v < off[i].v) + off[++l] = off[i]; + n_off = l + 1; + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing for (i = 1; i < n_off; ++i) if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; { // merge adjacent blocks #if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16) - int l; for (i = 1, l = 0; i < n_off; ++i) { #ifdef BAM_TRUE_OFFSET if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;