From 6dd512ba2a4a68c07cd08698a8e972bb553d0c38 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 29 Jun 2009 22:23:32 +0000 Subject: [PATCH] * samtools-0.1.4-18 (r363) * knetfile: do not trigger network communication in FTP seek (lazy seek) * bgzf: cache recent blocks (disabled by default) --- bam_tview.c | 1 + bamtk.c | 2 +- bgzf.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++ bgzf.h | 9 ++++++ knetfile.c | 20 +++++++++----- knetfile.h | 3 +- 6 files changed, 106 insertions(+), 9 deletions(-) diff --git a/bam_tview.c b/bam_tview.c index 60b7350..e5b83a1 100644 --- a/bam_tview.c +++ b/bam_tview.c @@ -162,6 +162,7 @@ tview_t *tv_init(const char *fn, const char *fn_fa) tv->is_dot = 1; tv->idx = bam_index_load(fn); tv->fp = bam_open(fn, "r"); + bgzf_set_cache_size(tv->fp, 8 * 1024 *1024); assert(tv->fp); tv->header = bam_header_read(tv->fp); tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv); diff --git a/bamtk.c b/bamtk.c index 9474e83..a64a929 100644 --- a/bamtk.c +++ b/bamtk.c @@ -3,7 +3,7 @@ #include "bam.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.4-17 (r361)" +#define PACKAGE_VERSION "0.1.4-18 (r363)" #endif int bam_taf2baf(int argc, char *argv[]); diff --git a/bgzf.c b/bgzf.c index c49215c..fe4e31d 100644 --- a/bgzf.c +++ b/bgzf.c @@ -10,6 +10,7 @@ */ /* + 2009-06-29 by lh3: cache recent uncompressed blocks. 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP. 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */ @@ -22,6 +23,14 @@ #include #include "bgzf.h" +#include "khash.h" +typedef struct { + int size; + uint8_t *block; + int64_t end_offset; +} cache_t; +KHASH_MAP_INIT_INT64(cache, cache_t) + extern off_t ftello(FILE *stream); extern int fseeko(FILE *stream, off_t offset, int whence); @@ -93,6 +102,8 @@ static BGZF *bgzf_read_init() fp->uncompressed_block = malloc(MAX_BLOCK_SIZE); fp->compressed_block_size = MAX_BLOCK_SIZE; fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->cache_size = 0; + fp->cache = kh_init(cache); return fp; } @@ -338,22 +349,83 @@ check_header(const byte* header) unpackInt16((uint8_t*)&header[14]) == BGZF_LEN); } +static void free_cache(BGZF *fp) +{ + khint_t k; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (fp->open_mode != 'r') return; + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) free(kh_val(h, k).block); + kh_destroy(cache, h); +} + +static int load_block_from_cache(BGZF *fp, int64_t block_address) +{ + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + k = kh_get(cache, h, block_address); + if (k == kh_end(h)) return 0; + p = &kh_val(h, k); + if (fp->block_length != 0) fp->block_offset = 0; + fp->block_address = block_address; + fp->block_length = p->size; + memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE); +#ifdef _USE_KNETFILE + knet_seek(fp->x.fpr, p->end_offset, SEEK_SET); +#else + fseeko(fp->file, p->end_offset, SEEK_SET); +#endif + return p->size; +} + +static void cache_block(BGZF *fp, int size) +{ + int ret; + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (MAX_BLOCK_SIZE >= fp->cache_size) return; + if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) { + /* A better way would be to remove the oldest block in the + * cache, but here we remove a random one for simplicity. This + * should not have a big impact on performance. */ + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) break; + if (k < kh_end(h)) { + free(kh_val(h, k).block); + kh_del(cache, h, k); + } + } + k = kh_put(cache, h, fp->block_address, &ret); + if (ret == 0) return; // if this happens, a bug! + p = &kh_val(h, k); + p->size = fp->block_length; + p->end_offset = fp->block_address + size; + p->block = malloc(MAX_BLOCK_SIZE); + memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE); +} + static int read_block(BGZF* fp) { byte header[BLOCK_HEADER_LENGTH]; + int size = 0; #ifdef _USE_KNETFILE int64_t block_address = knet_tell(fp->x.fpr); + if (load_block_from_cache(fp, block_address)) return 0; int count = knet_read(fp->x.fpr, header, sizeof(header)); #else int64_t block_address = ftello(fp->file); + if (load_block_from_cache(fp, block_address)) return 0; int count = fread(header, 1, sizeof(header), fp->file); #endif if (count == 0) { fp->block_length = 0; return 0; } + size = count; if (count != sizeof(header)) { report_error(fp, "read failed"); return -1; @@ -375,6 +447,7 @@ read_block(BGZF* fp) report_error(fp, "read failed"); return -1; } + size += count; count = inflate_block(fp, block_length); if (count < 0) { return -1; @@ -385,6 +458,7 @@ read_block(BGZF* fp) } fp->block_address = block_address; fp->block_length = count; + cache_block(fp, size); return 0; } @@ -515,6 +589,7 @@ bgzf_close(BGZF* fp) } free(fp->uncompressed_block); free(fp->compressed_block); + free_cache(fp); free(fp); return 0; } @@ -525,6 +600,11 @@ bgzf_tell(BGZF* fp) return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)); } +void bgzf_set_cache_size(BGZF *fp, int cache_size) +{ + if (fp) fp->cache_size = cache_size; +} + int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) { diff --git a/bgzf.h b/bgzf.h index 68375b4..d5eeafe 100644 --- a/bgzf.h +++ b/bgzf.h @@ -41,7 +41,9 @@ typedef struct { int64_t block_address; int block_length; int block_offset; + int cache_size; const char* error; + void *cache; // a pointer to a hash table } BGZF; #ifdef __cplusplus @@ -104,6 +106,13 @@ int64_t bgzf_tell(BGZF* fp); */ int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); +/* + * Set the cache size. Zero to disable. By default, caching is + * disabled. The recommended cache size for frequent random access is + * about 8M bytes. + */ +void bgzf_set_cache_size(BGZF *fp, int cache_size); + #ifdef __cplusplus } #endif diff --git a/knetfile.c b/knetfile.c index 322885a..9a6878d 100644 --- a/knetfile.c +++ b/knetfile.c @@ -132,25 +132,26 @@ knetFile *kftp_prep(const char *fn, const char *mode) strncpy(fp->host, fn + 6, l); fp->retr = calloc(strlen(p) + 8, 1); sprintf(fp->retr, "RETR %s\r\n", p); + fp->seek_offset = -1; return fp; } // place ->fd at offset off -int kftp_connect_file(knetFile *fp, off_t off) +int kftp_connect_file(knetFile *fp) { if (fp->fd) { close(fp->fd); if (fp->no_reconnect) kftp_get_response(fp); } kftp_pasv_prep(fp); - if (off) { + if (fp->offset) { char tmp[32]; - sprintf(tmp, "REST %lld\r\n", (long long)off); + sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); kftp_send_cmd(fp, tmp, 1); } kftp_send_cmd(fp, fp->retr, 0); kftp_pasv_connect(fp); kftp_get_response(fp); - fp->offset = off; + fp->is_ready = 1; return 0; } @@ -168,7 +169,7 @@ knetFile *knet_open(const char *fn, const char *mode) knet_close(fp); return 0; } - kftp_connect_file(fp, 0); + kftp_connect_file(fp); } else { int fd = open(fn, O_RDONLY); if (fd == -1) { @@ -198,6 +199,11 @@ off_t knet_read(knetFile *fp, void *buf, off_t len) fp->offset += l; } else { off_t rest = len, curr; + if (fp->is_ready == 0) { + if (!fp->no_reconnect) kftp_reconnect(fp); + kftp_connect_file(fp); + fp->is_ready = 1; + } while (rest) { curr = read(fp->fd, buf + l, rest); if (curr == 0) break; // FIXME: end of file or bad network? I do not know... @@ -223,8 +229,8 @@ int knet_seek(knetFile *fp, off_t off, int whence) fprintf(stderr, "[knet_seek] only SEEK_SET is supported for FTP. Offset is unchanged.\n"); return -1; } - if (!fp->no_reconnect) kftp_reconnect(fp); - kftp_connect_file(fp, off); + fp->offset = off; + fp->is_ready = 0; return 0; } return -1; diff --git a/knetfile.h b/knetfile.h index 7fc86c2..85f348c 100644 --- a/knetfile.h +++ b/knetfile.h @@ -16,8 +16,9 @@ typedef struct knetFile_s { char *host; // the following are for FTP only - int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect; + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; char *response, *retr; + int64_t seek_offset; // for lazy seek } knetFile; #define knet_tell(fp) ((fp)->offset) -- 2.39.2