X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bgzf.c;h=2cd1c5e514a94c5569f5819c7f6b23af8985d444;hb=2ec779405b1f89e1407c24956bbb3af584234bc9;hp=b3d25f781c0b18e53fc9e9707ccb6350e175a627;hpb=3ddb3942053df00fdae714e77cbc2f5618db617e;p=samtools.git diff --git a/bgzf.c b/bgzf.c index b3d25f7..2cd1c5e 100644 --- a/bgzf.c +++ b/bgzf.c @@ -9,7 +9,10 @@ * or functionality. */ -/* 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */ +/* + 2009-06-29 by lh3: cache recent uncompressed blocks. + 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP. + 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */ #include #include @@ -20,10 +23,23 @@ #include #include "bgzf.h" +#include "khash.h" +typedef struct { + int size; + uint8_t *block; + int64_t end_offset; +} cache_t; +KHASH_MAP_INIT_INT64(cache, cache_t) + +#ifdef _NO_LFS +#define ftello(fp) ftell(fp) +#define fseeko(fp, offset, whence) fseek(fp, offset, whence) +#else extern off_t ftello(FILE *stream); extern int fseeko(FILE *stream, off_t offset, int whence); +#endif -typedef int8_t byte; +typedef int8_t bgzf_byte_t; static const int DEFAULT_BLOCK_SIZE = 64 * 1024; static const int MAX_BLOCK_SIZE = 64 * 1024; @@ -70,9 +86,9 @@ packInt32(uint8_t* buffer, uint32_t value) buffer[3] = value >> 24; } -inline +static inline int -min(int x, int y) +bgzf_min(int x, int y) { return (x < y) ? x : y; } @@ -91,6 +107,8 @@ static BGZF *bgzf_read_init() fp->uncompressed_block = malloc(MAX_BLOCK_SIZE); fp->compressed_block_size = MAX_BLOCK_SIZE; fp->compressed_block = malloc(MAX_BLOCK_SIZE); + fp->cache_size = 0; + fp->cache = kh_init(cache); return fp; } @@ -193,7 +211,7 @@ deflate_block(BGZF* fp, int block_length) // Deflate the block in fp->uncompressed_block into fp->compressed_block. // Also adds an extra field that stores the compressed block length. - byte* buffer = fp->compressed_block; + bgzf_byte_t* buffer = fp->compressed_block; int buffer_size = fp->compressed_block_size; // Init gzip header @@ -324,10 +342,10 @@ inflate_block(BGZF* fp, int block_length) static int -check_header(const byte* header) +check_header(const bgzf_byte_t* header) { return (header[0] == GZIP_ID1 && - header[1] == (byte) GZIP_ID2 && + header[1] == (bgzf_byte_t) GZIP_ID2 && header[2] == Z_DEFLATED && (header[3] & FLG_FEXTRA) != 0 && unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN && @@ -336,22 +354,83 @@ check_header(const byte* header) unpackInt16((uint8_t*)&header[14]) == BGZF_LEN); } +static void free_cache(BGZF *fp) +{ + khint_t k; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (fp->open_mode != 'r') return; + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) free(kh_val(h, k).block); + kh_destroy(cache, h); +} + +static int load_block_from_cache(BGZF *fp, int64_t block_address) +{ + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + k = kh_get(cache, h, block_address); + if (k == kh_end(h)) return 0; + p = &kh_val(h, k); + if (fp->block_length != 0) fp->block_offset = 0; + fp->block_address = block_address; + fp->block_length = p->size; + memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE); +#ifdef _USE_KNETFILE + knet_seek(fp->x.fpr, p->end_offset, SEEK_SET); +#else + fseeko(fp->file, p->end_offset, SEEK_SET); +#endif + return p->size; +} + +static void cache_block(BGZF *fp, int size) +{ + int ret; + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (MAX_BLOCK_SIZE >= fp->cache_size) return; + if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) { + /* A better way would be to remove the oldest block in the + * cache, but here we remove a random one for simplicity. This + * should not have a big impact on performance. */ + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) break; + if (k < kh_end(h)) { + free(kh_val(h, k).block); + kh_del(cache, h, k); + } + } + k = kh_put(cache, h, fp->block_address, &ret); + if (ret == 0) return; // if this happens, a bug! + p = &kh_val(h, k); + p->size = fp->block_length; + p->end_offset = fp->block_address + size; + p->block = malloc(MAX_BLOCK_SIZE); + memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE); +} + static int read_block(BGZF* fp) { - byte header[BLOCK_HEADER_LENGTH]; + bgzf_byte_t header[BLOCK_HEADER_LENGTH]; + int size = 0; #ifdef _USE_KNETFILE int64_t block_address = knet_tell(fp->x.fpr); + if (load_block_from_cache(fp, block_address)) return 0; int count = knet_read(fp->x.fpr, header, sizeof(header)); #else int64_t block_address = ftello(fp->file); + if (load_block_from_cache(fp, block_address)) return 0; int count = fread(header, 1, sizeof(header), fp->file); #endif if (count == 0) { fp->block_length = 0; return 0; } + size = count; if (count != sizeof(header)) { report_error(fp, "read failed"); return -1; @@ -361,7 +440,7 @@ read_block(BGZF* fp) return -1; } int block_length = unpackInt16((uint8_t*)&header[16]) + 1; - byte* compressed_block = (byte*) fp->compressed_block; + bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block; memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); int remaining = block_length - BLOCK_HEADER_LENGTH; #ifdef _USE_KNETFILE @@ -373,6 +452,7 @@ read_block(BGZF* fp) report_error(fp, "read failed"); return -1; } + size += count; count = inflate_block(fp, block_length); if (count < 0) { return -1; @@ -383,6 +463,7 @@ read_block(BGZF* fp) } fp->block_address = block_address; fp->block_length = count; + cache_block(fp, size); return 0; } @@ -398,7 +479,7 @@ bgzf_read(BGZF* fp, void* data, int length) } int bytes_read = 0; - byte* output = data; + bgzf_byte_t* output = data; while (bytes_read < length) { int available = fp->block_length - fp->block_offset; if (available <= 0) { @@ -410,8 +491,8 @@ bgzf_read(BGZF* fp, void* data, int length) break; } } - int copy_length = min(length-bytes_read, available); - byte* buffer = fp->uncompressed_block; + int copy_length = bgzf_min(length-bytes_read, available); + bgzf_byte_t* buffer = fp->uncompressed_block; memcpy(output, buffer + fp->block_offset, copy_length); fp->block_offset += copy_length; output += copy_length; @@ -464,12 +545,12 @@ bgzf_write(BGZF* fp, const void* data, int length) fp->uncompressed_block = malloc(fp->uncompressed_block_size); } - const byte* input = data; + const bgzf_byte_t* input = data; int block_length = fp->uncompressed_block_size; int bytes_written = 0; while (bytes_written < length) { - int copy_length = min(block_length - fp->block_offset, length - bytes_written); - byte* buffer = fp->uncompressed_block; + int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written); + bgzf_byte_t* buffer = fp->uncompressed_block; memcpy(buffer + fp->block_offset, input, copy_length); fp->block_offset += copy_length; input += copy_length; @@ -490,6 +571,14 @@ bgzf_close(BGZF* fp) if (flush_block(fp) != 0) { return -1; } + { // add an empty block + int count, block_length = deflate_block(fp, 0); +#ifdef _USE_KNETFILE + count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); +#else + count = fwrite(fp->compressed_block, 1, block_length, fp->file); +#endif + } #ifdef _USE_KNETFILE if (fflush(fp->x.fpw) != 0) { #else @@ -513,6 +602,7 @@ bgzf_close(BGZF* fp) } free(fp->uncompressed_block); free(fp->compressed_block); + free_cache(fp); free(fp); return 0; } @@ -523,6 +613,30 @@ bgzf_tell(BGZF* fp) return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)); } +void bgzf_set_cache_size(BGZF *fp, int cache_size) +{ + if (fp) fp->cache_size = cache_size; +} + +int bgzf_check_EOF(BGZF *fp) +{ + static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; + uint8_t buf[28]; + off_t offset; +#ifdef _USE_KNETFILE + offset = knet_tell(fp->x.fpr); + if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1; + knet_read(fp->x.fpr, buf, 28); + knet_seek(fp->x.fpr, offset, SEEK_SET); +#else + offset = ftello(fp->file); + if (fseeko(fp->file, -28, SEEK_END) != 0) return -1; + fread(buf, 1, 28, fp->file); + fseeko(fp->file, offset, SEEK_SET); +#endif + return (memcmp(magic, buf, 28) == 0)? 1 : 0; +} + int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) { @@ -549,4 +663,3 @@ bgzf_seek(BGZF* fp, int64_t pos, int where) fp->block_offset = block_offset; return 0; } -