3 Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 2009-06-29 by lh3: cache recent uncompressed blocks.
26 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
27 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
34 #include <sys/types.h>
44 KHASH_MAP_INIT_INT64(cache, cache_t)
46 #if defined(_WIN32) || defined(_MSC_VER)
47 #define ftello(fp) ftell(fp)
48 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
50 extern off_t ftello(FILE *stream);
51 extern int fseeko(FILE *stream, off_t offset, int whence);
54 typedef int8_t bgzf_byte_t;
56 static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
57 static const int MAX_BLOCK_SIZE = 64 * 1024;
59 static const int BLOCK_HEADER_LENGTH = 18;
60 static const int BLOCK_FOOTER_LENGTH = 8;
62 static const int GZIP_ID1 = 31;
63 static const int GZIP_ID2 = 139;
64 static const int CM_DEFLATE = 8;
65 static const int FLG_FEXTRA = 4;
66 static const int OS_UNKNOWN = 255;
67 static const int BGZF_ID1 = 66; // 'B'
68 static const int BGZF_ID2 = 67; // 'C'
69 static const int BGZF_LEN = 2;
70 static const int BGZF_XLEN = 6; // BGZF_LEN+4
72 static const int GZIP_WINDOW_BITS = -15; // no zlib header
73 static const int Z_DEFAULT_MEM_LEVEL = 8;
78 packInt16(uint8_t* buffer, uint16_t value)
81 buffer[1] = value >> 8;
86 unpackInt16(const uint8_t* buffer)
88 return (buffer[0] | (buffer[1] << 8));
93 packInt32(uint8_t* buffer, uint32_t value)
96 buffer[1] = value >> 8;
97 buffer[2] = value >> 16;
98 buffer[3] = value >> 24;
103 bgzf_min(int x, int y)
105 return (x < y) ? x : y;
110 report_error(BGZF* fp, const char* message) {
114 static BGZF *bgzf_read_init()
117 fp = calloc(1, sizeof(BGZF));
118 fp->uncompressed_block_size = MAX_BLOCK_SIZE;
119 fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
120 fp->compressed_block_size = MAX_BLOCK_SIZE;
121 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
123 fp->cache = kh_init(cache);
132 knetFile *file = knet_dopen(fd, "r");
134 FILE* file = fdopen(fd, "r");
137 if (file == 0) return 0;
138 fp = bgzf_read_init();
139 fp->file_descriptor = fd;
151 open_write(int fd, bool is_uncompressed)
153 FILE* file = fdopen(fd, "w");
155 if (file == 0) return 0;
156 fp = malloc(sizeof(BGZF));
157 fp->file_descriptor = fd;
159 fp->owned_file = 0; fp->is_uncompressed = is_uncompressed;
165 fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
166 fp->uncompressed_block = NULL;
167 fp->compressed_block_size = MAX_BLOCK_SIZE;
168 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
169 fp->block_address = 0;
170 fp->block_offset = 0;
171 fp->block_length = 0;
177 bgzf_open(const char* __restrict path, const char* __restrict mode)
180 if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */
182 knetFile *file = knet_open(path, mode);
183 if (file == 0) return 0;
184 fp = bgzf_read_init();
185 fp->file_descriptor = -1;
189 int fd, oflag = O_RDONLY;
193 fd = open(path, oflag);
194 if (fd == -1) return 0;
197 } else if (mode[0] == 'w' || mode[0] == 'W') {
198 int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC;
202 fd = open(path, oflag, 0644);
203 if (fd == -1) return 0;
204 fp = open_write(fd, strstr(mode, "u")? 1 : 0);
213 bgzf_fdopen(int fd, const char * __restrict mode)
215 if (fd == -1) return 0;
216 if (mode[0] == 'r' || mode[0] == 'R') {
217 return open_read(fd);
218 } else if (mode[0] == 'w' || mode[0] == 'W') {
219 return open_write(fd, strstr(mode, "u")? 1 : 0);
227 deflate_block(BGZF* fp, int block_length)
229 // Deflate the block in fp->uncompressed_block into fp->compressed_block.
230 // Also adds an extra field that stores the compressed block length.
232 bgzf_byte_t* buffer = fp->compressed_block;
233 int buffer_size = fp->compressed_block_size;
236 buffer[0] = GZIP_ID1;
237 buffer[1] = GZIP_ID2;
238 buffer[2] = CM_DEFLATE;
239 buffer[3] = FLG_FEXTRA;
240 buffer[4] = 0; // mtime
245 buffer[9] = OS_UNKNOWN;
246 buffer[10] = BGZF_XLEN;
248 buffer[12] = BGZF_ID1;
249 buffer[13] = BGZF_ID2;
250 buffer[14] = BGZF_LEN;
252 buffer[16] = 0; // placeholder for block length
255 // loop to retry for blocks that do not compress enough
256 int input_length = block_length;
257 int compressed_length = 0;
259 int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION;
263 zs.next_in = fp->uncompressed_block;
264 zs.avail_in = input_length;
265 zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
266 zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
268 int status = deflateInit2(&zs, compress_level, Z_DEFLATED,
269 GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
270 if (status != Z_OK) {
271 report_error(fp, "deflate init failed");
274 status = deflate(&zs, Z_FINISH);
275 if (status != Z_STREAM_END) {
277 if (status == Z_OK) {
278 // Not enough space in buffer.
279 // Can happen in the rare case the input doesn't compress enough.
280 // Reduce the amount of input until it fits.
281 input_length -= 1024;
282 if (input_length <= 0) {
283 // should never happen
284 report_error(fp, "input reduction failed");
289 report_error(fp, "deflate failed");
292 status = deflateEnd(&zs);
293 if (status != Z_OK) {
294 report_error(fp, "deflate end failed");
297 compressed_length = zs.total_out;
298 compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
299 if (compressed_length > MAX_BLOCK_SIZE) {
300 // should never happen
301 report_error(fp, "deflate overflow");
307 packInt16((uint8_t*)&buffer[16], compressed_length-1);
308 uint32_t crc = crc32(0L, NULL, 0L);
309 crc = crc32(crc, fp->uncompressed_block, input_length);
310 packInt32((uint8_t*)&buffer[compressed_length-8], crc);
311 packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
313 int remaining = block_length - input_length;
315 if (remaining > input_length) {
316 // should never happen (check so we can use memcpy)
317 report_error(fp, "remainder too large");
320 memcpy(fp->uncompressed_block,
321 fp->uncompressed_block + input_length,
324 fp->block_offset = remaining;
325 return compressed_length;
330 inflate_block(BGZF* fp, int block_length)
332 // Inflate the block in fp->compressed_block into fp->uncompressed_block
337 zs.next_in = fp->compressed_block + 18;
338 zs.avail_in = block_length - 16;
339 zs.next_out = fp->uncompressed_block;
340 zs.avail_out = fp->uncompressed_block_size;
342 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
343 if (status != Z_OK) {
344 report_error(fp, "inflate init failed");
347 status = inflate(&zs, Z_FINISH);
348 if (status != Z_STREAM_END) {
350 report_error(fp, "inflate failed");
353 status = inflateEnd(&zs);
354 if (status != Z_OK) {
355 report_error(fp, "inflate failed");
363 check_header(const bgzf_byte_t* header)
365 return (header[0] == GZIP_ID1 &&
366 header[1] == (bgzf_byte_t) GZIP_ID2 &&
367 header[2] == Z_DEFLATED &&
368 (header[3] & FLG_FEXTRA) != 0 &&
369 unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
370 header[12] == BGZF_ID1 &&
371 header[13] == BGZF_ID2 &&
372 unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
375 static void free_cache(BGZF *fp)
378 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
379 if (fp->open_mode != 'r') return;
380 for (k = kh_begin(h); k < kh_end(h); ++k)
381 if (kh_exist(h, k)) free(kh_val(h, k).block);
382 kh_destroy(cache, h);
385 static int load_block_from_cache(BGZF *fp, int64_t block_address)
389 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
390 k = kh_get(cache, h, block_address);
391 if (k == kh_end(h)) return 0;
393 if (fp->block_length != 0) fp->block_offset = 0;
394 fp->block_address = block_address;
395 fp->block_length = p->size;
396 memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE);
398 knet_seek(fp->x.fpr, p->end_offset, SEEK_SET);
400 fseeko(fp->file, p->end_offset, SEEK_SET);
405 static void cache_block(BGZF *fp, int size)
410 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
411 if (MAX_BLOCK_SIZE >= fp->cache_size) return;
412 if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) {
413 /* A better way would be to remove the oldest block in the
414 * cache, but here we remove a random one for simplicity. This
415 * should not have a big impact on performance. */
416 for (k = kh_begin(h); k < kh_end(h); ++k)
417 if (kh_exist(h, k)) break;
419 free(kh_val(h, k).block);
423 k = kh_put(cache, h, fp->block_address, &ret);
424 if (ret == 0) return; // if this happens, a bug!
426 p->size = fp->block_length;
427 p->end_offset = fp->block_address + size;
428 p->block = malloc(MAX_BLOCK_SIZE);
429 memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE);
436 bgzf_byte_t header[BLOCK_HEADER_LENGTH];
439 int64_t block_address = knet_tell(fp->x.fpr);
440 if (load_block_from_cache(fp, block_address)) return 0;
441 int count = knet_read(fp->x.fpr, header, sizeof(header));
443 int64_t block_address = ftello(fp->file);
444 if (load_block_from_cache(fp, block_address)) return 0;
445 int count = fread(header, 1, sizeof(header), fp->file);
448 fp->block_length = 0;
452 if (count != sizeof(header)) {
453 report_error(fp, "read failed");
456 if (!check_header(header)) {
457 report_error(fp, "invalid block header");
460 int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
461 bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
462 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
463 int remaining = block_length - BLOCK_HEADER_LENGTH;
465 count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
467 count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
469 if (count != remaining) {
470 report_error(fp, "read failed");
474 count = inflate_block(fp, block_length);
478 if (fp->block_length != 0) {
479 // Do not reset offset if this read follows a seek.
480 fp->block_offset = 0;
482 fp->block_address = block_address;
483 fp->block_length = count;
484 cache_block(fp, size);
489 bgzf_read(BGZF* fp, void* data, int length)
494 if (fp->open_mode != 'r') {
495 report_error(fp, "file not open for reading");
500 bgzf_byte_t* output = data;
501 while (bytes_read < length) {
502 int available = fp->block_length - fp->block_offset;
503 if (available <= 0) {
504 if (read_block(fp) != 0) {
507 available = fp->block_length - fp->block_offset;
508 if (available <= 0) {
512 int copy_length = bgzf_min(length-bytes_read, available);
513 bgzf_byte_t* buffer = fp->uncompressed_block;
514 memcpy(output, buffer + fp->block_offset, copy_length);
515 fp->block_offset += copy_length;
516 output += copy_length;
517 bytes_read += copy_length;
519 if (fp->block_offset == fp->block_length) {
521 fp->block_address = knet_tell(fp->x.fpr);
523 fp->block_address = ftello(fp->file);
525 fp->block_offset = 0;
526 fp->block_length = 0;
533 flush_block(BGZF* fp)
535 while (fp->block_offset > 0) {
536 int block_length = deflate_block(fp, fp->block_offset);
537 if (block_length < 0) {
541 int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
543 int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
545 if (count != block_length) {
546 report_error(fp, "write failed");
549 fp->block_address += block_length;
555 bgzf_write(BGZF* fp, const void* data, int length)
557 if (fp->open_mode != 'w') {
558 report_error(fp, "file not open for writing");
562 if (fp->uncompressed_block == NULL) {
563 fp->uncompressed_block = malloc(fp->uncompressed_block_size);
566 const bgzf_byte_t* input = data;
567 int block_length = fp->uncompressed_block_size;
568 int bytes_written = 0;
569 while (bytes_written < length) {
570 int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
571 bgzf_byte_t* buffer = fp->uncompressed_block;
572 memcpy(buffer + fp->block_offset, input, copy_length);
573 fp->block_offset += copy_length;
574 input += copy_length;
575 bytes_written += copy_length;
576 if (fp->block_offset == block_length) {
577 if (flush_block(fp) != 0) {
582 return bytes_written;
588 if (fp->open_mode == 'w') {
589 if (flush_block(fp) != 0) {
592 { // add an empty block
593 int count, block_length = deflate_block(fp, 0);
595 count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
597 count = fwrite(fp->compressed_block, 1, block_length, fp->file);
601 if (fflush(fp->x.fpw) != 0) {
603 if (fflush(fp->file) != 0) {
605 report_error(fp, "flush failed");
609 if (fp->owned_file) {
612 if (fp->open_mode == 'w') ret = fclose(fp->x.fpw);
613 else ret = knet_close(fp->x.fpr);
614 if (ret != 0) return -1;
616 if (fclose(fp->file) != 0) {
621 free(fp->uncompressed_block);
622 free(fp->compressed_block);
631 return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF));
634 void bgzf_set_cache_size(BGZF *fp, int cache_size)
636 if (fp) fp->cache_size = cache_size;
639 int bgzf_check_EOF(BGZF *fp)
641 static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
645 offset = knet_tell(fp->x.fpr);
646 if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;
647 knet_read(fp->x.fpr, buf, 28);
648 knet_seek(fp->x.fpr, offset, SEEK_SET);
650 offset = ftello(fp->file);
651 if (fseeko(fp->file, -28, SEEK_END) != 0) return -1;
652 fread(buf, 1, 28, fp->file);
653 fseeko(fp->file, offset, SEEK_SET);
655 return (memcmp(magic, buf, 28) == 0)? 1 : 0;
659 bgzf_seek(BGZF* fp, int64_t pos, int where)
661 if (fp->open_mode != 'r') {
662 report_error(fp, "file not open for read");
665 if (where != SEEK_SET) {
666 report_error(fp, "unimplemented seek option");
669 int block_offset = pos & 0xFFFF;
670 int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
672 if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
674 if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
676 report_error(fp, "seek failed");
679 fp->block_length = 0; // indicates current block is not loaded
680 fp->block_address = block_address;
681 fp->block_offset = block_offset;