3 Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 2009-06-29 by lh3: cache recent uncompressed blocks.
26 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
27 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
34 #include <sys/types.h>
44 KHASH_MAP_INIT_INT64(cache, cache_t)
46 #if defined(_WIN32) || defined(_MSC_VER)
47 #define ftello(fp) ftell(fp)
48 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
50 extern off_t ftello(FILE *stream);
51 extern int fseeko(FILE *stream, off_t offset, int whence);
54 typedef int8_t bgzf_byte_t;
56 static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
57 static const int MAX_BLOCK_SIZE = 64 * 1024;
59 static const int BLOCK_HEADER_LENGTH = 18;
60 static const int BLOCK_FOOTER_LENGTH = 8;
62 static const int GZIP_ID1 = 31;
63 static const int GZIP_ID2 = 139;
64 static const int CM_DEFLATE = 8;
65 static const int FLG_FEXTRA = 4;
66 static const int OS_UNKNOWN = 255;
67 static const int BGZF_ID1 = 66; // 'B'
68 static const int BGZF_ID2 = 67; // 'C'
69 static const int BGZF_LEN = 2;
70 static const int BGZF_XLEN = 6; // BGZF_LEN+4
72 static const int GZIP_WINDOW_BITS = -15; // no zlib header
73 static const int Z_DEFAULT_MEM_LEVEL = 8;
78 packInt16(uint8_t* buffer, uint16_t value)
81 buffer[1] = value >> 8;
86 unpackInt16(const uint8_t* buffer)
88 return (buffer[0] | (buffer[1] << 8));
93 packInt32(uint8_t* buffer, uint32_t value)
96 buffer[1] = value >> 8;
97 buffer[2] = value >> 16;
98 buffer[3] = value >> 24;
103 bgzf_min(int x, int y)
105 return (x < y) ? x : y;
110 report_error(BGZF* fp, const char* message) {
114 static BGZF *bgzf_read_init()
117 fp = calloc(1, sizeof(BGZF));
118 fp->uncompressed_block_size = MAX_BLOCK_SIZE;
119 fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
120 fp->compressed_block_size = MAX_BLOCK_SIZE;
121 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
123 fp->cache = kh_init(cache);
132 knetFile *file = knet_dopen(fd, "r");
134 FILE* file = fdopen(fd, "r");
137 if (file == 0) return 0;
138 fp = bgzf_read_init();
139 fp->file_descriptor = fd;
151 open_write(int fd, bool is_uncompressed)
153 FILE* file = fdopen(fd, "w");
155 if (file == 0) return 0;
156 fp = malloc(sizeof(BGZF));
157 fp->file_descriptor = fd;
159 fp->owned_file = 0; fp->is_uncompressed = is_uncompressed;
165 fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
166 fp->uncompressed_block = NULL;
167 fp->compressed_block_size = MAX_BLOCK_SIZE;
168 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
169 fp->block_address = 0;
170 fp->block_offset = 0;
171 fp->block_length = 0;
177 bgzf_open(const char* __restrict path, const char* __restrict mode)
180 if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */
182 knetFile *file = knet_open(path, mode);
183 if (file == 0) return 0;
184 fp = bgzf_read_init();
185 fp->file_descriptor = -1;
189 int fd, oflag = O_RDONLY;
193 fd = open(path, oflag);
194 if (fd == -1) return 0;
197 } else if (mode[0] == 'w' || mode[0] == 'W') {
198 int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC;
202 fd = open(path, oflag, 0666);
203 if (fd == -1) return 0;
204 fp = open_write(fd, strstr(mode, "u")? 1 : 0);
213 bgzf_fdopen(int fd, const char * __restrict mode)
215 if (fd == -1) return 0;
216 if (mode[0] == 'r' || mode[0] == 'R') {
217 return open_read(fd);
218 } else if (mode[0] == 'w' || mode[0] == 'W') {
219 return open_write(fd, strstr(mode, "u")? 1 : 0);
227 deflate_block(BGZF* fp, int block_length)
229 // Deflate the block in fp->uncompressed_block into fp->compressed_block.
230 // Also adds an extra field that stores the compressed block length.
232 bgzf_byte_t* buffer = fp->compressed_block;
233 int buffer_size = fp->compressed_block_size;
236 buffer[0] = GZIP_ID1;
237 buffer[1] = GZIP_ID2;
238 buffer[2] = CM_DEFLATE;
239 buffer[3] = FLG_FEXTRA;
240 buffer[4] = 0; // mtime
245 buffer[9] = OS_UNKNOWN;
246 buffer[10] = BGZF_XLEN;
248 buffer[12] = BGZF_ID1;
249 buffer[13] = BGZF_ID2;
250 buffer[14] = BGZF_LEN;
252 buffer[16] = 0; // placeholder for block length
255 // loop to retry for blocks that do not compress enough
256 int input_length = block_length;
257 int compressed_length = 0;
259 int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION;
263 zs.next_in = fp->uncompressed_block;
264 zs.avail_in = input_length;
265 zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
266 zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
268 int status = deflateInit2(&zs, compress_level, Z_DEFLATED,
269 GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
270 if (status != Z_OK) {
271 report_error(fp, "deflate init failed");
274 status = deflate(&zs, Z_FINISH);
275 if (status != Z_STREAM_END) {
277 if (status == Z_OK) {
278 // Not enough space in buffer.
279 // Can happen in the rare case the input doesn't compress enough.
280 // Reduce the amount of input until it fits.
281 input_length -= 1024;
282 if (input_length <= 0) {
283 // should never happen
284 report_error(fp, "input reduction failed");
289 report_error(fp, "deflate failed");
292 status = deflateEnd(&zs);
293 if (status != Z_OK) {
294 report_error(fp, "deflate end failed");
297 compressed_length = zs.total_out;
298 compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
299 if (compressed_length > MAX_BLOCK_SIZE) {
300 // should never happen
301 report_error(fp, "deflate overflow");
307 packInt16((uint8_t*)&buffer[16], compressed_length-1);
308 uint32_t crc = crc32(0L, NULL, 0L);
309 crc = crc32(crc, fp->uncompressed_block, input_length);
310 packInt32((uint8_t*)&buffer[compressed_length-8], crc);
311 packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
313 int remaining = block_length - input_length;
315 if (remaining > input_length) {
316 // should never happen (check so we can use memcpy)
317 report_error(fp, "remainder too large");
320 memcpy(fp->uncompressed_block,
321 fp->uncompressed_block + input_length,
324 fp->block_offset = remaining;
325 return compressed_length;
330 inflate_block(BGZF* fp, int block_length)
332 // Inflate the block in fp->compressed_block into fp->uncompressed_block
337 zs.next_in = fp->compressed_block + 18;
338 zs.avail_in = block_length - 16;
339 zs.next_out = fp->uncompressed_block;
340 zs.avail_out = fp->uncompressed_block_size;
342 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
343 if (status != Z_OK) {
344 report_error(fp, "inflate init failed");
347 status = inflate(&zs, Z_FINISH);
348 if (status != Z_STREAM_END) {
350 report_error(fp, "inflate failed");
353 status = inflateEnd(&zs);
354 if (status != Z_OK) {
355 report_error(fp, "inflate failed");
363 check_header(const bgzf_byte_t* header)
365 return (header[0] == GZIP_ID1 &&
366 header[1] == (bgzf_byte_t) GZIP_ID2 &&
367 header[2] == Z_DEFLATED &&
368 (header[3] & FLG_FEXTRA) != 0 &&
369 unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
370 header[12] == BGZF_ID1 &&
371 header[13] == BGZF_ID2 &&
372 unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
375 static void free_cache(BGZF *fp)
378 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
379 if (fp->open_mode != 'r') return;
380 for (k = kh_begin(h); k < kh_end(h); ++k)
381 if (kh_exist(h, k)) free(kh_val(h, k).block);
382 kh_destroy(cache, h);
385 static int load_block_from_cache(BGZF *fp, int64_t block_address)
389 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
390 k = kh_get(cache, h, block_address);
391 if (k == kh_end(h)) return 0;
393 if (fp->block_length != 0) fp->block_offset = 0;
394 fp->block_address = block_address;
395 fp->block_length = p->size;
396 memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE);
398 knet_seek(fp->x.fpr, p->end_offset, SEEK_SET);
400 fseeko(fp->file, p->end_offset, SEEK_SET);
405 static void cache_block(BGZF *fp, int size)
410 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
411 if (MAX_BLOCK_SIZE >= fp->cache_size) return;
412 if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) {
413 /* A better way would be to remove the oldest block in the
414 * cache, but here we remove a random one for simplicity. This
415 * should not have a big impact on performance. */
416 for (k = kh_begin(h); k < kh_end(h); ++k)
417 if (kh_exist(h, k)) break;
419 free(kh_val(h, k).block);
423 k = kh_put(cache, h, fp->block_address, &ret);
424 if (ret == 0) return; // if this happens, a bug!
426 p->size = fp->block_length;
427 p->end_offset = fp->block_address + size;
428 p->block = malloc(MAX_BLOCK_SIZE);
429 memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE);
433 bgzf_read_block(BGZF* fp)
435 bgzf_byte_t header[BLOCK_HEADER_LENGTH];
438 int64_t block_address = knet_tell(fp->x.fpr);
439 if (load_block_from_cache(fp, block_address)) return 0;
440 int count = knet_read(fp->x.fpr, header, sizeof(header));
442 int64_t block_address = ftello(fp->file);
443 if (load_block_from_cache(fp, block_address)) return 0;
444 int count = fread(header, 1, sizeof(header), fp->file);
447 fp->block_length = 0;
451 if (count != sizeof(header)) {
452 report_error(fp, "read failed");
455 if (!check_header(header)) {
456 report_error(fp, "invalid block header");
459 int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
460 bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
461 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
462 int remaining = block_length - BLOCK_HEADER_LENGTH;
464 count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
466 count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
468 if (count != remaining) {
469 report_error(fp, "read failed");
473 count = inflate_block(fp, block_length);
477 if (fp->block_length != 0) {
478 // Do not reset offset if this read follows a seek.
479 fp->block_offset = 0;
481 fp->block_address = block_address;
482 fp->block_length = count;
483 cache_block(fp, size);
488 bgzf_read(BGZF* fp, void* data, int length)
493 if (fp->open_mode != 'r') {
494 report_error(fp, "file not open for reading");
499 bgzf_byte_t* output = data;
500 while (bytes_read < length) {
501 int available = fp->block_length - fp->block_offset;
502 if (available <= 0) {
503 if (bgzf_read_block(fp) != 0) {
506 available = fp->block_length - fp->block_offset;
507 if (available <= 0) {
511 int copy_length = bgzf_min(length-bytes_read, available);
512 bgzf_byte_t* buffer = fp->uncompressed_block;
513 memcpy(output, buffer + fp->block_offset, copy_length);
514 fp->block_offset += copy_length;
515 output += copy_length;
516 bytes_read += copy_length;
518 if (fp->block_offset == fp->block_length) {
520 fp->block_address = knet_tell(fp->x.fpr);
522 fp->block_address = ftello(fp->file);
524 fp->block_offset = 0;
525 fp->block_length = 0;
530 int bgzf_flush(BGZF* fp)
532 while (fp->block_offset > 0) {
533 int count, block_length;
534 block_length = deflate_block(fp, fp->block_offset);
535 if (block_length < 0) return -1;
537 count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
539 count = fwrite(fp->compressed_block, 1, block_length, fp->file);
541 if (count != block_length) {
542 report_error(fp, "write failed");
545 fp->block_address += block_length;
550 int bgzf_flush_try(BGZF *fp, int size)
552 if (fp->block_offset + size > fp->uncompressed_block_size)
553 return bgzf_flush(fp);
557 int bgzf_write(BGZF* fp, const void* data, int length)
559 if (fp->open_mode != 'w') {
560 report_error(fp, "file not open for writing");
564 if (fp->uncompressed_block == NULL)
565 fp->uncompressed_block = malloc(fp->uncompressed_block_size);
567 const bgzf_byte_t* input = data;
568 int block_length = fp->uncompressed_block_size;
569 int bytes_written = 0;
570 while (bytes_written < length) {
571 int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
572 bgzf_byte_t* buffer = fp->uncompressed_block;
573 memcpy(buffer + fp->block_offset, input, copy_length);
574 fp->block_offset += copy_length;
575 input += copy_length;
576 bytes_written += copy_length;
577 if (fp->block_offset == block_length) {
578 if (bgzf_flush(fp) != 0) {
583 return bytes_written;
586 int bgzf_close(BGZF* fp)
588 if (fp->open_mode == 'w') {
589 if (bgzf_flush(fp) != 0) return -1;
590 { // add an empty block
591 int count, block_length = deflate_block(fp, 0);
593 count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
595 count = fwrite(fp->compressed_block, 1, block_length, fp->file);
599 if (fflush(fp->x.fpw) != 0) {
601 if (fflush(fp->file) != 0) {
603 report_error(fp, "flush failed");
607 if (fp->owned_file) {
610 if (fp->open_mode == 'w') ret = fclose(fp->x.fpw);
611 else ret = knet_close(fp->x.fpr);
612 if (ret != 0) return -1;
614 if (fclose(fp->file) != 0) {
619 free(fp->uncompressed_block);
620 free(fp->compressed_block);
626 void bgzf_set_cache_size(BGZF *fp, int cache_size)
628 if (fp) fp->cache_size = cache_size;
631 int bgzf_check_EOF(BGZF *fp)
633 static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
637 offset = knet_tell(fp->x.fpr);
638 if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;
639 knet_read(fp->x.fpr, buf, 28);
640 knet_seek(fp->x.fpr, offset, SEEK_SET);
642 offset = ftello(fp->file);
643 if (fseeko(fp->file, -28, SEEK_END) != 0) return -1;
644 fread(buf, 1, 28, fp->file);
645 fseeko(fp->file, offset, SEEK_SET);
647 return (memcmp(magic, buf, 28) == 0)? 1 : 0;
650 int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
653 int64_t block_address;
655 if (fp->open_mode != 'r') {
656 report_error(fp, "file not open for read");
659 if (where != SEEK_SET) {
660 report_error(fp, "unimplemented seek option");
663 block_offset = pos & 0xFFFF;
664 block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
666 if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
668 if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
670 report_error(fp, "seek failed");
673 fp->block_length = 0; // indicates current block is not loaded
674 fp->block_address = block_address;
675 fp->block_offset = block_offset;