3 Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to deal
7 in the Software without restriction, including without limitation the rights
8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the Software is
10 furnished to do so, subject to the following conditions:
12 The above copyright notice and this permission notice shall be included in
13 all copies or substantial portions of the Software.
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 2009-06-29 by lh3: cache recent uncompressed blocks.
26 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
27 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
34 #include <sys/types.h>
44 KHASH_MAP_INIT_INT64(cache, cache_t)
46 #if defined(_WIN32) || defined(_MSC_VER)
47 #define ftello(fp) ftell(fp)
48 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
50 extern off_t ftello(FILE *stream);
51 extern int fseeko(FILE *stream, off_t offset, int whence);
54 typedef int8_t bgzf_byte_t;
56 static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
57 static const int MAX_BLOCK_SIZE = 64 * 1024;
59 static const int BLOCK_HEADER_LENGTH = 18;
60 static const int BLOCK_FOOTER_LENGTH = 8;
62 static const int GZIP_ID1 = 31;
63 static const int GZIP_ID2 = 139;
64 static const int CM_DEFLATE = 8;
65 static const int FLG_FEXTRA = 4;
66 static const int OS_UNKNOWN = 255;
67 static const int BGZF_ID1 = 66; // 'B'
68 static const int BGZF_ID2 = 67; // 'C'
69 static const int BGZF_LEN = 2;
70 static const int BGZF_XLEN = 6; // BGZF_LEN+4
72 static const int GZIP_WINDOW_BITS = -15; // no zlib header
73 static const int Z_DEFAULT_MEM_LEVEL = 8;
78 packInt16(uint8_t* buffer, uint16_t value)
81 buffer[1] = value >> 8;
86 unpackInt16(const uint8_t* buffer)
88 return (buffer[0] | (buffer[1] << 8));
93 packInt32(uint8_t* buffer, uint32_t value)
96 buffer[1] = value >> 8;
97 buffer[2] = value >> 16;
98 buffer[3] = value >> 24;
103 bgzf_min(int x, int y)
105 return (x < y) ? x : y;
110 report_error(BGZF* fp, const char* message) {
114 static BGZF *bgzf_read_init()
117 fp = calloc(1, sizeof(BGZF));
118 fp->uncompressed_block_size = MAX_BLOCK_SIZE;
119 fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
120 fp->compressed_block_size = MAX_BLOCK_SIZE;
121 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
123 fp->cache = kh_init(cache);
132 knetFile *file = knet_dopen(fd, "r");
134 FILE* file = fdopen(fd, "r");
137 if (file == 0) return 0;
138 fp = bgzf_read_init();
139 fp->file_descriptor = fd;
151 open_write(int fd, int compress_level) // compress_level==-1 for the default level
153 FILE* file = fdopen(fd, "w");
155 if (file == 0) return 0;
156 fp = malloc(sizeof(BGZF));
157 fp->file_descriptor = fd;
160 fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
161 if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
167 fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
168 fp->uncompressed_block = NULL;
169 fp->compressed_block_size = MAX_BLOCK_SIZE;
170 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
171 fp->block_address = 0;
172 fp->block_offset = 0;
173 fp->block_length = 0;
179 bgzf_open(const char* __restrict path, const char* __restrict mode)
182 if (strchr(mode, 'r') || strchr(mode, 'R')) { /* The reading mode is preferred. */
184 knetFile *file = knet_open(path, mode);
185 if (file == 0) return 0;
186 fp = bgzf_read_init();
187 fp->file_descriptor = -1;
191 int fd, oflag = O_RDONLY;
195 fd = open(path, oflag);
196 if (fd == -1) return 0;
199 } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
200 int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC;
204 fd = open(path, oflag, 0666);
205 if (fd == -1) return 0;
206 { // set compress_level
208 for (i = 0; mode[i]; ++i)
209 if (mode[i] >= '0' && mode[i] <= '9') break;
210 if (mode[i]) compress_level = (int)mode[i] - '0';
211 if (strchr(mode, 'u')) compress_level = 0;
213 fp = open_write(fd, compress_level);
215 if (fp != NULL) fp->owned_file = 1;
220 bgzf_fdopen(int fd, const char * __restrict mode)
222 if (fd == -1) return 0;
223 if (mode[0] == 'r' || mode[0] == 'R') {
224 return open_read(fd);
225 } else if (mode[0] == 'w' || mode[0] == 'W') {
226 int i, compress_level = -1;
227 for (i = 0; mode[i]; ++i)
228 if (mode[i] >= '0' && mode[i] <= '9') break;
229 if (mode[i]) compress_level = (int)mode[i] - '0';
230 if (strchr(mode, 'u')) compress_level = 0;
231 return open_write(fd, compress_level);
239 deflate_block(BGZF* fp, int block_length)
241 // Deflate the block in fp->uncompressed_block into fp->compressed_block.
242 // Also adds an extra field that stores the compressed block length.
244 bgzf_byte_t* buffer = fp->compressed_block;
245 int buffer_size = fp->compressed_block_size;
248 buffer[0] = GZIP_ID1;
249 buffer[1] = GZIP_ID2;
250 buffer[2] = CM_DEFLATE;
251 buffer[3] = FLG_FEXTRA;
252 buffer[4] = 0; // mtime
257 buffer[9] = OS_UNKNOWN;
258 buffer[10] = BGZF_XLEN;
260 buffer[12] = BGZF_ID1;
261 buffer[13] = BGZF_ID2;
262 buffer[14] = BGZF_LEN;
264 buffer[16] = 0; // placeholder for block length
267 // loop to retry for blocks that do not compress enough
268 int input_length = block_length;
269 int compressed_length = 0;
274 zs.next_in = fp->uncompressed_block;
275 zs.avail_in = input_length;
276 zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
277 zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
279 int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED,
280 GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
281 if (status != Z_OK) {
282 report_error(fp, "deflate init failed");
285 status = deflate(&zs, Z_FINISH);
286 if (status != Z_STREAM_END) {
288 if (status == Z_OK) {
289 // Not enough space in buffer.
290 // Can happen in the rare case the input doesn't compress enough.
291 // Reduce the amount of input until it fits.
292 input_length -= 1024;
293 if (input_length <= 0) {
294 // should never happen
295 report_error(fp, "input reduction failed");
300 report_error(fp, "deflate failed");
303 status = deflateEnd(&zs);
304 if (status != Z_OK) {
305 report_error(fp, "deflate end failed");
308 compressed_length = zs.total_out;
309 compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
310 if (compressed_length > MAX_BLOCK_SIZE) {
311 // should never happen
312 report_error(fp, "deflate overflow");
318 packInt16((uint8_t*)&buffer[16], compressed_length-1);
319 uint32_t crc = crc32(0L, NULL, 0L);
320 crc = crc32(crc, fp->uncompressed_block, input_length);
321 packInt32((uint8_t*)&buffer[compressed_length-8], crc);
322 packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
324 int remaining = block_length - input_length;
326 if (remaining > input_length) {
327 // should never happen (check so we can use memcpy)
328 report_error(fp, "remainder too large");
331 memcpy(fp->uncompressed_block,
332 fp->uncompressed_block + input_length,
335 fp->block_offset = remaining;
336 return compressed_length;
341 inflate_block(BGZF* fp, int block_length)
343 // Inflate the block in fp->compressed_block into fp->uncompressed_block
349 zs.next_in = fp->compressed_block + 18;
350 zs.avail_in = block_length - 16;
351 zs.next_out = fp->uncompressed_block;
352 zs.avail_out = fp->uncompressed_block_size;
354 status = inflateInit2(&zs, GZIP_WINDOW_BITS);
355 if (status != Z_OK) {
356 report_error(fp, "inflate init failed");
359 status = inflate(&zs, Z_FINISH);
360 if (status != Z_STREAM_END) {
362 report_error(fp, "inflate failed");
365 status = inflateEnd(&zs);
366 if (status != Z_OK) {
367 report_error(fp, "inflate failed");
375 check_header(const bgzf_byte_t* header)
377 return (header[0] == GZIP_ID1 &&
378 header[1] == (bgzf_byte_t) GZIP_ID2 &&
379 header[2] == Z_DEFLATED &&
380 (header[3] & FLG_FEXTRA) != 0 &&
381 unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
382 header[12] == BGZF_ID1 &&
383 header[13] == BGZF_ID2 &&
384 unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
387 static void free_cache(BGZF *fp)
390 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
391 if (fp->open_mode != 'r') return;
392 for (k = kh_begin(h); k < kh_end(h); ++k)
393 if (kh_exist(h, k)) free(kh_val(h, k).block);
394 kh_destroy(cache, h);
397 static int load_block_from_cache(BGZF *fp, int64_t block_address)
401 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
402 k = kh_get(cache, h, block_address);
403 if (k == kh_end(h)) return 0;
405 if (fp->block_length != 0) fp->block_offset = 0;
406 fp->block_address = block_address;
407 fp->block_length = p->size;
408 memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE);
410 knet_seek(fp->x.fpr, p->end_offset, SEEK_SET);
412 fseeko(fp->file, p->end_offset, SEEK_SET);
417 static void cache_block(BGZF *fp, int size)
422 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
423 if (MAX_BLOCK_SIZE >= fp->cache_size) return;
424 if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) {
425 /* A better way would be to remove the oldest block in the
426 * cache, but here we remove a random one for simplicity. This
427 * should not have a big impact on performance. */
428 for (k = kh_begin(h); k < kh_end(h); ++k)
429 if (kh_exist(h, k)) break;
431 free(kh_val(h, k).block);
435 k = kh_put(cache, h, fp->block_address, &ret);
436 if (ret == 0) return; // if this happens, a bug!
438 p->size = fp->block_length;
439 p->end_offset = fp->block_address + size;
440 p->block = malloc(MAX_BLOCK_SIZE);
441 memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE);
445 bgzf_read_block(BGZF* fp)
447 bgzf_byte_t header[BLOCK_HEADER_LENGTH];
448 int count, size = 0, block_length, remaining;
450 int64_t block_address = knet_tell(fp->x.fpr);
451 if (load_block_from_cache(fp, block_address)) return 0;
452 count = knet_read(fp->x.fpr, header, sizeof(header));
454 int64_t block_address = ftello(fp->file);
455 if (load_block_from_cache(fp, block_address)) return 0;
456 count = fread(header, 1, sizeof(header), fp->file);
459 fp->block_length = 0;
463 if (count != sizeof(header)) {
464 report_error(fp, "read failed");
467 if (!check_header(header)) {
468 report_error(fp, "invalid block header");
471 block_length = unpackInt16((uint8_t*)&header[16]) + 1;
472 bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
473 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
474 remaining = block_length - BLOCK_HEADER_LENGTH;
476 count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
478 count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
480 if (count != remaining) {
481 report_error(fp, "read failed");
485 count = inflate_block(fp, block_length);
486 if (count < 0) return -1;
487 if (fp->block_length != 0) {
488 // Do not reset offset if this read follows a seek.
489 fp->block_offset = 0;
491 fp->block_address = block_address;
492 fp->block_length = count;
493 cache_block(fp, size);
498 bgzf_read(BGZF* fp, void* data, int length)
503 if (fp->open_mode != 'r') {
504 report_error(fp, "file not open for reading");
509 bgzf_byte_t* output = data;
510 while (bytes_read < length) {
511 int copy_length, available = fp->block_length - fp->block_offset;
513 if (available <= 0) {
514 if (bgzf_read_block(fp) != 0) {
517 available = fp->block_length - fp->block_offset;
518 if (available <= 0) {
522 copy_length = bgzf_min(length-bytes_read, available);
523 buffer = fp->uncompressed_block;
524 memcpy(output, buffer + fp->block_offset, copy_length);
525 fp->block_offset += copy_length;
526 output += copy_length;
527 bytes_read += copy_length;
529 if (fp->block_offset == fp->block_length) {
531 fp->block_address = knet_tell(fp->x.fpr);
533 fp->block_address = ftello(fp->file);
535 fp->block_offset = 0;
536 fp->block_length = 0;
541 int bgzf_flush(BGZF* fp)
543 while (fp->block_offset > 0) {
544 int count, block_length;
545 block_length = deflate_block(fp, fp->block_offset);
546 if (block_length < 0) return -1;
548 count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
550 count = fwrite(fp->compressed_block, 1, block_length, fp->file);
552 if (count != block_length) {
553 report_error(fp, "write failed");
556 fp->block_address += block_length;
561 int bgzf_flush_try(BGZF *fp, int size)
563 if (fp->block_offset + size > fp->uncompressed_block_size)
564 return bgzf_flush(fp);
568 int bgzf_write(BGZF* fp, const void* data, int length)
570 const bgzf_byte_t *input = data;
571 int block_length, bytes_written;
572 if (fp->open_mode != 'w') {
573 report_error(fp, "file not open for writing");
577 if (fp->uncompressed_block == NULL)
578 fp->uncompressed_block = malloc(fp->uncompressed_block_size);
581 block_length = fp->uncompressed_block_size;
583 while (bytes_written < length) {
584 int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
585 bgzf_byte_t* buffer = fp->uncompressed_block;
586 memcpy(buffer + fp->block_offset, input, copy_length);
587 fp->block_offset += copy_length;
588 input += copy_length;
589 bytes_written += copy_length;
590 if (fp->block_offset == block_length) {
591 if (bgzf_flush(fp) != 0) {
596 return bytes_written;
599 int bgzf_close(BGZF* fp)
601 if (fp->open_mode == 'w') {
602 if (bgzf_flush(fp) != 0) return -1;
603 { // add an empty block
604 int count, block_length = deflate_block(fp, 0);
606 count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
608 count = fwrite(fp->compressed_block, 1, block_length, fp->file);
612 if (fflush(fp->x.fpw) != 0) {
614 if (fflush(fp->file) != 0) {
616 report_error(fp, "flush failed");
620 if (fp->owned_file) {
623 if (fp->open_mode == 'w') ret = fclose(fp->x.fpw);
624 else ret = knet_close(fp->x.fpr);
625 if (ret != 0) return -1;
627 if (fclose(fp->file) != 0) return -1;
630 free(fp->uncompressed_block);
631 free(fp->compressed_block);
637 void bgzf_set_cache_size(BGZF *fp, int cache_size)
639 if (fp) fp->cache_size = cache_size;
642 int bgzf_check_EOF(BGZF *fp)
644 static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
648 offset = knet_tell(fp->x.fpr);
649 if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;
650 knet_read(fp->x.fpr, buf, 28);
651 knet_seek(fp->x.fpr, offset, SEEK_SET);
653 offset = ftello(fp->file);
654 if (fseeko(fp->file, -28, SEEK_END) != 0) return -1;
655 fread(buf, 1, 28, fp->file);
656 fseeko(fp->file, offset, SEEK_SET);
658 return (memcmp(magic, buf, 28) == 0)? 1 : 0;
661 int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
664 int64_t block_address;
666 if (fp->open_mode != 'r') {
667 report_error(fp, "file not open for read");
670 if (where != SEEK_SET) {
671 report_error(fp, "unimplemented seek option");
674 block_offset = pos & 0xFFFF;
675 block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
677 if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
679 if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
681 report_error(fp, "seek failed");
684 fp->block_length = 0; // indicates current block is not loaded
685 fp->block_address = block_address;
686 fp->block_offset = block_offset;