3 Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
4 2011 Attractive Chaos <attractor@live.co.uk>
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
30 #include <sys/types.h>
35 typedef knetFile *_bgzf_file_t;
36 #define _bgzf_open(fn, mode) knet_open(fn, mode)
37 #define _bgzf_dopen(fp, mode) knet_dopen(fp, mode)
38 #define _bgzf_close(fp) knet_close(fp)
39 #define _bgzf_fileno(fp) ((fp)->fd)
40 #define _bgzf_tell(fp) knet_tell(fp)
41 #define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence)
42 #define _bgzf_read(fp, buf, len) knet_read(fp, buf, len)
43 #define _bgzf_write(fp, buf, len) knet_write(fp, buf, len)
44 #else // ~defined(_USE_KNETFILE)
45 #if defined(_WIN32) || defined(_MSC_VER)
46 #define ftello(fp) ftell(fp)
47 #define fseeko(fp, offset, whence) fseek(fp, offset, whence)
48 #else // ~defined(_WIN32)
49 extern off_t ftello(FILE *stream);
50 extern int fseeko(FILE *stream, off_t offset, int whence);
51 #endif // ~defined(_WIN32)
52 typedef FILE *_bgzf_file_t;
53 #define _bgzf_open(fn, mode) fopen(fn, mode)
54 #define _bgzf_dopen(fp, mode) fdopen(fp, mode)
55 #define _bgzf_close(fp) fclose(fp)
56 #define _bgzf_fileno(fp) fileno(fp)
57 #define _bgzf_tell(fp) ftello(fp)
58 #define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence)
59 #define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp)
60 #define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp)
61 #endif // ~define(_USE_KNETFILE)
63 #define BLOCK_HEADER_LENGTH 18
64 #define BLOCK_FOOTER_LENGTH 8
67 /* BGZF/GZIP header (speciallized from RFC 1952; little endian):
68 +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
69 | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN|
70 +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
72 static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
81 KHASH_MAP_INIT_INT64(cache, cache_t)
84 static inline void packInt16(uint8_t *buffer, uint16_t value)
87 buffer[1] = value >> 8;
90 static inline int unpackInt16(const uint8_t *buffer)
92 return buffer[0] | buffer[1] << 8;
95 static inline void packInt32(uint8_t *buffer, uint32_t value)
98 buffer[1] = value >> 8;
99 buffer[2] = value >> 16;
100 buffer[3] = value >> 24;
103 static BGZF *bgzf_read_init()
106 fp = calloc(1, sizeof(BGZF));
108 fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
109 fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
111 fp->cache = kh_init(cache);
116 static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level
119 fp = calloc(1, sizeof(BGZF));
121 fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
122 fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
123 fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
124 if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
127 // get the compress level from the mode string
128 static int mode2level(const char *__restrict mode)
130 int i, compress_level = -1;
131 for (i = 0; mode[i]; ++i)
132 if (mode[i] >= '0' && mode[i] <= '9') break;
133 if (mode[i]) compress_level = (int)mode[i] - '0';
134 if (strchr(mode, 'u')) compress_level = 0;
135 return compress_level;
138 BGZF *bgzf_open(const char *path, const char *mode)
141 assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
142 if (strchr(mode, 'r') || strchr(mode, 'R')) {
144 if ((fpr = _bgzf_open(path, "r")) == 0) return 0;
145 fp = bgzf_read_init();
147 } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
149 if ((fpw = fopen(path, "w")) == 0) return 0;
150 fp = bgzf_write_init(mode2level(mode));
156 BGZF *bgzf_dopen(int fd, const char *mode)
159 assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
160 if (strchr(mode, 'r') || strchr(mode, 'R')) {
162 if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0;
163 fp = bgzf_read_init();
165 } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
167 if ((fpw = fdopen(fd, "w")) == 0) return 0;
168 fp = bgzf_write_init(mode2level(mode));
174 static int bgzf_compress(void *_dst, int *dlen, void *src, int slen, int level)
178 uint8_t *dst = (uint8_t*)_dst;
181 zs.zalloc = NULL; zs.zfree = NULL;
184 zs.next_out = dst + BLOCK_HEADER_LENGTH;
185 zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
186 if (deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; // -15 to disable zlib header/footer
187 if (deflate(&zs, Z_FINISH) != Z_STREAM_END) return -1;
188 if (deflateEnd(&zs) != Z_OK) return -1;
189 *dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
191 memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
192 packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes
194 crc = crc32(crc32(0L, NULL, 0L), src, slen);
195 packInt32((uint8_t*)&dst[*dlen - 8], crc);
196 packInt32((uint8_t*)&dst[*dlen - 4], slen);
200 // Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
201 static int deflate_block(BGZF *fp, int block_length)
203 int comp_size = BGZF_MAX_BLOCK_SIZE;
204 if (bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level) != 0) {
205 fp->errcode |= BGZF_ERR_ZLIB;
208 fp->block_offset = 0;
212 // Inflate the block in fp->compressed_block into fp->uncompressed_block
213 static int inflate_block(BGZF* fp, int block_length)
218 zs.next_in = fp->compressed_block + 18;
219 zs.avail_in = block_length - 16;
220 zs.next_out = fp->uncompressed_block;
221 zs.avail_out = BGZF_BLOCK_SIZE;
223 if (inflateInit2(&zs, -15) != Z_OK) {
224 fp->errcode |= BGZF_ERR_ZLIB;
227 if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
229 fp->errcode |= BGZF_ERR_ZLIB;
232 if (inflateEnd(&zs) != Z_OK) {
233 fp->errcode |= BGZF_ERR_ZLIB;
239 static int check_header(const uint8_t *header)
241 return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0
242 && unpackInt16((uint8_t*)&header[10]) == 6
243 && header[12] == 'B' && header[13] == 'C'
244 && unpackInt16((uint8_t*)&header[14]) == 2);
248 static void free_cache(BGZF *fp)
251 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
252 if (fp->is_write) return;
253 for (k = kh_begin(h); k < kh_end(h); ++k)
254 if (kh_exist(h, k)) free(kh_val(h, k).block);
255 kh_destroy(cache, h);
258 static int load_block_from_cache(BGZF *fp, int64_t block_address)
262 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
263 k = kh_get(cache, h, block_address);
264 if (k == kh_end(h)) return 0;
266 if (fp->block_length != 0) fp->block_offset = 0;
267 fp->block_address = block_address;
268 fp->block_length = p->size;
269 memcpy(fp->uncompressed_block, p->block, BGZF_BLOCK_SIZE);
270 _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET);
274 static void cache_block(BGZF *fp, int size)
279 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
280 if (BGZF_BLOCK_SIZE >= fp->cache_size) return;
281 if ((kh_size(h) + 1) * BGZF_BLOCK_SIZE > fp->cache_size) {
282 /* A better way would be to remove the oldest block in the
283 * cache, but here we remove a random one for simplicity. This
284 * should not have a big impact on performance. */
285 for (k = kh_begin(h); k < kh_end(h); ++k)
286 if (kh_exist(h, k)) break;
288 free(kh_val(h, k).block);
292 k = kh_put(cache, h, fp->block_address, &ret);
293 if (ret == 0) return; // if this happens, a bug!
295 p->size = fp->block_length;
296 p->end_offset = fp->block_address + size;
297 p->block = malloc(BGZF_BLOCK_SIZE);
298 memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_BLOCK_SIZE);
301 static void free_cache(BGZF *fp) {}
302 static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
303 static void cache_block(BGZF *fp, int size) {}
306 int bgzf_read_block(BGZF *fp)
308 uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
309 int count, size = 0, block_length, remaining;
310 int64_t block_address;
311 block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
312 if (load_block_from_cache(fp, block_address)) return 0;
313 count = _bgzf_read(fp->fp, header, sizeof(header));
314 if (count == 0) { // no data read
315 fp->block_length = 0;
318 if (count != sizeof(header) || !check_header(header)) {
319 fp->errcode |= BGZF_ERR_HEADER;
323 block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
324 compressed_block = (uint8_t*)fp->compressed_block;
325 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
326 remaining = block_length - BLOCK_HEADER_LENGTH;
327 count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
328 if (count != remaining) {
329 fp->errcode |= BGZF_ERR_IO;
333 if ((count = inflate_block(fp, block_length)) < 0) return -1;
334 if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
335 fp->block_address = block_address;
336 fp->block_length = count;
337 cache_block(fp, size);
341 ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length)
343 ssize_t bytes_read = 0;
344 uint8_t *output = data;
345 if (length <= 0) return 0;
346 assert(fp->is_write == 0);
347 while (bytes_read < length) {
348 int copy_length, available = fp->block_length - fp->block_offset;
350 if (available <= 0) {
351 if (bgzf_read_block(fp) != 0) return -1;
352 available = fp->block_length - fp->block_offset;
353 if (available <= 0) break;
355 copy_length = length - bytes_read < available? length - bytes_read : available;
356 buffer = fp->uncompressed_block;
357 memcpy(output, buffer + fp->block_offset, copy_length);
358 fp->block_offset += copy_length;
359 output += copy_length;
360 bytes_read += copy_length;
362 if (fp->block_offset == fp->block_length) {
363 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
364 fp->block_offset = fp->block_length = 0;
369 int bgzf_flush(BGZF *fp)
371 if (!fp->is_write) return 0;
372 while (fp->block_offset > 0) {
374 block_length = deflate_block(fp, fp->block_offset);
375 if (block_length < 0) return -1;
376 if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) {
377 fp->errcode |= BGZF_ERR_IO; // possibly truncated file
380 fp->block_address += block_length;
385 int bgzf_flush_try(BGZF *fp, ssize_t size)
387 if (fp->block_offset + size > BGZF_BLOCK_SIZE)
388 return bgzf_flush(fp);
392 ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length)
394 const uint8_t *input = data;
395 int block_length = BGZF_BLOCK_SIZE, bytes_written;
396 assert(fp->is_write);
399 while (bytes_written < length) {
400 uint8_t* buffer = fp->uncompressed_block;
401 int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written;
402 memcpy(buffer + fp->block_offset, input, copy_length);
403 fp->block_offset += copy_length;
404 input += copy_length;
405 bytes_written += copy_length;
406 if (fp->block_offset == block_length && bgzf_flush(fp)) break;
408 return bytes_written;
411 int bgzf_close(BGZF* fp)
413 int ret, count, block_length;
414 if (fp == 0) return -1;
416 if (bgzf_flush(fp) != 0) return -1;
417 block_length = deflate_block(fp, 0); // write an empty block
418 count = fwrite(fp->compressed_block, 1, block_length, fp->fp);
419 if (fflush(fp->fp) != 0) {
420 fp->errcode |= BGZF_ERR_IO;
424 ret = fp->is_write? fclose(fp->fp) : _bgzf_close(fp->fp);
425 if (ret != 0) return -1;
426 free(fp->uncompressed_block);
427 free(fp->compressed_block);
433 void bgzf_set_cache_size(BGZF *fp, int cache_size)
435 if (fp) fp->cache_size = cache_size;
438 int bgzf_check_EOF(BGZF *fp)
440 static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
443 offset = _bgzf_tell((_bgzf_file_t)fp->fp);
444 if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0;
445 _bgzf_read(fp->fp, buf, 28);
446 _bgzf_seek(fp->fp, offset, SEEK_SET);
447 return (memcmp(magic, buf, 28) == 0)? 1 : 0;
450 int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
453 int64_t block_address;
455 if (fp->is_write || where != SEEK_SET) {
456 fp->errcode |= BGZF_ERR_MISUSE;
459 block_offset = pos & 0xFFFF;
460 block_address = pos >> 16;
461 if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) {
462 fp->errcode |= BGZF_ERR_IO;
465 fp->block_length = 0; // indicates current block has not been loaded
466 fp->block_address = block_address;
467 fp->block_offset = block_offset;
471 int bgzf_is_bgzf(const char *fn)
476 if ((fp = _bgzf_open(fn, "r")) == 0) return 0;
477 n = _bgzf_read(fp, buf, 16);
479 if (n != 16) return 0;
480 return memcmp(g_magic, buf, 16) == 0? 1 : 0;
483 int bgzf_getc(BGZF *fp)
486 if (fp->block_offset >= fp->block_length) {
487 if (bgzf_read_block(fp) != 0) return -2; /* error */
488 if (fp->block_length == 0) return -1; /* end-of-file */
490 c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
491 if (fp->block_offset == fp->block_length) {
492 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
493 fp->block_offset = 0;
494 fp->block_length = 0;
500 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
503 int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
506 unsigned char *buf = (unsigned char*)fp->uncompressed_block;
509 if (fp->block_offset >= fp->block_length) {
510 if (bgzf_read_block(fp) != 0) { state = -2; break; }
511 if (fp->block_length == 0) { state = -1; break; }
513 for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
514 if (l < fp->block_length) state = 1;
515 l -= fp->block_offset;
516 if (str->l + l + 1 >= str->m) {
517 str->m = str->l + l + 2;
519 str->s = (char*)realloc(str->s, str->m);
521 memcpy(str->s + str->l, buf + fp->block_offset, l);
523 fp->block_offset += l + 1;
524 if (fp->block_offset >= fp->block_length) {
525 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
526 fp->block_offset = 0;
527 fp->block_length = 0;
529 } while (state == 0);
530 if (str->l == 0 && state < 0) return state;