3 * SOFTWARE COPYRIGHT NOTICE AGREEMENT
4 * This software and its documentation are copyright 2008 by the
5 * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
7 * This software is supplied without any warranty or guaranteed support whatsoever.
8 * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
13 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
14 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
21 #include <sys/types.h>
25 extern off_t ftello(FILE *stream);
26 extern int fseeko(FILE *stream, off_t offset, int whence);
30 static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
31 static const int MAX_BLOCK_SIZE = 64 * 1024;
33 static const int BLOCK_HEADER_LENGTH = 18;
34 static const int BLOCK_FOOTER_LENGTH = 8;
36 static const int GZIP_ID1 = 31;
37 static const int GZIP_ID2 = 139;
38 static const int CM_DEFLATE = 8;
39 static const int FLG_FEXTRA = 4;
40 static const int OS_UNKNOWN = 255;
41 static const int BGZF_ID1 = 66; // 'B'
42 static const int BGZF_ID2 = 67; // 'C'
43 static const int BGZF_LEN = 2;
44 static const int BGZF_XLEN = 6; // BGZF_LEN+4
46 static const int GZIP_WINDOW_BITS = -15; // no zlib header
47 static const int Z_DEFAULT_MEM_LEVEL = 8;
52 packInt16(uint8_t* buffer, uint16_t value)
55 buffer[1] = value >> 8;
60 unpackInt16(const uint8_t* buffer)
62 return (buffer[0] | (buffer[1] << 8));
67 packInt32(uint8_t* buffer, uint32_t value)
70 buffer[1] = value >> 8;
71 buffer[2] = value >> 16;
72 buffer[3] = value >> 24;
79 return (x < y) ? x : y;
84 report_error(BGZF* fp, const char* message) {
88 static BGZF *bgzf_read_init()
91 fp = calloc(1, sizeof(BGZF));
92 fp->uncompressed_block_size = MAX_BLOCK_SIZE;
93 fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
94 fp->compressed_block_size = MAX_BLOCK_SIZE;
95 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
104 knetFile *file = knet_dopen(fd, "r");
106 FILE* file = fdopen(fd, "r");
109 if (file == 0) return 0;
110 fp = bgzf_read_init();
111 fp->file_descriptor = fd;
123 open_write(int fd, bool is_uncompressed)
125 FILE* file = fdopen(fd, "w");
127 if (file == 0) return 0;
128 fp = malloc(sizeof(BGZF));
129 fp->file_descriptor = fd;
131 fp->owned_file = 0; fp->is_uncompressed = is_uncompressed;
137 fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
138 fp->uncompressed_block = NULL;
139 fp->compressed_block_size = MAX_BLOCK_SIZE;
140 fp->compressed_block = malloc(MAX_BLOCK_SIZE);
141 fp->block_address = 0;
142 fp->block_offset = 0;
143 fp->block_length = 0;
149 bgzf_open(const char* __restrict path, const char* __restrict mode)
152 if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */
154 knetFile *file = knet_open(path, mode);
155 if (file == 0) return 0;
156 fp = bgzf_read_init();
157 fp->file_descriptor = -1;
161 int oflag = O_RDONLY;
162 int fd = open(path, oflag);
163 if (fd == -1) return 0;
166 } else if (mode[0] == 'w' || mode[0] == 'W') {
167 int oflag = O_WRONLY | O_CREAT | O_TRUNC;
168 int fd = open(path, oflag, 0644);
169 if (fd == -1) return 0;
170 fp = open_write(fd, strstr(mode, "u")? 1 : 0);
179 bgzf_fdopen(int fd, const char * __restrict mode)
181 if (fd == -1) return 0;
182 if (mode[0] == 'r' || mode[0] == 'R') {
183 return open_read(fd);
184 } else if (mode[0] == 'w' || mode[0] == 'W') {
185 return open_write(fd, strstr(mode, "u")? 1 : 0);
193 deflate_block(BGZF* fp, int block_length)
195 // Deflate the block in fp->uncompressed_block into fp->compressed_block.
196 // Also adds an extra field that stores the compressed block length.
198 byte* buffer = fp->compressed_block;
199 int buffer_size = fp->compressed_block_size;
202 buffer[0] = GZIP_ID1;
203 buffer[1] = GZIP_ID2;
204 buffer[2] = CM_DEFLATE;
205 buffer[3] = FLG_FEXTRA;
206 buffer[4] = 0; // mtime
211 buffer[9] = OS_UNKNOWN;
212 buffer[10] = BGZF_XLEN;
214 buffer[12] = BGZF_ID1;
215 buffer[13] = BGZF_ID2;
216 buffer[14] = BGZF_LEN;
218 buffer[16] = 0; // placeholder for block length
221 // loop to retry for blocks that do not compress enough
222 int input_length = block_length;
223 int compressed_length = 0;
225 int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION;
229 zs.next_in = fp->uncompressed_block;
230 zs.avail_in = input_length;
231 zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
232 zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
234 int status = deflateInit2(&zs, compress_level, Z_DEFLATED,
235 GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
236 if (status != Z_OK) {
237 report_error(fp, "deflate init failed");
240 status = deflate(&zs, Z_FINISH);
241 if (status != Z_STREAM_END) {
243 if (status == Z_OK) {
244 // Not enough space in buffer.
245 // Can happen in the rare case the input doesn't compress enough.
246 // Reduce the amount of input until it fits.
247 input_length -= 1024;
248 if (input_length <= 0) {
249 // should never happen
250 report_error(fp, "input reduction failed");
255 report_error(fp, "deflate failed");
258 status = deflateEnd(&zs);
259 if (status != Z_OK) {
260 report_error(fp, "deflate end failed");
263 compressed_length = zs.total_out;
264 compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
265 if (compressed_length > MAX_BLOCK_SIZE) {
266 // should never happen
267 report_error(fp, "deflate overflow");
273 packInt16((uint8_t*)&buffer[16], compressed_length-1);
274 uint32_t crc = crc32(0L, NULL, 0L);
275 crc = crc32(crc, fp->uncompressed_block, input_length);
276 packInt32((uint8_t*)&buffer[compressed_length-8], crc);
277 packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
279 int remaining = block_length - input_length;
281 if (remaining > input_length) {
282 // should never happen (check so we can use memcpy)
283 report_error(fp, "remainder too large");
286 memcpy(fp->uncompressed_block,
287 fp->uncompressed_block + input_length,
290 fp->block_offset = remaining;
291 return compressed_length;
296 inflate_block(BGZF* fp, int block_length)
298 // Inflate the block in fp->compressed_block into fp->uncompressed_block
303 zs.next_in = fp->compressed_block + 18;
304 zs.avail_in = block_length - 16;
305 zs.next_out = fp->uncompressed_block;
306 zs.avail_out = fp->uncompressed_block_size;
308 int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
309 if (status != Z_OK) {
310 report_error(fp, "inflate init failed");
313 status = inflate(&zs, Z_FINISH);
314 if (status != Z_STREAM_END) {
316 report_error(fp, "inflate failed");
319 status = inflateEnd(&zs);
320 if (status != Z_OK) {
321 report_error(fp, "inflate failed");
329 check_header(const byte* header)
331 return (header[0] == GZIP_ID1 &&
332 header[1] == (byte) GZIP_ID2 &&
333 header[2] == Z_DEFLATED &&
334 (header[3] & FLG_FEXTRA) != 0 &&
335 unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
336 header[12] == BGZF_ID1 &&
337 header[13] == BGZF_ID2 &&
338 unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
345 byte header[BLOCK_HEADER_LENGTH];
347 int64_t block_address = knet_tell(fp->x.fpr);
348 int count = knet_read(fp->x.fpr, header, sizeof(header));
350 int64_t block_address = ftello(fp->file);
351 int count = fread(header, 1, sizeof(header), fp->file);
354 fp->block_length = 0;
357 if (count != sizeof(header)) {
358 report_error(fp, "read failed");
361 if (!check_header(header)) {
362 report_error(fp, "invalid block header");
365 int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
366 byte* compressed_block = (byte*) fp->compressed_block;
367 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
368 int remaining = block_length - BLOCK_HEADER_LENGTH;
370 count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
372 count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
374 if (count != remaining) {
375 report_error(fp, "read failed");
378 count = inflate_block(fp, block_length);
382 if (fp->block_length != 0) {
383 // Do not reset offset if this read follows a seek.
384 fp->block_offset = 0;
386 fp->block_address = block_address;
387 fp->block_length = count;
392 bgzf_read(BGZF* fp, void* data, int length)
397 if (fp->open_mode != 'r') {
398 report_error(fp, "file not open for reading");
404 while (bytes_read < length) {
405 int available = fp->block_length - fp->block_offset;
406 if (available <= 0) {
407 if (read_block(fp) != 0) {
410 available = fp->block_length - fp->block_offset;
411 if (available <= 0) {
415 int copy_length = min(length-bytes_read, available);
416 byte* buffer = fp->uncompressed_block;
417 memcpy(output, buffer + fp->block_offset, copy_length);
418 fp->block_offset += copy_length;
419 output += copy_length;
420 bytes_read += copy_length;
422 if (fp->block_offset == fp->block_length) {
424 fp->block_address = knet_tell(fp->x.fpr);
426 fp->block_address = ftello(fp->file);
428 fp->block_offset = 0;
429 fp->block_length = 0;
436 flush_block(BGZF* fp)
438 while (fp->block_offset > 0) {
439 int block_length = deflate_block(fp, fp->block_offset);
440 if (block_length < 0) {
444 int count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
446 int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
448 if (count != block_length) {
449 report_error(fp, "write failed");
452 fp->block_address += block_length;
458 bgzf_write(BGZF* fp, const void* data, int length)
460 if (fp->open_mode != 'w') {
461 report_error(fp, "file not open for writing");
465 if (fp->uncompressed_block == NULL) {
466 fp->uncompressed_block = malloc(fp->uncompressed_block_size);
469 const byte* input = data;
470 int block_length = fp->uncompressed_block_size;
471 int bytes_written = 0;
472 while (bytes_written < length) {
473 int copy_length = min(block_length - fp->block_offset, length - bytes_written);
474 byte* buffer = fp->uncompressed_block;
475 memcpy(buffer + fp->block_offset, input, copy_length);
476 fp->block_offset += copy_length;
477 input += copy_length;
478 bytes_written += copy_length;
479 if (fp->block_offset == block_length) {
480 if (flush_block(fp) != 0) {
485 return bytes_written;
491 if (fp->open_mode == 'w') {
492 if (flush_block(fp) != 0) {
496 if (fflush(fp->x.fpw) != 0) {
498 if (fflush(fp->file) != 0) {
500 report_error(fp, "flush failed");
504 if (fp->owned_file) {
507 if (fp->open_mode == 'w') ret = fclose(fp->x.fpw);
508 else ret = knet_close(fp->x.fpr);
509 if (ret != 0) return -1;
511 if (fclose(fp->file) != 0) {
516 free(fp->uncompressed_block);
517 free(fp->compressed_block);
525 return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF));
529 bgzf_seek(BGZF* fp, int64_t pos, int where)
531 if (fp->open_mode != 'r') {
532 report_error(fp, "file not open for read");
535 if (where != SEEK_SET) {
536 report_error(fp, "unimplemented seek option");
539 int block_offset = pos & 0xFFFF;
540 int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
542 if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
544 if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
546 report_error(fp, "seek failed");
549 fp->block_length = 0; // indicates current block is not loaded
550 fp->block_address = block_address;
551 fp->block_offset = block_offset;