--- /dev/null
+// ***************************************************************************\r
+// BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 16 August 2010 (DB)\r
+// ---------------------------------------------------------------------------\r
+// BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for reading & writing BGZF files\r
+// ***************************************************************************\r
+\r
+#include <algorithm>\r
+#include "BGZF.h"\r
+using namespace BamTools;\r
+using std::string;\r
+using std::min;\r
+\r
+BgzfData::BgzfData(void)\r
+ : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)\r
+ , CompressedBlockSize(MAX_BLOCK_SIZE)\r
+ , BlockLength(0)\r
+ , BlockOffset(0)\r
+ , BlockAddress(0)\r
+ , IsOpen(false)\r
+ , IsWriteOnly(false)\r
+ , IsWriteUncompressed(false)\r
+ , Stream(NULL)\r
+ , UncompressedBlock(NULL)\r
+ , CompressedBlock(NULL)\r
+{\r
+ try {\r
+ CompressedBlock = new char[CompressedBlockSize];\r
+ UncompressedBlock = new char[UncompressedBlockSize];\r
+ } catch( std::bad_alloc& ba ) {\r
+ printf("BGZF ERROR: unable to allocate memory for our BGZF object.\n");\r
+ exit(1);\r
+ }\r
+}\r
+\r
+// destructor\r
+BgzfData::~BgzfData(void) {\r
+ if( CompressedBlock ) delete[] CompressedBlock;\r
+ if( UncompressedBlock ) delete[] UncompressedBlock;\r
+}\r
+\r
+// closes BGZF file\r
+void BgzfData::Close(void) {\r
+\r
+ // skip if file not open, otherwise set flag\r
+ if ( !IsOpen ) return;\r
+\r
+ // if writing to file, flush the current BGZF block,\r
+ // then write an empty block (as EOF marker)\r
+ if ( IsWriteOnly ) {\r
+ FlushBlock();\r
+ int blockLength = DeflateBlock();\r
+ fwrite(CompressedBlock, 1, blockLength, Stream);\r
+ }\r
+ \r
+ // flush and close\r
+ fflush(Stream);\r
+ fclose(Stream);\r
+ IsWriteUncompressed = false;\r
+ IsOpen = false;\r
+}\r
+\r
+// compresses the current block\r
+int BgzfData::DeflateBlock(void) {\r
+\r
+ // initialize the gzip header\r
+ char* buffer = CompressedBlock;\r
+ memset(buffer, 0, 18);\r
+ buffer[0] = GZIP_ID1;\r
+ buffer[1] = (char)GZIP_ID2;\r
+ buffer[2] = CM_DEFLATE;\r
+ buffer[3] = FLG_FEXTRA;\r
+ buffer[9] = (char)OS_UNKNOWN;\r
+ buffer[10] = BGZF_XLEN;\r
+ buffer[12] = BGZF_ID1;\r
+ buffer[13] = BGZF_ID2;\r
+ buffer[14] = BGZF_LEN;\r
+\r
+ // set compression level\r
+ const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION );\r
+ \r
+ // loop to retry for blocks that do not compress enough\r
+ int inputLength = BlockOffset;\r
+ int compressedLength = 0;\r
+ unsigned int bufferSize = CompressedBlockSize;\r
+\r
+ while ( true ) {\r
+ \r
+ // initialize zstream values\r
+ z_stream zs;\r
+ zs.zalloc = NULL;\r
+ zs.zfree = NULL;\r
+ zs.next_in = (Bytef*)UncompressedBlock;\r
+ zs.avail_in = inputLength;\r
+ zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];\r
+ zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;\r
+\r
+ // initialize the zlib compression algorithm\r
+ if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) {\r
+ printf("BGZF ERROR: zlib deflate initialization failed.\n");\r
+ exit(1);\r
+ }\r
+\r
+ // compress the data\r
+ int status = deflate(&zs, Z_FINISH);\r
+ if ( status != Z_STREAM_END ) {\r
+\r
+ deflateEnd(&zs);\r
+\r
+ // reduce the input length and try again\r
+ if ( status == Z_OK ) {\r
+ inputLength -= 1024;\r
+ if( inputLength < 0 ) {\r
+ printf("BGZF ERROR: input reduction failed.\n");\r
+ exit(1);\r
+ }\r
+ continue;\r
+ }\r
+\r
+ printf("BGZF ERROR: zlib::deflateEnd() failed.\n");\r
+ exit(1);\r
+ }\r
+\r
+ // finalize the compression routine\r
+ if ( deflateEnd(&zs) != Z_OK ) {\r
+ printf("BGZF ERROR: zlib::deflateEnd() failed.\n");\r
+ exit(1);\r
+ }\r
+\r
+ compressedLength = zs.total_out;\r
+ compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;\r
+ if ( compressedLength > MAX_BLOCK_SIZE ) {\r
+ printf("BGZF ERROR: deflate overflow.\n");\r
+ exit(1);\r
+ }\r
+\r
+ break;\r
+ }\r
+\r
+ // store the compressed length\r
+ BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));\r
+\r
+ // store the CRC32 checksum\r
+ unsigned int crc = crc32(0, NULL, 0);\r
+ crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);\r
+ BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc);\r
+ BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);\r
+\r
+ // ensure that we have less than a block of data left\r
+ int remaining = BlockOffset - inputLength;\r
+ if ( remaining > 0 ) {\r
+ if ( remaining > inputLength ) {\r
+ printf("BGZF ERROR: after deflate, remainder too large.\n");\r
+ exit(1);\r
+ }\r
+ memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);\r
+ }\r
+\r
+ BlockOffset = remaining;\r
+ return compressedLength;\r
+}\r
+\r
+// flushes the data in the BGZF block\r
+void BgzfData::FlushBlock(void) {\r
+\r
+ // flush all of the remaining blocks\r
+ while ( BlockOffset > 0 ) {\r
+\r
+ // compress the data block\r
+ int blockLength = DeflateBlock();\r
+\r
+ // flush the data to our output stream\r
+ int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);\r
+\r
+ if ( numBytesWritten != blockLength ) {\r
+ printf("BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);\r
+ exit(1);\r
+ }\r
+ \r
+ BlockAddress += blockLength;\r
+ }\r
+}\r
+\r
+// de-compresses the current block\r
+int BgzfData::InflateBlock(const int& blockLength) {\r
+\r
+ // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock\r
+ z_stream zs;\r
+ zs.zalloc = NULL;\r
+ zs.zfree = NULL;\r
+ zs.next_in = (Bytef*)CompressedBlock + 18;\r
+ zs.avail_in = blockLength - 16;\r
+ zs.next_out = (Bytef*)UncompressedBlock;\r
+ zs.avail_out = UncompressedBlockSize;\r
+\r
+ int status = inflateInit2(&zs, GZIP_WINDOW_BITS);\r
+ if ( status != Z_OK ) {\r
+ printf("BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n");\r
+ return -1;\r
+ }\r
+\r
+ status = inflate(&zs, Z_FINISH);\r
+ if ( status != Z_STREAM_END ) {\r
+ inflateEnd(&zs);\r
+ printf("BGZF ERROR: could not decompress block - zlib::inflate() failed\n");\r
+ return -1;\r
+ }\r
+\r
+ status = inflateEnd(&zs);\r
+ if ( status != Z_OK ) {\r
+ printf("BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n");\r
+ return -1;\r
+ }\r
+\r
+ return zs.total_out;\r
+}\r
+\r
+// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)\r
+bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncompressed ) {\r
+\r
+ // determine open mode\r
+ if ( strcmp(mode, "rb") == 0 )\r
+ IsWriteOnly = false;\r
+ else if ( strcmp(mode, "wb") == 0) \r
+ IsWriteOnly = true;\r
+ else {\r
+ printf("BGZF ERROR: unknown file mode: %s\n", mode);\r
+ return false; \r
+ }\r
+\r
+ // ----------------------------------------------------------------\r
+ // open Stream to read to/write from file, stdin, or stdout\r
+ // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03)\r
+ \r
+ // read/write BGZF data to/from a file\r
+ if ( (filename != "stdin") && (filename != "stdout") )\r
+ Stream = fopen(filename.c_str(), mode);\r
+ \r
+ // read BGZF data from stdin\r
+ else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) )\r
+ Stream = freopen(NULL, mode, stdin);\r
+ \r
+ // write BGZF data to stdout\r
+ else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) )\r
+ Stream = freopen(NULL, mode, stdout);\r
+\r
+ if ( !Stream ) {\r
+ printf("BGZF ERROR: unable to open file %s\n", filename.c_str() );\r
+ return false;\r
+ }\r
+ \r
+ // set flags, return success\r
+ IsOpen = true;\r
+ IsWriteUncompressed = isWriteUncompressed;\r
+ return true;\r
+}\r
+\r
+// reads BGZF data into a byte buffer\r
+int BgzfData::Read(char* data, const unsigned int dataLength) {\r
+\r
+ if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0;\r
+\r
+ char* output = data;\r
+ unsigned int numBytesRead = 0;\r
+ while ( numBytesRead < dataLength ) {\r
+\r
+ int bytesAvailable = BlockLength - BlockOffset;\r
+ if ( bytesAvailable <= 0 ) {\r
+ if ( !ReadBlock() ) return -1; \r
+ bytesAvailable = BlockLength - BlockOffset;\r
+ if ( bytesAvailable <= 0 ) break;\r
+ }\r
+\r
+ char* buffer = UncompressedBlock;\r
+ int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );\r
+ memcpy(output, buffer + BlockOffset, copyLength);\r
+\r
+ BlockOffset += copyLength;\r
+ output += copyLength;\r
+ numBytesRead += copyLength;\r
+ }\r
+\r
+ if ( BlockOffset == BlockLength ) {\r
+ BlockAddress = ftell64(Stream);\r
+ BlockOffset = 0;\r
+ BlockLength = 0;\r
+ }\r
+\r
+ return numBytesRead;\r
+}\r
+\r
+// reads a BGZF block\r
+bool BgzfData::ReadBlock(void) {\r
+\r
+ char header[BLOCK_HEADER_LENGTH];\r
+ int64_t blockAddress = ftell64(Stream);\r
+ \r
+ int count = fread(header, 1, sizeof(header), Stream);\r
+ if ( count == 0 ) {\r
+ BlockLength = 0;\r
+ return true;\r
+ }\r
+\r
+ if ( count != sizeof(header) ) {\r
+ printf("BGZF ERROR: read block failed - could not read block header\n");\r
+ return false;\r
+ }\r
+\r
+ if ( !BgzfData::CheckBlockHeader(header) ) {\r
+ printf("BGZF ERROR: read block failed - invalid block header\n");\r
+ return false;\r
+ }\r
+\r
+ int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1;\r
+ char* compressedBlock = CompressedBlock;\r
+ memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);\r
+ int remaining = blockLength - BLOCK_HEADER_LENGTH;\r
+\r
+ count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream);\r
+ if ( count != remaining ) {\r
+ printf("BGZF ERROR: read block failed - could not read data from block\n");\r
+ return false;\r
+ }\r
+\r
+ count = InflateBlock(blockLength);\r
+ if ( count < 0 ) { \r
+ printf("BGZF ERROR: read block failed - could not decompress block data\n");\r
+ return false;\r
+ }\r
+\r
+ if ( BlockLength != 0 )\r
+ BlockOffset = 0;\r
+\r
+ BlockAddress = blockAddress;\r
+ BlockLength = count;\r
+ return true;\r
+}\r
+\r
+// seek to position in BGZF file\r
+bool BgzfData::Seek(int64_t position) {\r
+\r
+ if ( !IsOpen ) return false;\r
+ \r
+ int blockOffset = (position & 0xFFFF);\r
+ int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;\r
+\r
+ if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) {\r
+ printf("BGZF ERROR: unable to seek in file\n");\r
+ return false;\r
+ }\r
+\r
+ BlockLength = 0;\r
+ BlockAddress = blockAddress;\r
+ BlockOffset = blockOffset;\r
+ return true;\r
+}\r
+\r
+// get file position in BGZF file\r
+int64_t BgzfData::Tell(void) {\r
+ if ( !IsOpen ) \r
+ return false;\r
+ else \r
+ return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );\r
+}\r
+\r
+// writes the supplied data into the BGZF buffer\r
+unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) {\r
+\r
+ if ( !IsOpen || !IsWriteOnly ) return false;\r
+ \r
+ // initialize\r
+ unsigned int numBytesWritten = 0;\r
+ const char* input = data;\r
+ unsigned int blockLength = UncompressedBlockSize;\r
+\r
+ // copy the data to the buffer\r
+ while ( numBytesWritten < dataLen ) {\r
+ \r
+ unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);\r
+ char* buffer = UncompressedBlock;\r
+ memcpy(buffer + BlockOffset, input, copyLength);\r
+\r
+ BlockOffset += copyLength;\r
+ input += copyLength;\r
+ numBytesWritten += copyLength;\r
+\r
+ if ( BlockOffset == blockLength )\r
+ FlushBlock();\r
+ }\r
+\r
+ return numBytesWritten;\r
+}\r