X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Fapi%2Finternal%2FBgzfStream_p.cpp;h=f70b97eac1eb692035040ebf26a23bdc3686f75b;hb=2e1822c9ed231b25fd474117a01a1492d4209fa4;hp=b79164decabcf7d49feba9c6455da4eed96fa979;hpb=88577e25bbf4b6b43642cb679c5f9f5cba026fec;p=bamtools.git diff --git a/src/api/internal/BgzfStream_p.cpp b/src/api/internal/BgzfStream_p.cpp index b79164d..f70b97e 100644 --- a/src/api/internal/BgzfStream_p.cpp +++ b/src/api/internal/BgzfStream_p.cpp @@ -1,9 +1,8 @@ // *************************************************************************** // BgzfStream_p.cpp (c) 2011 Derek Barnett // Marth Lab, Department of Biology, Boston College -// All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 9 September 2011(DB) +// Last modified: 7 October 2011(DB) // --------------------------------------------------------------------------- // Based on BGZF routines developed at the Broad Institute. // Provides the basic functionality for reading & writing BGZF files @@ -11,6 +10,7 @@ // *************************************************************************** #include +#include #include using namespace BamTools; using namespace BamTools::Internal; @@ -18,74 +18,94 @@ using namespace BamTools::Internal; #include #include #include +#include using namespace std; +// ---------------------------- +// RaiiWrapper implementation +// ---------------------------- + +BgzfStream::RaiiWrapper::RaiiWrapper(void) { + CompressedBlock = new char[Constants::BGZF_MAX_BLOCK_SIZE]; + UncompressedBlock = new char[Constants::BGZF_DEFAULT_BLOCK_SIZE]; +} + +BgzfStream::RaiiWrapper::~RaiiWrapper(void) { + + // clean up buffers + delete[] CompressedBlock; + delete[] UncompressedBlock; + CompressedBlock = 0; + UncompressedBlock = 0; +} + +// --------------------------- +// BgzfStream implementation +// --------------------------- + // constructor BgzfStream::BgzfStream(void) - : m_uncompressedBlockSize(Constants::BGZF_DEFAULT_BLOCK_SIZE) - , m_compressedBlockSize(Constants::BGZF_MAX_BLOCK_SIZE) - , m_blockLength(0) - , m_blockOffset(0) - , m_blockAddress(0) - , m_uncompressedBlock(NULL) - , m_compressedBlock(NULL) - , m_isOpen(false) - , m_isWriteOnly(false) - , m_isWriteCompressed(true) - , m_device(0) - , m_stream(NULL) -{ - try { - m_compressedBlock = new char[m_compressedBlockSize]; - m_uncompressedBlock = new char[m_uncompressedBlockSize]; - } catch( std::bad_alloc& ba ) { - fprintf(stderr, "BgzfStream ERROR: unable to allocate memory\n"); - exit(1); - } -} + : m_blockLength(0) + , m_blockOffset(0) + , m_blockAddress(0) + , m_isWriteCompressed(true) + , m_device(0) +{ } // destructor BgzfStream::~BgzfStream(void) { - if( m_compressedBlock ) delete[] m_compressedBlock; - if( m_uncompressedBlock ) delete[] m_uncompressedBlock; + Close(); +} + +// checks BGZF block header +bool BgzfStream::CheckBlockHeader(char* header) { + return (header[0] == Constants::GZIP_ID1 && + header[1] == Constants::GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & Constants::FLG_FEXTRA) != 0 && + BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN && + header[12] == Constants::BGZF_ID1 && + header[13] == Constants::BGZF_ID2 && + BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN ); } // closes BGZF file void BgzfStream::Close(void) { + // reset state + m_blockLength = 0; + m_blockOffset = 0; + m_blockAddress = 0; + m_isWriteCompressed = true; + // skip if no device open - if ( m_device == 0 ) - return; + if ( m_device == 0 ) return; // if writing to file, flush the current BGZF block, // then write an empty block (as EOF marker) if ( m_device->IsOpen() && (m_device->Mode() == IBamIODevice::WriteOnly) ) { FlushBlock(); - int blockLength = DeflateBlock(); - m_device->Write(m_compressedBlock, blockLength); + const size_t blockLength = DeflateBlock(); + m_device->Write(Resources.CompressedBlock, blockLength); } // close device m_device->Close(); - - // clean up & reset flags delete m_device; m_device = 0; - m_isWriteCompressed = true; - m_isOpen = false; } // compresses the current block -unsigned int BgzfStream::DeflateBlock(void) { +size_t BgzfStream::DeflateBlock(void) { // initialize the gzip header - char* buffer = m_compressedBlock; + char* buffer = Resources.CompressedBlock; memset(buffer, 0, 18); buffer[0] = Constants::GZIP_ID1; - buffer[1] = (char)Constants::GZIP_ID2; + buffer[1] = Constants::GZIP_ID2; buffer[2] = Constants::CM_DEFLATE; buffer[3] = Constants::FLG_FEXTRA; - buffer[9] = (char)Constants::OS_UNKNOWN; + buffer[9] = Constants::OS_UNKNOWN; buffer[10] = Constants::BGZF_XLEN; buffer[12] = Constants::BGZF_ID1; buffer[13] = Constants::BGZF_ID2; @@ -96,8 +116,8 @@ unsigned int BgzfStream::DeflateBlock(void) { // loop to retry for blocks that do not compress enough int inputLength = m_blockOffset; - unsigned int compressedLength = 0; - unsigned int bufferSize = m_compressedBlockSize; + size_t compressedLength = 0; + const unsigned int bufferSize = Constants::BGZF_MAX_BLOCK_SIZE; while ( true ) { @@ -105,76 +125,74 @@ unsigned int BgzfStream::DeflateBlock(void) { z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; - zs.next_in = (Bytef*)m_uncompressedBlock; + zs.next_in = (Bytef*)Resources.UncompressedBlock; zs.avail_in = inputLength; zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH]; - zs.avail_out = bufferSize - Constants::BGZF_BLOCK_HEADER_LENGTH - Constants::BGZF_BLOCK_FOOTER_LENGTH; + zs.avail_out = bufferSize - + Constants::BGZF_BLOCK_HEADER_LENGTH - + Constants::BGZF_BLOCK_FOOTER_LENGTH; // initialize the zlib compression algorithm - if ( deflateInit2(&zs, - compressionLevel, - Z_DEFLATED, - Constants::GZIP_WINDOW_BITS, - Constants::Z_DEFAULT_MEM_LEVEL, - Z_DEFAULT_STRATEGY) != Z_OK ) - { - fprintf(stderr, "BgzfStream ERROR: zlib deflate initialization failed\n"); - exit(1); - } + int status = deflateInit2(&zs, + compressionLevel, + Z_DEFLATED, + Constants::GZIP_WINDOW_BITS, + Constants::Z_DEFAULT_MEM_LEVEL, + Z_DEFAULT_STRATEGY); + if ( status != Z_OK ) + throw BamException("BgzfStream::DeflateBlock", "zlib deflateInit2 failed"); // compress the data - int status = deflate(&zs, Z_FINISH); + status = deflate(&zs, Z_FINISH); + + // if not at stream end if ( status != Z_STREAM_END ) { deflateEnd(&zs); - // reduce the input length and try again - if ( status == Z_OK ) { - inputLength -= 1024; - if ( inputLength < 0 ) { - fprintf(stderr, "BgzfStream ERROR: input reduction failed\n"); - exit(1); - } - continue; - } - - fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n"); - exit(1); - } + // if error status + if ( status != Z_OK ) + throw BamException("BgzfStream::DeflateBlock", "zlib deflate failed"); - // finalize the compression routine - if ( deflateEnd(&zs) != Z_OK ) { - fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n"); - exit(1); - } - - compressedLength = zs.total_out; - compressedLength += Constants::BGZF_BLOCK_HEADER_LENGTH + Constants::BGZF_BLOCK_FOOTER_LENGTH; - if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE ) { - fprintf(stderr, "BgzfStream ERROR: deflate overflow\n"); - exit(1); + // not enough space available in buffer + // try to reduce the input length & re-start loop + inputLength -= 1024; + if ( inputLength <= 0 ) + throw BamException("BgzfStream::DeflateBlock", "input reduction failed"); + continue; } + // finalize the compression routine + status = deflateEnd(&zs); + if ( status != Z_OK ) + throw BamException("BgzfStream::DeflateBlock", "zlib deflateEnd failed"); + + // update compressedLength + compressedLength = zs.total_out + + Constants::BGZF_BLOCK_HEADER_LENGTH + + Constants::BGZF_BLOCK_FOOTER_LENGTH; + if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE ) + throw BamException("BgzfStream::DeflateBlock", "deflate overflow"); + + // quit while loop break; } // store the compressed length - BamTools::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1)); + BamTools::PackUnsignedShort(&buffer[16], static_cast(compressedLength - 1)); // store the CRC32 checksum - unsigned int crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)m_uncompressedBlock, inputLength); + uint32_t crc = crc32(0, NULL, 0); + crc = crc32(crc, (Bytef*)Resources.UncompressedBlock, inputLength); BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc); BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength); // ensure that we have less than a block of data left int remaining = m_blockOffset - inputLength; if ( remaining > 0 ) { - if ( remaining > inputLength ) { - fprintf(stderr, "BgzfStream ERROR: after deflate, remainder too large\n"); - exit(1); - } - memcpy(m_uncompressedBlock, m_uncompressedBlock + inputLength, remaining); + if ( remaining > inputLength ) + throw BamException("BgzfStream::DeflateBlock", "after deflate, remainder too large"); + memcpy(Resources.UncompressedBlock, Resources.UncompressedBlock + inputLength, remaining); } // update block data @@ -193,14 +211,15 @@ void BgzfStream::FlushBlock(void) { while ( m_blockOffset > 0 ) { // compress the data block - unsigned int blockLength = DeflateBlock(); + const size_t blockLength = DeflateBlock(); - // flush the data to our output stream - unsigned int numBytesWritten = m_device->Write(m_compressedBlock, blockLength); + // flush the data to our output device + const size_t numBytesWritten = m_device->Write(Resources.CompressedBlock, blockLength); if ( numBytesWritten != blockLength ) { - fprintf(stderr, "BgzfStream ERROR: expected to write %u bytes during flushing, but wrote %u bytes\n", - blockLength, numBytesWritten); - exit(1); + stringstream s(""); + s << "expected to write " << blockLength + << " bytes during flushing, but wrote " << numBytesWritten; + throw BamException("BgzfStream::FlushBlock", s.str()); } // update block data @@ -209,34 +228,34 @@ void BgzfStream::FlushBlock(void) { } // decompresses the current block -int BgzfStream::InflateBlock(const int& blockLength) { +size_t BgzfStream::InflateBlock(const size_t& blockLength) { - // inflate the data from compressed buffer into uncompressed buffer + // setup zlib stream object z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; - zs.next_in = (Bytef*)m_compressedBlock + 18; + zs.next_in = (Bytef*)Resources.CompressedBlock + 18; zs.avail_in = blockLength - 16; - zs.next_out = (Bytef*)m_uncompressedBlock; - zs.avail_out = m_uncompressedBlockSize; + zs.next_out = (Bytef*)Resources.UncompressedBlock; + zs.avail_out = Constants::BGZF_DEFAULT_BLOCK_SIZE; + // initialize int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS); - if ( status != Z_OK ) { - fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateInit() failed\n"); - return -1; - } + if ( status != Z_OK ) + throw BamException("BgzfStream::InflateBlock", "zlib inflateInit failed"); + // decompress status = inflate(&zs, Z_FINISH); if ( status != Z_STREAM_END ) { inflateEnd(&zs); - fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflate() failed\n"); - return -1; + throw BamException("BgzfStream::InflateBlock", "zlib inflate failed"); } + // finalize status = inflateEnd(&zs); if ( status != Z_OK ) { - fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateEnd() failed\n"); - return -1; + inflateEnd(&zs); + throw BamException("BgzfStream::InflateBlock", "zlib inflateEnd failed"); } // return result @@ -249,74 +268,26 @@ bool BgzfStream::IsOpen(void) const { return m_device->IsOpen(); } -bool BgzfStream::Open(const string& filename, const IBamIODevice::OpenMode mode) { +void BgzfStream::Open(const string& filename, const IBamIODevice::OpenMode mode) { // close current device if necessary Close(); - - // sanity check BT_ASSERT_X( (m_device == 0), "BgzfStream::Open() - unable to properly close previous IO device" ); // retrieve new IO device depending on filename m_device = BamDeviceFactory::CreateDevice(filename); - - // sanity check BT_ASSERT_X( m_device, "BgzfStream::Open() - unable to create IO device from filename" ); // if device fails to open if ( !m_device->Open(mode) ) { - cerr << "BgzfStream::Open() - unable to open IO device:" << endl; - cerr << m_device->ErrorString(); - return false; + const string deviceError = m_device->GetErrorString(); + const string message = string("could not open BGZF stream: \n\t") + deviceError; + throw BamException("BgzfStream::Open", message); } - - // otherwise, set flag & return true - m_isOpen = true; - m_isWriteOnly = ( mode == IBamIODevice::WriteOnly ); - return true; - -} - -// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing) -bool BgzfStream::Open(const string& filename, const char* mode) { - - // close current stream, if necessary, before opening next - if ( m_isOpen ) Close(); - - // determine open mode - if ( strcmp(mode, "rb") == 0 ) - m_isWriteOnly = false; - else if ( strcmp(mode, "wb") == 0) - m_isWriteOnly = true; - else { - fprintf(stderr, "BgzfStream ERROR: unknown file mode: %s\n", mode); - return false; - } - - // open BGZF stream on a file - if ( (filename != "stdin") && (filename != "stdout") && (filename != "-")) - m_stream = fopen(filename.c_str(), mode); - - // open BGZF stream on stdin - else if ( (filename == "stdin" || filename == "-") && (strcmp(mode, "rb") == 0 ) ) - m_stream = freopen(NULL, mode, stdin); - - // open BGZF stream on stdout - else if ( (filename == "stdout" || filename == "-") && (strcmp(mode, "wb") == 0) ) - m_stream = freopen(NULL, mode, stdout); - - if ( !m_stream ) { - fprintf(stderr, "BgzfStream ERROR: unable to open file %s\n", filename.c_str() ); - return false; - } - - // set flag & return success - m_isOpen = true; - return true; } // reads BGZF data into a byte buffer -unsigned int BgzfStream::Read(char* data, const unsigned int dataLength) { +size_t BgzfStream::Read(char* data, const size_t dataLength) { if ( dataLength == 0 ) return 0; @@ -327,8 +298,7 @@ unsigned int BgzfStream::Read(char* data, const unsigned int dataLength) { return 0; // read blocks as needed until desired data length is retrieved - char* output = data; - unsigned int numBytesRead = 0; + size_t numBytesRead = 0; while ( numBytesRead < dataLength ) { // determine bytes available in current block @@ -336,20 +306,20 @@ unsigned int BgzfStream::Read(char* data, const unsigned int dataLength) { // read (and decompress) next block if needed if ( bytesAvailable <= 0 ) { - if ( !ReadBlock() ) return -1; + ReadBlock(); bytesAvailable = m_blockLength - m_blockOffset; - if ( bytesAvailable <= 0 ) break; + if ( bytesAvailable <= 0 ) + break; } // copy data from uncompressed source buffer into data destination buffer - char* buffer = m_uncompressedBlock; - int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable ); - memcpy(output, buffer + m_blockOffset, copyLength); + const size_t copyLength = min( (dataLength-numBytesRead), (size_t)bytesAvailable ); + memcpy(data, Resources.UncompressedBlock + m_blockOffset, copyLength); // update counters - m_blockOffset += copyLength; - output += copyLength; - numBytesRead += copyLength; + m_blockOffset += copyLength; + data += copyLength; + numBytesRead += copyLength; } // update block data @@ -357,13 +327,15 @@ unsigned int BgzfStream::Read(char* data, const unsigned int dataLength) { m_blockAddress = m_device->Tell(); m_blockOffset = 0; m_blockLength = 0; + } + // return actual number of bytes read return numBytesRead; } // reads a BGZF block -bool BgzfStream::ReadBlock(void) { +void BgzfStream::ReadBlock(void) { BT_ASSERT_X( m_device, "BgzfStream::ReadBlock() - trying to read from null IO device"); @@ -372,80 +344,67 @@ bool BgzfStream::ReadBlock(void) { // read block header from file char header[Constants::BGZF_BLOCK_HEADER_LENGTH]; - int numBytesRead = m_device->Read(header, Constants::BGZF_BLOCK_HEADER_LENGTH); + size_t numBytesRead = m_device->Read(header, Constants::BGZF_BLOCK_HEADER_LENGTH); // if block header empty if ( numBytesRead == 0 ) { m_blockLength = 0; - return true; + return; } // if block header invalid size - if ( numBytesRead != Constants::BGZF_BLOCK_HEADER_LENGTH ) { - fprintf(stderr, "BgzfStream ERROR: read block failed - could not read block header\n"); - return false; - } + if ( numBytesRead != Constants::BGZF_BLOCK_HEADER_LENGTH ) + throw BamException("BgzfStream::ReadBlock", "invalid block header size"); // validate block header contents - if ( !BgzfStream::CheckBlockHeader(header) ) { - fprintf(stderr, "BgzfStream ERROR: read block failed - invalid block header\n"); - return false; - } + if ( !BgzfStream::CheckBlockHeader(header) ) + throw BamException("BgzfStream::ReadBlock", "invalid block header contents"); // copy header contents to compressed buffer - int blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1; - char* compressedBlock = m_compressedBlock; - memcpy(compressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH); - int remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH; + const size_t blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1; + memcpy(Resources.CompressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH); // read remainder of block - numBytesRead = m_device->Read(&compressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], remaining); - if ( numBytesRead != remaining ) { - fprintf(stderr, "BgzfStream ERROR: read block failed - could not read data from block\n"); - return false; - } + const size_t remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH; + numBytesRead = m_device->Read(&Resources.CompressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], remaining); + if ( numBytesRead != remaining ) + throw BamException("BgzfStream::ReadBlock", "could not read data from block"); // decompress block data numBytesRead = InflateBlock(blockLength); - if ( numBytesRead < 0 ) { - fprintf(stderr, "BgzfStream ERROR: read block failed - could not decompress block data\n"); - return false; - } // update block data if ( m_blockLength != 0 ) m_blockOffset = 0; m_blockAddress = blockAddress; m_blockLength = numBytesRead; - - // return success - return true; } // seek to position in BGZF file -bool BgzfStream::Seek(const int64_t& position) { +void BgzfStream::Seek(const int64_t& position) { BT_ASSERT_X( m_device, "BgzfStream::Seek() - trying to seek on null IO device"); - // skip if not open or not seek-able - if ( !IsOpen() || !m_device->IsRandomAccess() ) - return false; + // skip if device is not open + if ( !IsOpen() ) return; // determine adjusted offset & address int blockOffset = (position & 0xFFFF); int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; // attempt seek in file - if ( !m_device->Seek(blockAddress) ) { - fprintf(stderr, "BgzfStream ERROR: unable to seek in file\n"); - return false; - } + if ( m_device->IsRandomAccess() && m_device->Seek(blockAddress) ) { - // update block data & return success - m_blockLength = 0; - m_blockAddress = blockAddress; - m_blockOffset = blockOffset; - return true; + // update block data & return success + m_blockLength = 0; + m_blockAddress = blockAddress; + m_blockOffset = blockOffset; + } + else { + stringstream s(""); + s << "unable to seek to position: " << position; + throw BamException("BgzfStream::Seek", s.str()); + } } void BgzfStream::SetWriteCompressed(bool ok) { @@ -454,26 +413,31 @@ void BgzfStream::SetWriteCompressed(bool ok) { // get file position in BGZF file int64_t BgzfStream::Tell(void) const { - if ( !m_isOpen ) return 0; + if ( !IsOpen() ) + return 0; return ( (m_blockAddress << 16) | (m_blockOffset & 0xFFFF) ); } // writes the supplied data into the BGZF buffer -unsigned int BgzfStream::Write(const char* data, const unsigned int dataLength) { +size_t BgzfStream::Write(const char* data, const size_t dataLength) { BT_ASSERT_X( m_device, "BgzfStream::Write() - trying to write to null IO device"); BT_ASSERT_X( (m_device->Mode() == IBamIODevice::WriteOnly), "BgzfStream::Write() - trying to write to non-writable IO device"); + // skip if file not open for writing + if ( !IsOpen() ) + return 0; + // write blocks as needed til all data is written - unsigned int numBytesWritten = 0; + size_t numBytesWritten = 0; const char* input = data; - unsigned int blockLength = m_uncompressedBlockSize; + const size_t blockLength = Constants::BGZF_DEFAULT_BLOCK_SIZE; while ( numBytesWritten < dataLength ) { // copy data contents to uncompressed output buffer unsigned int copyLength = min(blockLength - m_blockOffset, dataLength - numBytesWritten); - char* buffer = m_uncompressedBlock; + char* buffer = Resources.UncompressedBlock; memcpy(buffer + m_blockOffset, input, copyLength); // update counter @@ -486,6 +450,6 @@ unsigned int BgzfStream::Write(const char* data, const unsigned int dataLength) FlushBlock(); } - // return result + // return actual number of bytes written return numBytesWritten; }