+++ /dev/null
-// ***************************************************************************
-// BgzfStream_p.cpp (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 11 October 2011(DB)
-// ---------------------------------------------------------------------------
-// Based on BGZF routines developed at the Broad Institute.
-// Provides the basic functionality for reading & writing BGZF files
-// Replaces the old BGZF.* files to avoid clashing with other toolkits
-// ***************************************************************************
-
-#include "api/BamAux.h"
-#include "api/BamConstants.h"
-#include "api/internal/BamDeviceFactory_p.h"
-#include "api/internal/BamException_p.h"
-#include "api/internal/BgzfStream_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include "zlib.h"
-
-#include <cstring>
-#include <algorithm>
-#include <iostream>
-#include <sstream>
-using namespace std;
-
-// ----------------------------
-// RaiiWrapper implementation
-// ----------------------------
-
-BgzfStream::RaiiWrapper::RaiiWrapper(void) {
- CompressedBlock = new char[Constants::BGZF_MAX_BLOCK_SIZE];
- UncompressedBlock = new char[Constants::BGZF_DEFAULT_BLOCK_SIZE];
-}
-
-BgzfStream::RaiiWrapper::~RaiiWrapper(void) {
-
- // clean up buffers
- delete[] CompressedBlock;
- delete[] UncompressedBlock;
- CompressedBlock = 0;
- UncompressedBlock = 0;
-}
-
-// ---------------------------
-// BgzfStream implementation
-// ---------------------------
-
-// constructor
-BgzfStream::BgzfStream(void)
- : m_blockLength(0)
- , m_blockOffset(0)
- , m_blockAddress(0)
- , m_isWriteCompressed(true)
- , m_device(0)
-{ }
-
-// destructor
-BgzfStream::~BgzfStream(void) {
- Close();
-}
-
-// checks BGZF block header
-bool BgzfStream::CheckBlockHeader(char* header) {
- return (header[0] == Constants::GZIP_ID1 &&
- header[1] == Constants::GZIP_ID2 &&
- header[2] == Z_DEFLATED &&
- (header[3] & Constants::FLG_FEXTRA) != 0 &&
- BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN &&
- header[12] == Constants::BGZF_ID1 &&
- header[13] == Constants::BGZF_ID2 &&
- BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN );
-}
-
-// closes BGZF file
-void BgzfStream::Close(void) {
-
- // skip if no device open
- if ( m_device == 0 ) return;
-
- // if writing to file, flush the current BGZF block,
- // then write an empty block (as EOF marker)
- if ( m_device->IsOpen() && (m_device->Mode() == IBamIODevice::WriteOnly) ) {
- FlushBlock();
- const size_t blockLength = DeflateBlock();
- m_device->Write(Resources.CompressedBlock, blockLength);
- }
-
- // close device
- m_device->Close();
- delete m_device;
- m_device = 0;
-
- // reset state
- m_blockLength = 0;
- m_blockOffset = 0;
- m_blockAddress = 0;
- m_isWriteCompressed = true;
-}
-
-// compresses the current block
-size_t BgzfStream::DeflateBlock(void) {
-
- // initialize the gzip header
- char* buffer = Resources.CompressedBlock;
- memset(buffer, 0, 18);
- buffer[0] = Constants::GZIP_ID1;
- buffer[1] = Constants::GZIP_ID2;
- buffer[2] = Constants::CM_DEFLATE;
- buffer[3] = Constants::FLG_FEXTRA;
- buffer[9] = Constants::OS_UNKNOWN;
- buffer[10] = Constants::BGZF_XLEN;
- buffer[12] = Constants::BGZF_ID1;
- buffer[13] = Constants::BGZF_ID2;
- buffer[14] = Constants::BGZF_LEN;
-
- // set compression level
- const int compressionLevel = ( m_isWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 );
-
- // loop to retry for blocks that do not compress enough
- int inputLength = m_blockOffset;
- size_t compressedLength = 0;
- const unsigned int bufferSize = Constants::BGZF_MAX_BLOCK_SIZE;
-
- while ( true ) {
-
- // initialize zstream values
- z_stream zs;
- zs.zalloc = NULL;
- zs.zfree = NULL;
- zs.next_in = (Bytef*)Resources.UncompressedBlock;
- zs.avail_in = inputLength;
- zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH];
- zs.avail_out = bufferSize -
- Constants::BGZF_BLOCK_HEADER_LENGTH -
- Constants::BGZF_BLOCK_FOOTER_LENGTH;
-
- // initialize the zlib compression algorithm
- int status = deflateInit2(&zs,
- compressionLevel,
- Z_DEFLATED,
- Constants::GZIP_WINDOW_BITS,
- Constants::Z_DEFAULT_MEM_LEVEL,
- Z_DEFAULT_STRATEGY);
- if ( status != Z_OK )
- throw BamException("BgzfStream::DeflateBlock", "zlib deflateInit2 failed");
-
- // compress the data
- status = deflate(&zs, Z_FINISH);
-
- // if not at stream end
- if ( status != Z_STREAM_END ) {
-
- deflateEnd(&zs);
-
- // there was not enough space available in buffer
- // try to reduce the input length & re-start loop
- if ( status == Z_OK ) {
- inputLength -= 1024;
- if ( inputLength < 0 )
- throw BamException("BgzfStream::DeflateBlock", "input reduction failed");
- continue;
- }
-
- throw BamException("BgzfStream::DeflateBlock", "zlib deflate failed");
- }
-
- // finalize the compression routine
- status = deflateEnd(&zs);
- if ( status != Z_OK )
- throw BamException("BgzfStream::DeflateBlock", "zlib deflateEnd failed");
-
- // update compressedLength
- compressedLength = zs.total_out +
- Constants::BGZF_BLOCK_HEADER_LENGTH +
- Constants::BGZF_BLOCK_FOOTER_LENGTH;
- if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE )
- throw BamException("BgzfStream::DeflateBlock", "deflate overflow");
-
- // quit while loop
- break;
- }
-
- // store the compressed length
- BamTools::PackUnsignedShort(&buffer[16], static_cast<uint16_t>(compressedLength - 1));
-
- // store the CRC32 checksum
- uint32_t crc = crc32(0, NULL, 0);
- crc = crc32(crc, (Bytef*)Resources.UncompressedBlock, inputLength);
- BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc);
- BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
-
- // ensure that we have less than a block of data left
- int remaining = m_blockOffset - inputLength;
- if ( remaining > 0 ) {
- if ( remaining > inputLength )
- throw BamException("BgzfStream::DeflateBlock", "after deflate, remainder too large");
- memcpy(Resources.UncompressedBlock, Resources.UncompressedBlock + inputLength, remaining);
- }
-
- // update block data
- m_blockOffset = remaining;
-
- // return result
- return compressedLength;
-}
-
-// flushes the data in the BGZF block
-void BgzfStream::FlushBlock(void) {
-
- BT_ASSERT_X( m_device, "BgzfStream::FlushBlock() - attempting to flush to null device" );
-
- // flush all of the remaining blocks
- while ( m_blockOffset > 0 ) {
-
- // compress the data block
- const size_t blockLength = DeflateBlock();
-
- // flush the data to our output device
- const size_t numBytesWritten = m_device->Write(Resources.CompressedBlock, blockLength);
- if ( numBytesWritten != blockLength ) {
- stringstream s("");
- s << "expected to write " << blockLength
- << " bytes during flushing, but wrote " << numBytesWritten;
- throw BamException("BgzfStream::FlushBlock", s.str());
- }
-
- // update block data
- m_blockAddress += blockLength;
- }
-}
-
-// decompresses the current block
-size_t BgzfStream::InflateBlock(const size_t& blockLength) {
-
- // setup zlib stream object
- z_stream zs;
- zs.zalloc = NULL;
- zs.zfree = NULL;
- zs.next_in = (Bytef*)Resources.CompressedBlock + 18;
- zs.avail_in = blockLength - 16;
- zs.next_out = (Bytef*)Resources.UncompressedBlock;
- zs.avail_out = Constants::BGZF_DEFAULT_BLOCK_SIZE;
-
- // initialize
- int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS);
- if ( status != Z_OK )
- throw BamException("BgzfStream::InflateBlock", "zlib inflateInit failed");
-
- // decompress
- status = inflate(&zs, Z_FINISH);
- if ( status != Z_STREAM_END ) {
- inflateEnd(&zs);
- throw BamException("BgzfStream::InflateBlock", "zlib inflate failed");
- }
-
- // finalize
- status = inflateEnd(&zs);
- if ( status != Z_OK ) {
- inflateEnd(&zs);
- throw BamException("BgzfStream::InflateBlock", "zlib inflateEnd failed");
- }
-
- // return result
- return zs.total_out;
-}
-
-bool BgzfStream::IsOpen(void) const {
- if ( m_device == 0 )
- return false;
- return m_device->IsOpen();
-}
-
-void BgzfStream::Open(const string& filename, const IBamIODevice::OpenMode mode) {
-
- // close current device if necessary
- Close();
- BT_ASSERT_X( (m_device == 0), "BgzfStream::Open() - unable to properly close previous IO device" );
-
- // retrieve new IO device depending on filename
- m_device = BamDeviceFactory::CreateDevice(filename);
- BT_ASSERT_X( m_device, "BgzfStream::Open() - unable to create IO device from filename" );
-
- // if device fails to open
- if ( !m_device->Open(mode) ) {
- const string deviceError = m_device->GetErrorString();
- const string message = string("could not open BGZF stream: \n\t") + deviceError;
- throw BamException("BgzfStream::Open", message);
- }
-}
-
-// reads BGZF data into a byte buffer
-size_t BgzfStream::Read(char* data, const size_t dataLength) {
-
- if ( dataLength == 0 )
- return 0;
-
- // if stream not open for reading
- BT_ASSERT_X( m_device, "BgzfStream::Read() - trying to read from null device");
- if ( !m_device->IsOpen() || (m_device->Mode() != IBamIODevice::ReadOnly) )
- return 0;
-
- // read blocks as needed until desired data length is retrieved
- char* output = data;
- size_t numBytesRead = 0;
- while ( numBytesRead < dataLength ) {
-
- // determine bytes available in current block
- int bytesAvailable = m_blockLength - m_blockOffset;
-
- // read (and decompress) next block if needed
- if ( bytesAvailable <= 0 ) {
- ReadBlock();
- bytesAvailable = m_blockLength - m_blockOffset;
- if ( bytesAvailable <= 0 )
- break;
- }
-
- // copy data from uncompressed source buffer into data destination buffer
- const size_t copyLength = min( (dataLength-numBytesRead), (size_t)bytesAvailable );
- memcpy(output, Resources.UncompressedBlock + m_blockOffset, copyLength);
-
- // update counters
- m_blockOffset += copyLength;
- output += copyLength;
- numBytesRead += copyLength;
- }
-
- // update block data
- if ( m_blockOffset == m_blockLength ) {
- m_blockAddress = m_device->Tell();
- m_blockOffset = 0;
- m_blockLength = 0;
-
- }
-
- // return actual number of bytes read
- return numBytesRead;
-}
-
-// reads a BGZF block
-void BgzfStream::ReadBlock(void) {
-
- BT_ASSERT_X( m_device, "BgzfStream::ReadBlock() - trying to read from null IO device");
-
- // store block's starting address
- int64_t blockAddress = m_device->Tell();
-
- // read block header from file
- char header[Constants::BGZF_BLOCK_HEADER_LENGTH];
- size_t numBytesRead = m_device->Read(header, Constants::BGZF_BLOCK_HEADER_LENGTH);
-
- // if block header empty
- if ( numBytesRead == 0 ) {
- m_blockLength = 0;
- return;
- }
-
- // if block header invalid size
- if ( numBytesRead != Constants::BGZF_BLOCK_HEADER_LENGTH )
- throw BamException("BgzfStream::ReadBlock", "invalid block header size");
-
- // validate block header contents
- if ( !BgzfStream::CheckBlockHeader(header) )
- throw BamException("BgzfStream::ReadBlock", "invalid block header contents");
-
- // copy header contents to compressed buffer
- const size_t blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1;
- memcpy(Resources.CompressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH);
-
- // read remainder of block
- const size_t remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH;
- numBytesRead = m_device->Read(&Resources.CompressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], remaining);
- if ( numBytesRead != remaining )
- throw BamException("BgzfStream::ReadBlock", "could not read data from block");
-
- // decompress block data
- numBytesRead = InflateBlock(blockLength);
-
- // update block data
- if ( m_blockLength != 0 )
- m_blockOffset = 0;
- m_blockAddress = blockAddress;
- m_blockLength = numBytesRead;
-}
-
-// seek to position in BGZF file
-void BgzfStream::Seek(const int64_t& position) {
-
- BT_ASSERT_X( m_device, "BgzfStream::Seek() - trying to seek on null IO device");
-
- // skip if device is not open
- if ( !IsOpen() ) return;
-
- // determine adjusted offset & address
- int blockOffset = (position & 0xFFFF);
- int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
-
- // attempt seek in file
- if ( m_device->IsRandomAccess() && m_device->Seek(blockAddress) ) {
-
- // update block data & return success
- m_blockLength = 0;
- m_blockAddress = blockAddress;
- m_blockOffset = blockOffset;
- }
- else {
- stringstream s("");
- s << "unable to seek to position: " << position;
- throw BamException("BgzfStream::Seek", s.str());
- }
-}
-
-void BgzfStream::SetWriteCompressed(bool ok) {
- m_isWriteCompressed = ok;
-}
-
-// get file position in BGZF file
-int64_t BgzfStream::Tell(void) const {
- if ( !IsOpen() )
- return 0;
- return ( (m_blockAddress << 16) | (m_blockOffset & 0xFFFF) );
-}
-
-// writes the supplied data into the BGZF buffer
-size_t BgzfStream::Write(const char* data, const size_t dataLength) {
-
- BT_ASSERT_X( m_device, "BgzfStream::Write() - trying to write to null IO device");
- BT_ASSERT_X( (m_device->Mode() == IBamIODevice::WriteOnly),
- "BgzfStream::Write() - trying to write to non-writable IO device");
-
- // skip if file not open for writing
- if ( !IsOpen() )
- return 0;
-
- // write blocks as needed til all data is written
- size_t numBytesWritten = 0;
- const char* input = data;
- const size_t blockLength = Constants::BGZF_DEFAULT_BLOCK_SIZE;
- while ( numBytesWritten < dataLength ) {
-
- // copy data contents to uncompressed output buffer
- unsigned int copyLength = min(blockLength - m_blockOffset, dataLength - numBytesWritten);
- char* buffer = Resources.UncompressedBlock;
- memcpy(buffer + m_blockOffset, input, copyLength);
-
- // update counter
- m_blockOffset += copyLength;
- input += copyLength;
- numBytesWritten += copyLength;
-
- // flush (& compress) output buffer when full
- if ( m_blockOffset == blockLength )
- FlushBlock();
- }
-
- // return actual number of bytes written
- return numBytesWritten;
-}