]> git.donarmstrong.com Git - bamtools.git/blobdiff - src/api/internal/BgzfStream_p.cpp
Major update to BamTools version 1.0
[bamtools.git] / src / api / internal / BgzfStream_p.cpp
diff --git a/src/api/internal/BgzfStream_p.cpp b/src/api/internal/BgzfStream_p.cpp
new file mode 100644 (file)
index 0000000..fb67799
--- /dev/null
@@ -0,0 +1,444 @@
+// ***************************************************************************
+// BgzfStream_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 21 March 2011(DB)
+// ---------------------------------------------------------------------------
+// Based on BGZF routines developed at the Broad Institute.
+// Provides the basic functionality for reading & writing BGZF files
+// Replaces the old BGZF.* files to avoid clashing with other toolkits
+// ***************************************************************************
+
+#include <api/internal/BgzfStream_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstring>
+#include <algorithm>
+using namespace std;
+
+// constructor
+BgzfStream::BgzfStream(void)
+    : UncompressedBlockSize(Constants::BGZF_DEFAULT_BLOCK_SIZE)
+    , CompressedBlockSize(Constants::BGZF_MAX_BLOCK_SIZE)
+    , BlockLength(0)
+    , BlockOffset(0)
+    , BlockAddress(0)
+    , IsOpen(false)
+    , IsWriteOnly(false)
+    , IsWriteCompressed(true)
+    , Stream(NULL)
+    , UncompressedBlock(NULL)
+    , CompressedBlock(NULL)
+{
+    try {
+        CompressedBlock   = new char[CompressedBlockSize];
+        UncompressedBlock = new char[UncompressedBlockSize];
+    } catch( std::bad_alloc& ba ) {
+        fprintf(stderr, "BgzfStream ERROR: unable to allocate memory\n");
+        exit(1);
+    }
+}
+
+// destructor
+BgzfStream::~BgzfStream(void) {
+    if( CompressedBlock   ) delete[] CompressedBlock;
+    if( UncompressedBlock ) delete[] UncompressedBlock;
+}
+
+// closes BGZF file
+void BgzfStream::Close(void) {
+
+    // skip if file not open
+    if ( !IsOpen ) return;
+
+    // if writing to file, flush the current BGZF block,
+    // then write an empty block (as EOF marker)
+    if ( IsWriteOnly ) {
+        FlushBlock();
+        int blockLength = DeflateBlock();
+        fwrite(CompressedBlock, 1, blockLength, Stream);
+    }
+
+    // flush and close stream
+    fflush(Stream);
+    fclose(Stream);
+
+    // reset flags
+    IsWriteCompressed = true;
+    IsOpen = false;
+}
+
+// compresses the current block
+int BgzfStream::DeflateBlock(void) {
+
+    // initialize the gzip header
+    char* buffer = CompressedBlock;
+    memset(buffer, 0, 18);
+    buffer[0]  = Constants::GZIP_ID1;
+    buffer[1]  = (char)Constants::GZIP_ID2;
+    buffer[2]  = Constants::CM_DEFLATE;
+    buffer[3]  = Constants::FLG_FEXTRA;
+    buffer[9]  = (char)Constants::OS_UNKNOWN;
+    buffer[10] = Constants::BGZF_XLEN;
+    buffer[12] = Constants::BGZF_ID1;
+    buffer[13] = Constants::BGZF_ID2;
+    buffer[14] = Constants::BGZF_LEN;
+
+    // set compression level
+    const int compressionLevel = ( IsWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 );
+
+    // loop to retry for blocks that do not compress enough
+    int inputLength = BlockOffset;
+    int compressedLength = 0;
+    unsigned int bufferSize = CompressedBlockSize;
+
+    while ( true ) {
+
+        // initialize zstream values
+        z_stream zs;
+        zs.zalloc    = NULL;
+        zs.zfree     = NULL;
+        zs.next_in   = (Bytef*)UncompressedBlock;
+        zs.avail_in  = inputLength;
+        zs.next_out  = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH];
+        zs.avail_out = bufferSize - Constants::BGZF_BLOCK_HEADER_LENGTH - Constants::BGZF_BLOCK_FOOTER_LENGTH;
+
+        // initialize the zlib compression algorithm
+        if ( deflateInit2(&zs,
+                          compressionLevel,
+                          Z_DEFLATED,
+                          Constants::GZIP_WINDOW_BITS,
+                          Constants::Z_DEFAULT_MEM_LEVEL,
+                          Z_DEFAULT_STRATEGY) != Z_OK )
+        {
+            fprintf(stderr, "BgzfStream ERROR: zlib deflate initialization failed\n");
+            exit(1);
+        }
+
+        // compress the data
+        int status = deflate(&zs, Z_FINISH);
+        if ( status != Z_STREAM_END ) {
+
+            deflateEnd(&zs);
+
+            // reduce the input length and try again
+            if ( status == Z_OK ) {
+                inputLength -= 1024;
+                if ( inputLength < 0 ) {
+                    fprintf(stderr, "BgzfStream ERROR: input reduction failed\n");
+                    exit(1);
+                }
+                continue;
+            }
+
+            fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n");
+            exit(1);
+        }
+
+        // finalize the compression routine
+        if ( deflateEnd(&zs) != Z_OK ) {
+            fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n");
+            exit(1);
+        }
+
+        compressedLength = zs.total_out;
+        compressedLength += Constants::BGZF_BLOCK_HEADER_LENGTH + Constants::BGZF_BLOCK_FOOTER_LENGTH;
+        if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE ) {
+            fprintf(stderr, "BgzfStream ERROR: deflate overflow\n");
+            exit(1);
+        }
+
+        break;
+    }
+
+    // store the compressed length
+    BamTools::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
+
+    // store the CRC32 checksum
+    unsigned int crc = crc32(0, NULL, 0);
+    crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);
+    BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc);
+    BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
+
+    // ensure that we have less than a block of data left
+    int remaining = BlockOffset - inputLength;
+    if ( remaining > 0 ) {
+        if ( remaining > inputLength ) {
+            fprintf(stderr, "BgzfStream ERROR: after deflate, remainder too large\n");
+            exit(1);
+        }
+        memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);
+    }
+
+    // update block data
+    BlockOffset = remaining;
+
+    // return result
+    return compressedLength;
+}
+
+// flushes the data in the BGZF block
+void BgzfStream::FlushBlock(void) {
+
+    // flush all of the remaining blocks
+    while ( BlockOffset > 0 ) {
+
+        // compress the data block
+        int blockLength = DeflateBlock();
+
+        // flush the data to our output stream
+        int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);
+        if ( numBytesWritten != blockLength ) {
+            fprintf(stderr, "BgzfStream ERROR: expected to write %u bytes during flushing, but wrote %u bytes\n",
+                    blockLength, numBytesWritten);
+            exit(1);
+        }
+
+        // update block data
+        BlockAddress += blockLength;
+    }
+}
+
+// decompresses the current block
+int BgzfStream::InflateBlock(const int& blockLength) {
+
+    // inflate the data from compressed buffer into uncompressed buffer
+    z_stream zs;
+    zs.zalloc    = NULL;
+    zs.zfree     = NULL;
+    zs.next_in   = (Bytef*)CompressedBlock + 18;
+    zs.avail_in  = blockLength - 16;
+    zs.next_out  = (Bytef*)UncompressedBlock;
+    zs.avail_out = UncompressedBlockSize;
+
+    int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS);
+    if ( status != Z_OK ) {
+        fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateInit() failed\n");
+        return -1;
+    }
+
+    status = inflate(&zs, Z_FINISH);
+    if ( status != Z_STREAM_END ) {
+        inflateEnd(&zs);
+        fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflate() failed\n");
+        return -1;
+    }
+
+    status = inflateEnd(&zs);
+    if ( status != Z_OK ) {
+        fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateEnd() failed\n");
+        return -1;
+    }
+
+    // return result
+    return zs.total_out;
+}
+
+// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)
+bool BgzfStream::Open(const string& filename, const char* mode) {
+
+    // close current stream, if necessary, before opening next
+    if ( IsOpen ) Close();
+
+    // determine open mode
+    if ( strcmp(mode, "rb") == 0 )
+        IsWriteOnly = false;
+    else if ( strcmp(mode, "wb") == 0)
+        IsWriteOnly = true;
+    else {
+        fprintf(stderr, "BgzfStream ERROR: unknown file mode: %s\n", mode);
+        return false;
+    }
+
+    // open BGZF stream on a file
+    if ( (filename != "stdin") && (filename != "stdout") )
+        Stream = fopen(filename.c_str(), mode);
+
+    // open BGZF stream on stdin
+    else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) )
+        Stream = freopen(NULL, mode, stdin);
+
+    // open BGZF stream on stdout
+    else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) )
+        Stream = freopen(NULL, mode, stdout);
+
+    if ( !Stream ) {
+        fprintf(stderr, "BgzfStream ERROR: unable to open file %s\n", filename.c_str() );
+        return false;
+    }
+
+    // set flag & return success
+    IsOpen = true;
+    return true;
+}
+
+// reads BGZF data into a byte buffer
+int BgzfStream::Read(char* data, const unsigned int dataLength) {
+
+    // if stream not open for reading (or empty request)
+    if ( !IsOpen || IsWriteOnly || dataLength == 0 )
+        return 0;
+
+    // read blocks as needed until desired data length is retrieved
+    char* output = data;
+    unsigned int numBytesRead = 0;
+    while ( numBytesRead < dataLength ) {
+
+        // determine bytes available in current block
+        int bytesAvailable = BlockLength - BlockOffset;
+
+        // read (and decompress) next block if needed
+        if ( bytesAvailable <= 0 ) {
+            if ( !ReadBlock() ) return -1;
+            bytesAvailable = BlockLength - BlockOffset;
+            if ( bytesAvailable <= 0 ) break;
+        }
+
+        // copy data from uncompressed source buffer into data destination buffer
+        char* buffer   = UncompressedBlock;
+        int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
+        memcpy(output, buffer + BlockOffset, copyLength);
+
+        // update counters
+        BlockOffset  += copyLength;
+        output       += copyLength;
+        numBytesRead += copyLength;
+    }
+
+    // update block data
+    if ( BlockOffset == BlockLength ) {
+        BlockAddress = ftell64(Stream);
+        BlockOffset  = 0;
+        BlockLength  = 0;
+    }
+
+    return numBytesRead;
+}
+
+// reads a BGZF block
+bool BgzfStream::ReadBlock(void) {
+
+    char header[Constants::BGZF_BLOCK_HEADER_LENGTH];
+    int64_t blockAddress = ftell64(Stream);
+
+    // read block header from file
+    int count = fread(header, 1, sizeof(header), Stream);
+
+    // if block header empty
+    if ( count == 0 ) {
+        BlockLength = 0;
+        return true;
+    }
+
+    // if block header invalid size
+    if ( count != sizeof(header) ) {
+        fprintf(stderr, "BgzfStream ERROR: read block failed - could not read block header\n");
+        return false;
+    }
+
+    // validate block header contents
+    if ( !BgzfStream::CheckBlockHeader(header) ) {
+        fprintf(stderr, "BgzfStream ERROR: read block failed - invalid block header\n");
+        return false;
+    }
+
+    // copy header contents to compressed buffer
+    int blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1;
+    char* compressedBlock = CompressedBlock;
+    memcpy(compressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH);
+    int remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH;
+
+    // read remainder of block
+    count = fread(&compressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], 1, remaining, Stream);
+    if ( count != remaining ) {
+        fprintf(stderr, "BgzfStream ERROR: read block failed - could not read data from block\n");
+        return false;
+    }
+
+    // decompress block data
+    count = InflateBlock(blockLength);
+    if ( count < 0 ) {
+        fprintf(stderr, "BgzfStream ERROR: read block failed - could not decompress block data\n");
+        return false;
+    }
+
+    // update block data
+    if ( BlockLength != 0 )
+        BlockOffset = 0;
+    BlockAddress = blockAddress;
+    BlockLength  = count;
+
+    // return success
+    return true;
+}
+
+// seek to position in BGZF file
+bool BgzfStream::Seek(int64_t position) {
+
+    // skip if not open
+    if ( !IsOpen ) return false;
+
+    // determine adjusted offset & address
+    int     blockOffset  = (position & 0xFFFF);
+    int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
+
+    // attempt seek in file
+    if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) {
+        fprintf(stderr, "BgzfStream ERROR: unable to seek in file\n");
+        return false;
+    }
+
+    // update block data
+    BlockLength  = 0;
+    BlockAddress = blockAddress;
+    BlockOffset  = blockOffset;
+
+    // return success
+    return true;
+}
+
+void BgzfStream::SetWriteCompressed(bool ok) {
+    IsWriteCompressed = ok;
+}
+
+// get file position in BGZF file
+int64_t BgzfStream::Tell(void) {
+
+    // skip if file not open
+    if ( !IsOpen ) return false;
+
+    // otherwise return file pointer position
+    return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );
+}
+
+// writes the supplied data into the BGZF buffer
+unsigned int BgzfStream::Write(const char* data, const unsigned int dataLen) {
+
+    // skip if file not open for writing
+    if ( !IsOpen || !IsWriteOnly ) return false;
+
+    // write blocks as needed til all data is written
+    unsigned int numBytesWritten = 0;
+    const char* input = data;
+    unsigned int blockLength = UncompressedBlockSize;
+    while ( numBytesWritten < dataLen ) {
+
+        // copy data contents to uncompressed output buffer
+        unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);
+        char* buffer = UncompressedBlock;
+        memcpy(buffer + BlockOffset, input, copyLength);
+
+        // update counter
+        BlockOffset     += copyLength;
+        input           += copyLength;
+        numBytesWritten += copyLength;
+
+        // flush (& compress) output buffer when full
+        if ( BlockOffset == blockLength ) FlushBlock();
+    }
+
+    // return result
+    return numBytesWritten;
+}