]> git.donarmstrong.com Git - bamtools.git/blobdiff - src/api/BGZF.cpp
Reorganized source tree & build system
[bamtools.git] / src / api / BGZF.cpp
diff --git a/src/api/BGZF.cpp b/src/api/BGZF.cpp
new file mode 100644 (file)
index 0000000..92afb96
--- /dev/null
@@ -0,0 +1,398 @@
+// ***************************************************************************\r
+// BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 16 August 2010 (DB)\r
+// ---------------------------------------------------------------------------\r
+// BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for reading & writing BGZF files\r
+// ***************************************************************************\r
+\r
+#include <algorithm>\r
+#include "BGZF.h"\r
+using namespace BamTools;\r
+using std::string;\r
+using std::min;\r
+\r
+BgzfData::BgzfData(void)\r
+    : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)\r
+    , CompressedBlockSize(MAX_BLOCK_SIZE)\r
+    , BlockLength(0)\r
+    , BlockOffset(0)\r
+    , BlockAddress(0)\r
+    , IsOpen(false)\r
+    , IsWriteOnly(false)\r
+    , IsWriteUncompressed(false)\r
+    , Stream(NULL)\r
+    , UncompressedBlock(NULL)\r
+    , CompressedBlock(NULL)\r
+{\r
+    try {\r
+        CompressedBlock   = new char[CompressedBlockSize];\r
+        UncompressedBlock = new char[UncompressedBlockSize];\r
+    } catch( std::bad_alloc& ba ) {\r
+        printf("BGZF ERROR: unable to allocate memory for our BGZF object.\n");\r
+        exit(1);\r
+    }\r
+}\r
+\r
+// destructor\r
+BgzfData::~BgzfData(void) {\r
+    if( CompressedBlock   ) delete[] CompressedBlock;\r
+    if( UncompressedBlock ) delete[] UncompressedBlock;\r
+}\r
+\r
+// closes BGZF file\r
+void BgzfData::Close(void) {\r
+\r
+    // skip if file not open, otherwise set flag\r
+    if ( !IsOpen ) return;\r
+\r
+    // if writing to file, flush the current BGZF block,\r
+    // then write an empty block (as EOF marker)\r
+    if ( IsWriteOnly ) {\r
+        FlushBlock();\r
+        int blockLength = DeflateBlock();\r
+        fwrite(CompressedBlock, 1, blockLength, Stream);\r
+    }\r
+    \r
+    // flush and close\r
+    fflush(Stream);\r
+    fclose(Stream);\r
+    IsWriteUncompressed = false;\r
+    IsOpen = false;\r
+}\r
+\r
+// compresses the current block\r
+int BgzfData::DeflateBlock(void) {\r
+\r
+    // initialize the gzip header\r
+    char* buffer = CompressedBlock;\r
+    memset(buffer, 0, 18);\r
+    buffer[0]  = GZIP_ID1;\r
+    buffer[1]  = (char)GZIP_ID2;\r
+    buffer[2]  = CM_DEFLATE;\r
+    buffer[3]  = FLG_FEXTRA;\r
+    buffer[9]  = (char)OS_UNKNOWN;\r
+    buffer[10] = BGZF_XLEN;\r
+    buffer[12] = BGZF_ID1;\r
+    buffer[13] = BGZF_ID2;\r
+    buffer[14] = BGZF_LEN;\r
+\r
+    // set compression level\r
+    const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION );\r
+    \r
+    // loop to retry for blocks that do not compress enough\r
+    int inputLength = BlockOffset;\r
+    int compressedLength = 0;\r
+    unsigned int bufferSize = CompressedBlockSize;\r
+\r
+    while ( true ) {\r
+        \r
+        // initialize zstream values\r
+        z_stream zs;\r
+        zs.zalloc    = NULL;\r
+        zs.zfree     = NULL;\r
+        zs.next_in   = (Bytef*)UncompressedBlock;\r
+        zs.avail_in  = inputLength;\r
+        zs.next_out  = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];\r
+        zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;\r
+\r
+        // initialize the zlib compression algorithm\r
+        if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) {\r
+            printf("BGZF ERROR: zlib deflate initialization failed.\n");\r
+            exit(1);\r
+        }\r
+\r
+        // compress the data\r
+        int status = deflate(&zs, Z_FINISH);\r
+        if ( status != Z_STREAM_END ) {\r
+\r
+            deflateEnd(&zs);\r
+\r
+            // reduce the input length and try again\r
+            if ( status == Z_OK ) {\r
+                inputLength -= 1024;\r
+                if( inputLength < 0 ) {\r
+                    printf("BGZF ERROR: input reduction failed.\n");\r
+                    exit(1);\r
+                }\r
+                continue;\r
+            }\r
+\r
+            printf("BGZF ERROR: zlib::deflateEnd() failed.\n");\r
+            exit(1);\r
+        }\r
+\r
+        // finalize the compression routine\r
+        if ( deflateEnd(&zs) != Z_OK ) {\r
+            printf("BGZF ERROR: zlib::deflateEnd() failed.\n");\r
+            exit(1);\r
+        }\r
+\r
+        compressedLength = zs.total_out;\r
+        compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;\r
+        if ( compressedLength > MAX_BLOCK_SIZE ) {\r
+            printf("BGZF ERROR: deflate overflow.\n");\r
+            exit(1);\r
+        }\r
+\r
+        break;\r
+    }\r
+\r
+    // store the compressed length\r
+    BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));\r
+\r
+    // store the CRC32 checksum\r
+    unsigned int crc = crc32(0, NULL, 0);\r
+    crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);\r
+    BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc);\r
+    BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);\r
+\r
+    // ensure that we have less than a block of data left\r
+    int remaining = BlockOffset - inputLength;\r
+    if ( remaining > 0 ) {\r
+        if ( remaining > inputLength ) {\r
+            printf("BGZF ERROR: after deflate, remainder too large.\n");\r
+            exit(1);\r
+        }\r
+        memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);\r
+    }\r
+\r
+    BlockOffset = remaining;\r
+    return compressedLength;\r
+}\r
+\r
+// flushes the data in the BGZF block\r
+void BgzfData::FlushBlock(void) {\r
+\r
+    // flush all of the remaining blocks\r
+    while ( BlockOffset > 0 ) {\r
+\r
+        // compress the data block\r
+        int blockLength = DeflateBlock();\r
+\r
+        // flush the data to our output stream\r
+        int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);\r
+\r
+        if ( numBytesWritten != blockLength ) {\r
+          printf("BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);\r
+          exit(1);\r
+        }\r
+              \r
+        BlockAddress += blockLength;\r
+    }\r
+}\r
+\r
+// de-compresses the current block\r
+int BgzfData::InflateBlock(const int& blockLength) {\r
+\r
+    // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock\r
+    z_stream zs;\r
+    zs.zalloc    = NULL;\r
+    zs.zfree     = NULL;\r
+    zs.next_in   = (Bytef*)CompressedBlock + 18;\r
+    zs.avail_in  = blockLength - 16;\r
+    zs.next_out  = (Bytef*)UncompressedBlock;\r
+    zs.avail_out = UncompressedBlockSize;\r
+\r
+    int status = inflateInit2(&zs, GZIP_WINDOW_BITS);\r
+    if ( status != Z_OK ) {\r
+        printf("BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n");\r
+        return -1;\r
+    }\r
+\r
+    status = inflate(&zs, Z_FINISH);\r
+    if ( status != Z_STREAM_END ) {\r
+        inflateEnd(&zs);\r
+        printf("BGZF ERROR: could not decompress block - zlib::inflate() failed\n");\r
+        return -1;\r
+    }\r
+\r
+    status = inflateEnd(&zs);\r
+    if ( status != Z_OK ) {\r
+        printf("BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n");\r
+        return -1;\r
+    }\r
+\r
+    return zs.total_out;\r
+}\r
+\r
+// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)\r
+bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncompressed ) {\r
+\r
+    // determine open mode\r
+    if ( strcmp(mode, "rb") == 0 )\r
+        IsWriteOnly = false;\r
+    else if ( strcmp(mode, "wb") == 0) \r
+        IsWriteOnly = true;\r
+    else {\r
+        printf("BGZF ERROR: unknown file mode: %s\n", mode);\r
+        return false; \r
+    }\r
+\r
+    // ----------------------------------------------------------------\r
+    // open Stream to read to/write from file, stdin, or stdout\r
+    // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03)\r
+    \r
+    // read/write BGZF data to/from a file\r
+    if ( (filename != "stdin") && (filename != "stdout") )\r
+        Stream = fopen(filename.c_str(), mode);\r
+    \r
+    // read BGZF data from stdin\r
+    else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) )\r
+        Stream = freopen(NULL, mode, stdin);\r
+    \r
+    // write BGZF data to stdout\r
+    else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) )\r
+        Stream = freopen(NULL, mode, stdout);\r
+\r
+    if ( !Stream ) {\r
+        printf("BGZF ERROR: unable to open file %s\n", filename.c_str() );\r
+        return false;\r
+    }\r
+    \r
+    // set flags, return success\r
+    IsOpen = true;\r
+    IsWriteUncompressed = isWriteUncompressed;\r
+    return true;\r
+}\r
+\r
+// reads BGZF data into a byte buffer\r
+int BgzfData::Read(char* data, const unsigned int dataLength) {\r
+\r
+   if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0;\r
+\r
+   char* output = data;\r
+   unsigned int numBytesRead = 0;\r
+   while ( numBytesRead < dataLength ) {\r
+\r
+       int bytesAvailable = BlockLength - BlockOffset;\r
+       if ( bytesAvailable <= 0 ) {\r
+           if ( !ReadBlock() ) return -1; \r
+           bytesAvailable = BlockLength - BlockOffset;\r
+           if ( bytesAvailable <= 0 ) break;\r
+       }\r
+\r
+       char* buffer   = UncompressedBlock;\r
+       int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );\r
+       memcpy(output, buffer + BlockOffset, copyLength);\r
+\r
+       BlockOffset  += copyLength;\r
+       output       += copyLength;\r
+       numBytesRead += copyLength;\r
+   }\r
+\r
+   if ( BlockOffset == BlockLength ) {\r
+       BlockAddress = ftell64(Stream);\r
+       BlockOffset  = 0;\r
+       BlockLength  = 0;\r
+   }\r
+\r
+   return numBytesRead;\r
+}\r
+\r
+// reads a BGZF block\r
+bool BgzfData::ReadBlock(void) {\r
+\r
+    char    header[BLOCK_HEADER_LENGTH];\r
+    int64_t blockAddress = ftell64(Stream);\r
+    \r
+    int count = fread(header, 1, sizeof(header), Stream);\r
+    if ( count == 0 ) {\r
+        BlockLength = 0;\r
+        return true;\r
+    }\r
+\r
+    if ( count != sizeof(header) ) {\r
+        printf("BGZF ERROR: read block failed - could not read block header\n");\r
+        return false;\r
+    }\r
+\r
+    if ( !BgzfData::CheckBlockHeader(header) ) {\r
+        printf("BGZF ERROR: read block failed - invalid block header\n");\r
+        return false;\r
+    }\r
+\r
+    int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1;\r
+    char* compressedBlock = CompressedBlock;\r
+    memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);\r
+    int remaining = blockLength - BLOCK_HEADER_LENGTH;\r
+\r
+    count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream);\r
+    if ( count != remaining ) {\r
+        printf("BGZF ERROR: read block failed - could not read data from block\n");\r
+        return false;\r
+    }\r
+\r
+    count = InflateBlock(blockLength);\r
+    if ( count < 0 ) { \r
+      printf("BGZF ERROR: read block failed - could not decompress block data\n");\r
+      return false;\r
+    }\r
+\r
+    if ( BlockLength != 0 )\r
+        BlockOffset = 0;\r
+\r
+    BlockAddress = blockAddress;\r
+    BlockLength  = count;\r
+    return true;\r
+}\r
+\r
+// seek to position in BGZF file\r
+bool BgzfData::Seek(int64_t position) {\r
+\r
+    if ( !IsOpen ) return false;\r
+  \r
+    int     blockOffset  = (position & 0xFFFF);\r
+    int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;\r
+\r
+    if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) {\r
+        printf("BGZF ERROR: unable to seek in file\n");\r
+        return false;\r
+    }\r
+\r
+    BlockLength  = 0;\r
+    BlockAddress = blockAddress;\r
+    BlockOffset  = blockOffset;\r
+    return true;\r
+}\r
+\r
+// get file position in BGZF file\r
+int64_t BgzfData::Tell(void) {\r
+    if ( !IsOpen ) \r
+        return false;\r
+    else \r
+        return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );\r
+}\r
+\r
+// writes the supplied data into the BGZF buffer\r
+unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) {\r
+\r
+    if ( !IsOpen || !IsWriteOnly ) return false;\r
+  \r
+    // initialize\r
+    unsigned int numBytesWritten = 0;\r
+    const char* input = data;\r
+    unsigned int blockLength = UncompressedBlockSize;\r
+\r
+    // copy the data to the buffer\r
+    while ( numBytesWritten < dataLen ) {\r
+      \r
+        unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);\r
+        char* buffer = UncompressedBlock;\r
+        memcpy(buffer + BlockOffset, input, copyLength);\r
+\r
+        BlockOffset     += copyLength;\r
+        input           += copyLength;\r
+        numBytesWritten += copyLength;\r
+\r
+        if ( BlockOffset == blockLength )\r
+            FlushBlock();\r
+    }\r
+\r
+    return numBytesWritten;\r
+}\r