--- /dev/null
+// ***************************************************************************\r
+// BGZF.cpp (c) 2009 Derek Barnett, Michael Strömberg\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 8 December 2009 (DB)\r
+// ---------------------------------------------------------------------------\r
+// BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for reading & writing BGZF files\r
+// ***************************************************************************\r
+\r
+#include <algorithm>\r
+#include "BGZF.h"\r
+using namespace BamTools;\r
+using std::string;\r
+using std::min;\r
+\r
+BgzfData::BgzfData(void)\r
+ : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)\r
+ , CompressedBlockSize(MAX_BLOCK_SIZE)\r
+ , BlockLength(0)\r
+ , BlockOffset(0)\r
+ , BlockAddress(0)\r
+ , IsOpen(false)\r
+ , IsWriteOnly(false)\r
+ , Stream(NULL)\r
+ , UncompressedBlock(NULL)\r
+ , CompressedBlock(NULL)\r
+{\r
+ try {\r
+ CompressedBlock = new char[CompressedBlockSize];\r
+ UncompressedBlock = new char[UncompressedBlockSize];\r
+ } catch( std::bad_alloc& ba ) {\r
+ printf("ERROR: Unable to allocate memory for our BGZF object.\n");\r
+ exit(1);\r
+ }\r
+}\r
+\r
+// destructor\r
+BgzfData::~BgzfData(void) {\r
+ if(CompressedBlock) delete [] CompressedBlock;\r
+ if(UncompressedBlock) delete [] UncompressedBlock;\r
+}\r
+\r
+// closes BGZF file\r
+void BgzfData::Close(void) {\r
+\r
+ if (!IsOpen) { return; }\r
+ IsOpen = false;\r
+\r
+ // flush the BGZF block\r
+ if ( IsWriteOnly ) { FlushBlock(); }\r
+\r
+ // flush and close\r
+ fflush(Stream);\r
+ fclose(Stream);\r
+}\r
+\r
+// compresses the current block\r
+int BgzfData::DeflateBlock(void) {\r
+\r
+ // initialize the gzip header\r
+ char* buffer = CompressedBlock;\r
+ unsigned int bufferSize = CompressedBlockSize;\r
+\r
+ memset(buffer, 0, 18);\r
+ buffer[0] = GZIP_ID1;\r
+ buffer[1] = (char)GZIP_ID2;\r
+ buffer[2] = CM_DEFLATE;\r
+ buffer[3] = FLG_FEXTRA;\r
+ buffer[9] = (char)OS_UNKNOWN;\r
+ buffer[10] = BGZF_XLEN;\r
+ buffer[12] = BGZF_ID1;\r
+ buffer[13] = BGZF_ID2;\r
+ buffer[14] = BGZF_LEN;\r
+\r
+ // loop to retry for blocks that do not compress enough\r
+ int inputLength = BlockOffset;\r
+ int compressedLength = 0;\r
+\r
+ while(true) {\r
+\r
+ z_stream zs;\r
+ zs.zalloc = NULL;\r
+ zs.zfree = NULL;\r
+ zs.next_in = (Bytef*)UncompressedBlock;\r
+ zs.avail_in = inputLength;\r
+ zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];\r
+ zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;\r
+\r
+ // initialize the zlib compression algorithm\r
+ if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {\r
+ printf("ERROR: zlib deflate initialization failed.\n");\r
+ exit(1);\r
+ }\r
+\r
+ // compress the data\r
+ int status = deflate(&zs, Z_FINISH);\r
+ if(status != Z_STREAM_END) {\r
+\r
+ deflateEnd(&zs);\r
+\r
+ // reduce the input length and try again\r
+ if(status == Z_OK) {\r
+ inputLength -= 1024;\r
+ if(inputLength < 0) {\r
+ printf("ERROR: input reduction failed.\n");\r
+ exit(1);\r
+ }\r
+ continue;\r
+ }\r
+\r
+ printf("ERROR: zlib deflate failed.\n");\r
+ exit(1);\r
+ }\r
+\r
+ // finalize the compression routine\r
+ if(deflateEnd(&zs) != Z_OK) {\r
+ printf("ERROR: deflate end failed.\n");\r
+ exit(1);\r
+ }\r
+\r
+ compressedLength = zs.total_out;\r
+ compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;\r
+\r
+ if(compressedLength > MAX_BLOCK_SIZE) {\r
+ printf("ERROR: deflate overflow.\n");\r
+ exit(1);\r
+ }\r
+\r
+ break;\r
+ }\r
+\r
+ // store the compressed length\r
+ BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));\r
+\r
+ // store the CRC32 checksum\r
+ unsigned int crc = crc32(0, NULL, 0);\r
+ crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);\r
+ BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc);\r
+ BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);\r
+\r
+ // ensure that we have less than a block of data left\r
+ int remaining = BlockOffset - inputLength;\r
+ if(remaining > 0) {\r
+ if(remaining > inputLength) {\r
+ printf("ERROR: remainder too large.\n");\r
+ exit(1);\r
+ }\r
+ memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);\r
+ }\r
+\r
+ BlockOffset = remaining;\r
+ return compressedLength;\r
+}\r
+\r
+// flushes the data in the BGZF block\r
+void BgzfData::FlushBlock(void) {\r
+\r
+ // flush all of the remaining blocks\r
+ while(BlockOffset > 0) {\r
+\r
+ // compress the data block\r
+ int blockLength = DeflateBlock();\r
+\r
+ // flush the data to our output stream\r
+ int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);\r
+\r
+ if(numBytesWritten != blockLength) {\r
+ printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);\r
+ exit(1);\r
+ }\r
+\r
+ BlockAddress += blockLength;\r
+ }\r
+}\r
+\r
+// de-compresses the current block\r
+int BgzfData::InflateBlock(const int& blockLength) {\r
+\r
+ // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock\r
+ z_stream zs;\r
+ zs.zalloc = NULL;\r
+ zs.zfree = NULL;\r
+ zs.next_in = (Bytef*)CompressedBlock + 18;\r
+ zs.avail_in = blockLength - 16;\r
+ zs.next_out = (Bytef*)UncompressedBlock;\r
+ zs.avail_out = UncompressedBlockSize;\r
+\r
+ int status = inflateInit2(&zs, GZIP_WINDOW_BITS);\r
+ if (status != Z_OK) {\r
+ printf("inflateInit failed\n");\r
+ exit(1);\r
+ }\r
+\r
+ status = inflate(&zs, Z_FINISH);\r
+ if (status != Z_STREAM_END) {\r
+ inflateEnd(&zs);\r
+ printf("inflate failed\n");\r
+ exit(1);\r
+ }\r
+\r
+ status = inflateEnd(&zs);\r
+ if (status != Z_OK) {\r
+ printf("inflateEnd failed\n");\r
+ exit(1);\r
+ }\r
+\r
+ return zs.total_out;\r
+}\r
+\r
+void BgzfData::Open(const string& filename, const char* mode) {\r
+\r
+ if ( strcmp(mode, "rb") == 0 ) {\r
+ IsWriteOnly = false;\r
+ } else if ( strcmp(mode, "wb") == 0) {\r
+ IsWriteOnly = true;\r
+ } else {\r
+ printf("ERROR: Unknown file mode: %s\n", mode);\r
+ exit(1);\r
+ }\r
+\r
+ Stream = fopen(filename.c_str(), mode);\r
+ if(!Stream) {\r
+ printf("ERROR: Unable to open the BAM file %s\n", filename.c_str() );\r
+ exit(1);\r
+ }\r
+ IsOpen = true;\r
+}\r
+\r
+int BgzfData::Read(char* data, const unsigned int dataLength) {\r
+\r
+ if (dataLength == 0) { return 0; }\r
+\r
+ char* output = data;\r
+ unsigned int numBytesRead = 0;\r
+ while (numBytesRead < dataLength) {\r
+\r
+ int bytesAvailable = BlockLength - BlockOffset;\r
+ if (bytesAvailable <= 0) {\r
+ if ( ReadBlock() != 0 ) { return -1; }\r
+ bytesAvailable = BlockLength - BlockOffset;\r
+ if ( bytesAvailable <= 0 ) { break; }\r
+ }\r
+\r
+ char* buffer = UncompressedBlock;\r
+ int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );\r
+ memcpy(output, buffer + BlockOffset, copyLength);\r
+\r
+ BlockOffset += copyLength;\r
+ output += copyLength;\r
+ numBytesRead += copyLength;\r
+ }\r
+\r
+ if ( BlockOffset == BlockLength ) {\r
+ BlockAddress = ftell(Stream);\r
+ BlockOffset = 0;\r
+ BlockLength = 0;\r
+ }\r
+\r
+ return numBytesRead;\r
+}\r
+\r
+int BgzfData::ReadBlock(void) {\r
+\r
+ char header[BLOCK_HEADER_LENGTH];\r
+ int64_t blockAddress = ftell(Stream);\r
+\r
+ int count = fread(header, 1, sizeof(header), Stream);\r
+ if (count == 0) {\r
+ BlockLength = 0;\r
+ return 0;\r
+ }\r
+\r
+ if (count != sizeof(header)) {\r
+ printf("read block failed - count != sizeof(header)\n");\r
+ return -1;\r
+ }\r
+\r
+ if (!BgzfData::CheckBlockHeader(header)) {\r
+ printf("read block failed - CheckBlockHeader() returned false\n");\r
+ return -1;\r
+ }\r
+\r
+ int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1;\r
+ char* compressedBlock = CompressedBlock;\r
+ memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);\r
+ int remaining = blockLength - BLOCK_HEADER_LENGTH;\r
+\r
+ count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream);\r
+ if (count != remaining) {\r
+ printf("read block failed - count != remaining\n");\r
+ return -1;\r
+ }\r
+\r
+ count = InflateBlock(blockLength);\r
+ if (count < 0) { return -1; }\r
+\r
+ if ( BlockLength != 0 ) {\r
+ BlockOffset = 0;\r
+ }\r
+\r
+ BlockAddress = blockAddress;\r
+ BlockLength = count;\r
+ return 0;\r
+}\r
+\r
+bool BgzfData::Seek(int64_t position) {\r
+\r
+ int blockOffset = (position & 0xFFFF);\r
+ int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;\r
+\r
+ if (fseek(Stream, blockAddress, SEEK_SET) != 0) {\r
+ printf("ERROR: Unable to seek in BAM file\n");\r
+ exit(1);\r
+ }\r
+\r
+ BlockLength = 0;\r
+ BlockAddress = blockAddress;\r
+ BlockOffset = blockOffset;\r
+ return true;\r
+}\r
+\r
+int64_t BgzfData::Tell(void) {\r
+ return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );\r
+}\r
+\r
+// writes the supplied data into the BGZF buffer\r
+unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) {\r
+\r
+ // initialize\r
+ unsigned int numBytesWritten = 0;\r
+ const char* input = data;\r
+ unsigned int blockLength = UncompressedBlockSize;\r
+\r
+ // copy the data to the buffer\r
+ while(numBytesWritten < dataLen) {\r
+ unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);\r
+ char* buffer = UncompressedBlock;\r
+ memcpy(buffer + BlockOffset, input, copyLength);\r
+\r
+ BlockOffset += copyLength;\r
+ input += copyLength;\r
+ numBytesWritten += copyLength;\r
+\r
+ if(BlockOffset == blockLength) {\r
+ FlushBlock();\r
+ }\r
+ }\r
+\r
+ return numBytesWritten;\r
+}\r
--- /dev/null
+// ***************************************************************************\r
+// BGZF.h (c) 2009 Derek Barnett, Michael Strömberg\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 8 December 2009 (DB)\r
+// ---------------------------------------------------------------------------\r
+// BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for reading & writing BGZF files\r
+// ***************************************************************************\r
+\r
+#ifndef BGZF_H\r
+#define BGZF_H\r
+\r
+// 'C' includes\r
+#include <cstdio>\r
+#include <cstdlib>\r
+#include <cstring>\r
+\r
+// C++ includes\r
+#include <string>\r
+\r
+// zlib includes\r
+#include "zlib.h"\r
+\r
+// Platform-specific type definitions\r
+#ifdef _MSC_VER\r
+ typedef char int8_t;\r
+ typedef unsigned char uint8_t;\r
+ typedef short int16_t;\r
+ typedef unsigned short uint16_t;\r
+ typedef int int32_t;\r
+ typedef unsigned int uint32_t;\r
+ typedef long long int64_t;\r
+ typedef unsigned long long uint64_t;\r
+#else\r
+ #include <stdint.h>\r
+#endif\r
+\r
+namespace BamTools {\r
+\r
+// zlib constants\r
+const int GZIP_ID1 = 31;\r
+const int GZIP_ID2 = 139;\r
+const int CM_DEFLATE = 8;\r
+const int FLG_FEXTRA = 4;\r
+const int OS_UNKNOWN = 255;\r
+const int BGZF_XLEN = 6;\r
+const int BGZF_ID1 = 66;\r
+const int BGZF_ID2 = 67;\r
+const int BGZF_LEN = 2;\r
+const int GZIP_WINDOW_BITS = -15;\r
+const int Z_DEFAULT_MEM_LEVEL = 8;\r
+\r
+// BZGF constants\r
+const int BLOCK_HEADER_LENGTH = 18;\r
+const int BLOCK_FOOTER_LENGTH = 8;\r
+const int MAX_BLOCK_SIZE = 65536;\r
+const int DEFAULT_BLOCK_SIZE = 65536;\r
+\r
+struct BgzfData {\r
+\r
+ // data members\r
+ unsigned int UncompressedBlockSize;\r
+ unsigned int CompressedBlockSize;\r
+ unsigned int BlockLength;\r
+ unsigned int BlockOffset;\r
+ uint64_t BlockAddress;\r
+ bool IsOpen;\r
+ bool IsWriteOnly;\r
+ FILE* Stream;\r
+ char* UncompressedBlock;\r
+ char* CompressedBlock;\r
+\r
+ // constructor & destructor\r
+ BgzfData(void);\r
+ ~BgzfData(void);\r
+\r
+ // closes BGZF file\r
+ void Close(void);\r
+ // compresses the current block\r
+ int DeflateBlock(void);\r
+ // flushes the data in the BGZF block\r
+ void FlushBlock(void);\r
+ // de-compresses the current block\r
+ int InflateBlock(const int& blockLength);\r
+ // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing\r
+ void Open(const std::string& filename, const char* mode);\r
+ // reads BGZF data into a byte buffer\r
+ int Read(char* data, const unsigned int dataLength);\r
+ // reads BGZF block\r
+ int ReadBlock(void);\r
+ // seek to position in BAM file\r
+ bool Seek(int64_t position);\r
+ // get file position in BAM file\r
+ int64_t Tell(void);\r
+ // writes the supplied data into the BGZF buffer\r
+ unsigned int Write(const char* data, const unsigned int dataLen);\r
+\r
+ // checks BGZF block header\r
+ static inline bool CheckBlockHeader(char* header);\r
+ // packs an unsigned integer into the specified buffer\r
+ static inline void PackUnsignedInt(char* buffer, unsigned int value);\r
+ // packs an unsigned short into the specified buffer\r
+ static inline void PackUnsignedShort(char* buffer, unsigned short value);\r
+ // unpacks a buffer into a signed int\r
+ static inline signed int UnpackSignedInt(char* buffer);\r
+ // unpacks a buffer into a unsigned int\r
+ static inline unsigned int UnpackUnsignedInt(char* buffer);\r
+ // unpacks a buffer into a unsigned short\r
+ static inline unsigned short UnpackUnsignedShort(char* buffer);\r
+};\r
+\r
+// -------------------------------------------------------------\r
+\r
+inline\r
+bool BgzfData::CheckBlockHeader(char* header) {\r
+\r
+ return (header[0] == GZIP_ID1 &&\r
+ header[1] == (char)GZIP_ID2 &&\r
+ header[2] == Z_DEFLATED &&\r
+ (header[3] & FLG_FEXTRA) != 0 &&\r
+ BgzfData::UnpackUnsignedShort(&header[10]) == BGZF_XLEN &&\r
+ header[12] == BGZF_ID1 &&\r
+ header[13] == BGZF_ID2 &&\r
+ BgzfData::UnpackUnsignedShort(&header[14]) == BGZF_LEN );\r
+}\r
+\r
+// packs an unsigned integer into the specified buffer\r
+inline\r
+void BgzfData::PackUnsignedInt(char* buffer, unsigned int value) {\r
+ buffer[0] = (char)value;\r
+ buffer[1] = (char)(value >> 8);\r
+ buffer[2] = (char)(value >> 16);\r
+ buffer[3] = (char)(value >> 24);\r
+}\r
+\r
+// packs an unsigned short into the specified buffer\r
+inline\r
+void BgzfData::PackUnsignedShort(char* buffer, unsigned short value) {\r
+ buffer[0] = (char)value;\r
+ buffer[1] = (char)(value >> 8);\r
+}\r
+\r
+// unpacks a buffer into a signed int\r
+inline\r
+signed int BgzfData::UnpackSignedInt(char* buffer) {\r
+ union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un;\r
+ un.value = 0;\r
+ un.valueBuffer[0] = buffer[0];\r
+ un.valueBuffer[1] = buffer[1];\r
+ un.valueBuffer[2] = buffer[2];\r
+ un.valueBuffer[3] = buffer[3];\r
+ return un.value;\r
+}\r
+\r
+// unpacks a buffer into an unsigned int\r
+inline\r
+unsigned int BgzfData::UnpackUnsignedInt(char* buffer) {\r
+ union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;\r
+ un.value = 0;\r
+ un.valueBuffer[0] = buffer[0];\r
+ un.valueBuffer[1] = buffer[1];\r
+ un.valueBuffer[2] = buffer[2];\r
+ un.valueBuffer[3] = buffer[3];\r
+ return un.value;\r
+}\r
+\r
+// unpacks a buffer into an unsigned short\r
+inline\r
+unsigned short BgzfData::UnpackUnsignedShort(char* buffer) {\r
+ union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;\r
+ un.value = 0;\r
+ un.valueBuffer[0] = buffer[0];\r
+ un.valueBuffer[1] = buffer[1];\r
+ return un.value;\r
+}\r
+\r
+} // namespace BamTools\r
+\r
+#endif // BGZF_H\r
// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 1 October 2009 (DB)\r
+// Last modified: 8 December 2009 (DB)\r
// ---------------------------------------------------------------------------\r
-// The BGZF routines were adapted from the bgzf.c code developed at the Broad\r
-// Institute.\r
-// ---------------------------------------------------------------------------\r
-// Defines common constants, typedefs, & data structures for BamTools.\r
+// Provides the basic constants, data structures, etc. for using BAM files\r
// ***************************************************************************\r
\r
-/*! \r
- \file BamAux.h\r
- \brief BamTools constants, typedefs, & data structures\r
-*/\r
+#ifndef BAMAUX_H\r
+#define BAMAUX_H\r
\r
-#pragma once\r
+// C inclues\r
+#include <cstdlib>\r
+#include <cstring>\r
\r
// C++ includes\r
#include <exception>\r
+#include <map>\r
#include <string>\r
#include <utility>\r
#include <vector>\r
\r
-// C includes\r
-#include <cstdio>\r
-#include <cstdlib>\r
-#include <cstring>\r
-\r
-// Platform-specific type definitions\r
-#ifdef WIN32\r
- typedef char int8_t;\r
- typedef unsigned char uint8_t;\r
- typedef short int16_t;\r
- typedef unsigned short uint16_t;\r
- typedef int int32_t;\r
- typedef unsigned int uint32_t;\r
- typedef long long int64_t;\r
- typedef unsigned long long uint64_t;\r
-#else\r
- #include <stdint.h>\r
-#endif\r
-\r
-//! \namespace BamTools\r
namespace BamTools {\r
\r
- //! \cond\r
- // --------------------------------------------------------------------------------------\r
- // This section is purely internal and can be excluded from main generated documentation.\r
- \r
- // zlib constants\r
- const int GZIP_ID1 = 31;\r
- const int GZIP_ID2 = 139;\r
- const int CM_DEFLATE = 8;\r
- const int FLG_FEXTRA = 4;\r
- const int OS_UNKNOWN = 255;\r
- const int BGZF_XLEN = 6;\r
- const int BGZF_ID1 = 66;\r
- const int BGZF_ID2 = 67;\r
- const int BGZF_LEN = 2;\r
- const int GZIP_WINDOW_BITS = -15;\r
- const int Z_DEFAULT_MEM_LEVEL = 8;\r
-\r
- // BZGF constants\r
- const int BLOCK_HEADER_LENGTH = 18;\r
- const int BLOCK_FOOTER_LENGTH = 8;\r
- const int MAX_BLOCK_SIZE = 65536;\r
- const int DEFAULT_BLOCK_SIZE = 65536;\r
-\r
- // BAM constants\r
- const unsigned int BAM_CORE_SIZE = 32;\r
- const int BAM_CMATCH = 0;\r
- const int BAM_CINS = 1;\r
- const int BAM_CDEL = 2;\r
- const int BAM_CREF_SKIP = 3;\r
- const int BAM_CSOFT_CLIP = 4;\r
- const int BAM_CHARD_CLIP = 5;\r
- const int BAM_CPAD = 6;\r
- const int BAM_CIGAR_SHIFT = 4;\r
- const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1);\r
-\r
- // BAM index constants\r
- const int MAX_BIN = 37450; // =(8^6-1)/7+1\r
- const int BAM_MIN_CHUNK_GAP = 32768;\r
- const int BAM_LIDX_SHIFT = 14;\r
-\r
- // Explicit variable sizes\r
- const int BT_SIZEOF_INT = 4;\r
- \r
- struct BgzfData {\r
- unsigned int UncompressedBlockSize;\r
- unsigned int CompressedBlockSize;\r
- unsigned int BlockLength;\r
- unsigned int BlockOffset;\r
- uint64_t BlockAddress;\r
- bool IsOpen;\r
- FILE* Stream;\r
- char* UncompressedBlock;\r
- char* CompressedBlock;\r
- \r
- // constructor\r
- BgzfData(void)\r
- : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)\r
- , CompressedBlockSize(MAX_BLOCK_SIZE)\r
- , BlockLength(0)\r
- , BlockOffset(0)\r
- , BlockAddress(0)\r
- , IsOpen(false)\r
- , Stream(NULL)\r
- , UncompressedBlock(NULL)\r
- , CompressedBlock(NULL)\r
- {\r
- try {\r
- CompressedBlock = new char[CompressedBlockSize];\r
- UncompressedBlock = new char[UncompressedBlockSize];\r
- } catch( std::bad_alloc& ba ) {\r
- printf("ERROR: Unable to allocate memory for our BGZF object.\n");\r
- exit(1);\r
- }\r
- }\r
- \r
- // destructor\r
- ~BgzfData(void) {\r
- if(CompressedBlock) delete [] CompressedBlock;\r
- if(UncompressedBlock) delete [] UncompressedBlock;\r
- }\r
- };\r
- //! \endcond\r
- \r
- // --------------------------------------------------------------------------------------\r
- // Data structures\r
- \r
- //! \brief Cigar operation data structure\r
- struct CigarOp {\r
- char Type; //!< Operation type (MIDNSHP)\r
- uint32_t Length; //!< Operation length (number of bases)\r
- \r
- };\r
-\r
- //! Reference sequence data structure\r
- struct RefData {\r
- std::string RefName; //!< Name of reference sequence\r
- unsigned int RefLength; //!< Length of reference sequence\r
- bool RefHasAlignments; //!< True if BAM file contains alignments mapped to reference sequence\r
- \r
- // constructor\r
- RefData(void)\r
- : RefLength(0)\r
- , RefHasAlignments(false)\r
- { }\r
- };\r
-\r
- //! BAM alignment data structure\r
- struct BamAlignment {\r
- \r
- // Queries against alignment flag\r
- public:\r
- //! Returns true if this read is a PCR duplicate (determined by external app)\r
- bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }\r
- //! Returns true if this read failed quality control (determined by external app)\r
- bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } \r
- //! Returns true if alignment is first mate on read\r
- bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }\r
- //! Returns true if alignment is mapped \r
- bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }\r
- //! Returns true if alignment's mate is mapped\r
- bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } \r
- //! Returns true if alignment's mate mapped to reverse strand\r
- bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }\r
- //! Returns true if alignment part of paired-end read\r
- bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } \r
- //! Returns true if this position is primary alignment (determined by external app)\r
- bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } \r
- //! Returns true if alignment is part of read that satisfied paired-end resolution (determined by external app)\r
- bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } \r
- //! Returns true if alignment mapped to reverse strand\r
- bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } \r
- //! Returns true if alignment is second mate on read\r
- bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } \r
- \r
- public:\r
- /*! \r
- \brief Get alignment's read group text.\r
- \r
- Assigns read group text to readGroup.\r
- \r
- \return True if read group data successfully retrieved.\r
- */\r
- bool GetReadGroup(std::string& readGroup) const {\r
- \r
- if ( TagData.empty() ) { return false; }\r
- \r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLen = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- bool foundReadGroupTag = false;\r
- while( numBytesParsed < tagDataLen ) {\r
- \r
- const char* pTagType = pTagData;\r
- const char* pTagStorageType = pTagData + 2;\r
- pTagData += 3;\r
- numBytesParsed += 3;\r
- \r
- // check the current tag\r
- if ( strncmp(pTagType, "RG", 2) == 0 ) {\r
- foundReadGroupTag = true;\r
- break;\r
- }\r
- \r
- // get the storage class and find the next tag\r
- SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed );\r
- }\r
- \r
- // return if the read group tag was not present\r
- if ( !foundReadGroupTag ) { return false; }\r
- \r
- // assign the read group\r
- const unsigned int readGroupLen = strlen(pTagData);\r
- readGroup.resize(readGroupLen);\r
- memcpy( (char*)readGroup.data(), pTagData, readGroupLen );\r
- return true;\r
- }\r
- \r
- private:\r
- // skips to the next tag\r
- static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {\r
- switch(storageType) {\r
- \r
- case 'A':\r
- case 'c':\r
- case 'C':\r
- ++numBytesParsed;\r
- ++pTagData;\r
- break;\r
- \r
- case 's':\r
- case 'S':\r
- case 'f':\r
- numBytesParsed += 2;\r
- pTagData += 2;\r
- break;\r
- \r
- case 'i':\r
- case 'I':\r
- numBytesParsed += 4;\r
- pTagData += 4;\r
- break;\r
- \r
- case 'Z':\r
- case 'H':\r
- while(*pTagData) {\r
- ++numBytesParsed;\r
- ++pTagData;\r
- }\r
- break;\r
- \r
- default:\r
- printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData);\r
- exit(1);\r
- }\r
- }\r
- \r
- // Data members\r
- public:\r
- std::string Name; //!< Read name\r
- unsigned int Length; //!< Query length\r
- std::string QueryBases; //!< 'Original' sequence (as reported from sequencing machine)\r
- std::string AlignedBases; //!< 'Aligned' sequence (includes any indels, padding, clipping) \r
- std::string Qualities; //!< FASTQ qualities (ASCII characters, not numeric values)\r
- std::string TagData; //!< Tag data (accessor methods will pull the requested information out)\r
- //unsigned int RefID; //!< ID number for reference sequence\r
- //unsigned int Position; //!< Position (0-based) where alignment starts \r
- signed int RefID; //!< ID number for reference sequence (-1)\r
- signed int Position; //!< Position (0-based) where alignment starts (-1) \r
- unsigned int Bin; //!< Bin in BAM file where this alignment resides\r
- unsigned int MapQuality; //!< Mapping quality score \r
- unsigned int AlignmentFlag; //!< Alignment bit-flag - see Is<something>() methods for available queries\r
- std::vector<CigarOp> CigarData; //!< CIGAR operations for this alignment\r
- //unsigned int MateRefID; //!< ID number for reference sequence where alignment's mate was aligned\r
- //unsigned int MatePosition; //!< Position (0-based) where alignment's mate starts\r
- //unsigned int InsertSize; //!< Mate-pair insert size\r
- signed int MateRefID; //!< ID number for reference sequence where alignment's mate was aligned (-1)\r
- signed int MatePosition; //!< Position (0-based) where alignment's mate starts (-1)\r
- signed int InsertSize; //!< Mate-pair insert size(0)\r
- \r
- // Alignment flag query constants\r
- private:\r
- enum { PAIRED = 1,\r
- PROPER_PAIR = 2,\r
- UNMAPPED = 4,\r
- MATE_UNMAPPED = 8,\r
- REVERSE = 16,\r
- MATE_REVERSE = 32,\r
- READ_1 = 64,\r
- READ_2 = 128,\r
- SECONDARY = 256,\r
- QC_FAILED = 512,\r
- DUPLICATE = 1024\r
- };\r
- };\r
-\r
- // ----------------------------------------------------------------\r
- // Typedefs\r
- \r
- /*!\r
- \typedef RefVector\r
- \brief Vector of RefData objects\r
- */\r
- typedef std::vector<RefData> RefVector;\r
- \r
- /*! \r
- \typedef BamAlignmentVector\r
- \brief Vector of BamAlignments\r
- */\r
- typedef std::vector< BamAlignment > BamAlignmentVector;\r
- \r
- //! \cond\r
- // ----------------------------------------------------------------\r
- // Typedefs (internal - can exclude from main documentation)\r
- \r
- //Offsets for linear indexing\r
- typedef std::vector<uint64_t> LinearOffsetVector;\r
-\r
- // Alignment 'chunk' boundaries\r
- typedef std::pair<uint64_t, uint64_t> ChunkPair;\r
- // Vector of alignment 'chunks'\r
- typedef std::vector<ChunkPair> ChunkVector;\r
-\r
- // BAM bin contains a bin ID & a vector of alignment 'chunks'\r
- typedef std::pair<uint32_t, ChunkVector*> BamBin;\r
- // Vector of BAM bins\r
- typedef std::vector<BamBin> BinVector;\r
-\r
- // Reference sequence index data\r
- typedef std::pair<BinVector*, LinearOffsetVector*> RefIndex;\r
-\r
- // Full BAM file index data structure \r
- typedef std::vector<RefIndex*> BamIndex;\r
- // ----------------------------------------------------------------\r
- //! \endcond\r
+// BAM constants\r
+const int BAM_CORE_SIZE = 32;\r
+const int BAM_CMATCH = 0;\r
+const int BAM_CINS = 1;\r
+const int BAM_CDEL = 2;\r
+const int BAM_CREF_SKIP = 3;\r
+const int BAM_CSOFT_CLIP = 4;\r
+const int BAM_CHARD_CLIP = 5;\r
+const int BAM_CPAD = 6;\r
+const int BAM_CIGAR_SHIFT = 4;\r
+const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1);\r
+\r
+// BAM index constants\r
+const int MAX_BIN = 37450; // =(8^6-1)/7+1\r
+const int BAM_MIN_CHUNK_GAP = 32768;\r
+const int BAM_LIDX_SHIFT = 14;\r
+\r
+// Explicit variable sizes\r
+const int BT_SIZEOF_INT = 4;\r
+\r
+struct CigarOp;\r
+\r
+struct BamAlignment {\r
+\r
+ // Queries against alignment flag\r
+ public:\r
+ // Returns true if this read is a PCR duplicate (determined by external app)\r
+ bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }\r
+ // Returns true if this read failed quality control (determined by external app)\r
+ bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }\r
+ // Returns true if alignment is first mate on read\r
+ bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }\r
+ // Returns true if alignment is mapped\r
+ bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }\r
+ // Returns true if alignment's mate is mapped\r
+ bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }\r
+ // Returns true if alignment's mate mapped to reverse strand\r
+ bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }\r
+ // Returns true if alignment part of paired-end read\r
+ bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }\r
+ // Returns true if this position is primary alignment (determined by external app)\r
+ bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }\r
+ // Returns true if alignment is part of read that satisfied paired-end resolution (determined by external app)\r
+ bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }\r
+ // Returns true if alignment mapped to reverse strand\r
+ bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }\r
+ // Returns true if alignment is second mate on read\r
+ bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }\r
+\r
+ public:\r
+\r
+ // get "RG" tag data\r
+ bool GetReadGroup(std::string& readGroup) const {\r
+\r
+ if ( TagData.empty() ) { return false; }\r
+\r
+ // localize the tag data\r
+ char* pTagData = (char*)TagData.data();\r
+ const unsigned int tagDataLen = TagData.size();\r
+ unsigned int numBytesParsed = 0;\r
+\r
+ bool foundReadGroupTag = false;\r
+ while( numBytesParsed < tagDataLen ) {\r
+\r
+ const char* pTagType = pTagData;\r
+ const char* pTagStorageType = pTagData + 2;\r
+ pTagData += 3;\r
+ numBytesParsed += 3;\r
+\r
+ // check the current tag\r
+ if ( std::strncmp(pTagType, "RG", 2) == 0 ) {\r
+ foundReadGroupTag = true;\r
+ break;\r
+ }\r
+\r
+ // get the storage class and find the next tag\r
+ SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed );\r
+ }\r
+\r
+ // return if the read group tag was not present\r
+ if ( !foundReadGroupTag ) { return false; }\r
+\r
+ // assign the read group\r
+ const unsigned int readGroupLen = std::strlen(pTagData);\r
+ readGroup.resize(readGroupLen);\r
+ std::memcpy( (char*)readGroup.data(), pTagData, readGroupLen );\r
+ return true;\r
+ }\r
+\r
+ // get "NM" tag data - contributed by Aaron Quinlan\r
+ bool GetEditDistance(uint8_t& editDistance) const {\r
+\r
+ if ( TagData.empty() ) { return false; }\r
+\r
+ // localize the tag data\r
+ char* pTagData = (char*)TagData.data();\r
+ const unsigned int tagDataLen = TagData.size();\r
+ unsigned int numBytesParsed = 0;\r
+\r
+ bool foundEditDistanceTag = false;\r
+ while( numBytesParsed < tagDataLen ) {\r
+\r
+ const char* pTagType = pTagData;\r
+ const char* pTagStorageType = pTagData + 2;\r
+ pTagData += 3;\r
+ numBytesParsed += 3;\r
+\r
+ // check the current tag\r
+ if ( strncmp(pTagType, "NM", 2) == 0 ) {\r
+ foundEditDistanceTag = true;\r
+ break;\r
+ }\r
+\r
+ // get the storage class and find the next tag\r
+ SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed );\r
+ }\r
+ // return if the edit distance tag was not present\r
+ if ( !foundEditDistanceTag ) { return false; }\r
+\r
+ // assign the editDistance value\r
+ memcpy(&editDistance, pTagData, 1);\r
+ return true;\r
+ }\r
+\r
+ private:\r
+ static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {\r
+ switch(storageType) {\r
+\r
+ case 'A':\r
+ case 'c':\r
+ case 'C':\r
+ ++numBytesParsed;\r
+ ++pTagData;\r
+ break;\r
+\r
+ case 's':\r
+ case 'S':\r
+ case 'f':\r
+ numBytesParsed += 2;\r
+ pTagData += 2;\r
+ break;\r
+\r
+ case 'i':\r
+ case 'I':\r
+ numBytesParsed += 4;\r
+ pTagData += 4;\r
+ break;\r
+\r
+ case 'Z':\r
+ case 'H':\r
+ while(*pTagData) {\r
+ ++numBytesParsed;\r
+ ++pTagData;\r
+ }\r
+ break;\r
+\r
+ default:\r
+ printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData);\r
+ exit(1);\r
+ }\r
+ }\r
+\r
+ // Data members\r
+ public:\r
+ std::string Name; // Read name\r
+ int32_t Length; // Query length\r
+ std::string QueryBases; // 'Original' sequence (as reported from sequencing machine)\r
+ std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping)\r
+ std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)\r
+ std::string TagData; // Tag data (accessor methods will pull the requested information out)\r
+ int32_t RefID; // ID number for reference sequence\r
+ int32_t Position; // Position (0-based) where alignment starts\r
+ uint16_t Bin; // Bin in BAM file where this alignment resides\r
+ uint16_t MapQuality; // Mapping quality score\r
+ uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods for available queries\r
+ std::vector<CigarOp> CigarData; // CIGAR operations for this alignment\r
+ int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned\r
+ int32_t MatePosition; // Position (0-based) where alignment's mate starts\r
+ int32_t InsertSize; // Mate-pair insert size\r
+\r
+ // Alignment flag query constants\r
+ private:\r
+ enum { PAIRED = 1,\r
+ PROPER_PAIR = 2,\r
+ UNMAPPED = 4,\r
+ MATE_UNMAPPED = 8,\r
+ REVERSE = 16,\r
+ MATE_REVERSE = 32,\r
+ READ_1 = 64,\r
+ READ_2 = 128,\r
+ SECONDARY = 256,\r
+ QC_FAILED = 512,\r
+ DUPLICATE = 1024\r
+ };\r
+};\r
+\r
+// ----------------------------------------------------------------\r
+// Auxiliary data structs & typedefs\r
+\r
+struct CigarOp {\r
+ char Type; // Operation type (MIDNSHP)\r
+ uint32_t Length; // Operation length (number of bases)\r
+};\r
+\r
+struct RefData {\r
+ // data members\r
+ std::string RefName; // Name of reference sequence\r
+ unsigned int RefLength; // Length of reference sequence\r
+ bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence\r
+ // constructor\r
+ RefData(void)\r
+ : RefLength(0)\r
+ , RefHasAlignments(false)\r
+ { }\r
+};\r
+\r
+typedef std::vector<RefData> RefVector;\r
+typedef std::vector<BamAlignment> BamAlignmentVector;\r
+\r
+// ----------------------------------------------------------------\r
+// Indexing structs & typedefs\r
+\r
+struct Chunk {\r
+ // data members\r
+ uint64_t Start;\r
+ uint64_t Stop;\r
+ // constructor\r
+ Chunk(const uint64_t& start = 0, const uint64_t& stop = 0)\r
+ : Start(start)\r
+ , Stop(stop)\r
+ { }\r
+};\r
+\r
+inline\r
+bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {\r
+ return lhs.Start < rhs.Start;\r
}\r
+\r
+typedef std::vector<Chunk> ChunkVector;\r
+typedef std::map<uint32_t, ChunkVector> BamBinMap;\r
+typedef std::vector<uint64_t> LinearOffsetVector;\r
+\r
+struct ReferenceIndex {\r
+ // data members\r
+ BamBinMap Bins;\r
+ LinearOffsetVector Offsets;\r
+ // constructor\r
+ ReferenceIndex(const BamBinMap& binMap = BamBinMap(),\r
+ const LinearOffsetVector& offsets = LinearOffsetVector())\r
+ : Bins(binMap)\r
+ , Offsets(offsets)\r
+ { }\r
+};\r
+\r
+typedef std::vector<ReferenceIndex> BamIndex;\r
+\r
+} // namespace BamTools\r
+\r
+#endif // BAMAUX_H\r
-// ***************************************************************************
-// BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 15 July 2009 (DB)
-// ---------------------------------------------------------------------------
-// The BGZF routines were adapted from the bgzf.c code developed at the Broad
-// Institute.
-// ---------------------------------------------------------------------------
-// Provides the basic functionality for reading BAM files
-// ***************************************************************************
-
-// BamTools includes
-#include "BamReader.h"
-using namespace BamTools;
-using namespace std;
-
-// static character constants
-const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN";
-const char* BamReader::CIGAR_LOOKUP = "MIDNSHP";
-
-// constructor
-BamReader::BamReader(void)
- : m_BGZF(NULL)
- , m_index(NULL)
- , m_isIndexLoaded(false)
- , m_alignmentsBeginOffset(0)
- , m_isRegionSpecified(false)
- , m_currentRefID(0)
- , m_currentLeft(0)
-{ }
-
-// destructor
-BamReader::~BamReader(void) {
- Close();
-}
-
-// checks BGZF block header
-bool BamReader::BgzfCheckBlockHeader(char* header) {
-
- return (header[0] == GZIP_ID1 &&
- header[1] == (char)GZIP_ID2 &&
- header[2] == Z_DEFLATED &&
- (header[3] & FLG_FEXTRA) != 0 &&
- BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
- header[12] == BGZF_ID1 &&
- header[13] == BGZF_ID2 &&
- BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN
- );
-}
-
-// closes the BAM file
-void BamReader::BgzfClose(void) {
- fflush(m_BGZF->Stream);
- fclose(m_BGZF->Stream);
- m_BGZF->IsOpen = false;
-}
-
-// de-compresses the current block
-int BamReader::BgzfInflateBlock(int blockLength) {
-
- // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock
- z_stream zs;
- zs.zalloc = NULL;
- zs.zfree = NULL;
- zs.next_in = (Bytef*)m_BGZF->CompressedBlock + 18;
- zs.avail_in = blockLength - 16;
- zs.next_out = (Bytef*)m_BGZF->UncompressedBlock;
- zs.avail_out = m_BGZF->UncompressedBlockSize;
-
- int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
- if (status != Z_OK) {
- printf("inflateInit failed\n");
- exit(1);
- }
-
- status = inflate(&zs, Z_FINISH);
- if (status != Z_STREAM_END) {
- inflateEnd(&zs);
- printf("inflate failed\n");
- exit(1);
- }
-
- status = inflateEnd(&zs);
- if (status != Z_OK) {
- printf("inflateEnd failed\n");
- exit(1);
- }
-
- return zs.total_out;
-}
-
-// opens the BAM file for reading
-void BamReader::BgzfOpen(const string& filename) {
-
- m_BGZF->Stream = fopen(filename.c_str(), "rb");
- if(!m_BGZF->Stream) {
- printf("ERROR: Unable to open the BAM file %s for reading.\n", filename.c_str() );
- exit(1);
- }
-
- m_BGZF->IsOpen = true;
-}
-
-// reads BGZF data into buffer
-unsigned int BamReader::BgzfRead(char* data, const unsigned int dataLength) {
-
- if (dataLength == 0) { return 0; }
-
- char* output = data;
- unsigned int numBytesRead = 0;
- while (numBytesRead < dataLength) {
-
- int bytesAvailable = m_BGZF->BlockLength - m_BGZF->BlockOffset;
- if (bytesAvailable <= 0) {
- if ( BgzfReadBlock() != 0 ) { return -1; }
- bytesAvailable = m_BGZF->BlockLength - m_BGZF->BlockOffset;
- if ( bytesAvailable <= 0 ) { break; }
- }
-
- char* buffer = m_BGZF->UncompressedBlock;
- int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );
- memcpy(output, buffer + m_BGZF->BlockOffset, copyLength);
-
- m_BGZF->BlockOffset += copyLength;
- output += copyLength;
- numBytesRead += copyLength;
- }
-
- if ( m_BGZF->BlockOffset == m_BGZF->BlockLength ) {
- m_BGZF->BlockAddress = ftello(m_BGZF->Stream);
- m_BGZF->BlockOffset = 0;
- m_BGZF->BlockLength = 0;
- }
-
- return numBytesRead;
-}
-
-int BamReader::BgzfReadBlock(void) {
-
- char header[BLOCK_HEADER_LENGTH];
- int64_t blockAddress = ftello(m_BGZF->Stream);
-
- int count = fread(header, 1, sizeof(header), m_BGZF->Stream);
- if (count == 0) {
- m_BGZF->BlockLength = 0;
- return 0;
- }
-
- if (count != sizeof(header)) {
- printf("read block failed - count != sizeof(header)\n");
- return -1;
- }
-
- if (!BgzfCheckBlockHeader(header)) {
- printf("read block failed - CheckBgzfBlockHeader() returned false\n");
- return -1;
- }
-
- int blockLength = BgzfUnpackUnsignedShort(&header[16]) + 1;
- char* compressedBlock = m_BGZF->CompressedBlock;
- memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);
- int remaining = blockLength - BLOCK_HEADER_LENGTH;
-
- count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, m_BGZF->Stream);
- if (count != remaining) {
- printf("read block failed - count != remaining\n");
- return -1;
- }
-
- count = BgzfInflateBlock(blockLength);
- if (count < 0) { return -1; }
-
- if (m_BGZF->BlockLength != 0) {
- m_BGZF->BlockOffset = 0;
- }
-
- m_BGZF->BlockAddress = blockAddress;
- m_BGZF->BlockLength = count;
- return 0;
-}
-
-// move file pointer to specified offset
-bool BamReader::BgzfSeek(int64_t position) {
-
- int blockOffset = (position & 0xFFFF);
- int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
- if (fseeko(m_BGZF->Stream, blockAddress, SEEK_SET) != 0) {
- printf("ERROR: Unable to seek in BAM file\n");
- exit(1);
- }
-
- m_BGZF->BlockLength = 0;
- m_BGZF->BlockAddress = blockAddress;
- m_BGZF->BlockOffset = blockOffset;
- return true;
-}
-
-// get file position in BAM file
-int64_t BamReader::BgzfTell(void) {
- return ( (m_BGZF->BlockAddress << 16) | (m_BGZF->BlockOffset & 0xFFFF) );
-}
-
-int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {
-
- // get region boundaries
- uint32_t begin = left;
- uint32_t end = m_references.at(refID).RefLength - 1;
-
- // initialize list, bin '0' always a valid bin
- int i = 0;
- list[i++] = 0;
-
- // get rest of bins that contain this region
- unsigned int k;
- for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; }
- for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; }
- for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; }
- for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; }
- for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
-
- // return number of bins stored
- return i;
-}
-
-unsigned int BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {
-
- // initialize alignment end to starting position
- unsigned int alignEnd = position;
-
- // iterate over cigar operations
- vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
- vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
- for ( ; cigarIter != cigarEnd; ++cigarIter) {
- char cigarType = (*cigarIter).Type;
- if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {
- alignEnd += (*cigarIter).Length;
- }
- }
- return alignEnd;
-}
-
-void BamReader::ClearIndex(void) {
-
- if ( m_index ) {
- // iterate over references
- vector<RefIndex*>::iterator refIter = m_index->begin();
- vector<RefIndex*>::iterator refEnd = m_index->end();
- for ( ; refIter != refEnd; ++refIter) {
- RefIndex* aRef = (*refIter);
- if ( aRef ) {
- // clear out BAM bins
- if ( aRef->first ) {
- BinVector::iterator binIter = (aRef->first)->begin();
- BinVector::iterator binEnd = (aRef->first)->end();
- for ( ; binIter != binEnd; ++binIter ) {
- ChunkVector* chunks = (*binIter).second;
- if ( chunks ) { delete chunks; chunks = NULL;}
- }
- delete aRef->first;
- aRef->first = NULL;
- }
- // clear BAM linear offsets
- if ( aRef->second ) { delete aRef->second; aRef->second = NULL; }
- delete aRef;
- aRef = NULL;
- }
- }
- delete m_index;
- m_index = NULL;
- }
-}
-
-// closes the BAM file
-void BamReader::Close(void) {
-
- if (m_BGZF!=NULL && m_BGZF->IsOpen) {
- BgzfClose();
- delete m_BGZF;
- m_BGZF = NULL;
- }
- ClearIndex();
- m_headerText.clear();
- m_isRegionSpecified = false;
-}
-
-const string BamReader::GetHeaderText(void) const {
- return m_headerText;
-}
-
-const int BamReader::GetReferenceCount(void) const {
- return m_references.size();
-}
-
-const RefVector BamReader::GetReferenceData(void) const {
- return m_references;
-}
-
-const int BamReader::GetReferenceID(const string& refName) const {
-
- // retrieve names from reference data
- vector<string> refNames;
- RefVector::const_iterator refIter = m_references.begin();
- RefVector::const_iterator refEnd = m_references.end();
- for ( ; refIter != refEnd; ++refIter) {
- refNames.push_back( (*refIter).RefName );
- }
-
- // return 'index-of' refName ( if not found, returns refNames.size() )
- return Index( refNames.begin(), refNames.end(), refName );
-}
-
-// get next alignment (from specified region, if given)
-bool BamReader::GetNextAlignment(BamAlignment& bAlignment) {
-
- // if valid alignment available
- if ( LoadNextAlignment(bAlignment) ) {
-
- // if region not specified, return success
- if ( !m_isRegionSpecified ) { return true; }
-
- // load next alignment until region overlap is found
- while ( !IsOverlap(bAlignment) ) {
- // if no valid alignment available (likely EOF) return failure
- if ( !LoadNextAlignment(bAlignment) ) { return false; }
- }
-
- // return success (alignment found that overlaps region)
- return true;
- }
-
- // no valid alignment
- else { return false; }
-}
-
-int64_t BamReader::GetOffset(int refID, unsigned int left) {
-
- // calculate which bins overlap this region
- uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
- int numBins = BinsFromRegion(refID, left, bins);
-
- // get bins for this reference
- RefIndex* refIndex = m_index->at(refID);
- BinVector* refBins = refIndex->first;
-
- // get minimum offset to consider
- LinearOffsetVector* linearOffsets = refIndex->second;
- uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT);
-
- // store offsets to beginning of alignment 'chunks'
- std::vector<int64_t> chunkStarts;
-
- // reference bin iterators
- BinVector::const_iterator binIter;
- BinVector::const_iterator binBegin = refBins->begin();
- BinVector::const_iterator binEnd = refBins->end();
-
- // store all alignment 'chunk' starts for bins in this region
- for (int i = 0; i < numBins; ++i ) {
- binIter = lower_bound(binBegin, binEnd, bins[i], LookupKeyCompare<uint32_t, ChunkVector*>() );
- if ( (binIter != binEnd) && ( (*binIter).first == bins[i]) ) {
- ChunkVector* binChunks = (*binIter).second;
- ChunkVector::const_iterator chunkIter = binChunks->begin();
- ChunkVector::const_iterator chunkEnd = binChunks->end();
- for ( ; chunkIter != chunkEnd; ++chunkIter) {
- if ( (*chunkIter).second > minOffset ) {
- chunkStarts.push_back( (*chunkIter).first );
- }
- }
- }
- }
-
- // clean up memory
- free(bins);
-
- // if no alignments found
- if ( chunkStarts.empty() ) { return -1; }
-
- // else return smallest offset for alignment starts
- else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }
-}
-
-bool BamReader::IsOverlap(BamAlignment& bAlignment) {
-
- // if on different reference sequence, quit
- if ( bAlignment.RefID != m_currentRefID ) { return false; }
-
- // read starts after left boundary
- if ( bAlignment.Position >= (signed int) m_currentLeft) { return true; }
-
- // return whether alignment end overlaps left boundary
- return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft );
-}
-
-bool BamReader::Jump(int refID, unsigned int position) {
-
- // if index available, and region is valid
- if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (position <= m_references.at(refID).RefLength) ) {
- m_currentRefID = refID;
- m_currentLeft = position;
- m_isRegionSpecified = true;
-
- int64_t offset = GetOffset(m_currentRefID, m_currentLeft);
- if ( offset == -1 ) { return false; }
- else { return BgzfSeek(offset); }
- }
- return false;
-}
-
-void BamReader::LoadHeaderData(void) {
-
- // check to see if proper BAM header
- char buffer[4];
- if (BgzfRead(buffer, 4) != 4) {
- printf("Could not read header type\n");
- exit(1);
- }
-
- if (strncmp(buffer, "BAM\001", 4)) {
- printf("wrong header type!\n");
- exit(1);
- }
-
- // get BAM header text length
- BgzfRead(buffer, 4);
- const unsigned int headerTextLength = BgzfUnpackUnsignedInt(buffer);
-
- // get BAM header text
- char* headerText = (char*)calloc(headerTextLength + 1, 1);
- BgzfRead(headerText, headerTextLength);
- m_headerText = (string)((const char*)headerText);
-
- // clean up calloc-ed temp variable
- free(headerText);
-}
-
-void BamReader::LoadIndexData(FILE* indexStream) {
-
- // see if index is valid BAM index
- char magic[4];
- fread(magic, 1, 4, indexStream);
- if (strncmp(magic, "BAI\1", 4)) {
- printf("Problem with index file - invalid format.\n");
- fclose(indexStream);
- exit(1);
- }
-
- // get number of reference sequences
- uint32_t numRefSeqs;
- fread(&numRefSeqs, 4, 1, indexStream);
-
- // intialize BamIndex data structure
- m_index = new BamIndex;
- m_index->reserve(numRefSeqs);
-
- // iterate over reference sequences
- for (unsigned int i = 0; i < numRefSeqs; ++i) {
-
- // get number of bins for this reference sequence
- int32_t numBins;
- fread(&numBins, 4, 1, indexStream);
-
- if (numBins > 0) { m_references.at(i).RefHasAlignments = true; }
-
- // intialize BinVector
- BinVector* bins = new BinVector;
- bins->reserve(numBins);
-
- // iterate over bins for that reference sequence
- for (int j = 0; j < numBins; ++j) {
-
- // get binID
- uint32_t binID;
- fread(&binID, 4, 1, indexStream);
-
- // get number of regionChunks in this bin
- uint32_t numChunks;
- fread(&numChunks, 4, 1, indexStream);
-
- // intialize ChunkVector
- ChunkVector* regionChunks = new ChunkVector;
- regionChunks->reserve(numChunks);
-
- // iterate over regionChunks in this bin
- for (unsigned int k = 0; k < numChunks; ++k) {
-
- // get chunk boundaries (left, right)
- uint64_t left;
- uint64_t right;
- fread(&left, 8, 1, indexStream);
- fread(&right, 8, 1, indexStream);
-
- // save ChunkPair
- regionChunks->push_back( ChunkPair(left, right) );
- }
-
- // sort chunks for this bin
- sort( regionChunks->begin(), regionChunks->end(), LookupKeyCompare<uint64_t, uint64_t>() );
-
- // save binID, chunkVector for this bin
- bins->push_back( BamBin(binID, regionChunks) );
- }
-
- // sort bins by binID
- sort(bins->begin(), bins->end(), LookupKeyCompare<uint32_t, ChunkVector*>() );
-
- // load linear index for this reference sequence
-
- // get number of linear offsets
- int32_t numLinearOffsets;
- fread(&numLinearOffsets, 4, 1, indexStream);
-
- // intialize LinearOffsetVector
- LinearOffsetVector* linearOffsets = new LinearOffsetVector;
- linearOffsets->reserve(numLinearOffsets);
-
- // iterate over linear offsets for this reference sequeence
- for (int j = 0; j < numLinearOffsets; ++j) {
- // get a linear offset
- uint64_t linearOffset;
- fread(&linearOffset, 8, 1, indexStream);
- // store linear offset
- linearOffsets->push_back(linearOffset);
- }
-
- // sort linear offsets
- sort( linearOffsets->begin(), linearOffsets->end() );
-
- // store index data for that reference sequence
- m_index->push_back( new RefIndex(bins, linearOffsets) );
- }
-
- // close index file (.bai) and return
- fclose(indexStream);
-}
-
-bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) {
-
- // read in the 'block length' value, make sure it's not zero
- char buffer[4];
- BgzfRead(buffer, 4);
- const unsigned int blockLength = BgzfUnpackUnsignedInt(buffer);
- if ( blockLength == 0 ) { return false; }
-
- // keep track of bytes read as method progresses
- int bytesRead = 4;
-
- // read in core alignment data, make sure the right size of data was read
- char x[BAM_CORE_SIZE];
- if ( BgzfRead(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
- bytesRead += BAM_CORE_SIZE;
-
- // set BamAlignment 'core' data and character data lengths
- unsigned int tempValue;
- unsigned int queryNameLength;
- unsigned int numCigarOperations;
- unsigned int querySequenceLength;
-
- //bAlignment.RefID = BgzfUnpackUnsignedInt(&x[0]);
- //bAlignment.Position = BgzfUnpackUnsignedInt(&x[4]);
- bAlignment.RefID = BgzfUnpackSignedInt(&x[0]);
- bAlignment.Position = BgzfUnpackSignedInt(&x[4]);
-
- tempValue = BgzfUnpackUnsignedInt(&x[8]);
- bAlignment.Bin = tempValue >> 16;
- bAlignment.MapQuality = tempValue >> 8 & 0xff;
- queryNameLength = tempValue & 0xff;
-
- tempValue = BgzfUnpackUnsignedInt(&x[12]);
- bAlignment.AlignmentFlag = tempValue >> 16;
- numCigarOperations = tempValue & 0xffff;
-
- querySequenceLength = BgzfUnpackUnsignedInt(&x[16]);
- //bAlignment.MateRefID = BgzfUnpackUnsignedInt(&x[20]);
- //bAlignment.MatePosition = BgzfUnpackUnsignedInt(&x[24]);
- //bAlignment.InsertSize = BgzfUnpackUnsignedInt(&x[28]);
- bAlignment.MateRefID = BgzfUnpackSignedInt(&x[20]);
- bAlignment.MatePosition = BgzfUnpackSignedInt(&x[24]);
- bAlignment.InsertSize = BgzfUnpackSignedInt(&x[28]);
-
- // calculate lengths/offsets
- const unsigned int dataLength = blockLength - BAM_CORE_SIZE;
- const unsigned int cigarDataOffset = queryNameLength;
- const unsigned int seqDataOffset = cigarDataOffset + (numCigarOperations * 4);
- const unsigned int qualDataOffset = seqDataOffset + (querySequenceLength+1)/2;
- const unsigned int tagDataOffset = qualDataOffset + querySequenceLength;
- const unsigned int tagDataLen = dataLength - tagDataOffset;
-
- // set up destination buffers for character data
- char* allCharData = (char*)calloc(sizeof(char), dataLength);
- uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
- char* seqData = ((char*)allCharData) + seqDataOffset;
- char* qualData = ((char*)allCharData) + qualDataOffset;
- char* tagData = ((char*)allCharData) + tagDataOffset;
-
- // get character data - make sure proper data size was read
- if ( BgzfRead(allCharData, dataLength) != dataLength) { return false; }
- else {
-
- bytesRead += dataLength;
-
- // clear out any previous string data
- bAlignment.Name.clear();
- bAlignment.QueryBases.clear();
- bAlignment.Qualities.clear();
- bAlignment.AlignedBases.clear();
- bAlignment.CigarData.clear();
- bAlignment.TagData.clear();
-
- // save name
- bAlignment.Name = (string)((const char*)(allCharData));
-
- // save query sequence
- for (unsigned int i = 0; i < querySequenceLength; ++i) {
- char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
- bAlignment.QueryBases.append( 1, singleBase );
- }
-
- // save sequence length
- bAlignment.Length = bAlignment.QueryBases.length();
-
- // save qualities
- for (unsigned int i = 0; i < querySequenceLength; ++i) {
- char singleQuality = (char)(qualData[i]+33); // conversion from QV to FASTQ character
- bAlignment.Qualities.append( 1, singleQuality );
- }
-
- // save CIGAR-related data;
- int k = 0;
- for (unsigned int i = 0; i < numCigarOperations; ++i) {
-
- // build CigarOp struct
- CigarOp op;
- op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
- op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
-
- // save CigarOp
- bAlignment.CigarData.push_back(op);
-
- // build AlignedBases string
- switch (op.Type) {
-
- case ('M') :
- case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases
- case ('S') : k += op.Length; // for 'S' - skip over query bases
- break;
-
- case ('D') : bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character
- break;
-
- case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character;
- break;
-
- case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence
- k += op.Length;
- break;
-
- case ('H') : break; // for 'H' - do nothing, move to next op
-
- default : printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here
- exit(1);
- }
- }
-
- // read in the tag data
- bAlignment.TagData.resize(tagDataLen);
- memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
- }
-
- free(allCharData);
- return true;
-}
-
-void BamReader::LoadReferenceData(void) {
-
- // get number of reference sequences
- char buffer[4];
- BgzfRead(buffer, 4);
- const unsigned int numberRefSeqs = BgzfUnpackUnsignedInt(buffer);
- if (numberRefSeqs == 0) { return; }
- m_references.reserve((int)numberRefSeqs);
-
- // iterate over all references in header
- for (unsigned int i = 0; i != numberRefSeqs; ++i) {
-
- // get length of reference name
- BgzfRead(buffer, 4);
- const unsigned int refNameLength = BgzfUnpackUnsignedInt(buffer);
- char* refName = (char*)calloc(refNameLength, 1);
-
- // get reference name and reference sequence length
- BgzfRead(refName, refNameLength);
- BgzfRead(buffer, 4);
- const unsigned int refLength = BgzfUnpackUnsignedInt(buffer);
-
- // store data for reference
- RefData aReference;
- aReference.RefName = (string)((const char*)refName);
- aReference.RefLength = refLength;
- m_references.push_back(aReference);
-
- // clean up calloc-ed temp variable
- free(refName);
- }
-}
-
-// opens BAM file (and index)
-void BamReader::Open(const string& filename, const string& indexFilename) {
-
- // open the BGZF file for reading, retrieve header text & reference data
- m_BGZF = new BgzfData;
- BgzfOpen(filename);
- LoadHeaderData();
- LoadReferenceData();
-
- // store file offset of first alignment
- m_alignmentsBeginOffset = BgzfTell();
-
- // open index file & load index data (if exists)
- OpenIndex(indexFilename);
-}
-
-void BamReader::OpenIndex(const string& indexFilename) {
-
- // if index file exists
- if (!indexFilename.empty()) {
-
- // open index
- FILE* indexStream = fopen(indexFilename.c_str(), "rb");
-
- // abort on error
- if(!indexStream) {
- printf("ERROR: Unable to open the BAM index file %s for reading.\n", indexFilename.c_str() );
- exit(1);
- }
-
- // build up index data structure
- LoadIndexData(indexStream);
- }
-}
-
-bool BamReader::Rewind(void) {
-
- // find first reference that has alignments in the BAM file
- int refID = 0;
- int refCount = m_references.size();
- for ( ; refID < refCount; ++refID ) {
- if ( m_references.at(refID).RefHasAlignments ) { break; }
- }
-
- // store default bounds for first alignment
- m_currentRefID = refID;
- m_currentLeft = 0;
- m_isRegionSpecified = false;
-
- // return success/failure of seek
- return BgzfSeek(m_alignmentsBeginOffset);
-}
+// ***************************************************************************\r
+// BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 8 December 2009 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for reading BAM files\r
+// ***************************************************************************\r
+\r
+// C++ includes\r
+#include <algorithm>\r
+#include <iterator>\r
+#include <string>\r
+#include <vector>\r
+\r
+// BamTools includes\r
+#include "BGZF.h"\r
+#include "BamReader.h"\r
+using namespace BamTools;\r
+using namespace std;\r
+\r
+struct BamReader::BamReaderPrivate {\r
+\r
+ // -------------------------------\r
+ // data members\r
+ // -------------------------------\r
+\r
+ // general data\r
+ BgzfData mBGZF;\r
+ string HeaderText;\r
+ BamIndex Index;\r
+ RefVector References;\r
+ bool IsIndexLoaded;\r
+ int64_t AlignmentsBeginOffset;\r
+ string Filename;\r
+ string IndexFilename;\r
+\r
+ // user-specified region values\r
+ bool IsRegionSpecified;\r
+ int CurrentRefID;\r
+ unsigned int CurrentLeft;\r
+\r
+ // BAM character constants\r
+ const char* DNA_LOOKUP;\r
+ const char* CIGAR_LOOKUP;\r
+\r
+ // -------------------------------\r
+ // constructor & destructor\r
+ // -------------------------------\r
+ BamReaderPrivate(void);\r
+ ~BamReaderPrivate(void);\r
+\r
+ // -------------------------------\r
+ // "public" interface\r
+ // -------------------------------\r
+\r
+ // flie operations\r
+ void Close(void);\r
+ bool Jump(int refID, unsigned int position = 0);\r
+ void Open(const string& filename, const string& indexFilename = "");\r
+ bool Rewind(void);\r
+\r
+ // access alignment data\r
+ bool GetNextAlignment(BamAlignment& bAlignment);\r
+\r
+ // access auxiliary data\r
+ const string GetHeaderText(void) const;\r
+ const int GetReferenceCount(void) const;\r
+ const RefVector GetReferenceData(void) const;\r
+ const int GetReferenceID(const string& refName) const;\r
+\r
+ // index operations\r
+ bool CreateIndex(void);\r
+\r
+ // -------------------------------\r
+ // internal methods\r
+ // -------------------------------\r
+\r
+ // *** reading alignments and auxiliary data *** //\r
+\r
+ // calculate bins that overlap region ( left to reference end for now )\r
+ int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);\r
+ // calculates alignment end position based on starting position and provided CIGAR operations\r
+ unsigned int CalculateAlignmentEnd(const unsigned int& position, const std::vector<CigarOp>& cigarData);\r
+ // calculate file offset for first alignment chunk overlapping 'left'\r
+ int64_t GetOffset(int refID, unsigned int left);\r
+ // checks to see if alignment overlaps current region\r
+ bool IsOverlap(BamAlignment& bAlignment);\r
+ // retrieves header text from BAM file\r
+ void LoadHeaderData(void);\r
+ // retrieves BAM alignment under file pointer\r
+ bool LoadNextAlignment(BamAlignment& bAlignment);\r
+ // builds reference data structure from BAM file\r
+ void LoadReferenceData(void);\r
+\r
+ // *** index file handling *** //\r
+\r
+ // calculates index for BAM file\r
+ bool BuildIndex(void);\r
+ // clear out inernal index data structure\r
+ void ClearIndex(void);\r
+ // saves BAM bin entry for index\r
+ void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset);\r
+ // saves linear offset entry for index\r
+ void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset);\r
+ // loads index from BAM index file\r
+ bool LoadIndex(void);\r
+ // simplifies index by merging 'chunks'\r
+ void MergeChunks(void);\r
+ // round-up 32-bit integer to next power-of-2\r
+ void Roundup32(int& value);\r
+ // saves index to BAM index file\r
+ bool WriteIndex(void);\r
+};\r
+\r
+// -----------------------------------------------------\r
+// BamReader implementation (wrapper around BRPrivate)\r
+// -----------------------------------------------------\r
+\r
+// constructor\r
+BamReader::BamReader(void) {\r
+ d = new BamReaderPrivate;\r
+}\r
+\r
+// destructor\r
+BamReader::~BamReader(void) {\r
+ delete d;\r
+ d = 0;\r
+}\r
+\r
+// file operations\r
+void BamReader::Close(void) { d->Close(); }\r
+bool BamReader::Jump(int refID, unsigned int position) { return d->Jump(refID, position); }\r
+void BamReader::Open(const string& filename, const string& indexFilename) { d->Open(filename, indexFilename); }\r
+bool BamReader::Rewind(void) { return d->Rewind(); }\r
+\r
+// access alignment data\r
+bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); }\r
+\r
+// access auxiliary data\r
+const string BamReader::GetHeaderText(void) const { return d->HeaderText; }\r
+const int BamReader::GetReferenceCount(void) const { return d->References.size(); }\r
+const RefVector BamReader::GetReferenceData(void) const { return d->References; }\r
+const int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }\r
+\r
+// index operations\r
+bool BamReader::CreateIndex(void) { return d->CreateIndex(); }\r
+\r
+// -----------------------------------------------------\r
+// BamReaderPrivate implementation\r
+// -----------------------------------------------------\r
+\r
+// constructor\r
+BamReader::BamReaderPrivate::BamReaderPrivate(void)\r
+ : IsIndexLoaded(false)\r
+ , AlignmentsBeginOffset(0)\r
+ , IsRegionSpecified(false)\r
+ , CurrentRefID(0)\r
+ , CurrentLeft(0)\r
+ , DNA_LOOKUP("=ACMGRSVTWYHKDBN")\r
+ , CIGAR_LOOKUP("MIDNSHP")\r
+{ }\r
+\r
+// destructor\r
+BamReader::BamReaderPrivate::~BamReaderPrivate(void) {\r
+ Close();\r
+}\r
+\r
+// calculate bins that overlap region ( left to reference end for now )\r
+int BamReader::BamReaderPrivate::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) {\r
+\r
+ // get region boundaries\r
+ uint32_t begin = left;\r
+ uint32_t end = References.at(refID).RefLength - 1;\r
+\r
+ // initialize list, bin '0' always a valid bin\r
+ int i = 0;\r
+ list[i++] = 0;\r
+\r
+ // get rest of bins that contain this region\r
+ unsigned int k;\r
+ for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; }\r
+ for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; }\r
+ for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; }\r
+ for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; }\r
+ for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }\r
+\r
+ // return number of bins stored\r
+ return i;\r
+}\r
+\r
+// populates BAM index data structure from BAM file data\r
+bool BamReader::BamReaderPrivate::BuildIndex(void) {\r
+\r
+ // check to be sure file is open\r
+ if (!mBGZF.IsOpen) { return false; }\r
+\r
+ // move file pointer to beginning of alignments\r
+ Rewind();\r
+\r
+ // get reference count, reserve index space\r
+ int numReferences = References.size();\r
+ for ( int i = 0; i < numReferences; ++i ) {\r
+ Index.push_back(ReferenceIndex());\r
+ }\r
+\r
+ // sets default constant for bin, ID, offset, coordinate variables\r
+ const uint32_t defaultValue = 0xffffffffu;\r
+\r
+ // bin data\r
+ uint32_t saveBin(defaultValue);\r
+ uint32_t lastBin(defaultValue);\r
+\r
+ // reference ID data\r
+ int32_t saveRefID(defaultValue);\r
+ int32_t lastRefID(defaultValue);\r
+\r
+ // offset data\r
+ uint64_t saveOffset = mBGZF.Tell();\r
+ uint64_t lastOffset = saveOffset;\r
+\r
+ // coordinate data\r
+ int32_t lastCoordinate = defaultValue;\r
+\r
+ BamAlignment bAlignment;\r
+ while( GetNextAlignment(bAlignment) ) {\r
+\r
+ // change of chromosome, save ID, reset bin\r
+ if ( lastRefID != bAlignment.RefID ) {\r
+ lastRefID = bAlignment.RefID;\r
+ lastBin = defaultValue;\r
+ }\r
+\r
+ // if lastCoordinate greater than BAM position - file not sorted properly\r
+ else if ( lastCoordinate > bAlignment.Position ) {\r
+ printf("BAM file not properly sorted:\n");\r
+ printf("Alignment %s : %u > %u on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID);\r
+ exit(1);\r
+ }\r
+\r
+ // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)\r
+ if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {\r
+\r
+ // save linear offset entry (matched to BAM entry refID)\r
+ ReferenceIndex& refIndex = Index.at(bAlignment.RefID);\r
+ LinearOffsetVector& offsets = refIndex.Offsets;\r
+ InsertLinearOffset(offsets, bAlignment, lastOffset);\r
+ }\r
+\r
+ // if current BamAlignment bin != lastBin, "then possibly write the binning index"\r
+ if ( bAlignment.Bin != lastBin ) {\r
+\r
+ // if not first time through\r
+ if ( saveBin != defaultValue ) {\r
+\r
+ // save Bam bin entry\r
+ ReferenceIndex& refIndex = Index.at(saveRefID);\r
+ BamBinMap& binMap = refIndex.Bins;\r
+ InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);\r
+ }\r
+\r
+ // update saveOffset\r
+ saveOffset = lastOffset;\r
+\r
+ // update bin values\r
+ saveBin = bAlignment.Bin;\r
+ lastBin = bAlignment.Bin;\r
+\r
+ // update saveRefID\r
+ saveRefID = bAlignment.RefID;\r
+\r
+ // if invalid RefID, break out (why?)\r
+ if ( saveRefID < 0 ) { break; }\r
+ }\r
+\r
+ // make sure that current file pointer is beyond lastOffset\r
+ if ( mBGZF.Tell() <= (int64_t)lastOffset ) {\r
+ printf("Error in BGZF offsets.\n");\r
+ exit(1);\r
+ }\r
+\r
+ // update lastOffset\r
+ lastOffset = mBGZF.Tell();\r
+\r
+ // update lastCoordinate\r
+ lastCoordinate = bAlignment.Position;\r
+ }\r
+\r
+ // save any leftover BAM data (as long as refID is valid)\r
+ if ( saveRefID >= 0 ) {\r
+ // save Bam bin entry\r
+ ReferenceIndex& refIndex = Index.at(saveRefID);\r
+ BamBinMap& binMap = refIndex.Bins;\r
+ InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);\r
+ }\r
+\r
+ // simplify index by merging chunks\r
+ MergeChunks();\r
+\r
+ // iterate over references\r
+ BamIndex::iterator indexIter = Index.begin();\r
+ BamIndex::iterator indexEnd = Index.end();\r
+ for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {\r
+\r
+ // get reference index data\r
+ ReferenceIndex& refIndex = (*indexIter);\r
+ BamBinMap& binMap = refIndex.Bins;\r
+ LinearOffsetVector& offsets = refIndex.Offsets;\r
+\r
+ // store whether reference has alignments or no\r
+ References[i].RefHasAlignments = ( binMap.size() > 0 );\r
+\r
+ // sort linear offsets\r
+ sort(offsets.begin(), offsets.end());\r
+ }\r
+\r
+\r
+ // rewind file pointer to beginning of alignments, return success/fail\r
+ return Rewind();\r
+}\r
+\r
+// calculates alignment end position based on starting position and provided CIGAR operations\r
+unsigned int BamReader::BamReaderPrivate::CalculateAlignmentEnd(const unsigned int& position, const vector<CigarOp>& cigarData) {\r
+\r
+ // initialize alignment end to starting position\r
+ unsigned int alignEnd = position;\r
+\r
+ // iterate over cigar operations\r
+ vector<CigarOp>::const_iterator cigarIter = cigarData.begin();\r
+ vector<CigarOp>::const_iterator cigarEnd = cigarData.end();\r
+ for ( ; cigarIter != cigarEnd; ++cigarIter) {\r
+ char cigarType = (*cigarIter).Type;\r
+ if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {\r
+ alignEnd += (*cigarIter).Length;\r
+ }\r
+ }\r
+ return alignEnd;\r
+}\r
+\r
+\r
+// clear index data structure\r
+void BamReader::BamReaderPrivate::ClearIndex(void) {\r
+ Index.clear(); // sufficient ??\r
+}\r
+\r
+// closes the BAM file\r
+void BamReader::BamReaderPrivate::Close(void) {\r
+ mBGZF.Close();\r
+ ClearIndex();\r
+ HeaderText.clear();\r
+ IsRegionSpecified = false;\r
+}\r
+\r
+// create BAM index from BAM file (keep structure in memory) and write to default index output file\r
+bool BamReader::BamReaderPrivate::CreateIndex(void) {\r
+\r
+ // clear out index\r
+ ClearIndex();\r
+\r
+ bool ok = true;\r
+ ok &= BuildIndex();\r
+ ok &= WriteIndex();\r
+ return ok;\r
+}\r
+\r
+// returns RefID for given RefName (returns References.size() if not found)\r
+const int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const {\r
+\r
+ // retrieve names from reference data\r
+ vector<string> refNames;\r
+ RefVector::const_iterator refIter = References.begin();\r
+ RefVector::const_iterator refEnd = References.end();\r
+ for ( ; refIter != refEnd; ++refIter) {\r
+ refNames.push_back( (*refIter).RefName );\r
+ }\r
+\r
+ // return 'index-of' refName ( if not found, returns refNames.size() )\r
+ return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));\r
+}\r
+\r
+// get next alignment (from specified region, if given)\r
+bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {\r
+\r
+ // if valid alignment available\r
+ if ( LoadNextAlignment(bAlignment) ) {\r
+\r
+ // if region not specified, return success\r
+ if ( !IsRegionSpecified ) { return true; }\r
+\r
+ // load next alignment until region overlap is found\r
+ while ( !IsOverlap(bAlignment) ) {\r
+ // if no valid alignment available (likely EOF) return failure\r
+ if ( !LoadNextAlignment(bAlignment) ) { return false; }\r
+ }\r
+\r
+ // return success (alignment found that overlaps region)\r
+ return true;\r
+ }\r
+\r
+ // no valid alignment\r
+ else { return false; }\r
+}\r
+\r
+// calculate closest indexed file offset for region specified\r
+int64_t BamReader::BamReaderPrivate::GetOffset(int refID, unsigned int left) {\r
+\r
+ // calculate which bins overlap this region\r
+ uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);\r
+ int numBins = BinsFromRegion(refID, left, bins);\r
+\r
+ // get bins for this reference\r
+ const ReferenceIndex& refIndex = Index.at(refID);\r
+ const BamBinMap& binMap = refIndex.Bins;\r
+\r
+ // get minimum offset to consider\r
+ const LinearOffsetVector& offsets = refIndex.Offsets;\r
+ uint64_t minOffset = ( (left>>BAM_LIDX_SHIFT) >= offsets.size() ) ? 0 : offsets.at(left>>BAM_LIDX_SHIFT);\r
+\r
+ // store offsets to beginning of alignment 'chunks'\r
+ std::vector<int64_t> chunkStarts;\r
+\r
+ // store all alignment 'chunk' starts for bins in this region\r
+ for (int i = 0; i < numBins; ++i ) {\r
+ uint16_t binKey = bins[i];\r
+\r
+ map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);\r
+ if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {\r
+\r
+ const ChunkVector& chunks = (*binIter).second;\r
+ std::vector<Chunk>::const_iterator chunksIter = chunks.begin();\r
+ std::vector<Chunk>::const_iterator chunksEnd = chunks.end();\r
+ for ( ; chunksIter != chunksEnd; ++chunksIter) {\r
+ const Chunk& chunk = (*chunksIter);\r
+ if ( chunk.Stop > minOffset ) {\r
+ chunkStarts.push_back( chunk.Start );\r
+ }\r
+ }\r
+ }\r
+ }\r
+\r
+ // clean up memory\r
+ free(bins);\r
+\r
+ // if no alignments found, else return smallest offset for alignment starts\r
+ if ( chunkStarts.size() == 0 ) { return -1; }\r
+ else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }\r
+}\r
+\r
+// saves BAM bin entry for index\r
+void BamReader::BamReaderPrivate::InsertBinEntry(BamBinMap& binMap,\r
+ const uint32_t& saveBin,\r
+ const uint64_t& saveOffset,\r
+ const uint64_t& lastOffset)\r
+{\r
+ // look up saveBin\r
+ BamBinMap::iterator binIter = binMap.find(saveBin);\r
+\r
+ // create new chunk\r
+ Chunk newChunk(saveOffset, lastOffset);\r
+\r
+ // if entry doesn't exist\r
+ if ( binIter == binMap.end() ) {\r
+ ChunkVector newChunks;\r
+ newChunks.push_back(newChunk);\r
+ binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));\r
+ }\r
+\r
+ // otherwise\r
+ else {\r
+ ChunkVector& binChunks = (*binIter).second;\r
+ binChunks.push_back( newChunk );\r
+ }\r
+}\r
+\r
+// saves linear offset entry for index\r
+void BamReader::BamReaderPrivate::InsertLinearOffset(LinearOffsetVector& offsets,\r
+ const BamAlignment& bAlignment,\r
+ const uint64_t& lastOffset)\r
+{\r
+ // get converted offsets\r
+ int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;\r
+ int endOffset = ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) - 1) >> BAM_LIDX_SHIFT;\r
+\r
+ // resize vector if necessary\r
+ int oldSize = offsets.size();\r
+ int newSize = endOffset + 1;\r
+ if ( oldSize < newSize ) { \r
+ Roundup32(newSize);\r
+ offsets.resize(newSize, 0);\r
+ }\r
+\r
+ // store offset\r
+ for(int i = beginOffset + 1; i <= endOffset ; ++i) {\r
+ if ( offsets[i] == 0) {\r
+ offsets[i] = lastOffset;\r
+ }\r
+ }\r
+}\r
+\r
+// returns whether alignment overlaps currently specified region (refID, leftBound)\r
+bool BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {\r
+\r
+ // if on different reference sequence, quit\r
+ if ( bAlignment.RefID != CurrentRefID ) { return false; }\r
+\r
+ // read starts after left boundary\r
+ if ( bAlignment.Position >= (int32_t)CurrentLeft) { return true; }\r
+\r
+ // return whether alignment end overlaps left boundary\r
+ return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= CurrentLeft );\r
+}\r
+\r
+// jumps to specified region(refID, leftBound) in BAM file, returns success/fail\r
+bool BamReader::BamReaderPrivate::Jump(int refID, unsigned int position) {\r
+\r
+ // if data exists for this reference and position is valid \r
+ if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {\r
+\r
+ // set current region\r
+ CurrentRefID = refID;\r
+ CurrentLeft = position;\r
+ IsRegionSpecified = true;\r
+\r
+ // calculate offset\r
+ int64_t offset = GetOffset(CurrentRefID, CurrentLeft);\r
+\r
+ // if in valid offset, return failure\r
+ if ( offset == -1 ) { return false; }\r
+\r
+ // otherwise return success of seek operation\r
+ else { return mBGZF.Seek(offset); }\r
+ }\r
+\r
+ // invalid jump request parameters, return failure\r
+ return false;\r
+}\r
+\r
+// load BAM header data\r
+void BamReader::BamReaderPrivate::LoadHeaderData(void) {\r
+\r
+ // check to see if proper BAM header\r
+ char buffer[4];\r
+ if (mBGZF.Read(buffer, 4) != 4) {\r
+ printf("Could not read header type\n");\r
+ exit(1);\r
+ }\r
+\r
+ if (strncmp(buffer, "BAM\001", 4)) {\r
+ printf("wrong header type!\n");\r
+ exit(1);\r
+ }\r
+\r
+ // get BAM header text length\r
+ mBGZF.Read(buffer, 4);\r
+ const unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);\r
+\r
+ // get BAM header text\r
+ char* headerText = (char*)calloc(headerTextLength + 1, 1);\r
+ mBGZF.Read(headerText, headerTextLength);\r
+ HeaderText = (string)((const char*)headerText);\r
+\r
+ // clean up calloc-ed temp variable\r
+ free(headerText);\r
+}\r
+\r
+// load existing index data from BAM index file (".bai"), return success/fail\r
+bool BamReader::BamReaderPrivate::LoadIndex(void) {\r
+\r
+ // clear out index data\r
+ ClearIndex();\r
+\r
+ // skip if index file empty\r
+ if ( IndexFilename.empty() ) { return false; }\r
+\r
+ // open index file, abort on error\r
+ FILE* indexStream = fopen(IndexFilename.c_str(), "rb");\r
+ if(!indexStream) {\r
+ printf("ERROR: Unable to open the BAM index file %s for reading.\n", IndexFilename.c_str() );\r
+ return false;\r
+ }\r
+\r
+ // see if index is valid BAM index\r
+ char magic[4];\r
+ fread(magic, 1, 4, indexStream);\r
+ if (strncmp(magic, "BAI\1", 4)) {\r
+ printf("Problem with index file - invalid format.\n");\r
+ fclose(indexStream);\r
+ return false;\r
+ }\r
+\r
+ // get number of reference sequences\r
+ uint32_t numRefSeqs;\r
+ fread(&numRefSeqs, 4, 1, indexStream);\r
+\r
+ // intialize space for BamIndex data structure\r
+ Index.reserve(numRefSeqs);\r
+\r
+ // iterate over reference sequences\r
+ for (unsigned int i = 0; i < numRefSeqs; ++i) {\r
+\r
+ // get number of bins for this reference sequence\r
+ int32_t numBins;\r
+ fread(&numBins, 4, 1, indexStream);\r
+\r
+ if (numBins > 0) {\r
+ RefData& refEntry = References[i];\r
+ refEntry.RefHasAlignments = true;\r
+ }\r
+\r
+ // intialize BinVector\r
+ BamBinMap binMap;\r
+\r
+ // iterate over bins for that reference sequence\r
+ for (int j = 0; j < numBins; ++j) {\r
+\r
+ // get binID\r
+ uint32_t binID;\r
+ fread(&binID, 4, 1, indexStream);\r
+\r
+ // get number of regionChunks in this bin\r
+ uint32_t numChunks;\r
+ fread(&numChunks, 4, 1, indexStream);\r
+\r
+ // intialize ChunkVector\r
+ ChunkVector regionChunks;\r
+ regionChunks.reserve(numChunks);\r
+\r
+ // iterate over regionChunks in this bin\r
+ for (unsigned int k = 0; k < numChunks; ++k) {\r
+\r
+ // get chunk boundaries (left, right)\r
+ uint64_t left;\r
+ uint64_t right;\r
+ fread(&left, 8, 1, indexStream);\r
+ fread(&right, 8, 1, indexStream);\r
+\r
+ // save ChunkPair\r
+ regionChunks.push_back( Chunk(left, right) );\r
+ }\r
+\r
+ // sort chunks for this bin\r
+ sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan );\r
+\r
+ // save binID, chunkVector for this bin\r
+ binMap.insert( pair<uint32_t, ChunkVector>(binID, regionChunks) );\r
+ }\r
+\r
+ // load linear index for this reference sequence\r
+\r
+ // get number of linear offsets\r
+ int32_t numLinearOffsets;\r
+ fread(&numLinearOffsets, 4, 1, indexStream);\r
+\r
+ // intialize LinearOffsetVector\r
+ LinearOffsetVector offsets;\r
+ offsets.reserve(numLinearOffsets);\r
+\r
+ // iterate over linear offsets for this reference sequeence\r
+ uint64_t linearOffset;\r
+ for (int j = 0; j < numLinearOffsets; ++j) {\r
+ // read a linear offset & store\r
+ fread(&linearOffset, 8, 1, indexStream);\r
+ offsets.push_back(linearOffset);\r
+ }\r
+\r
+ // sort linear offsets\r
+ sort( offsets.begin(), offsets.end() );\r
+\r
+ // store index data for that reference sequence\r
+ Index.push_back( ReferenceIndex(binMap, offsets) );\r
+ }\r
+\r
+ // close index file (.bai) and return\r
+ fclose(indexStream);\r
+ return true;\r
+}\r
+\r
+// populates BamAlignment with alignment data under file pointer, returns success/fail\r
+bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {\r
+\r
+ // read in the 'block length' value, make sure it's not zero\r
+ char buffer[4];\r
+ mBGZF.Read(buffer, 4);\r
+ const unsigned int blockLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( blockLength == 0 ) { return false; }\r
+\r
+ // keep track of bytes read as method progresses\r
+ int bytesRead = 4;\r
+\r
+ // read in core alignment data, make sure the right size of data was read\r
+ char x[BAM_CORE_SIZE];\r
+ if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }\r
+ bytesRead += BAM_CORE_SIZE;\r
+\r
+ // set BamAlignment 'core' data and character data lengths\r
+ unsigned int tempValue;\r
+ unsigned int queryNameLength;\r
+ unsigned int numCigarOperations;\r
+ unsigned int querySequenceLength;\r
+\r
+ bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]);\r
+ bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);\r
+\r
+ tempValue = BgzfData::UnpackUnsignedInt(&x[8]);\r
+ bAlignment.Bin = tempValue >> 16;\r
+ bAlignment.MapQuality = tempValue >> 8 & 0xff;\r
+ queryNameLength = tempValue & 0xff;\r
+\r
+ tempValue = BgzfData::UnpackUnsignedInt(&x[12]);\r
+ bAlignment.AlignmentFlag = tempValue >> 16;\r
+ numCigarOperations = tempValue & 0xffff;\r
+\r
+ querySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);\r
+ bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]);\r
+ bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);\r
+ bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);\r
+\r
+ // calculate lengths/offsets\r
+ const unsigned int dataLength = blockLength - BAM_CORE_SIZE;\r
+ const unsigned int cigarDataOffset = queryNameLength;\r
+ const unsigned int seqDataOffset = cigarDataOffset + (numCigarOperations * 4);\r
+ const unsigned int qualDataOffset = seqDataOffset + (querySequenceLength+1)/2;\r
+ const unsigned int tagDataOffset = qualDataOffset + querySequenceLength;\r
+ const unsigned int tagDataLen = dataLength - tagDataOffset;\r
+\r
+ // set up destination buffers for character data\r
+ char* allCharData = (char*)calloc(sizeof(char), dataLength);\r
+ uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);\r
+ char* seqData = ((char*)allCharData) + seqDataOffset;\r
+ char* qualData = ((char*)allCharData) + qualDataOffset;\r
+ char* tagData = ((char*)allCharData) + tagDataOffset;\r
+\r
+ // get character data - make sure proper data size was read\r
+ if ( mBGZF.Read(allCharData, dataLength) != (signed int)dataLength) { return false; }\r
+ else {\r
+\r
+ bytesRead += dataLength;\r
+\r
+ // clear out any previous string data\r
+ bAlignment.Name.clear();\r
+ bAlignment.QueryBases.clear();\r
+ bAlignment.Qualities.clear();\r
+ bAlignment.AlignedBases.clear();\r
+ bAlignment.CigarData.clear();\r
+ bAlignment.TagData.clear();\r
+\r
+ // save name\r
+ bAlignment.Name = (string)((const char*)(allCharData));\r
+\r
+ // save query sequence\r
+ for (unsigned int i = 0; i < querySequenceLength; ++i) {\r
+ char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];\r
+ bAlignment.QueryBases.append( 1, singleBase );\r
+ }\r
+\r
+ // save sequence length\r
+ bAlignment.Length = bAlignment.QueryBases.length();\r
+\r
+ // save qualities, convert from numeric QV to FASTQ character\r
+ for (unsigned int i = 0; i < querySequenceLength; ++i) {\r
+ char singleQuality = (char)(qualData[i]+33);\r
+ bAlignment.Qualities.append( 1, singleQuality );\r
+ }\r
+\r
+ // save CIGAR-related data;\r
+ int k = 0;\r
+ for (unsigned int i = 0; i < numCigarOperations; ++i) {\r
+\r
+ // build CigarOp struct\r
+ CigarOp op;\r
+ op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
+ op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
+\r
+ // save CigarOp\r
+ bAlignment.CigarData.push_back(op);\r
+\r
+ // build AlignedBases string\r
+ switch (op.Type) {\r
+\r
+ case ('M') :\r
+ case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases\r
+ case ('S') : k += op.Length; // for 'S' - skip over query bases\r
+ break;\r
+\r
+ case ('D') : bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character\r
+ break;\r
+\r
+ case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character;\r
+ break;\r
+\r
+ case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence\r
+ k += op.Length;\r
+ break;\r
+\r
+ case ('H') : break; // for 'H' - do nothing, move to next op\r
+\r
+ default : printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here\r
+ exit(1);\r
+ }\r
+ }\r
+\r
+ // read in the tag data\r
+ bAlignment.TagData.resize(tagDataLen);\r
+ memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);\r
+ }\r
+\r
+ free(allCharData);\r
+ return true;\r
+}\r
+\r
+// loads reference data from BAM file\r
+void BamReader::BamReaderPrivate::LoadReferenceData(void) {\r
+\r
+ // get number of reference sequences\r
+ char buffer[4];\r
+ mBGZF.Read(buffer, 4);\r
+ const unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);\r
+ if (numberRefSeqs == 0) { return; }\r
+ References.reserve((int)numberRefSeqs);\r
+\r
+ // iterate over all references in header\r
+ for (unsigned int i = 0; i != numberRefSeqs; ++i) {\r
+\r
+ // get length of reference name\r
+ mBGZF.Read(buffer, 4);\r
+ const unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ char* refName = (char*)calloc(refNameLength, 1);\r
+\r
+ // get reference name and reference sequence length\r
+ mBGZF.Read(refName, refNameLength);\r
+ mBGZF.Read(buffer, 4);\r
+ const unsigned int refLength = BgzfData::UnpackUnsignedInt(buffer);\r
+\r
+ // store data for reference\r
+ RefData aReference;\r
+ aReference.RefName = (string)((const char*)refName);\r
+ aReference.RefLength = refLength;\r
+ References.push_back(aReference);\r
+\r
+ // clean up calloc-ed temp variable\r
+ free(refName);\r
+ }\r
+}\r
+\r
+// merges 'alignment chunks' in BAM bin (used for index building)\r
+void BamReader::BamReaderPrivate::MergeChunks(void) {\r
+\r
+ // iterate over reference enties\r
+ BamIndex::iterator indexIter = Index.begin();\r
+ BamIndex::iterator indexEnd = Index.end();\r
+ for ( ; indexIter != indexEnd; ++indexIter ) {\r
+\r
+ // get BAM bin map for this reference\r
+ ReferenceIndex& refIndex = (*indexIter);\r
+ BamBinMap& bamBinMap = refIndex.Bins;\r
+\r
+ // iterate over BAM bins\r
+ BamBinMap::iterator binIter = bamBinMap.begin();\r
+ BamBinMap::iterator binEnd = bamBinMap.end();\r
+ for ( ; binIter != binEnd; ++binIter ) {\r
+\r
+ // get chunk vector for this bin\r
+ ChunkVector& binChunks = (*binIter).second;\r
+ if ( binChunks.size() == 0 ) { continue; }\r
+\r
+ ChunkVector mergedChunks;\r
+ mergedChunks.push_back( binChunks[0] );\r
+\r
+ // iterate over chunks\r
+ int i = 0;\r
+ ChunkVector::iterator chunkIter = binChunks.begin();\r
+ ChunkVector::iterator chunkEnd = binChunks.end();\r
+ for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {\r
+\r
+ // get 'currentChunk' based on numeric index\r
+ Chunk& currentChunk = mergedChunks[i];\r
+\r
+ // get iteratorChunk based on vector iterator\r
+ Chunk& iteratorChunk = (*chunkIter);\r
+\r
+ // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted)\r
+ if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) {\r
+\r
+ // set currentChunk.Stop to iteratorChunk.Stop\r
+ currentChunk.Stop = iteratorChunk.Stop;\r
+ }\r
+\r
+ // otherwise\r
+ else {\r
+ // set currentChunk + 1 to iteratorChunk\r
+ mergedChunks.push_back(iteratorChunk);\r
+ ++i;\r
+ }\r
+ }\r
+\r
+ // saved merged chunk vector\r
+ (*binIter).second = mergedChunks;\r
+ }\r
+ }\r
+}\r
+\r
+// opens BAM file (and index)\r
+void BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) {\r
+\r
+ Filename = filename;\r
+ IndexFilename = indexFilename;\r
+\r
+ // open the BGZF file for reading, retrieve header text & reference data\r
+ mBGZF.Open(filename, "rb");\r
+ LoadHeaderData();\r
+ LoadReferenceData();\r
+\r
+ // store file offset of first alignment\r
+ AlignmentsBeginOffset = mBGZF.Tell();\r
+\r
+ // open index file & load index data (if exists)\r
+ if ( !IndexFilename.empty() ) {\r
+ LoadIndex();\r
+ }\r
+}\r
+\r
+// returns BAM file pointer to beginning of alignment data\r
+bool BamReader::BamReaderPrivate::Rewind(void) {\r
+\r
+ // find first reference that has alignments in the BAM file\r
+ int refID = 0;\r
+ int refCount = References.size();\r
+ for ( ; refID < refCount; ++refID ) {\r
+ if ( References.at(refID).RefHasAlignments ) { break; }\r
+ }\r
+\r
+ // store default bounds for first alignment\r
+ CurrentRefID = refID;\r
+ CurrentLeft = 0;\r
+ IsRegionSpecified = false;\r
+\r
+ // return success/failure of seek\r
+ return mBGZF.Seek(AlignmentsBeginOffset);\r
+}\r
+\r
+// rounds value up to next power-of-2 (used in index building)\r
+void BamReader::BamReaderPrivate::Roundup32(int& value) { \r
+ --value;\r
+ value |= value >> 1;\r
+ value |= value >> 2;\r
+ value |= value >> 4;\r
+ value |= value >> 8;\r
+ value |= value >> 16;\r
+ ++value;\r
+}\r
+\r
+// saves index data to BAM index file (".bai"), returns success/fail\r
+bool BamReader::BamReaderPrivate::WriteIndex(void) {\r
+\r
+ IndexFilename = Filename + ".bai";\r
+ FILE* indexStream = fopen(IndexFilename.c_str(), "wb");\r
+ if ( indexStream == 0 ) {\r
+ printf("ERROR: Could not open file to save index\n");\r
+ return false;\r
+ }\r
+\r
+ // write BAM index header\r
+ fwrite("BAI\1", 1, 4, indexStream);\r
+\r
+ // write number of reference sequences\r
+ int32_t numReferenceSeqs = Index.size();\r
+ fwrite(&numReferenceSeqs, 4, 1, indexStream);\r
+\r
+ // iterate over reference sequences\r
+ BamIndex::const_iterator indexIter = Index.begin();\r
+ BamIndex::const_iterator indexEnd = Index.end();\r
+ for ( ; indexIter != indexEnd; ++ indexIter ) {\r
+\r
+ // get reference index data\r
+ const ReferenceIndex& refIndex = (*indexIter);\r
+ const BamBinMap& binMap = refIndex.Bins;\r
+ const LinearOffsetVector& offsets = refIndex.Offsets;\r
+\r
+ // write number of bins\r
+ int32_t binCount = binMap.size();\r
+ fwrite(&binCount, 4, 1, indexStream);\r
+\r
+ // iterate over bins\r
+ BamBinMap::const_iterator binIter = binMap.begin();\r
+ BamBinMap::const_iterator binEnd = binMap.end();\r
+ for ( ; binIter != binEnd; ++binIter ) {\r
+\r
+ // get bin data (key and chunk vector)\r
+ const uint32_t& binKey = (*binIter).first;\r
+ const ChunkVector& binChunks = (*binIter).second;\r
+\r
+ // save BAM bin key\r
+ fwrite(&binKey, 4, 1, indexStream);\r
+\r
+ // save chunk count\r
+ int32_t chunkCount = binChunks.size();\r
+ fwrite(&chunkCount, 4, 1, indexStream);\r
+\r
+ // iterate over chunks\r
+ ChunkVector::const_iterator chunkIter = binChunks.begin();\r
+ ChunkVector::const_iterator chunkEnd = binChunks.end();\r
+ for ( ; chunkIter != chunkEnd; ++chunkIter ) {\r
+\r
+ // get current chunk data\r
+ const Chunk& chunk = (*chunkIter);\r
+ const uint64_t& start = chunk.Start;\r
+ const uint64_t& stop = chunk.Stop;\r
+\r
+ // save chunk offsets\r
+ fwrite(&start, 8, 1, indexStream);\r
+ fwrite(&stop, 8, 1, indexStream);\r
+ }\r
+ }\r
+\r
+ // write linear offsets size\r
+ int32_t offsetSize = offsets.size();\r
+ fwrite(&offsetSize, 4, 1, indexStream);\r
+\r
+ // iterate over linear offsets\r
+ LinearOffsetVector::const_iterator offsetIter = offsets.begin();\r
+ LinearOffsetVector::const_iterator offsetEnd = offsets.end();\r
+ for ( ; offsetIter != offsetEnd; ++offsetIter ) {\r
+\r
+ // write linear offset value\r
+ const uint64_t& linearOffset = (*offsetIter);\r
+ fwrite(&linearOffset, 8, 1, indexStream);\r
+ }\r
+ }\r
+\r
+ // flush buffer, close file, and return success\r
+ fflush(indexStream);\r
+ fclose(indexStream);\r
+ return true;\r
+}\r
-// ***************************************************************************
-// BamReader.h (c) 2009 Derek Barnett, Michael Strömberg
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 24 June 2009 (DB)
-// ---------------------------------------------------------------------------
-// The BGZF routines were adapted from the bgzf.c code developed at the Broad
-// Institute.
-// ---------------------------------------------------------------------------
-// Provides the basic functionality for reading BAM files
-// ***************************************************************************
-
-/*!
- \file BamReader.h
- \brief API for reading BAM files.
-*/
-
-#pragma once
-
-// C++ includes
-#include <algorithm>
-#include <iterator>
-#include <string>
-#include <utility>
-#include <vector>
-
-// zlib includes
-#include <zlib.h>
-
-// BamTools includes
-#include "BamAux.h"
-
-namespace BamTools {
-
- //! API for reading BAM files.
- class BamReader {
-
- public:
-
- //! Constructor
- BamReader(void);
-
- //! Destructor
- ~BamReader(void);
-
- public:
-
- /*!
- \brief Closes the BAM file.
-
- Also closes index file and clears index data, if provided.
-
- \sa Open()
- */
- void Close(void);
-
- /*!
- \brief Access SAM format header data.
-
- See SAM format documentation for detailed header description.
-
- \return Full header text (no parsing of tags)
- */
- const std::string GetHeaderText(void) const;
-
- /*!
- \brief Retrieve next alignment.
-
- Stores result in bAlignment.
-
- If reference and position are specified by a prior call to Jump(), this method stores the next aligmment that either:
- a) overlaps, or
- b) begins on/after that specified position.
-
- Otherwise this simply stores next alignment, if one exists.
-
- Note that this method does not specifiy a 'right bound' position.
- If a range is desired, you should supply some stopping criteria. For example:
-
- \code
- BamReader bReader;
- bReader.Open(bamFile, bamIndexfile);
- if ( bReader.Jump( someID, somePosition ) {
- BamAlignment bAlignment;
- while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= someUpperBound) ) {
- // do something
- }
- }
- \endcode
-
- \param bAlignment destination for alignment data
- \return success/failure
- \sa Jump(), Rewind()
- */
- bool GetNextAlignment(BamAlignment& bAlignment);
-
- /*!
- \brief Get number of reference sequences in BAM file.
- \return Number of references
- \sa GetReferenceData(), GetReferenceID()
- */
- const int GetReferenceCount(void) const;
-
- /*!
- \brief Access reference data.
- \return Vector of RefData entry
- \sa GetReferenceCount(), GetReferenceID()
- */
- const RefVector GetReferenceData(void) const;
-
- /*!
- \brief Get reference ID from name.
- \param refName name of reference sequence
- \return reference ID number
- \sa GetReferenceCount(), GetReferenceData()
- */
- const int GetReferenceID(const std::string& refName) const;
-
- /*!
- \brief Random access in BAM file.
-
- Jump to a specified position on reference sequence.
- Position is optional - defaults to beginning of specified reference.
-
- Reference and position are stored for use by subsequent calls to GetNextAlignment().
-
- \param refID ID number of desired reference
- \param position left-bound position
- \return success/failure
- \sa GetNextAlignment(), Rewind()
- */
- bool Jump(int refID, unsigned int position = 0);
-
- /*!
- \brief Opens a BAM file.
-
- Index file is optional - sequential reading through a BAM file does not require an index.
-
- However, the index is required to perform random-access alignment retrival (using the Jump() method ).
-
- See SAMtools documentation for BAM index generation.
-
- \param filename BAM file
- \param indexFilename BAM index file
- \sa Jump(), Close()
- */
- void Open(const std::string& filename, const std::string& indexFilename = "");
-
- /*!
- \brief Moves file pointer to beginning of alignment data.
-
- A subsequent call to GetNextAlignment() would retrieve the first alignment in the BAM file.
- Clears any reference and position set by a prior call to Jump()
-
- \return success/failure
- \sa GetNextAlignment(), Jump()
- */
- bool Rewind(void);
-
- // --------------------------------------------------------------------------------------
- // internal methods
- private:
- // checks BGZF block header
- bool BgzfCheckBlockHeader(char* header);
- // closes the BAM file
- void BgzfClose(void);
- // de-compresses the current block
- int BgzfInflateBlock(int blockLength);
- // opens the BAM file for reading
- void BgzfOpen(const std::string& filename);
- // reads BGZF data into a byte buffer
- unsigned int BgzfRead(char* data, const unsigned int dataLen);
- // reads BGZF block
- int BgzfReadBlock(void);
- // seek to position in BAM file
- bool BgzfSeek(int64_t position);
- // get file position in BAM file
- int64_t BgzfTell(void);
- // unpacks a buffer into an unsigned int
- static inline unsigned int BgzfUnpackUnsignedInt(char* buffer);
- // unpacks a buffer into an unsigned short
- static inline unsigned short BgzfUnpackUnsignedShort(char* buffer);
- // unpacks a buffer into a signed int
- static inline signed int BgzfUnpackSignedInt(char* buffer);
- // calculate bins that overlap region ( left to reference end for now )
- int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]);
- // calculates alignment end position based on starting position and provided CIGAR operations
- unsigned int CalculateAlignmentEnd(const unsigned int& position, const std::vector<CigarOp>& cigarData);
- // clear out (delete pointers in) index data structure
- void ClearIndex(void);
- // calculate file offset for first alignment chunk overlapping 'left'
- int64_t GetOffset(int refID, unsigned int left);
- // checks to see if alignment overlaps current region
- bool IsOverlap(BamAlignment& bAlignment);
- // retrieves header text from BAM file
- void LoadHeaderData(void);
- // builds BamIndex data structure from BAM index file
- void LoadIndexData(FILE* indexStream);
- // retrieves BAM alignment under file pointer
- bool LoadNextAlignment(BamAlignment& bAlignment);
- // builds reference data structure from BAM file
- void LoadReferenceData(void);
- // open BAM index file (if successful, loads index)
- void OpenIndex(const std::string& indexFilename);
-
- // aligment file & index file data
- private:
- BgzfData* m_BGZF;
- std::string m_headerText;
- BamIndex* m_index;
- RefVector m_references;
- bool m_isIndexLoaded;
- int64_t m_alignmentsBeginOffset;
-
- // user-specified region values
- private:
- bool m_isRegionSpecified;
- int m_currentRefID;
- unsigned int m_currentLeft;
-
- // BAM character constants
- private:
- static const char* DNA_LOOKUP;
- static const char* CIGAR_LOOKUP;
- };
-
- //! \cond
- // --------------------------------------------------------------------------------------
- // static inline methods (internal - can exclude from main documentation)
-
- // unpacks a buffer into an unsigned int
- inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) {
- union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
- un.valueBuffer[0] = buffer[0];
- un.valueBuffer[1] = buffer[1];
- un.valueBuffer[2] = buffer[2];
- un.valueBuffer[3] = buffer[3];
- return un.value;
- }
-
- // unpacks a buffer into an unsigned short
- inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) {
- union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;
- un.valueBuffer[0] = buffer[0];
- un.valueBuffer[1] = buffer[1];
- return un.value;
- }
-
- // unpacks a buffer into a signed int
- inline signed int BamReader::BgzfUnpackSignedInt(char* buffer) {
- union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un;
- un.valueBuffer[0] = buffer[0];
- un.valueBuffer[1] = buffer[1];
- un.valueBuffer[2] = buffer[2];
- un.valueBuffer[3] = buffer[3];
- return un.value;
- }
-
- // --------------------------------------------------------------------------------------
- // template classes/methods (internal - can exclude from main documentation)
-
- // allows sorting/searching of a vector of pairs (instead of using maps)
- template <typename Key, typename Value>
- class LookupKeyCompare {
-
- typedef std::pair< Key, Value > LookupData;
- typedef typename LookupData::first_type Key_t;
-
- public:
- bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); }
- bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); }
- bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); }
- private:
- bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; }
- };
-
- // return index of item if found, else return container.size()
- template < typename InputIterator, typename EqualityComparable >
- typename std::iterator_traits<InputIterator>::difference_type
- Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) {
- return std::distance(begin, std::find(begin, end, item));
- }
- //! \endcond
-
-}
+// ***************************************************************************\r
+// BamReader.h (c) 2009 Derek Barnett, Michael Strömberg\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 8 December 2009 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for reading BAM files\r
+// ***************************************************************************\r
+\r
+#ifndef BAMREADER_H\r
+#define BAMREADER_H\r
+\r
+// C++ includes\r
+#include <string>\r
+\r
+// BamTools includes\r
+#include "BamAux.h"\r
+\r
+namespace BamTools {\r
+\r
+class BamReader {\r
+\r
+ // constructor / destructor\r
+ public:\r
+ BamReader(void);\r
+ ~BamReader(void);\r
+\r
+ // public interface\r
+ public:\r
+\r
+ // ----------------------\r
+ // BAM file operations\r
+ // ----------------------\r
+\r
+ // close BAM file\r
+ void Close(void);\r
+ // performs random-access jump to reference, position\r
+ bool Jump(int refID, unsigned int position = 0);\r
+ // opens BAM file (and optional BAM index file, if provided)\r
+ void Open(const std::string& filename, const std::string& indexFilename = "");\r
+ // returns file pointer to beginning of alignments\r
+ bool Rewind(void);\r
+\r
+ // ----------------------\r
+ // access alignment data\r
+ // ----------------------\r
+\r
+ // retrieves next available alignment (returns success/fail)\r
+ bool GetNextAlignment(BamAlignment& bAlignment);\r
+\r
+ // ----------------------\r
+ // access auxiliary data\r
+ // ----------------------\r
+\r
+ // returns SAM header text\r
+ const std::string GetHeaderText(void) const;\r
+ // returns number of reference sequences\r
+ const int GetReferenceCount(void) const;\r
+ // returns vector of reference objects\r
+ const BamTools::RefVector GetReferenceData(void) const;\r
+ // returns reference id (used for BamReader::Jump()) for the given reference name\r
+ const int GetReferenceID(const std::string& refName) const;\r
+\r
+ // ----------------------\r
+ // BAM index operations\r
+ // ----------------------\r
+\r
+ // creates index for BAM file, saves to file (default = bamFilename + ".bai")\r
+ bool CreateIndex(void);\r
+\r
+ // private implementation\r
+ private:\r
+ struct BamReaderPrivate;\r
+ BamReaderPrivate* d;\r
+};\r
+\r
+} // namespace BamTools\r
+\r
+#endif // BAMREADER_H\r
-// ***************************************************************************
-// BamWriter.cpp (c) 2009 Michael Strömberg, Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 24 June 2009 (DB)
-// ---------------------------------------------------------------------------
-// The BGZF routines were adapted from the bgzf.c code developed at the Broad
-// Institute.
-// ---------------------------------------------------------------------------
-// Provides the basic functionality for producing BAM files
-// ***************************************************************************
-
-#include "BamWriter.h"
-using namespace BamTools;
-using namespace std;
-
-// constructor
-BamWriter::BamWriter(void)
-{}
-
-// destructor
-BamWriter::~BamWriter(void) {
- if(mBGZF.IsOpen) BgzfClose();
-}
-
-// closes the BAM file
-void BamWriter::BgzfClose(void) {
-
- mBGZF.IsOpen = false;
-
- // flush the BGZF block
- BgzfFlushBlock();
-
- // flush and close
- fflush(mBGZF.Stream);
- fclose(mBGZF.Stream);
-}
-
-// compresses the current block
-int BamWriter::BgzfDeflateBlock(void) {
-
- // initialize the gzip header
- char* buffer = mBGZF.CompressedBlock;
- unsigned int bufferSize = mBGZF.CompressedBlockSize;
-
- memset(buffer, 0, 18);
- buffer[0] = GZIP_ID1;
- buffer[1] = (char)GZIP_ID2;
- buffer[2] = CM_DEFLATE;
- buffer[3] = FLG_FEXTRA;
- buffer[9] = (char)OS_UNKNOWN;
- buffer[10] = BGZF_XLEN;
- buffer[12] = BGZF_ID1;
- buffer[13] = BGZF_ID2;
- buffer[14] = BGZF_LEN;
-
- // loop to retry for blocks that do not compress enough
- int inputLength = mBGZF.BlockOffset;
- int compressedLength = 0;
-
- while(true) {
-
- z_stream zs;
- zs.zalloc = NULL;
- zs.zfree = NULL;
- zs.next_in = (Bytef*)mBGZF.UncompressedBlock;
- zs.avail_in = inputLength;
- zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
- zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
-
- // initialize the zlib compression algorithm
- if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {
- printf("ERROR: zlib deflate initialization failed.\n");
- exit(1);
- }
-
- // compress the data
- int status = deflate(&zs, Z_FINISH);
- if(status != Z_STREAM_END) {
- deflateEnd(&zs);
-
- // reduce the input length and try again
- if(status == Z_OK) {
- inputLength -= 1024;
- if(inputLength < 0) {
- printf("ERROR: input reduction failed.\n");
- exit(1);
- }
- continue;
- }
-
- printf("ERROR: zlib deflate failed.\n");
- exit(1);
- }
-
- // finalize the compression routine
- if(deflateEnd(&zs) != Z_OK) {
- printf("ERROR: deflate end failed.\n");
- exit(1);
- }
-
- compressedLength = zs.total_out;
- compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
-
- if(compressedLength > MAX_BLOCK_SIZE) {
- printf("ERROR: deflate overflow.\n");
- exit(1);
- }
-
- break;
- }
-
- // store the compressed length
- BgzfPackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
-
- // store the CRC32 checksum
- unsigned int crc = crc32(0, NULL, 0);
- crc = crc32(crc, (Bytef*)mBGZF.UncompressedBlock, inputLength);
- BgzfPackUnsignedInt(&buffer[compressedLength - 8], crc);
- BgzfPackUnsignedInt(&buffer[compressedLength - 4], inputLength);
-
- // ensure that we have less than a block of data left
- int remaining = mBGZF.BlockOffset - inputLength;
- if(remaining > 0) {
- if(remaining > inputLength) {
- printf("ERROR: remainder too large.\n");
- exit(1);
- }
-
- memcpy(mBGZF.UncompressedBlock, mBGZF.UncompressedBlock + inputLength, remaining);
- }
-
- mBGZF.BlockOffset = remaining;
- return compressedLength;
-}
-
-// flushes the data in the BGZF block
-void BamWriter::BgzfFlushBlock(void) {
-
- // flush all of the remaining blocks
- while(mBGZF.BlockOffset > 0) {
-
- // compress the data block
- int blockLength = BgzfDeflateBlock();
-
- // flush the data to our output stream
- int numBytesWritten = fwrite(mBGZF.CompressedBlock, 1, blockLength, mBGZF.Stream);
-
- if(numBytesWritten != blockLength) {
- printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
- exit(1);
- }
-
- mBGZF.BlockAddress += blockLength;
- }
-}
-
-// opens the BAM file for writing
-void BamWriter::BgzfOpen(const string& filename) {
-
- mBGZF.Stream = fopen(filename.c_str(), "wb");
-
- if(!mBGZF.Stream) {
- printf("ERROR: Unable to open the BAM file (%s) for writing.\n", filename.c_str());
- exit(1);
- }
-
- mBGZF.IsOpen = true;
-}
-
-// writes the supplied data into the BGZF buffer
-unsigned int BamWriter::BgzfWrite(const char* data, const unsigned int dataLen) {
-
- // initialize
- unsigned int numBytesWritten = 0;
- const char* input = data;
- unsigned int blockLength = mBGZF.UncompressedBlockSize;
-
- // copy the data to the buffer
- while(numBytesWritten < dataLen) {
- unsigned int copyLength = min(blockLength - mBGZF.BlockOffset, dataLen - numBytesWritten);
- char* buffer = mBGZF.UncompressedBlock;
- memcpy(buffer + mBGZF.BlockOffset, input, copyLength);
-
- mBGZF.BlockOffset += copyLength;
- input += copyLength;
- numBytesWritten += copyLength;
-
- if(mBGZF.BlockOffset == blockLength) BgzfFlushBlock();
- }
-
- return numBytesWritten;
-}
-
-// closes the alignment archive
-void BamWriter::Close(void) {
- if(mBGZF.IsOpen) BgzfClose();
-}
-
-// creates a cigar string from the supplied alignment
-void BamWriter::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
-
- // initialize
- const unsigned int numCigarOperations = cigarOperations.size();
- packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);
-
- // pack the cigar data into the string
- unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
-
- unsigned int cigarOp;
- vector<CigarOp>::const_iterator coIter;
- for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {
-
- switch(coIter->Type) {
- case 'M':
- cigarOp = BAM_CMATCH;
- break;
- case 'I':
- cigarOp = BAM_CINS;
- break;
- case 'D':
- cigarOp = BAM_CDEL;
- break;
- case 'N':
- cigarOp = BAM_CREF_SKIP;
- break;
- case 'S':
- cigarOp = BAM_CSOFT_CLIP;
- break;
- case 'H':
- cigarOp = BAM_CHARD_CLIP;
- break;
- case 'P':
- cigarOp = BAM_CPAD;
- break;
- default:
- printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);
- exit(1);
- }
-
- *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
- pPackedCigar++;
- }
-}
-
-// encodes the supplied query sequence into 4-bit notation
-void BamWriter::EncodeQuerySequence(const string& query, string& encodedQuery) {
-
- // prepare the encoded query string
- const unsigned int queryLen = query.size();
- const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
- encodedQuery.resize(encodedQueryLen);
- char* pEncodedQuery = (char*)encodedQuery.data();
- const char* pQuery = (const char*)query.data();
-
- unsigned char nucleotideCode;
- bool useHighWord = true;
-
- while(*pQuery) {
-
- switch(*pQuery) {
- case '=':
- nucleotideCode = 0;
- break;
- case 'A':
- nucleotideCode = 1;
- break;
- case 'C':
- nucleotideCode = 2;
- break;
- case 'G':
- nucleotideCode = 4;
- break;
- case 'T':
- nucleotideCode = 8;
- break;
- case 'N':
- nucleotideCode = 15;
- break;
- default:
- printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
- exit(1);
- }
-
- // pack the nucleotide code
- if(useHighWord) {
- *pEncodedQuery = nucleotideCode << 4;
- useHighWord = false;
- } else {
- *pEncodedQuery |= nucleotideCode;
- pEncodedQuery++;
- useHighWord = true;
- }
-
- // increment the query position
- pQuery++;
- }
-}
-
-// opens the alignment archive
-void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {
-
- // open the BGZF file for writing
- BgzfOpen(filename);
-
- // ================
- // write the header
- // ================
-
- // write the BAM signature
- const unsigned char SIGNATURE_LENGTH = 4;
- const char* BAM_SIGNATURE = "BAM\1";
- BgzfWrite(BAM_SIGNATURE, SIGNATURE_LENGTH);
-
- // write the SAM header text length
- const unsigned int samHeaderLen = samHeader.size();
- BgzfWrite((char*)&samHeaderLen, BT_SIZEOF_INT);
-
- // write the SAM header text
- if(samHeaderLen > 0) BgzfWrite(samHeader.data(), samHeaderLen);
-
- // write the number of reference sequences
- const unsigned int numReferenceSequences = referenceSequences.size();
- BgzfWrite((char*)&numReferenceSequences, BT_SIZEOF_INT);
-
- // =============================
- // write the sequence dictionary
- // =============================
-
- RefVector::const_iterator rsIter;
- for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
-
- // write the reference sequence name length
- const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1;
- BgzfWrite((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
-
- // write the reference sequence name
- BgzfWrite(rsIter->RefName.c_str(), referenceSequenceNameLen);
-
- // write the reference sequence length
- BgzfWrite((char*)&rsIter->RefLength, BT_SIZEOF_INT);
- }
-}
-
-// saves the alignment to the alignment archive
-void BamWriter::SaveAlignment(const BamAlignment& al) {
-
- // initialize
- const unsigned int nameLen = al.Name.size() + 1;
- const unsigned int queryLen = al.QueryBases.size();
- const unsigned int numCigarOperations = al.CigarData.size();
-
- // create our packed cigar string
- string packedCigar;
- CreatePackedCigar(al.CigarData, packedCigar);
- const unsigned int packedCigarLen = packedCigar.size();
-
- // encode the query
- string encodedQuery;
- EncodeQuerySequence(al.QueryBases, encodedQuery);
- const unsigned int encodedQueryLen = encodedQuery.size();
-
- // store the tag data length
- const unsigned int tagDataLength = al.TagData.size() + 1;
-
- // assign the BAM core data
- unsigned int buffer[8];
- buffer[0] = al.RefID;
- buffer[1] = al.Position;
- buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;
- buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
- buffer[4] = queryLen;
- buffer[5] = al.MateRefID;
- buffer[6] = al.MatePosition;
- buffer[7] = al.InsertSize;
-
- // write the block size
- const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;
- const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
-
- BgzfWrite((char*)&blockSize, BT_SIZEOF_INT);
-
- // write the BAM core
- BgzfWrite((char*)&buffer, BAM_CORE_SIZE);
-
- // write the query name
- BgzfWrite(al.Name.c_str(), nameLen);
-
- // write the packed cigar
- BgzfWrite(packedCigar.data(), packedCigarLen);
-
- // write the encoded query sequence
- BgzfWrite(encodedQuery.data(), encodedQueryLen);
-
- // write the base qualities
- string baseQualities = al.Qualities;
- char* pBaseQualities = (char*)al.Qualities.data();
- for(unsigned int i = 0; i < queryLen; i++) pBaseQualities[i] -= 33;
- BgzfWrite(pBaseQualities, queryLen);
-
- // write the read group tag
- BgzfWrite(al.TagData.data(), tagDataLength);
-}
+// ***************************************************************************\r
+// BamWriter.cpp (c) 2009 Michael Strömberg, Derek Barnett\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 8 December 2009 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for producing BAM files\r
+// ***************************************************************************\r
+\r
+// BGZF includes\r
+#include "BGZF.h"\r
+#include "BamWriter.h"\r
+using namespace BamTools;\r
+using namespace std;\r
+\r
+struct BamWriter::BamWriterPrivate {\r
+\r
+ // data members\r
+ BgzfData mBGZF;\r
+\r
+ // constructor / destructor\r
+ BamWriterPrivate(void) { }\r
+ ~BamWriterPrivate(void) {\r
+ mBGZF.Close();\r
+ }\r
+\r
+ // "public" interface\r
+ void Close(void);\r
+ void Open(const std::string& filename, const std::string& samHeader, const BamTools::RefVector& referenceSequences);\r
+ void SaveAlignment(const BamTools::BamAlignment& al);\r
+\r
+ // internal methods\r
+ void CreatePackedCigar(const std::vector<CigarOp>& cigarOperations, std::string& packedCigar);\r
+ void EncodeQuerySequence(const std::string& query, std::string& encodedQuery);\r
+};\r
+\r
+// -----------------------------------------------------\r
+// BamWriter implementation\r
+// -----------------------------------------------------\r
+\r
+// constructor\r
+BamWriter::BamWriter(void) {\r
+ d = new BamWriterPrivate;\r
+}\r
+\r
+// destructor\r
+BamWriter::~BamWriter(void) {\r
+ delete d;\r
+ d = 0;\r
+}\r
+\r
+// closes the alignment archive\r
+void BamWriter::Close(void) {\r
+ d->Close();\r
+}\r
+\r
+// opens the alignment archive\r
+void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {\r
+ d->Open(filename, samHeader, referenceSequences);\r
+}\r
+\r
+// saves the alignment to the alignment archive\r
+void BamWriter::SaveAlignment(const BamAlignment& al) {\r
+ d->SaveAlignment(al);\r
+}\r
+\r
+// -----------------------------------------------------\r
+// BamWriterPrivate implementation\r
+// -----------------------------------------------------\r
+\r
+// closes the alignment archive\r
+void BamWriter::BamWriterPrivate::Close(void) {\r
+ mBGZF.Close();\r
+}\r
+\r
+// creates a cigar string from the supplied alignment\r
+void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {\r
+\r
+ // initialize\r
+ const unsigned int numCigarOperations = cigarOperations.size();\r
+ packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);\r
+\r
+ // pack the cigar data into the string\r
+ unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();\r
+\r
+ unsigned int cigarOp;\r
+ vector<CigarOp>::const_iterator coIter;\r
+ for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {\r
+\r
+ switch(coIter->Type) {\r
+ case 'M':\r
+ cigarOp = BAM_CMATCH;\r
+ break;\r
+ case 'I':\r
+ cigarOp = BAM_CINS;\r
+ break;\r
+ case 'D':\r
+ cigarOp = BAM_CDEL;\r
+ break;\r
+ case 'N':\r
+ cigarOp = BAM_CREF_SKIP;\r
+ break;\r
+ case 'S':\r
+ cigarOp = BAM_CSOFT_CLIP;\r
+ break;\r
+ case 'H':\r
+ cigarOp = BAM_CHARD_CLIP;\r
+ break;\r
+ case 'P':\r
+ cigarOp = BAM_CPAD;\r
+ break;\r
+ default:\r
+ printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);\r
+ exit(1);\r
+ }\r
+\r
+ *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;\r
+ pPackedCigar++;\r
+ }\r
+}\r
+\r
+// encodes the supplied query sequence into 4-bit notation\r
+void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {\r
+\r
+ // prepare the encoded query string\r
+ const unsigned int queryLen = query.size();\r
+ const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);\r
+ encodedQuery.resize(encodedQueryLen);\r
+ char* pEncodedQuery = (char*)encodedQuery.data();\r
+ const char* pQuery = (const char*)query.data();\r
+\r
+ unsigned char nucleotideCode;\r
+ bool useHighWord = true;\r
+\r
+ while(*pQuery) {\r
+\r
+ switch(*pQuery) {\r
+ case '=':\r
+ nucleotideCode = 0;\r
+ break;\r
+ case 'A':\r
+ nucleotideCode = 1;\r
+ break;\r
+ case 'C':\r
+ nucleotideCode = 2;\r
+ break;\r
+ case 'G':\r
+ nucleotideCode = 4;\r
+ break;\r
+ case 'T':\r
+ nucleotideCode = 8;\r
+ break;\r
+ case 'N':\r
+ nucleotideCode = 15;\r
+ break;\r
+ default:\r
+ printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
+ exit(1);\r
+ }\r
+\r
+ // pack the nucleotide code\r
+ if(useHighWord) {\r
+ *pEncodedQuery = nucleotideCode << 4;\r
+ useHighWord = false;\r
+ } else {\r
+ *pEncodedQuery |= nucleotideCode;\r
+ pEncodedQuery++;\r
+ useHighWord = true;\r
+ }\r
+\r
+ // increment the query position\r
+ pQuery++;\r
+ }\r
+}\r
+\r
+// opens the alignment archive\r
+void BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {\r
+\r
+ // open the BGZF file for writing\r
+ mBGZF.Open(filename, "wb");\r
+\r
+ // ================\r
+ // write the header\r
+ // ================\r
+\r
+ // write the BAM signature\r
+ const unsigned char SIGNATURE_LENGTH = 4;\r
+ const char* BAM_SIGNATURE = "BAM\1";\r
+ mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);\r
+\r
+ // write the SAM header text length\r
+ const unsigned int samHeaderLen = samHeader.size();\r
+ mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
+\r
+ // write the SAM header text\r
+ if(samHeaderLen > 0) {\r
+ mBGZF.Write(samHeader.data(), samHeaderLen);\r
+ }\r
+\r
+ // write the number of reference sequences\r
+ const unsigned int numReferenceSequences = referenceSequences.size();\r
+ mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
+\r
+ // =============================\r
+ // write the sequence dictionary\r
+ // =============================\r
+\r
+ RefVector::const_iterator rsIter;\r
+ for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {\r
+\r
+ // write the reference sequence name length\r
+ const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
+ mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
+\r
+ // write the reference sequence name\r
+ mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);\r
+\r
+ // write the reference sequence length\r
+ mBGZF.Write((char*)&rsIter->RefLength, BT_SIZEOF_INT);\r
+ }\r
+}\r
+\r
+// saves the alignment to the alignment archive\r
+void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {\r
+\r
+ // initialize\r
+ const unsigned int nameLen = al.Name.size() + 1;\r
+ const unsigned int queryLen = al.QueryBases.size();\r
+ const unsigned int numCigarOperations = al.CigarData.size();\r
+\r
+ // create our packed cigar string\r
+ string packedCigar;\r
+ CreatePackedCigar(al.CigarData, packedCigar);\r
+ const unsigned int packedCigarLen = packedCigar.size();\r
+\r
+ // encode the query\r
+ string encodedQuery;\r
+ EncodeQuerySequence(al.QueryBases, encodedQuery);\r
+ const unsigned int encodedQueryLen = encodedQuery.size();\r
+\r
+ // store the tag data length\r
+ const unsigned int tagDataLength = al.TagData.size() + 1;\r
+\r
+ // assign the BAM core data\r
+ unsigned int buffer[8];\r
+ buffer[0] = al.RefID;\r
+ buffer[1] = al.Position;\r
+ buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;\r
+ buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;\r
+ buffer[4] = queryLen;\r
+ buffer[5] = al.MateRefID;\r
+ buffer[6] = al.MatePosition;\r
+ buffer[7] = al.InsertSize;\r
+\r
+ // write the block size\r
+ const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;\r
+ const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
+ mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
+\r
+ // write the BAM core\r
+ mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
+\r
+ // write the query name\r
+ mBGZF.Write(al.Name.c_str(), nameLen);\r
+\r
+ // write the packed cigar\r
+ mBGZF.Write(packedCigar.data(), packedCigarLen);\r
+\r
+ // write the encoded query sequence\r
+ mBGZF.Write(encodedQuery.data(), encodedQueryLen);\r
+\r
+ // write the base qualities\r
+ string baseQualities = al.Qualities;\r
+ char* pBaseQualities = (char*)al.Qualities.data();\r
+ for(unsigned int i = 0; i < queryLen; i++) { pBaseQualities[i] -= 33; }\r
+ mBGZF.Write(pBaseQualities, queryLen);\r
+\r
+ // write the read group tag\r
+ mBGZF.Write(al.TagData.data(), tagDataLength);\r
+}\r
-// ***************************************************************************
-// BamWriter.h (c) 2009 Michael Strömberg, Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 24 June 2009 (DB)
-// ---------------------------------------------------------------------------
-// The BGZF routines were adapted from the bgzf.c code developed at the Broad
-// Institute.
-// ---------------------------------------------------------------------------
-// Provides the basic functionality for producing BAM files
-// ***************************************************************************
-
-/*!
- \file BamWriter.h
- \brief API for writing BAM files.
-*/
-
-#pragma once
-
-// C++ includes
-#include <string>
-#include <vector>
-
-// zlib includes
-#include <zlib.h>
-
-// BamTools includes
-#include "BamAux.h"
-
-namespace BamTools {
-
- //! API for writing BAM files.
- class BamWriter {
-
- public:
-
- //! Constructor.
- BamWriter(void);
-
- //! Destructor.
- ~BamWriter(void);
-
- public:
-
- /*!
- \brief Closes the alignment archive
- \sa Open()
- */
- void Close(void);
-
- /*!
- \brief Opens the alignment archive
- \param filename output BAM file
- \param samHeader SAM-format header text
- \param referenceSequences Reference sequence data
- \sa Close()
- */
- void Open(const std::string& filename, const std::string& samHeader, const RefVector& referenceSequences);
-
- /*!
- \brief Saves an alignment to the archive
- \param al The BamAlignment to be saved
- */
- void SaveAlignment(const BamAlignment& al);
-
- // --------------------------------------------------------------------------------------
- // internal methods
- private:
- // closes the BAM file
- void BgzfClose(void);
- // compresses the current block
- int BgzfDeflateBlock(void);
- // flushes the data in the BGZF block
- void BgzfFlushBlock(void);
- // opens the BAM file for writing
- void BgzfOpen(const std::string& filename);
- // packs an unsigned integer into the specified buffer
- static inline void BgzfPackUnsignedInt(char* buffer, unsigned int value);
- // packs an unsigned short into the specified buffer
- static inline void BgzfPackUnsignedShort(char* buffer, unsigned short value);
- // writes the supplied data into the BGZF buffer
- unsigned int BgzfWrite(const char* data, const unsigned int dataLen);
- // calculates the minimum bin that contains a region [begin, end)
- static inline unsigned int CalculateMinimumBin(unsigned int begin, unsigned int end);
- // creates a packed cigar string from the supplied alignment
- static void CreatePackedCigar(const std::vector<CigarOp>& cigarOperations, std::string& packedCigar);
- // encodes the supplied query sequence into 4-bit notation
- static void EncodeQuerySequence(const std::string& query, std::string& encodedQuery);
- // our BGZF output object
- BgzfData mBGZF;
- };
-
- //! \cond
- // --------------------------------------------------------------------------------------
- // static inline methods (internal - can exclude from main documentation)
-
- // packs an unsigned integer into the specified buffer
- inline void BamWriter::BgzfPackUnsignedInt(char* buffer, unsigned int value) {
- buffer[0] = (char)value;
- buffer[1] = (char)(value >> 8);
- buffer[2] = (char)(value >> 16);
- buffer[3] = (char)(value >> 24);
- }
-
- // packs an unsigned short into the specified buffer
- inline void BamWriter::BgzfPackUnsignedShort(char* buffer, unsigned short value) {
- buffer[0] = (char)value;
- buffer[1] = (char)(value >> 8);
- }
- // --------------------------------------------------------------------------------------
- //! \endcond
-
-} // end BamTools namespace
\ No newline at end of file
+// ***************************************************************************\r
+// BamWriter.h (c) 2009 Michael Strömberg, Derek Barnett\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 8 December 2009 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for producing BAM files\r
+// ***************************************************************************\r
+\r
+#ifndef BAMWRITER_H\r
+#define BAMWRITER_H\r
+\r
+// C++ includes\r
+#include <string>\r
+\r
+// BamTools includes\r
+#include "BamAux.h"\r
+\r
+namespace BamTools {\r
+\r
+class BamWriter {\r
+\r
+ // constructor/destructor\r
+ public:\r
+ BamWriter(void);\r
+ ~BamWriter(void);\r
+\r
+ // public interface\r
+ public:\r
+ // closes the alignment archive\r
+ void Close(void);\r
+ // opens the alignment archive\r
+ void Open(const std::string& filename, const std::string& samHeader, const BamTools::RefVector& referenceSequences);\r
+ // saves the alignment to the alignment archive\r
+ void SaveAlignment(const BamTools::BamAlignment& al);\r
+\r
+ // private implementation\r
+ private:\r
+ struct BamWriterPrivate;\r
+ BamWriterPrivate* d;\r
+};\r
+\r
+} // namespace BamTools\r
+\r
+#endif // BAMWRITER_H\r