From a434db60a2ba1c77a0084704290c0e2ba6dcf1dc Mon Sep 17 00:00:00 2001 From: barnett Date: Tue, 8 Dec 2009 18:37:03 +0000 Subject: [PATCH] Major overhaul to BamTools Separated out Bgzf routines to BGZF.h Simplified main BamReader.h/BamWriter.h headers, by adding pimpl git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@30 9efb377e-2e27-44b9-b91a-ec4abb80ed8b --- BGZF.cpp | 354 ++++++++++ BGZF.h | 183 +++++ BamAux.h | 592 ++++++++-------- BamReader.cpp | 1797 ++++++++++++++++++++++++++++--------------------- BamReader.h | 369 +++------- BamWriter.cpp | 688 ++++++++----------- BamWriter.h | 163 ++--- 7 files changed, 2257 insertions(+), 1889 deletions(-) create mode 100644 BGZF.cpp create mode 100644 BGZF.h diff --git a/BGZF.cpp b/BGZF.cpp new file mode 100644 index 0000000..651fe81 --- /dev/null +++ b/BGZF.cpp @@ -0,0 +1,354 @@ +// *************************************************************************** +// BGZF.cpp (c) 2009 Derek Barnett, Michael Strömberg +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 8 December 2009 (DB) +// --------------------------------------------------------------------------- +// BGZF routines were adapted from the bgzf.c code developed at the Broad +// Institute. +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading & writing BGZF files +// *************************************************************************** + +#include +#include "BGZF.h" +using namespace BamTools; +using std::string; +using std::min; + +BgzfData::BgzfData(void) + : UncompressedBlockSize(DEFAULT_BLOCK_SIZE) + , CompressedBlockSize(MAX_BLOCK_SIZE) + , BlockLength(0) + , BlockOffset(0) + , BlockAddress(0) + , IsOpen(false) + , IsWriteOnly(false) + , Stream(NULL) + , UncompressedBlock(NULL) + , CompressedBlock(NULL) +{ + try { + CompressedBlock = new char[CompressedBlockSize]; + UncompressedBlock = new char[UncompressedBlockSize]; + } catch( std::bad_alloc& ba ) { + printf("ERROR: Unable to allocate memory for our BGZF object.\n"); + exit(1); + } +} + +// destructor +BgzfData::~BgzfData(void) { + if(CompressedBlock) delete [] CompressedBlock; + if(UncompressedBlock) delete [] UncompressedBlock; +} + +// closes BGZF file +void BgzfData::Close(void) { + + if (!IsOpen) { return; } + IsOpen = false; + + // flush the BGZF block + if ( IsWriteOnly ) { FlushBlock(); } + + // flush and close + fflush(Stream); + fclose(Stream); +} + +// compresses the current block +int BgzfData::DeflateBlock(void) { + + // initialize the gzip header + char* buffer = CompressedBlock; + unsigned int bufferSize = CompressedBlockSize; + + memset(buffer, 0, 18); + buffer[0] = GZIP_ID1; + buffer[1] = (char)GZIP_ID2; + buffer[2] = CM_DEFLATE; + buffer[3] = FLG_FEXTRA; + buffer[9] = (char)OS_UNKNOWN; + buffer[10] = BGZF_XLEN; + buffer[12] = BGZF_ID1; + buffer[13] = BGZF_ID2; + buffer[14] = BGZF_LEN; + + // loop to retry for blocks that do not compress enough + int inputLength = BlockOffset; + int compressedLength = 0; + + while(true) { + + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = (Bytef*)UncompressedBlock; + zs.avail_in = inputLength; + zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH]; + zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + + // initialize the zlib compression algorithm + if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) { + printf("ERROR: zlib deflate initialization failed.\n"); + exit(1); + } + + // compress the data + int status = deflate(&zs, Z_FINISH); + if(status != Z_STREAM_END) { + + deflateEnd(&zs); + + // reduce the input length and try again + if(status == Z_OK) { + inputLength -= 1024; + if(inputLength < 0) { + printf("ERROR: input reduction failed.\n"); + exit(1); + } + continue; + } + + printf("ERROR: zlib deflate failed.\n"); + exit(1); + } + + // finalize the compression routine + if(deflateEnd(&zs) != Z_OK) { + printf("ERROR: deflate end failed.\n"); + exit(1); + } + + compressedLength = zs.total_out; + compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + + if(compressedLength > MAX_BLOCK_SIZE) { + printf("ERROR: deflate overflow.\n"); + exit(1); + } + + break; + } + + // store the compressed length + BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1)); + + // store the CRC32 checksum + unsigned int crc = crc32(0, NULL, 0); + crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength); + BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc); + BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength); + + // ensure that we have less than a block of data left + int remaining = BlockOffset - inputLength; + if(remaining > 0) { + if(remaining > inputLength) { + printf("ERROR: remainder too large.\n"); + exit(1); + } + memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining); + } + + BlockOffset = remaining; + return compressedLength; +} + +// flushes the data in the BGZF block +void BgzfData::FlushBlock(void) { + + // flush all of the remaining blocks + while(BlockOffset > 0) { + + // compress the data block + int blockLength = DeflateBlock(); + + // flush the data to our output stream + int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream); + + if(numBytesWritten != blockLength) { + printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten); + exit(1); + } + + BlockAddress += blockLength; + } +} + +// de-compresses the current block +int BgzfData::InflateBlock(const int& blockLength) { + + // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = (Bytef*)CompressedBlock + 18; + zs.avail_in = blockLength - 16; + zs.next_out = (Bytef*)UncompressedBlock; + zs.avail_out = UncompressedBlockSize; + + int status = inflateInit2(&zs, GZIP_WINDOW_BITS); + if (status != Z_OK) { + printf("inflateInit failed\n"); + exit(1); + } + + status = inflate(&zs, Z_FINISH); + if (status != Z_STREAM_END) { + inflateEnd(&zs); + printf("inflate failed\n"); + exit(1); + } + + status = inflateEnd(&zs); + if (status != Z_OK) { + printf("inflateEnd failed\n"); + exit(1); + } + + return zs.total_out; +} + +void BgzfData::Open(const string& filename, const char* mode) { + + if ( strcmp(mode, "rb") == 0 ) { + IsWriteOnly = false; + } else if ( strcmp(mode, "wb") == 0) { + IsWriteOnly = true; + } else { + printf("ERROR: Unknown file mode: %s\n", mode); + exit(1); + } + + Stream = fopen(filename.c_str(), mode); + if(!Stream) { + printf("ERROR: Unable to open the BAM file %s\n", filename.c_str() ); + exit(1); + } + IsOpen = true; +} + +int BgzfData::Read(char* data, const unsigned int dataLength) { + + if (dataLength == 0) { return 0; } + + char* output = data; + unsigned int numBytesRead = 0; + while (numBytesRead < dataLength) { + + int bytesAvailable = BlockLength - BlockOffset; + if (bytesAvailable <= 0) { + if ( ReadBlock() != 0 ) { return -1; } + bytesAvailable = BlockLength - BlockOffset; + if ( bytesAvailable <= 0 ) { break; } + } + + char* buffer = UncompressedBlock; + int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable ); + memcpy(output, buffer + BlockOffset, copyLength); + + BlockOffset += copyLength; + output += copyLength; + numBytesRead += copyLength; + } + + if ( BlockOffset == BlockLength ) { + BlockAddress = ftell(Stream); + BlockOffset = 0; + BlockLength = 0; + } + + return numBytesRead; +} + +int BgzfData::ReadBlock(void) { + + char header[BLOCK_HEADER_LENGTH]; + int64_t blockAddress = ftell(Stream); + + int count = fread(header, 1, sizeof(header), Stream); + if (count == 0) { + BlockLength = 0; + return 0; + } + + if (count != sizeof(header)) { + printf("read block failed - count != sizeof(header)\n"); + return -1; + } + + if (!BgzfData::CheckBlockHeader(header)) { + printf("read block failed - CheckBlockHeader() returned false\n"); + return -1; + } + + int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1; + char* compressedBlock = CompressedBlock; + memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH); + int remaining = blockLength - BLOCK_HEADER_LENGTH; + + count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream); + if (count != remaining) { + printf("read block failed - count != remaining\n"); + return -1; + } + + count = InflateBlock(blockLength); + if (count < 0) { return -1; } + + if ( BlockLength != 0 ) { + BlockOffset = 0; + } + + BlockAddress = blockAddress; + BlockLength = count; + return 0; +} + +bool BgzfData::Seek(int64_t position) { + + int blockOffset = (position & 0xFFFF); + int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; + + if (fseek(Stream, blockAddress, SEEK_SET) != 0) { + printf("ERROR: Unable to seek in BAM file\n"); + exit(1); + } + + BlockLength = 0; + BlockAddress = blockAddress; + BlockOffset = blockOffset; + return true; +} + +int64_t BgzfData::Tell(void) { + return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) ); +} + +// writes the supplied data into the BGZF buffer +unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) { + + // initialize + unsigned int numBytesWritten = 0; + const char* input = data; + unsigned int blockLength = UncompressedBlockSize; + + // copy the data to the buffer + while(numBytesWritten < dataLen) { + unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten); + char* buffer = UncompressedBlock; + memcpy(buffer + BlockOffset, input, copyLength); + + BlockOffset += copyLength; + input += copyLength; + numBytesWritten += copyLength; + + if(BlockOffset == blockLength) { + FlushBlock(); + } + } + + return numBytesWritten; +} diff --git a/BGZF.h b/BGZF.h new file mode 100644 index 0000000..2896542 --- /dev/null +++ b/BGZF.h @@ -0,0 +1,183 @@ +// *************************************************************************** +// BGZF.h (c) 2009 Derek Barnett, Michael Strömberg +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 8 December 2009 (DB) +// --------------------------------------------------------------------------- +// BGZF routines were adapted from the bgzf.c code developed at the Broad +// Institute. +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading & writing BGZF files +// *************************************************************************** + +#ifndef BGZF_H +#define BGZF_H + +// 'C' includes +#include +#include +#include + +// C++ includes +#include + +// zlib includes +#include "zlib.h" + +// Platform-specific type definitions +#ifdef _MSC_VER + typedef char int8_t; + typedef unsigned char uint8_t; + typedef short int16_t; + typedef unsigned short uint16_t; + typedef int int32_t; + typedef unsigned int uint32_t; + typedef long long int64_t; + typedef unsigned long long uint64_t; +#else + #include +#endif + +namespace BamTools { + +// zlib constants +const int GZIP_ID1 = 31; +const int GZIP_ID2 = 139; +const int CM_DEFLATE = 8; +const int FLG_FEXTRA = 4; +const int OS_UNKNOWN = 255; +const int BGZF_XLEN = 6; +const int BGZF_ID1 = 66; +const int BGZF_ID2 = 67; +const int BGZF_LEN = 2; +const int GZIP_WINDOW_BITS = -15; +const int Z_DEFAULT_MEM_LEVEL = 8; + +// BZGF constants +const int BLOCK_HEADER_LENGTH = 18; +const int BLOCK_FOOTER_LENGTH = 8; +const int MAX_BLOCK_SIZE = 65536; +const int DEFAULT_BLOCK_SIZE = 65536; + +struct BgzfData { + + // data members + unsigned int UncompressedBlockSize; + unsigned int CompressedBlockSize; + unsigned int BlockLength; + unsigned int BlockOffset; + uint64_t BlockAddress; + bool IsOpen; + bool IsWriteOnly; + FILE* Stream; + char* UncompressedBlock; + char* CompressedBlock; + + // constructor & destructor + BgzfData(void); + ~BgzfData(void); + + // closes BGZF file + void Close(void); + // compresses the current block + int DeflateBlock(void); + // flushes the data in the BGZF block + void FlushBlock(void); + // de-compresses the current block + int InflateBlock(const int& blockLength); + // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing + void Open(const std::string& filename, const char* mode); + // reads BGZF data into a byte buffer + int Read(char* data, const unsigned int dataLength); + // reads BGZF block + int ReadBlock(void); + // seek to position in BAM file + bool Seek(int64_t position); + // get file position in BAM file + int64_t Tell(void); + // writes the supplied data into the BGZF buffer + unsigned int Write(const char* data, const unsigned int dataLen); + + // checks BGZF block header + static inline bool CheckBlockHeader(char* header); + // packs an unsigned integer into the specified buffer + static inline void PackUnsignedInt(char* buffer, unsigned int value); + // packs an unsigned short into the specified buffer + static inline void PackUnsignedShort(char* buffer, unsigned short value); + // unpacks a buffer into a signed int + static inline signed int UnpackSignedInt(char* buffer); + // unpacks a buffer into a unsigned int + static inline unsigned int UnpackUnsignedInt(char* buffer); + // unpacks a buffer into a unsigned short + static inline unsigned short UnpackUnsignedShort(char* buffer); +}; + +// ------------------------------------------------------------- + +inline +bool BgzfData::CheckBlockHeader(char* header) { + + return (header[0] == GZIP_ID1 && + header[1] == (char)GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & FLG_FEXTRA) != 0 && + BgzfData::UnpackUnsignedShort(&header[10]) == BGZF_XLEN && + header[12] == BGZF_ID1 && + header[13] == BGZF_ID2 && + BgzfData::UnpackUnsignedShort(&header[14]) == BGZF_LEN ); +} + +// packs an unsigned integer into the specified buffer +inline +void BgzfData::PackUnsignedInt(char* buffer, unsigned int value) { + buffer[0] = (char)value; + buffer[1] = (char)(value >> 8); + buffer[2] = (char)(value >> 16); + buffer[3] = (char)(value >> 24); +} + +// packs an unsigned short into the specified buffer +inline +void BgzfData::PackUnsignedShort(char* buffer, unsigned short value) { + buffer[0] = (char)value; + buffer[1] = (char)(value >> 8); +} + +// unpacks a buffer into a signed int +inline +signed int BgzfData::UnpackSignedInt(char* buffer) { + union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +// unpacks a buffer into an unsigned int +inline +unsigned int BgzfData::UnpackUnsignedInt(char* buffer) { + union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +// unpacks a buffer into an unsigned short +inline +unsigned short BgzfData::UnpackUnsignedShort(char* buffer) { + union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + return un.value; +} + +} // namespace BamTools + +#endif // BGZF_H diff --git a/BamAux.h b/BamAux.h index 5784198..d5510d1 100644 --- a/BamAux.h +++ b/BamAux.h @@ -3,343 +3,283 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 1 October 2009 (DB) +// Last modified: 8 December 2009 (DB) // --------------------------------------------------------------------------- -// The BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Defines common constants, typedefs, & data structures for BamTools. +// Provides the basic constants, data structures, etc. for using BAM files // *************************************************************************** -/*! - \file BamAux.h - \brief BamTools constants, typedefs, & data structures -*/ +#ifndef BAMAUX_H +#define BAMAUX_H -#pragma once +// C inclues +#include +#include // C++ includes #include +#include #include #include #include -// C includes -#include -#include -#include - -// Platform-specific type definitions -#ifdef WIN32 - typedef char int8_t; - typedef unsigned char uint8_t; - typedef short int16_t; - typedef unsigned short uint16_t; - typedef int int32_t; - typedef unsigned int uint32_t; - typedef long long int64_t; - typedef unsigned long long uint64_t; -#else - #include -#endif - -//! \namespace BamTools namespace BamTools { - //! \cond - // -------------------------------------------------------------------------------------- - // This section is purely internal and can be excluded from main generated documentation. - - // zlib constants - const int GZIP_ID1 = 31; - const int GZIP_ID2 = 139; - const int CM_DEFLATE = 8; - const int FLG_FEXTRA = 4; - const int OS_UNKNOWN = 255; - const int BGZF_XLEN = 6; - const int BGZF_ID1 = 66; - const int BGZF_ID2 = 67; - const int BGZF_LEN = 2; - const int GZIP_WINDOW_BITS = -15; - const int Z_DEFAULT_MEM_LEVEL = 8; - - // BZGF constants - const int BLOCK_HEADER_LENGTH = 18; - const int BLOCK_FOOTER_LENGTH = 8; - const int MAX_BLOCK_SIZE = 65536; - const int DEFAULT_BLOCK_SIZE = 65536; - - // BAM constants - const unsigned int BAM_CORE_SIZE = 32; - const int BAM_CMATCH = 0; - const int BAM_CINS = 1; - const int BAM_CDEL = 2; - const int BAM_CREF_SKIP = 3; - const int BAM_CSOFT_CLIP = 4; - const int BAM_CHARD_CLIP = 5; - const int BAM_CPAD = 6; - const int BAM_CIGAR_SHIFT = 4; - const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1); - - // BAM index constants - const int MAX_BIN = 37450; // =(8^6-1)/7+1 - const int BAM_MIN_CHUNK_GAP = 32768; - const int BAM_LIDX_SHIFT = 14; - - // Explicit variable sizes - const int BT_SIZEOF_INT = 4; - - struct BgzfData { - unsigned int UncompressedBlockSize; - unsigned int CompressedBlockSize; - unsigned int BlockLength; - unsigned int BlockOffset; - uint64_t BlockAddress; - bool IsOpen; - FILE* Stream; - char* UncompressedBlock; - char* CompressedBlock; - - // constructor - BgzfData(void) - : UncompressedBlockSize(DEFAULT_BLOCK_SIZE) - , CompressedBlockSize(MAX_BLOCK_SIZE) - , BlockLength(0) - , BlockOffset(0) - , BlockAddress(0) - , IsOpen(false) - , Stream(NULL) - , UncompressedBlock(NULL) - , CompressedBlock(NULL) - { - try { - CompressedBlock = new char[CompressedBlockSize]; - UncompressedBlock = new char[UncompressedBlockSize]; - } catch( std::bad_alloc& ba ) { - printf("ERROR: Unable to allocate memory for our BGZF object.\n"); - exit(1); - } - } - - // destructor - ~BgzfData(void) { - if(CompressedBlock) delete [] CompressedBlock; - if(UncompressedBlock) delete [] UncompressedBlock; - } - }; - //! \endcond - - // -------------------------------------------------------------------------------------- - // Data structures - - //! \brief Cigar operation data structure - struct CigarOp { - char Type; //!< Operation type (MIDNSHP) - uint32_t Length; //!< Operation length (number of bases) - - }; - - //! Reference sequence data structure - struct RefData { - std::string RefName; //!< Name of reference sequence - unsigned int RefLength; //!< Length of reference sequence - bool RefHasAlignments; //!< True if BAM file contains alignments mapped to reference sequence - - // constructor - RefData(void) - : RefLength(0) - , RefHasAlignments(false) - { } - }; - - //! BAM alignment data structure - struct BamAlignment { - - // Queries against alignment flag - public: - //! Returns true if this read is a PCR duplicate (determined by external app) - bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } - //! Returns true if this read failed quality control (determined by external app) - bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } - //! Returns true if alignment is first mate on read - bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); } - //! Returns true if alignment is mapped - bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); } - //! Returns true if alignment's mate is mapped - bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } - //! Returns true if alignment's mate mapped to reverse strand - bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); } - //! Returns true if alignment part of paired-end read - bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } - //! Returns true if this position is primary alignment (determined by external app) - bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } - //! Returns true if alignment is part of read that satisfied paired-end resolution (determined by external app) - bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } - //! Returns true if alignment mapped to reverse strand - bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } - //! Returns true if alignment is second mate on read - bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } - - public: - /*! - \brief Get alignment's read group text. - - Assigns read group text to readGroup. - - \return True if read group data successfully retrieved. - */ - bool GetReadGroup(std::string& readGroup) const { - - if ( TagData.empty() ) { return false; } - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLen = TagData.size(); - unsigned int numBytesParsed = 0; - - bool foundReadGroupTag = false; - while( numBytesParsed < tagDataLen ) { - - const char* pTagType = pTagData; - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - - // check the current tag - if ( strncmp(pTagType, "RG", 2) == 0 ) { - foundReadGroupTag = true; - break; - } - - // get the storage class and find the next tag - SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed ); - } - - // return if the read group tag was not present - if ( !foundReadGroupTag ) { return false; } - - // assign the read group - const unsigned int readGroupLen = strlen(pTagData); - readGroup.resize(readGroupLen); - memcpy( (char*)readGroup.data(), pTagData, readGroupLen ); - return true; - } - - private: - // skips to the next tag - static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { - switch(storageType) { - - case 'A': - case 'c': - case 'C': - ++numBytesParsed; - ++pTagData; - break; - - case 's': - case 'S': - case 'f': - numBytesParsed += 2; - pTagData += 2; - break; - - case 'i': - case 'I': - numBytesParsed += 4; - pTagData += 4; - break; - - case 'Z': - case 'H': - while(*pTagData) { - ++numBytesParsed; - ++pTagData; - } - break; - - default: - printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData); - exit(1); - } - } - - // Data members - public: - std::string Name; //!< Read name - unsigned int Length; //!< Query length - std::string QueryBases; //!< 'Original' sequence (as reported from sequencing machine) - std::string AlignedBases; //!< 'Aligned' sequence (includes any indels, padding, clipping) - std::string Qualities; //!< FASTQ qualities (ASCII characters, not numeric values) - std::string TagData; //!< Tag data (accessor methods will pull the requested information out) - //unsigned int RefID; //!< ID number for reference sequence - //unsigned int Position; //!< Position (0-based) where alignment starts - signed int RefID; //!< ID number for reference sequence (-1) - signed int Position; //!< Position (0-based) where alignment starts (-1) - unsigned int Bin; //!< Bin in BAM file where this alignment resides - unsigned int MapQuality; //!< Mapping quality score - unsigned int AlignmentFlag; //!< Alignment bit-flag - see Is() methods for available queries - std::vector CigarData; //!< CIGAR operations for this alignment - //unsigned int MateRefID; //!< ID number for reference sequence where alignment's mate was aligned - //unsigned int MatePosition; //!< Position (0-based) where alignment's mate starts - //unsigned int InsertSize; //!< Mate-pair insert size - signed int MateRefID; //!< ID number for reference sequence where alignment's mate was aligned (-1) - signed int MatePosition; //!< Position (0-based) where alignment's mate starts (-1) - signed int InsertSize; //!< Mate-pair insert size(0) - - // Alignment flag query constants - private: - enum { PAIRED = 1, - PROPER_PAIR = 2, - UNMAPPED = 4, - MATE_UNMAPPED = 8, - REVERSE = 16, - MATE_REVERSE = 32, - READ_1 = 64, - READ_2 = 128, - SECONDARY = 256, - QC_FAILED = 512, - DUPLICATE = 1024 - }; - }; - - // ---------------------------------------------------------------- - // Typedefs - - /*! - \typedef RefVector - \brief Vector of RefData objects - */ - typedef std::vector RefVector; - - /*! - \typedef BamAlignmentVector - \brief Vector of BamAlignments - */ - typedef std::vector< BamAlignment > BamAlignmentVector; - - //! \cond - // ---------------------------------------------------------------- - // Typedefs (internal - can exclude from main documentation) - - //Offsets for linear indexing - typedef std::vector LinearOffsetVector; - - // Alignment 'chunk' boundaries - typedef std::pair ChunkPair; - // Vector of alignment 'chunks' - typedef std::vector ChunkVector; - - // BAM bin contains a bin ID & a vector of alignment 'chunks' - typedef std::pair BamBin; - // Vector of BAM bins - typedef std::vector BinVector; - - // Reference sequence index data - typedef std::pair RefIndex; - - // Full BAM file index data structure - typedef std::vector BamIndex; - // ---------------------------------------------------------------- - //! \endcond +// BAM constants +const int BAM_CORE_SIZE = 32; +const int BAM_CMATCH = 0; +const int BAM_CINS = 1; +const int BAM_CDEL = 2; +const int BAM_CREF_SKIP = 3; +const int BAM_CSOFT_CLIP = 4; +const int BAM_CHARD_CLIP = 5; +const int BAM_CPAD = 6; +const int BAM_CIGAR_SHIFT = 4; +const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1); + +// BAM index constants +const int MAX_BIN = 37450; // =(8^6-1)/7+1 +const int BAM_MIN_CHUNK_GAP = 32768; +const int BAM_LIDX_SHIFT = 14; + +// Explicit variable sizes +const int BT_SIZEOF_INT = 4; + +struct CigarOp; + +struct BamAlignment { + + // Queries against alignment flag + public: + // Returns true if this read is a PCR duplicate (determined by external app) + bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } + // Returns true if this read failed quality control (determined by external app) + bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } + // Returns true if alignment is first mate on read + bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); } + // Returns true if alignment is mapped + bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); } + // Returns true if alignment's mate is mapped + bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } + // Returns true if alignment's mate mapped to reverse strand + bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); } + // Returns true if alignment part of paired-end read + bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } + // Returns true if this position is primary alignment (determined by external app) + bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } + // Returns true if alignment is part of read that satisfied paired-end resolution (determined by external app) + bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } + // Returns true if alignment mapped to reverse strand + bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } + // Returns true if alignment is second mate on read + bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } + + public: + + // get "RG" tag data + bool GetReadGroup(std::string& readGroup) const { + + if ( TagData.empty() ) { return false; } + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLen = TagData.size(); + unsigned int numBytesParsed = 0; + + bool foundReadGroupTag = false; + while( numBytesParsed < tagDataLen ) { + + const char* pTagType = pTagData; + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + + // check the current tag + if ( std::strncmp(pTagType, "RG", 2) == 0 ) { + foundReadGroupTag = true; + break; + } + + // get the storage class and find the next tag + SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed ); + } + + // return if the read group tag was not present + if ( !foundReadGroupTag ) { return false; } + + // assign the read group + const unsigned int readGroupLen = std::strlen(pTagData); + readGroup.resize(readGroupLen); + std::memcpy( (char*)readGroup.data(), pTagData, readGroupLen ); + return true; + } + + // get "NM" tag data - contributed by Aaron Quinlan + bool GetEditDistance(uint8_t& editDistance) const { + + if ( TagData.empty() ) { return false; } + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLen = TagData.size(); + unsigned int numBytesParsed = 0; + + bool foundEditDistanceTag = false; + while( numBytesParsed < tagDataLen ) { + + const char* pTagType = pTagData; + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + + // check the current tag + if ( strncmp(pTagType, "NM", 2) == 0 ) { + foundEditDistanceTag = true; + break; + } + + // get the storage class and find the next tag + SkipToNextTag( *pTagStorageType, pTagData, numBytesParsed ); + } + // return if the edit distance tag was not present + if ( !foundEditDistanceTag ) { return false; } + + // assign the editDistance value + memcpy(&editDistance, pTagData, 1); + return true; + } + + private: + static void SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { + switch(storageType) { + + case 'A': + case 'c': + case 'C': + ++numBytesParsed; + ++pTagData; + break; + + case 's': + case 'S': + case 'f': + numBytesParsed += 2; + pTagData += 2; + break; + + case 'i': + case 'I': + numBytesParsed += 4; + pTagData += 4; + break; + + case 'Z': + case 'H': + while(*pTagData) { + ++numBytesParsed; + ++pTagData; + } + break; + + default: + printf("ERROR: Unknown tag storage class encountered: [%c]\n", *pTagData); + exit(1); + } + } + + // Data members + public: + std::string Name; // Read name + int32_t Length; // Query length + std::string QueryBases; // 'Original' sequence (as reported from sequencing machine) + std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping) + std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values) + std::string TagData; // Tag data (accessor methods will pull the requested information out) + int32_t RefID; // ID number for reference sequence + int32_t Position; // Position (0-based) where alignment starts + uint16_t Bin; // Bin in BAM file where this alignment resides + uint16_t MapQuality; // Mapping quality score + uint32_t AlignmentFlag; // Alignment bit-flag - see Is() methods for available queries + std::vector CigarData; // CIGAR operations for this alignment + int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned + int32_t MatePosition; // Position (0-based) where alignment's mate starts + int32_t InsertSize; // Mate-pair insert size + + // Alignment flag query constants + private: + enum { PAIRED = 1, + PROPER_PAIR = 2, + UNMAPPED = 4, + MATE_UNMAPPED = 8, + REVERSE = 16, + MATE_REVERSE = 32, + READ_1 = 64, + READ_2 = 128, + SECONDARY = 256, + QC_FAILED = 512, + DUPLICATE = 1024 + }; +}; + +// ---------------------------------------------------------------- +// Auxiliary data structs & typedefs + +struct CigarOp { + char Type; // Operation type (MIDNSHP) + uint32_t Length; // Operation length (number of bases) +}; + +struct RefData { + // data members + std::string RefName; // Name of reference sequence + unsigned int RefLength; // Length of reference sequence + bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence + // constructor + RefData(void) + : RefLength(0) + , RefHasAlignments(false) + { } +}; + +typedef std::vector RefVector; +typedef std::vector BamAlignmentVector; + +// ---------------------------------------------------------------- +// Indexing structs & typedefs + +struct Chunk { + // data members + uint64_t Start; + uint64_t Stop; + // constructor + Chunk(const uint64_t& start = 0, const uint64_t& stop = 0) + : Start(start) + , Stop(stop) + { } +}; + +inline +bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) { + return lhs.Start < rhs.Start; } + +typedef std::vector ChunkVector; +typedef std::map BamBinMap; +typedef std::vector LinearOffsetVector; + +struct ReferenceIndex { + // data members + BamBinMap Bins; + LinearOffsetVector Offsets; + // constructor + ReferenceIndex(const BamBinMap& binMap = BamBinMap(), + const LinearOffsetVector& offsets = LinearOffsetVector()) + : Bins(binMap) + , Offsets(offsets) + { } +}; + +typedef std::vector BamIndex; + +} // namespace BamTools + +#endif // BAMAUX_H diff --git a/BamReader.cpp b/BamReader.cpp index f2a1e11..386a485 100644 --- a/BamReader.cpp +++ b/BamReader.cpp @@ -1,759 +1,1038 @@ -// *************************************************************************** -// BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 15 July 2009 (DB) -// --------------------------------------------------------------------------- -// The BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading BAM files -// *************************************************************************** - -// BamTools includes -#include "BamReader.h" -using namespace BamTools; -using namespace std; - -// static character constants -const char* BamReader::DNA_LOOKUP = "=ACMGRSVTWYHKDBN"; -const char* BamReader::CIGAR_LOOKUP = "MIDNSHP"; - -// constructor -BamReader::BamReader(void) - : m_BGZF(NULL) - , m_index(NULL) - , m_isIndexLoaded(false) - , m_alignmentsBeginOffset(0) - , m_isRegionSpecified(false) - , m_currentRefID(0) - , m_currentLeft(0) -{ } - -// destructor -BamReader::~BamReader(void) { - Close(); -} - -// checks BGZF block header -bool BamReader::BgzfCheckBlockHeader(char* header) { - - return (header[0] == GZIP_ID1 && - header[1] == (char)GZIP_ID2 && - header[2] == Z_DEFLATED && - (header[3] & FLG_FEXTRA) != 0 && - BgzfUnpackUnsignedShort(&header[10]) == BGZF_XLEN && - header[12] == BGZF_ID1 && - header[13] == BGZF_ID2 && - BgzfUnpackUnsignedShort(&header[14]) == BGZF_LEN - ); -} - -// closes the BAM file -void BamReader::BgzfClose(void) { - fflush(m_BGZF->Stream); - fclose(m_BGZF->Stream); - m_BGZF->IsOpen = false; -} - -// de-compresses the current block -int BamReader::BgzfInflateBlock(int blockLength) { - - // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)m_BGZF->CompressedBlock + 18; - zs.avail_in = blockLength - 16; - zs.next_out = (Bytef*)m_BGZF->UncompressedBlock; - zs.avail_out = m_BGZF->UncompressedBlockSize; - - int status = inflateInit2(&zs, GZIP_WINDOW_BITS); - if (status != Z_OK) { - printf("inflateInit failed\n"); - exit(1); - } - - status = inflate(&zs, Z_FINISH); - if (status != Z_STREAM_END) { - inflateEnd(&zs); - printf("inflate failed\n"); - exit(1); - } - - status = inflateEnd(&zs); - if (status != Z_OK) { - printf("inflateEnd failed\n"); - exit(1); - } - - return zs.total_out; -} - -// opens the BAM file for reading -void BamReader::BgzfOpen(const string& filename) { - - m_BGZF->Stream = fopen(filename.c_str(), "rb"); - if(!m_BGZF->Stream) { - printf("ERROR: Unable to open the BAM file %s for reading.\n", filename.c_str() ); - exit(1); - } - - m_BGZF->IsOpen = true; -} - -// reads BGZF data into buffer -unsigned int BamReader::BgzfRead(char* data, const unsigned int dataLength) { - - if (dataLength == 0) { return 0; } - - char* output = data; - unsigned int numBytesRead = 0; - while (numBytesRead < dataLength) { - - int bytesAvailable = m_BGZF->BlockLength - m_BGZF->BlockOffset; - if (bytesAvailable <= 0) { - if ( BgzfReadBlock() != 0 ) { return -1; } - bytesAvailable = m_BGZF->BlockLength - m_BGZF->BlockOffset; - if ( bytesAvailable <= 0 ) { break; } - } - - char* buffer = m_BGZF->UncompressedBlock; - int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable ); - memcpy(output, buffer + m_BGZF->BlockOffset, copyLength); - - m_BGZF->BlockOffset += copyLength; - output += copyLength; - numBytesRead += copyLength; - } - - if ( m_BGZF->BlockOffset == m_BGZF->BlockLength ) { - m_BGZF->BlockAddress = ftello(m_BGZF->Stream); - m_BGZF->BlockOffset = 0; - m_BGZF->BlockLength = 0; - } - - return numBytesRead; -} - -int BamReader::BgzfReadBlock(void) { - - char header[BLOCK_HEADER_LENGTH]; - int64_t blockAddress = ftello(m_BGZF->Stream); - - int count = fread(header, 1, sizeof(header), m_BGZF->Stream); - if (count == 0) { - m_BGZF->BlockLength = 0; - return 0; - } - - if (count != sizeof(header)) { - printf("read block failed - count != sizeof(header)\n"); - return -1; - } - - if (!BgzfCheckBlockHeader(header)) { - printf("read block failed - CheckBgzfBlockHeader() returned false\n"); - return -1; - } - - int blockLength = BgzfUnpackUnsignedShort(&header[16]) + 1; - char* compressedBlock = m_BGZF->CompressedBlock; - memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH); - int remaining = blockLength - BLOCK_HEADER_LENGTH; - - count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, m_BGZF->Stream); - if (count != remaining) { - printf("read block failed - count != remaining\n"); - return -1; - } - - count = BgzfInflateBlock(blockLength); - if (count < 0) { return -1; } - - if (m_BGZF->BlockLength != 0) { - m_BGZF->BlockOffset = 0; - } - - m_BGZF->BlockAddress = blockAddress; - m_BGZF->BlockLength = count; - return 0; -} - -// move file pointer to specified offset -bool BamReader::BgzfSeek(int64_t position) { - - int blockOffset = (position & 0xFFFF); - int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; - if (fseeko(m_BGZF->Stream, blockAddress, SEEK_SET) != 0) { - printf("ERROR: Unable to seek in BAM file\n"); - exit(1); - } - - m_BGZF->BlockLength = 0; - m_BGZF->BlockAddress = blockAddress; - m_BGZF->BlockOffset = blockOffset; - return true; -} - -// get file position in BAM file -int64_t BamReader::BgzfTell(void) { - return ( (m_BGZF->BlockAddress << 16) | (m_BGZF->BlockOffset & 0xFFFF) ); -} - -int BamReader::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) { - - // get region boundaries - uint32_t begin = left; - uint32_t end = m_references.at(refID).RefLength - 1; - - // initialize list, bin '0' always a valid bin - int i = 0; - list[i++] = 0; - - // get rest of bins that contain this region - unsigned int k; - for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; } - for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; } - for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; } - for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; } - for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; } - - // return number of bins stored - return i; -} - -unsigned int BamReader::CalculateAlignmentEnd(const unsigned int& position, const vector& cigarData) { - - // initialize alignment end to starting position - unsigned int alignEnd = position; - - // iterate over cigar operations - vector::const_iterator cigarIter = cigarData.begin(); - vector::const_iterator cigarEnd = cigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter) { - char cigarType = (*cigarIter).Type; - if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) { - alignEnd += (*cigarIter).Length; - } - } - return alignEnd; -} - -void BamReader::ClearIndex(void) { - - if ( m_index ) { - // iterate over references - vector::iterator refIter = m_index->begin(); - vector::iterator refEnd = m_index->end(); - for ( ; refIter != refEnd; ++refIter) { - RefIndex* aRef = (*refIter); - if ( aRef ) { - // clear out BAM bins - if ( aRef->first ) { - BinVector::iterator binIter = (aRef->first)->begin(); - BinVector::iterator binEnd = (aRef->first)->end(); - for ( ; binIter != binEnd; ++binIter ) { - ChunkVector* chunks = (*binIter).second; - if ( chunks ) { delete chunks; chunks = NULL;} - } - delete aRef->first; - aRef->first = NULL; - } - // clear BAM linear offsets - if ( aRef->second ) { delete aRef->second; aRef->second = NULL; } - delete aRef; - aRef = NULL; - } - } - delete m_index; - m_index = NULL; - } -} - -// closes the BAM file -void BamReader::Close(void) { - - if (m_BGZF!=NULL && m_BGZF->IsOpen) { - BgzfClose(); - delete m_BGZF; - m_BGZF = NULL; - } - ClearIndex(); - m_headerText.clear(); - m_isRegionSpecified = false; -} - -const string BamReader::GetHeaderText(void) const { - return m_headerText; -} - -const int BamReader::GetReferenceCount(void) const { - return m_references.size(); -} - -const RefVector BamReader::GetReferenceData(void) const { - return m_references; -} - -const int BamReader::GetReferenceID(const string& refName) const { - - // retrieve names from reference data - vector refNames; - RefVector::const_iterator refIter = m_references.begin(); - RefVector::const_iterator refEnd = m_references.end(); - for ( ; refIter != refEnd; ++refIter) { - refNames.push_back( (*refIter).RefName ); - } - - // return 'index-of' refName ( if not found, returns refNames.size() ) - return Index( refNames.begin(), refNames.end(), refName ); -} - -// get next alignment (from specified region, if given) -bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { - - // if valid alignment available - if ( LoadNextAlignment(bAlignment) ) { - - // if region not specified, return success - if ( !m_isRegionSpecified ) { return true; } - - // load next alignment until region overlap is found - while ( !IsOverlap(bAlignment) ) { - // if no valid alignment available (likely EOF) return failure - if ( !LoadNextAlignment(bAlignment) ) { return false; } - } - - // return success (alignment found that overlaps region) - return true; - } - - // no valid alignment - else { return false; } -} - -int64_t BamReader::GetOffset(int refID, unsigned int left) { - - // calculate which bins overlap this region - uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); - int numBins = BinsFromRegion(refID, left, bins); - - // get bins for this reference - RefIndex* refIndex = m_index->at(refID); - BinVector* refBins = refIndex->first; - - // get minimum offset to consider - LinearOffsetVector* linearOffsets = refIndex->second; - uint64_t minOffset = ((left>>BAM_LIDX_SHIFT) >= linearOffsets->size()) ? 0 : linearOffsets->at(left>>BAM_LIDX_SHIFT); - - // store offsets to beginning of alignment 'chunks' - std::vector chunkStarts; - - // reference bin iterators - BinVector::const_iterator binIter; - BinVector::const_iterator binBegin = refBins->begin(); - BinVector::const_iterator binEnd = refBins->end(); - - // store all alignment 'chunk' starts for bins in this region - for (int i = 0; i < numBins; ++i ) { - binIter = lower_bound(binBegin, binEnd, bins[i], LookupKeyCompare() ); - if ( (binIter != binEnd) && ( (*binIter).first == bins[i]) ) { - ChunkVector* binChunks = (*binIter).second; - ChunkVector::const_iterator chunkIter = binChunks->begin(); - ChunkVector::const_iterator chunkEnd = binChunks->end(); - for ( ; chunkIter != chunkEnd; ++chunkIter) { - if ( (*chunkIter).second > minOffset ) { - chunkStarts.push_back( (*chunkIter).first ); - } - } - } - } - - // clean up memory - free(bins); - - // if no alignments found - if ( chunkStarts.empty() ) { return -1; } - - // else return smallest offset for alignment starts - else { return *min_element(chunkStarts.begin(), chunkStarts.end()); } -} - -bool BamReader::IsOverlap(BamAlignment& bAlignment) { - - // if on different reference sequence, quit - if ( bAlignment.RefID != m_currentRefID ) { return false; } - - // read starts after left boundary - if ( bAlignment.Position >= (signed int) m_currentLeft) { return true; } - - // return whether alignment end overlaps left boundary - return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= m_currentLeft ); -} - -bool BamReader::Jump(int refID, unsigned int position) { - - // if index available, and region is valid - if ( (m_index->size() != 0) && m_references.at(refID).RefHasAlignments && (position <= m_references.at(refID).RefLength) ) { - m_currentRefID = refID; - m_currentLeft = position; - m_isRegionSpecified = true; - - int64_t offset = GetOffset(m_currentRefID, m_currentLeft); - if ( offset == -1 ) { return false; } - else { return BgzfSeek(offset); } - } - return false; -} - -void BamReader::LoadHeaderData(void) { - - // check to see if proper BAM header - char buffer[4]; - if (BgzfRead(buffer, 4) != 4) { - printf("Could not read header type\n"); - exit(1); - } - - if (strncmp(buffer, "BAM\001", 4)) { - printf("wrong header type!\n"); - exit(1); - } - - // get BAM header text length - BgzfRead(buffer, 4); - const unsigned int headerTextLength = BgzfUnpackUnsignedInt(buffer); - - // get BAM header text - char* headerText = (char*)calloc(headerTextLength + 1, 1); - BgzfRead(headerText, headerTextLength); - m_headerText = (string)((const char*)headerText); - - // clean up calloc-ed temp variable - free(headerText); -} - -void BamReader::LoadIndexData(FILE* indexStream) { - - // see if index is valid BAM index - char magic[4]; - fread(magic, 1, 4, indexStream); - if (strncmp(magic, "BAI\1", 4)) { - printf("Problem with index file - invalid format.\n"); - fclose(indexStream); - exit(1); - } - - // get number of reference sequences - uint32_t numRefSeqs; - fread(&numRefSeqs, 4, 1, indexStream); - - // intialize BamIndex data structure - m_index = new BamIndex; - m_index->reserve(numRefSeqs); - - // iterate over reference sequences - for (unsigned int i = 0; i < numRefSeqs; ++i) { - - // get number of bins for this reference sequence - int32_t numBins; - fread(&numBins, 4, 1, indexStream); - - if (numBins > 0) { m_references.at(i).RefHasAlignments = true; } - - // intialize BinVector - BinVector* bins = new BinVector; - bins->reserve(numBins); - - // iterate over bins for that reference sequence - for (int j = 0; j < numBins; ++j) { - - // get binID - uint32_t binID; - fread(&binID, 4, 1, indexStream); - - // get number of regionChunks in this bin - uint32_t numChunks; - fread(&numChunks, 4, 1, indexStream); - - // intialize ChunkVector - ChunkVector* regionChunks = new ChunkVector; - regionChunks->reserve(numChunks); - - // iterate over regionChunks in this bin - for (unsigned int k = 0; k < numChunks; ++k) { - - // get chunk boundaries (left, right) - uint64_t left; - uint64_t right; - fread(&left, 8, 1, indexStream); - fread(&right, 8, 1, indexStream); - - // save ChunkPair - regionChunks->push_back( ChunkPair(left, right) ); - } - - // sort chunks for this bin - sort( regionChunks->begin(), regionChunks->end(), LookupKeyCompare() ); - - // save binID, chunkVector for this bin - bins->push_back( BamBin(binID, regionChunks) ); - } - - // sort bins by binID - sort(bins->begin(), bins->end(), LookupKeyCompare() ); - - // load linear index for this reference sequence - - // get number of linear offsets - int32_t numLinearOffsets; - fread(&numLinearOffsets, 4, 1, indexStream); - - // intialize LinearOffsetVector - LinearOffsetVector* linearOffsets = new LinearOffsetVector; - linearOffsets->reserve(numLinearOffsets); - - // iterate over linear offsets for this reference sequeence - for (int j = 0; j < numLinearOffsets; ++j) { - // get a linear offset - uint64_t linearOffset; - fread(&linearOffset, 8, 1, indexStream); - // store linear offset - linearOffsets->push_back(linearOffset); - } - - // sort linear offsets - sort( linearOffsets->begin(), linearOffsets->end() ); - - // store index data for that reference sequence - m_index->push_back( new RefIndex(bins, linearOffsets) ); - } - - // close index file (.bai) and return - fclose(indexStream); -} - -bool BamReader::LoadNextAlignment(BamAlignment& bAlignment) { - - // read in the 'block length' value, make sure it's not zero - char buffer[4]; - BgzfRead(buffer, 4); - const unsigned int blockLength = BgzfUnpackUnsignedInt(buffer); - if ( blockLength == 0 ) { return false; } - - // keep track of bytes read as method progresses - int bytesRead = 4; - - // read in core alignment data, make sure the right size of data was read - char x[BAM_CORE_SIZE]; - if ( BgzfRead(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; } - bytesRead += BAM_CORE_SIZE; - - // set BamAlignment 'core' data and character data lengths - unsigned int tempValue; - unsigned int queryNameLength; - unsigned int numCigarOperations; - unsigned int querySequenceLength; - - //bAlignment.RefID = BgzfUnpackUnsignedInt(&x[0]); - //bAlignment.Position = BgzfUnpackUnsignedInt(&x[4]); - bAlignment.RefID = BgzfUnpackSignedInt(&x[0]); - bAlignment.Position = BgzfUnpackSignedInt(&x[4]); - - tempValue = BgzfUnpackUnsignedInt(&x[8]); - bAlignment.Bin = tempValue >> 16; - bAlignment.MapQuality = tempValue >> 8 & 0xff; - queryNameLength = tempValue & 0xff; - - tempValue = BgzfUnpackUnsignedInt(&x[12]); - bAlignment.AlignmentFlag = tempValue >> 16; - numCigarOperations = tempValue & 0xffff; - - querySequenceLength = BgzfUnpackUnsignedInt(&x[16]); - //bAlignment.MateRefID = BgzfUnpackUnsignedInt(&x[20]); - //bAlignment.MatePosition = BgzfUnpackUnsignedInt(&x[24]); - //bAlignment.InsertSize = BgzfUnpackUnsignedInt(&x[28]); - bAlignment.MateRefID = BgzfUnpackSignedInt(&x[20]); - bAlignment.MatePosition = BgzfUnpackSignedInt(&x[24]); - bAlignment.InsertSize = BgzfUnpackSignedInt(&x[28]); - - // calculate lengths/offsets - const unsigned int dataLength = blockLength - BAM_CORE_SIZE; - const unsigned int cigarDataOffset = queryNameLength; - const unsigned int seqDataOffset = cigarDataOffset + (numCigarOperations * 4); - const unsigned int qualDataOffset = seqDataOffset + (querySequenceLength+1)/2; - const unsigned int tagDataOffset = qualDataOffset + querySequenceLength; - const unsigned int tagDataLen = dataLength - tagDataOffset; - - // set up destination buffers for character data - char* allCharData = (char*)calloc(sizeof(char), dataLength); - uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); - char* seqData = ((char*)allCharData) + seqDataOffset; - char* qualData = ((char*)allCharData) + qualDataOffset; - char* tagData = ((char*)allCharData) + tagDataOffset; - - // get character data - make sure proper data size was read - if ( BgzfRead(allCharData, dataLength) != dataLength) { return false; } - else { - - bytesRead += dataLength; - - // clear out any previous string data - bAlignment.Name.clear(); - bAlignment.QueryBases.clear(); - bAlignment.Qualities.clear(); - bAlignment.AlignedBases.clear(); - bAlignment.CigarData.clear(); - bAlignment.TagData.clear(); - - // save name - bAlignment.Name = (string)((const char*)(allCharData)); - - // save query sequence - for (unsigned int i = 0; i < querySequenceLength; ++i) { - char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; - bAlignment.QueryBases.append( 1, singleBase ); - } - - // save sequence length - bAlignment.Length = bAlignment.QueryBases.length(); - - // save qualities - for (unsigned int i = 0; i < querySequenceLength; ++i) { - char singleQuality = (char)(qualData[i]+33); // conversion from QV to FASTQ character - bAlignment.Qualities.append( 1, singleQuality ); - } - - // save CIGAR-related data; - int k = 0; - for (unsigned int i = 0; i < numCigarOperations; ++i) { - - // build CigarOp struct - CigarOp op; - op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); - op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; - - // save CigarOp - bAlignment.CigarData.push_back(op); - - // build AlignedBases string - switch (op.Type) { - - case ('M') : - case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases - case ('S') : k += op.Length; // for 'S' - skip over query bases - break; - - case ('D') : bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character - break; - - case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character; - break; - - case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence - k += op.Length; - break; - - case ('H') : break; // for 'H' - do nothing, move to next op - - default : printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here - exit(1); - } - } - - // read in the tag data - bAlignment.TagData.resize(tagDataLen); - memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen); - } - - free(allCharData); - return true; -} - -void BamReader::LoadReferenceData(void) { - - // get number of reference sequences - char buffer[4]; - BgzfRead(buffer, 4); - const unsigned int numberRefSeqs = BgzfUnpackUnsignedInt(buffer); - if (numberRefSeqs == 0) { return; } - m_references.reserve((int)numberRefSeqs); - - // iterate over all references in header - for (unsigned int i = 0; i != numberRefSeqs; ++i) { - - // get length of reference name - BgzfRead(buffer, 4); - const unsigned int refNameLength = BgzfUnpackUnsignedInt(buffer); - char* refName = (char*)calloc(refNameLength, 1); - - // get reference name and reference sequence length - BgzfRead(refName, refNameLength); - BgzfRead(buffer, 4); - const unsigned int refLength = BgzfUnpackUnsignedInt(buffer); - - // store data for reference - RefData aReference; - aReference.RefName = (string)((const char*)refName); - aReference.RefLength = refLength; - m_references.push_back(aReference); - - // clean up calloc-ed temp variable - free(refName); - } -} - -// opens BAM file (and index) -void BamReader::Open(const string& filename, const string& indexFilename) { - - // open the BGZF file for reading, retrieve header text & reference data - m_BGZF = new BgzfData; - BgzfOpen(filename); - LoadHeaderData(); - LoadReferenceData(); - - // store file offset of first alignment - m_alignmentsBeginOffset = BgzfTell(); - - // open index file & load index data (if exists) - OpenIndex(indexFilename); -} - -void BamReader::OpenIndex(const string& indexFilename) { - - // if index file exists - if (!indexFilename.empty()) { - - // open index - FILE* indexStream = fopen(indexFilename.c_str(), "rb"); - - // abort on error - if(!indexStream) { - printf("ERROR: Unable to open the BAM index file %s for reading.\n", indexFilename.c_str() ); - exit(1); - } - - // build up index data structure - LoadIndexData(indexStream); - } -} - -bool BamReader::Rewind(void) { - - // find first reference that has alignments in the BAM file - int refID = 0; - int refCount = m_references.size(); - for ( ; refID < refCount; ++refID ) { - if ( m_references.at(refID).RefHasAlignments ) { break; } - } - - // store default bounds for first alignment - m_currentRefID = refID; - m_currentLeft = 0; - m_isRegionSpecified = false; - - // return success/failure of seek - return BgzfSeek(m_alignmentsBeginOffset); -} +// *************************************************************************** +// BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 8 December 2009 (DB) +// --------------------------------------------------------------------------- +// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad +// Institute. +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading BAM files +// *************************************************************************** + +// C++ includes +#include +#include +#include +#include + +// BamTools includes +#include "BGZF.h" +#include "BamReader.h" +using namespace BamTools; +using namespace std; + +struct BamReader::BamReaderPrivate { + + // ------------------------------- + // data members + // ------------------------------- + + // general data + BgzfData mBGZF; + string HeaderText; + BamIndex Index; + RefVector References; + bool IsIndexLoaded; + int64_t AlignmentsBeginOffset; + string Filename; + string IndexFilename; + + // user-specified region values + bool IsRegionSpecified; + int CurrentRefID; + unsigned int CurrentLeft; + + // BAM character constants + const char* DNA_LOOKUP; + const char* CIGAR_LOOKUP; + + // ------------------------------- + // constructor & destructor + // ------------------------------- + BamReaderPrivate(void); + ~BamReaderPrivate(void); + + // ------------------------------- + // "public" interface + // ------------------------------- + + // flie operations + void Close(void); + bool Jump(int refID, unsigned int position = 0); + void Open(const string& filename, const string& indexFilename = ""); + bool Rewind(void); + + // access alignment data + bool GetNextAlignment(BamAlignment& bAlignment); + + // access auxiliary data + const string GetHeaderText(void) const; + const int GetReferenceCount(void) const; + const RefVector GetReferenceData(void) const; + const int GetReferenceID(const string& refName) const; + + // index operations + bool CreateIndex(void); + + // ------------------------------- + // internal methods + // ------------------------------- + + // *** reading alignments and auxiliary data *** // + + // calculate bins that overlap region ( left to reference end for now ) + int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]); + // calculates alignment end position based on starting position and provided CIGAR operations + unsigned int CalculateAlignmentEnd(const unsigned int& position, const std::vector& cigarData); + // calculate file offset for first alignment chunk overlapping 'left' + int64_t GetOffset(int refID, unsigned int left); + // checks to see if alignment overlaps current region + bool IsOverlap(BamAlignment& bAlignment); + // retrieves header text from BAM file + void LoadHeaderData(void); + // retrieves BAM alignment under file pointer + bool LoadNextAlignment(BamAlignment& bAlignment); + // builds reference data structure from BAM file + void LoadReferenceData(void); + + // *** index file handling *** // + + // calculates index for BAM file + bool BuildIndex(void); + // clear out inernal index data structure + void ClearIndex(void); + // saves BAM bin entry for index + void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset); + // saves linear offset entry for index + void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset); + // loads index from BAM index file + bool LoadIndex(void); + // simplifies index by merging 'chunks' + void MergeChunks(void); + // round-up 32-bit integer to next power-of-2 + void Roundup32(int& value); + // saves index to BAM index file + bool WriteIndex(void); +}; + +// ----------------------------------------------------- +// BamReader implementation (wrapper around BRPrivate) +// ----------------------------------------------------- + +// constructor +BamReader::BamReader(void) { + d = new BamReaderPrivate; +} + +// destructor +BamReader::~BamReader(void) { + delete d; + d = 0; +} + +// file operations +void BamReader::Close(void) { d->Close(); } +bool BamReader::Jump(int refID, unsigned int position) { return d->Jump(refID, position); } +void BamReader::Open(const string& filename, const string& indexFilename) { d->Open(filename, indexFilename); } +bool BamReader::Rewind(void) { return d->Rewind(); } + +// access alignment data +bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); } + +// access auxiliary data +const string BamReader::GetHeaderText(void) const { return d->HeaderText; } +const int BamReader::GetReferenceCount(void) const { return d->References.size(); } +const RefVector BamReader::GetReferenceData(void) const { return d->References; } +const int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); } + +// index operations +bool BamReader::CreateIndex(void) { return d->CreateIndex(); } + +// ----------------------------------------------------- +// BamReaderPrivate implementation +// ----------------------------------------------------- + +// constructor +BamReader::BamReaderPrivate::BamReaderPrivate(void) + : IsIndexLoaded(false) + , AlignmentsBeginOffset(0) + , IsRegionSpecified(false) + , CurrentRefID(0) + , CurrentLeft(0) + , DNA_LOOKUP("=ACMGRSVTWYHKDBN") + , CIGAR_LOOKUP("MIDNSHP") +{ } + +// destructor +BamReader::BamReaderPrivate::~BamReaderPrivate(void) { + Close(); +} + +// calculate bins that overlap region ( left to reference end for now ) +int BamReader::BamReaderPrivate::BinsFromRegion(int refID, unsigned int left, uint16_t list[MAX_BIN]) { + + // get region boundaries + uint32_t begin = left; + uint32_t end = References.at(refID).RefLength - 1; + + // initialize list, bin '0' always a valid bin + int i = 0; + list[i++] = 0; + + // get rest of bins that contain this region + unsigned int k; + for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; } + for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; } + for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; } + for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; } + for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; } + + // return number of bins stored + return i; +} + +// populates BAM index data structure from BAM file data +bool BamReader::BamReaderPrivate::BuildIndex(void) { + + // check to be sure file is open + if (!mBGZF.IsOpen) { return false; } + + // move file pointer to beginning of alignments + Rewind(); + + // get reference count, reserve index space + int numReferences = References.size(); + for ( int i = 0; i < numReferences; ++i ) { + Index.push_back(ReferenceIndex()); + } + + // sets default constant for bin, ID, offset, coordinate variables + const uint32_t defaultValue = 0xffffffffu; + + // bin data + uint32_t saveBin(defaultValue); + uint32_t lastBin(defaultValue); + + // reference ID data + int32_t saveRefID(defaultValue); + int32_t lastRefID(defaultValue); + + // offset data + uint64_t saveOffset = mBGZF.Tell(); + uint64_t lastOffset = saveOffset; + + // coordinate data + int32_t lastCoordinate = defaultValue; + + BamAlignment bAlignment; + while( GetNextAlignment(bAlignment) ) { + + // change of chromosome, save ID, reset bin + if ( lastRefID != bAlignment.RefID ) { + lastRefID = bAlignment.RefID; + lastBin = defaultValue; + } + + // if lastCoordinate greater than BAM position - file not sorted properly + else if ( lastCoordinate > bAlignment.Position ) { + printf("BAM file not properly sorted:\n"); + printf("Alignment %s : %u > %u on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID); + exit(1); + } + + // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions) + if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) { + + // save linear offset entry (matched to BAM entry refID) + ReferenceIndex& refIndex = Index.at(bAlignment.RefID); + LinearOffsetVector& offsets = refIndex.Offsets; + InsertLinearOffset(offsets, bAlignment, lastOffset); + } + + // if current BamAlignment bin != lastBin, "then possibly write the binning index" + if ( bAlignment.Bin != lastBin ) { + + // if not first time through + if ( saveBin != defaultValue ) { + + // save Bam bin entry + ReferenceIndex& refIndex = Index.at(saveRefID); + BamBinMap& binMap = refIndex.Bins; + InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); + } + + // update saveOffset + saveOffset = lastOffset; + + // update bin values + saveBin = bAlignment.Bin; + lastBin = bAlignment.Bin; + + // update saveRefID + saveRefID = bAlignment.RefID; + + // if invalid RefID, break out (why?) + if ( saveRefID < 0 ) { break; } + } + + // make sure that current file pointer is beyond lastOffset + if ( mBGZF.Tell() <= (int64_t)lastOffset ) { + printf("Error in BGZF offsets.\n"); + exit(1); + } + + // update lastOffset + lastOffset = mBGZF.Tell(); + + // update lastCoordinate + lastCoordinate = bAlignment.Position; + } + + // save any leftover BAM data (as long as refID is valid) + if ( saveRefID >= 0 ) { + // save Bam bin entry + ReferenceIndex& refIndex = Index.at(saveRefID); + BamBinMap& binMap = refIndex.Bins; + InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); + } + + // simplify index by merging chunks + MergeChunks(); + + // iterate over references + BamIndex::iterator indexIter = Index.begin(); + BamIndex::iterator indexEnd = Index.end(); + for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) { + + // get reference index data + ReferenceIndex& refIndex = (*indexIter); + BamBinMap& binMap = refIndex.Bins; + LinearOffsetVector& offsets = refIndex.Offsets; + + // store whether reference has alignments or no + References[i].RefHasAlignments = ( binMap.size() > 0 ); + + // sort linear offsets + sort(offsets.begin(), offsets.end()); + } + + + // rewind file pointer to beginning of alignments, return success/fail + return Rewind(); +} + +// calculates alignment end position based on starting position and provided CIGAR operations +unsigned int BamReader::BamReaderPrivate::CalculateAlignmentEnd(const unsigned int& position, const vector& cigarData) { + + // initialize alignment end to starting position + unsigned int alignEnd = position; + + // iterate over cigar operations + vector::const_iterator cigarIter = cigarData.begin(); + vector::const_iterator cigarEnd = cigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter) { + char cigarType = (*cigarIter).Type; + if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) { + alignEnd += (*cigarIter).Length; + } + } + return alignEnd; +} + + +// clear index data structure +void BamReader::BamReaderPrivate::ClearIndex(void) { + Index.clear(); // sufficient ?? +} + +// closes the BAM file +void BamReader::BamReaderPrivate::Close(void) { + mBGZF.Close(); + ClearIndex(); + HeaderText.clear(); + IsRegionSpecified = false; +} + +// create BAM index from BAM file (keep structure in memory) and write to default index output file +bool BamReader::BamReaderPrivate::CreateIndex(void) { + + // clear out index + ClearIndex(); + + bool ok = true; + ok &= BuildIndex(); + ok &= WriteIndex(); + return ok; +} + +// returns RefID for given RefName (returns References.size() if not found) +const int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const { + + // retrieve names from reference data + vector refNames; + RefVector::const_iterator refIter = References.begin(); + RefVector::const_iterator refEnd = References.end(); + for ( ; refIter != refEnd; ++refIter) { + refNames.push_back( (*refIter).RefName ); + } + + // return 'index-of' refName ( if not found, returns refNames.size() ) + return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); +} + +// get next alignment (from specified region, if given) +bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) { + + // if valid alignment available + if ( LoadNextAlignment(bAlignment) ) { + + // if region not specified, return success + if ( !IsRegionSpecified ) { return true; } + + // load next alignment until region overlap is found + while ( !IsOverlap(bAlignment) ) { + // if no valid alignment available (likely EOF) return failure + if ( !LoadNextAlignment(bAlignment) ) { return false; } + } + + // return success (alignment found that overlaps region) + return true; + } + + // no valid alignment + else { return false; } +} + +// calculate closest indexed file offset for region specified +int64_t BamReader::BamReaderPrivate::GetOffset(int refID, unsigned int left) { + + // calculate which bins overlap this region + uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); + int numBins = BinsFromRegion(refID, left, bins); + + // get bins for this reference + const ReferenceIndex& refIndex = Index.at(refID); + const BamBinMap& binMap = refIndex.Bins; + + // get minimum offset to consider + const LinearOffsetVector& offsets = refIndex.Offsets; + uint64_t minOffset = ( (left>>BAM_LIDX_SHIFT) >= offsets.size() ) ? 0 : offsets.at(left>>BAM_LIDX_SHIFT); + + // store offsets to beginning of alignment 'chunks' + std::vector chunkStarts; + + // store all alignment 'chunk' starts for bins in this region + for (int i = 0; i < numBins; ++i ) { + uint16_t binKey = bins[i]; + + map::const_iterator binIter = binMap.find(binKey); + if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) { + + const ChunkVector& chunks = (*binIter).second; + std::vector::const_iterator chunksIter = chunks.begin(); + std::vector::const_iterator chunksEnd = chunks.end(); + for ( ; chunksIter != chunksEnd; ++chunksIter) { + const Chunk& chunk = (*chunksIter); + if ( chunk.Stop > minOffset ) { + chunkStarts.push_back( chunk.Start ); + } + } + } + } + + // clean up memory + free(bins); + + // if no alignments found, else return smallest offset for alignment starts + if ( chunkStarts.size() == 0 ) { return -1; } + else { return *min_element(chunkStarts.begin(), chunkStarts.end()); } +} + +// saves BAM bin entry for index +void BamReader::BamReaderPrivate::InsertBinEntry(BamBinMap& binMap, + const uint32_t& saveBin, + const uint64_t& saveOffset, + const uint64_t& lastOffset) +{ + // look up saveBin + BamBinMap::iterator binIter = binMap.find(saveBin); + + // create new chunk + Chunk newChunk(saveOffset, lastOffset); + + // if entry doesn't exist + if ( binIter == binMap.end() ) { + ChunkVector newChunks; + newChunks.push_back(newChunk); + binMap.insert( pair(saveBin, newChunks)); + } + + // otherwise + else { + ChunkVector& binChunks = (*binIter).second; + binChunks.push_back( newChunk ); + } +} + +// saves linear offset entry for index +void BamReader::BamReaderPrivate::InsertLinearOffset(LinearOffsetVector& offsets, + const BamAlignment& bAlignment, + const uint64_t& lastOffset) +{ + // get converted offsets + int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; + int endOffset = ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) - 1) >> BAM_LIDX_SHIFT; + + // resize vector if necessary + int oldSize = offsets.size(); + int newSize = endOffset + 1; + if ( oldSize < newSize ) { + Roundup32(newSize); + offsets.resize(newSize, 0); + } + + // store offset + for(int i = beginOffset + 1; i <= endOffset ; ++i) { + if ( offsets[i] == 0) { + offsets[i] = lastOffset; + } + } +} + +// returns whether alignment overlaps currently specified region (refID, leftBound) +bool BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { + + // if on different reference sequence, quit + if ( bAlignment.RefID != CurrentRefID ) { return false; } + + // read starts after left boundary + if ( bAlignment.Position >= (int32_t)CurrentLeft) { return true; } + + // return whether alignment end overlaps left boundary + return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= CurrentLeft ); +} + +// jumps to specified region(refID, leftBound) in BAM file, returns success/fail +bool BamReader::BamReaderPrivate::Jump(int refID, unsigned int position) { + + // if data exists for this reference and position is valid + if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) { + + // set current region + CurrentRefID = refID; + CurrentLeft = position; + IsRegionSpecified = true; + + // calculate offset + int64_t offset = GetOffset(CurrentRefID, CurrentLeft); + + // if in valid offset, return failure + if ( offset == -1 ) { return false; } + + // otherwise return success of seek operation + else { return mBGZF.Seek(offset); } + } + + // invalid jump request parameters, return failure + return false; +} + +// load BAM header data +void BamReader::BamReaderPrivate::LoadHeaderData(void) { + + // check to see if proper BAM header + char buffer[4]; + if (mBGZF.Read(buffer, 4) != 4) { + printf("Could not read header type\n"); + exit(1); + } + + if (strncmp(buffer, "BAM\001", 4)) { + printf("wrong header type!\n"); + exit(1); + } + + // get BAM header text length + mBGZF.Read(buffer, 4); + const unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer); + + // get BAM header text + char* headerText = (char*)calloc(headerTextLength + 1, 1); + mBGZF.Read(headerText, headerTextLength); + HeaderText = (string)((const char*)headerText); + + // clean up calloc-ed temp variable + free(headerText); +} + +// load existing index data from BAM index file (".bai"), return success/fail +bool BamReader::BamReaderPrivate::LoadIndex(void) { + + // clear out index data + ClearIndex(); + + // skip if index file empty + if ( IndexFilename.empty() ) { return false; } + + // open index file, abort on error + FILE* indexStream = fopen(IndexFilename.c_str(), "rb"); + if(!indexStream) { + printf("ERROR: Unable to open the BAM index file %s for reading.\n", IndexFilename.c_str() ); + return false; + } + + // see if index is valid BAM index + char magic[4]; + fread(magic, 1, 4, indexStream); + if (strncmp(magic, "BAI\1", 4)) { + printf("Problem with index file - invalid format.\n"); + fclose(indexStream); + return false; + } + + // get number of reference sequences + uint32_t numRefSeqs; + fread(&numRefSeqs, 4, 1, indexStream); + + // intialize space for BamIndex data structure + Index.reserve(numRefSeqs); + + // iterate over reference sequences + for (unsigned int i = 0; i < numRefSeqs; ++i) { + + // get number of bins for this reference sequence + int32_t numBins; + fread(&numBins, 4, 1, indexStream); + + if (numBins > 0) { + RefData& refEntry = References[i]; + refEntry.RefHasAlignments = true; + } + + // intialize BinVector + BamBinMap binMap; + + // iterate over bins for that reference sequence + for (int j = 0; j < numBins; ++j) { + + // get binID + uint32_t binID; + fread(&binID, 4, 1, indexStream); + + // get number of regionChunks in this bin + uint32_t numChunks; + fread(&numChunks, 4, 1, indexStream); + + // intialize ChunkVector + ChunkVector regionChunks; + regionChunks.reserve(numChunks); + + // iterate over regionChunks in this bin + for (unsigned int k = 0; k < numChunks; ++k) { + + // get chunk boundaries (left, right) + uint64_t left; + uint64_t right; + fread(&left, 8, 1, indexStream); + fread(&right, 8, 1, indexStream); + + // save ChunkPair + regionChunks.push_back( Chunk(left, right) ); + } + + // sort chunks for this bin + sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan ); + + // save binID, chunkVector for this bin + binMap.insert( pair(binID, regionChunks) ); + } + + // load linear index for this reference sequence + + // get number of linear offsets + int32_t numLinearOffsets; + fread(&numLinearOffsets, 4, 1, indexStream); + + // intialize LinearOffsetVector + LinearOffsetVector offsets; + offsets.reserve(numLinearOffsets); + + // iterate over linear offsets for this reference sequeence + uint64_t linearOffset; + for (int j = 0; j < numLinearOffsets; ++j) { + // read a linear offset & store + fread(&linearOffset, 8, 1, indexStream); + offsets.push_back(linearOffset); + } + + // sort linear offsets + sort( offsets.begin(), offsets.end() ); + + // store index data for that reference sequence + Index.push_back( ReferenceIndex(binMap, offsets) ); + } + + // close index file (.bai) and return + fclose(indexStream); + return true; +} + +// populates BamAlignment with alignment data under file pointer, returns success/fail +bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { + + // read in the 'block length' value, make sure it's not zero + char buffer[4]; + mBGZF.Read(buffer, 4); + const unsigned int blockLength = BgzfData::UnpackUnsignedInt(buffer); + if ( blockLength == 0 ) { return false; } + + // keep track of bytes read as method progresses + int bytesRead = 4; + + // read in core alignment data, make sure the right size of data was read + char x[BAM_CORE_SIZE]; + if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; } + bytesRead += BAM_CORE_SIZE; + + // set BamAlignment 'core' data and character data lengths + unsigned int tempValue; + unsigned int queryNameLength; + unsigned int numCigarOperations; + unsigned int querySequenceLength; + + bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); + bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]); + + tempValue = BgzfData::UnpackUnsignedInt(&x[8]); + bAlignment.Bin = tempValue >> 16; + bAlignment.MapQuality = tempValue >> 8 & 0xff; + queryNameLength = tempValue & 0xff; + + tempValue = BgzfData::UnpackUnsignedInt(&x[12]); + bAlignment.AlignmentFlag = tempValue >> 16; + numCigarOperations = tempValue & 0xffff; + + querySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]); + bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]); + bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]); + bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]); + + // calculate lengths/offsets + const unsigned int dataLength = blockLength - BAM_CORE_SIZE; + const unsigned int cigarDataOffset = queryNameLength; + const unsigned int seqDataOffset = cigarDataOffset + (numCigarOperations * 4); + const unsigned int qualDataOffset = seqDataOffset + (querySequenceLength+1)/2; + const unsigned int tagDataOffset = qualDataOffset + querySequenceLength; + const unsigned int tagDataLen = dataLength - tagDataOffset; + + // set up destination buffers for character data + char* allCharData = (char*)calloc(sizeof(char), dataLength); + uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); + char* seqData = ((char*)allCharData) + seqDataOffset; + char* qualData = ((char*)allCharData) + qualDataOffset; + char* tagData = ((char*)allCharData) + tagDataOffset; + + // get character data - make sure proper data size was read + if ( mBGZF.Read(allCharData, dataLength) != (signed int)dataLength) { return false; } + else { + + bytesRead += dataLength; + + // clear out any previous string data + bAlignment.Name.clear(); + bAlignment.QueryBases.clear(); + bAlignment.Qualities.clear(); + bAlignment.AlignedBases.clear(); + bAlignment.CigarData.clear(); + bAlignment.TagData.clear(); + + // save name + bAlignment.Name = (string)((const char*)(allCharData)); + + // save query sequence + for (unsigned int i = 0; i < querySequenceLength; ++i) { + char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; + bAlignment.QueryBases.append( 1, singleBase ); + } + + // save sequence length + bAlignment.Length = bAlignment.QueryBases.length(); + + // save qualities, convert from numeric QV to FASTQ character + for (unsigned int i = 0; i < querySequenceLength; ++i) { + char singleQuality = (char)(qualData[i]+33); + bAlignment.Qualities.append( 1, singleQuality ); + } + + // save CIGAR-related data; + int k = 0; + for (unsigned int i = 0; i < numCigarOperations; ++i) { + + // build CigarOp struct + CigarOp op; + op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); + op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; + + // save CigarOp + bAlignment.CigarData.push_back(op); + + // build AlignedBases string + switch (op.Type) { + + case ('M') : + case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases + case ('S') : k += op.Length; // for 'S' - skip over query bases + break; + + case ('D') : bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character + break; + + case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character; + break; + + case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence + k += op.Length; + break; + + case ('H') : break; // for 'H' - do nothing, move to next op + + default : printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here + exit(1); + } + } + + // read in the tag data + bAlignment.TagData.resize(tagDataLen); + memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen); + } + + free(allCharData); + return true; +} + +// loads reference data from BAM file +void BamReader::BamReaderPrivate::LoadReferenceData(void) { + + // get number of reference sequences + char buffer[4]; + mBGZF.Read(buffer, 4); + const unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer); + if (numberRefSeqs == 0) { return; } + References.reserve((int)numberRefSeqs); + + // iterate over all references in header + for (unsigned int i = 0; i != numberRefSeqs; ++i) { + + // get length of reference name + mBGZF.Read(buffer, 4); + const unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer); + char* refName = (char*)calloc(refNameLength, 1); + + // get reference name and reference sequence length + mBGZF.Read(refName, refNameLength); + mBGZF.Read(buffer, 4); + const unsigned int refLength = BgzfData::UnpackUnsignedInt(buffer); + + // store data for reference + RefData aReference; + aReference.RefName = (string)((const char*)refName); + aReference.RefLength = refLength; + References.push_back(aReference); + + // clean up calloc-ed temp variable + free(refName); + } +} + +// merges 'alignment chunks' in BAM bin (used for index building) +void BamReader::BamReaderPrivate::MergeChunks(void) { + + // iterate over reference enties + BamIndex::iterator indexIter = Index.begin(); + BamIndex::iterator indexEnd = Index.end(); + for ( ; indexIter != indexEnd; ++indexIter ) { + + // get BAM bin map for this reference + ReferenceIndex& refIndex = (*indexIter); + BamBinMap& bamBinMap = refIndex.Bins; + + // iterate over BAM bins + BamBinMap::iterator binIter = bamBinMap.begin(); + BamBinMap::iterator binEnd = bamBinMap.end(); + for ( ; binIter != binEnd; ++binIter ) { + + // get chunk vector for this bin + ChunkVector& binChunks = (*binIter).second; + if ( binChunks.size() == 0 ) { continue; } + + ChunkVector mergedChunks; + mergedChunks.push_back( binChunks[0] ); + + // iterate over chunks + int i = 0; + ChunkVector::iterator chunkIter = binChunks.begin(); + ChunkVector::iterator chunkEnd = binChunks.end(); + for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { + + // get 'currentChunk' based on numeric index + Chunk& currentChunk = mergedChunks[i]; + + // get iteratorChunk based on vector iterator + Chunk& iteratorChunk = (*chunkIter); + + // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted) + if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) { + + // set currentChunk.Stop to iteratorChunk.Stop + currentChunk.Stop = iteratorChunk.Stop; + } + + // otherwise + else { + // set currentChunk + 1 to iteratorChunk + mergedChunks.push_back(iteratorChunk); + ++i; + } + } + + // saved merged chunk vector + (*binIter).second = mergedChunks; + } + } +} + +// opens BAM file (and index) +void BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) { + + Filename = filename; + IndexFilename = indexFilename; + + // open the BGZF file for reading, retrieve header text & reference data + mBGZF.Open(filename, "rb"); + LoadHeaderData(); + LoadReferenceData(); + + // store file offset of first alignment + AlignmentsBeginOffset = mBGZF.Tell(); + + // open index file & load index data (if exists) + if ( !IndexFilename.empty() ) { + LoadIndex(); + } +} + +// returns BAM file pointer to beginning of alignment data +bool BamReader::BamReaderPrivate::Rewind(void) { + + // find first reference that has alignments in the BAM file + int refID = 0; + int refCount = References.size(); + for ( ; refID < refCount; ++refID ) { + if ( References.at(refID).RefHasAlignments ) { break; } + } + + // store default bounds for first alignment + CurrentRefID = refID; + CurrentLeft = 0; + IsRegionSpecified = false; + + // return success/failure of seek + return mBGZF.Seek(AlignmentsBeginOffset); +} + +// rounds value up to next power-of-2 (used in index building) +void BamReader::BamReaderPrivate::Roundup32(int& value) { + --value; + value |= value >> 1; + value |= value >> 2; + value |= value >> 4; + value |= value >> 8; + value |= value >> 16; + ++value; +} + +// saves index data to BAM index file (".bai"), returns success/fail +bool BamReader::BamReaderPrivate::WriteIndex(void) { + + IndexFilename = Filename + ".bai"; + FILE* indexStream = fopen(IndexFilename.c_str(), "wb"); + if ( indexStream == 0 ) { + printf("ERROR: Could not open file to save index\n"); + return false; + } + + // write BAM index header + fwrite("BAI\1", 1, 4, indexStream); + + // write number of reference sequences + int32_t numReferenceSeqs = Index.size(); + fwrite(&numReferenceSeqs, 4, 1, indexStream); + + // iterate over reference sequences + BamIndex::const_iterator indexIter = Index.begin(); + BamIndex::const_iterator indexEnd = Index.end(); + for ( ; indexIter != indexEnd; ++ indexIter ) { + + // get reference index data + const ReferenceIndex& refIndex = (*indexIter); + const BamBinMap& binMap = refIndex.Bins; + const LinearOffsetVector& offsets = refIndex.Offsets; + + // write number of bins + int32_t binCount = binMap.size(); + fwrite(&binCount, 4, 1, indexStream); + + // iterate over bins + BamBinMap::const_iterator binIter = binMap.begin(); + BamBinMap::const_iterator binEnd = binMap.end(); + for ( ; binIter != binEnd; ++binIter ) { + + // get bin data (key and chunk vector) + const uint32_t& binKey = (*binIter).first; + const ChunkVector& binChunks = (*binIter).second; + + // save BAM bin key + fwrite(&binKey, 4, 1, indexStream); + + // save chunk count + int32_t chunkCount = binChunks.size(); + fwrite(&chunkCount, 4, 1, indexStream); + + // iterate over chunks + ChunkVector::const_iterator chunkIter = binChunks.begin(); + ChunkVector::const_iterator chunkEnd = binChunks.end(); + for ( ; chunkIter != chunkEnd; ++chunkIter ) { + + // get current chunk data + const Chunk& chunk = (*chunkIter); + const uint64_t& start = chunk.Start; + const uint64_t& stop = chunk.Stop; + + // save chunk offsets + fwrite(&start, 8, 1, indexStream); + fwrite(&stop, 8, 1, indexStream); + } + } + + // write linear offsets size + int32_t offsetSize = offsets.size(); + fwrite(&offsetSize, 4, 1, indexStream); + + // iterate over linear offsets + LinearOffsetVector::const_iterator offsetIter = offsets.begin(); + LinearOffsetVector::const_iterator offsetEnd = offsets.end(); + for ( ; offsetIter != offsetEnd; ++offsetIter ) { + + // write linear offset value + const uint64_t& linearOffset = (*offsetIter); + fwrite(&linearOffset, 8, 1, indexStream); + } + } + + // flush buffer, close file, and return success + fflush(indexStream); + fclose(indexStream); + return true; +} diff --git a/BamReader.h b/BamReader.h index ed8397d..2587b00 100644 --- a/BamReader.h +++ b/BamReader.h @@ -1,286 +1,83 @@ -// *************************************************************************** -// BamReader.h (c) 2009 Derek Barnett, Michael Strömberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 24 June 2009 (DB) -// --------------------------------------------------------------------------- -// The BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading BAM files -// *************************************************************************** - -/*! - \file BamReader.h - \brief API for reading BAM files. -*/ - -#pragma once - -// C++ includes -#include -#include -#include -#include -#include - -// zlib includes -#include - -// BamTools includes -#include "BamAux.h" - -namespace BamTools { - - //! API for reading BAM files. - class BamReader { - - public: - - //! Constructor - BamReader(void); - - //! Destructor - ~BamReader(void); - - public: - - /*! - \brief Closes the BAM file. - - Also closes index file and clears index data, if provided. - - \sa Open() - */ - void Close(void); - - /*! - \brief Access SAM format header data. - - See SAM format documentation for detailed header description. - - \return Full header text (no parsing of tags) - */ - const std::string GetHeaderText(void) const; - - /*! - \brief Retrieve next alignment. - - Stores result in bAlignment. - - If reference and position are specified by a prior call to Jump(), this method stores the next aligmment that either: - a) overlaps, or - b) begins on/after that specified position. - - Otherwise this simply stores next alignment, if one exists. - - Note that this method does not specifiy a 'right bound' position. - If a range is desired, you should supply some stopping criteria. For example: - - \code - BamReader bReader; - bReader.Open(bamFile, bamIndexfile); - if ( bReader.Jump( someID, somePosition ) { - BamAlignment bAlignment; - while ( bReader.GetNextAlignment(bAlignment) && (bAlignment.Position <= someUpperBound) ) { - // do something - } - } - \endcode - - \param bAlignment destination for alignment data - \return success/failure - \sa Jump(), Rewind() - */ - bool GetNextAlignment(BamAlignment& bAlignment); - - /*! - \brief Get number of reference sequences in BAM file. - \return Number of references - \sa GetReferenceData(), GetReferenceID() - */ - const int GetReferenceCount(void) const; - - /*! - \brief Access reference data. - \return Vector of RefData entry - \sa GetReferenceCount(), GetReferenceID() - */ - const RefVector GetReferenceData(void) const; - - /*! - \brief Get reference ID from name. - \param refName name of reference sequence - \return reference ID number - \sa GetReferenceCount(), GetReferenceData() - */ - const int GetReferenceID(const std::string& refName) const; - - /*! - \brief Random access in BAM file. - - Jump to a specified position on reference sequence. - Position is optional - defaults to beginning of specified reference. - - Reference and position are stored for use by subsequent calls to GetNextAlignment(). - - \param refID ID number of desired reference - \param position left-bound position - \return success/failure - \sa GetNextAlignment(), Rewind() - */ - bool Jump(int refID, unsigned int position = 0); - - /*! - \brief Opens a BAM file. - - Index file is optional - sequential reading through a BAM file does not require an index. - - However, the index is required to perform random-access alignment retrival (using the Jump() method ). - - See SAMtools documentation for BAM index generation. - - \param filename BAM file - \param indexFilename BAM index file - \sa Jump(), Close() - */ - void Open(const std::string& filename, const std::string& indexFilename = ""); - - /*! - \brief Moves file pointer to beginning of alignment data. - - A subsequent call to GetNextAlignment() would retrieve the first alignment in the BAM file. - Clears any reference and position set by a prior call to Jump() - - \return success/failure - \sa GetNextAlignment(), Jump() - */ - bool Rewind(void); - - // -------------------------------------------------------------------------------------- - // internal methods - private: - // checks BGZF block header - bool BgzfCheckBlockHeader(char* header); - // closes the BAM file - void BgzfClose(void); - // de-compresses the current block - int BgzfInflateBlock(int blockLength); - // opens the BAM file for reading - void BgzfOpen(const std::string& filename); - // reads BGZF data into a byte buffer - unsigned int BgzfRead(char* data, const unsigned int dataLen); - // reads BGZF block - int BgzfReadBlock(void); - // seek to position in BAM file - bool BgzfSeek(int64_t position); - // get file position in BAM file - int64_t BgzfTell(void); - // unpacks a buffer into an unsigned int - static inline unsigned int BgzfUnpackUnsignedInt(char* buffer); - // unpacks a buffer into an unsigned short - static inline unsigned short BgzfUnpackUnsignedShort(char* buffer); - // unpacks a buffer into a signed int - static inline signed int BgzfUnpackSignedInt(char* buffer); - // calculate bins that overlap region ( left to reference end for now ) - int BinsFromRegion(int, unsigned int, uint16_t[MAX_BIN]); - // calculates alignment end position based on starting position and provided CIGAR operations - unsigned int CalculateAlignmentEnd(const unsigned int& position, const std::vector& cigarData); - // clear out (delete pointers in) index data structure - void ClearIndex(void); - // calculate file offset for first alignment chunk overlapping 'left' - int64_t GetOffset(int refID, unsigned int left); - // checks to see if alignment overlaps current region - bool IsOverlap(BamAlignment& bAlignment); - // retrieves header text from BAM file - void LoadHeaderData(void); - // builds BamIndex data structure from BAM index file - void LoadIndexData(FILE* indexStream); - // retrieves BAM alignment under file pointer - bool LoadNextAlignment(BamAlignment& bAlignment); - // builds reference data structure from BAM file - void LoadReferenceData(void); - // open BAM index file (if successful, loads index) - void OpenIndex(const std::string& indexFilename); - - // aligment file & index file data - private: - BgzfData* m_BGZF; - std::string m_headerText; - BamIndex* m_index; - RefVector m_references; - bool m_isIndexLoaded; - int64_t m_alignmentsBeginOffset; - - // user-specified region values - private: - bool m_isRegionSpecified; - int m_currentRefID; - unsigned int m_currentLeft; - - // BAM character constants - private: - static const char* DNA_LOOKUP; - static const char* CIGAR_LOOKUP; - }; - - //! \cond - // -------------------------------------------------------------------------------------- - // static inline methods (internal - can exclude from main documentation) - - // unpacks a buffer into an unsigned int - inline unsigned int BamReader::BgzfUnpackUnsignedInt(char* buffer) { - union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; - } - - // unpacks a buffer into an unsigned short - inline unsigned short BamReader::BgzfUnpackUnsignedShort(char* buffer) { - union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; - } - - // unpacks a buffer into a signed int - inline signed int BamReader::BgzfUnpackSignedInt(char* buffer) { - union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; - } - - // -------------------------------------------------------------------------------------- - // template classes/methods (internal - can exclude from main documentation) - - // allows sorting/searching of a vector of pairs (instead of using maps) - template - class LookupKeyCompare { - - typedef std::pair< Key, Value > LookupData; - typedef typename LookupData::first_type Key_t; - - public: - bool operator() (const LookupData& lhs, const LookupData& rhs) const { return keyLess(lhs.first, rhs.first); } - bool operator() (const LookupData& lhs, const Key_t& k) const { return keyLess(lhs.first, k); } - bool operator() (const Key_t& k, const LookupData& rhs) const { return keyLess(k, rhs.first); } - private: - bool keyLess(const Key_t& k1, const Key_t& k2) const { return k1 < k2; } - }; - - // return index of item if found, else return container.size() - template < typename InputIterator, typename EqualityComparable > - typename std::iterator_traits::difference_type - Index(const InputIterator& begin, const InputIterator& end, const EqualityComparable& item) { - return std::distance(begin, std::find(begin, end, item)); - } - //! \endcond - -} +// *************************************************************************** +// BamReader.h (c) 2009 Derek Barnett, Michael Strömberg +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 8 December 2009 (DB) +// --------------------------------------------------------------------------- +// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad +// Institute. +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading BAM files +// *************************************************************************** + +#ifndef BAMREADER_H +#define BAMREADER_H + +// C++ includes +#include + +// BamTools includes +#include "BamAux.h" + +namespace BamTools { + +class BamReader { + + // constructor / destructor + public: + BamReader(void); + ~BamReader(void); + + // public interface + public: + + // ---------------------- + // BAM file operations + // ---------------------- + + // close BAM file + void Close(void); + // performs random-access jump to reference, position + bool Jump(int refID, unsigned int position = 0); + // opens BAM file (and optional BAM index file, if provided) + void Open(const std::string& filename, const std::string& indexFilename = ""); + // returns file pointer to beginning of alignments + bool Rewind(void); + + // ---------------------- + // access alignment data + // ---------------------- + + // retrieves next available alignment (returns success/fail) + bool GetNextAlignment(BamAlignment& bAlignment); + + // ---------------------- + // access auxiliary data + // ---------------------- + + // returns SAM header text + const std::string GetHeaderText(void) const; + // returns number of reference sequences + const int GetReferenceCount(void) const; + // returns vector of reference objects + const BamTools::RefVector GetReferenceData(void) const; + // returns reference id (used for BamReader::Jump()) for the given reference name + const int GetReferenceID(const std::string& refName) const; + + // ---------------------- + // BAM index operations + // ---------------------- + + // creates index for BAM file, saves to file (default = bamFilename + ".bai") + bool CreateIndex(void); + + // private implementation + private: + struct BamReaderPrivate; + BamReaderPrivate* d; +}; + +} // namespace BamTools + +#endif // BAMREADER_H diff --git a/BamWriter.cpp b/BamWriter.cpp index 597bae6..c834d45 100644 --- a/BamWriter.cpp +++ b/BamWriter.cpp @@ -1,404 +1,284 @@ -// *************************************************************************** -// BamWriter.cpp (c) 2009 Michael Strömberg, Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 24 June 2009 (DB) -// --------------------------------------------------------------------------- -// The BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for producing BAM files -// *************************************************************************** - -#include "BamWriter.h" -using namespace BamTools; -using namespace std; - -// constructor -BamWriter::BamWriter(void) -{} - -// destructor -BamWriter::~BamWriter(void) { - if(mBGZF.IsOpen) BgzfClose(); -} - -// closes the BAM file -void BamWriter::BgzfClose(void) { - - mBGZF.IsOpen = false; - - // flush the BGZF block - BgzfFlushBlock(); - - // flush and close - fflush(mBGZF.Stream); - fclose(mBGZF.Stream); -} - -// compresses the current block -int BamWriter::BgzfDeflateBlock(void) { - - // initialize the gzip header - char* buffer = mBGZF.CompressedBlock; - unsigned int bufferSize = mBGZF.CompressedBlockSize; - - memset(buffer, 0, 18); - buffer[0] = GZIP_ID1; - buffer[1] = (char)GZIP_ID2; - buffer[2] = CM_DEFLATE; - buffer[3] = FLG_FEXTRA; - buffer[9] = (char)OS_UNKNOWN; - buffer[10] = BGZF_XLEN; - buffer[12] = BGZF_ID1; - buffer[13] = BGZF_ID2; - buffer[14] = BGZF_LEN; - - // loop to retry for blocks that do not compress enough - int inputLength = mBGZF.BlockOffset; - int compressedLength = 0; - - while(true) { - - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)mBGZF.UncompressedBlock; - zs.avail_in = inputLength; - zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH]; - zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; - - // initialize the zlib compression algorithm - if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) { - printf("ERROR: zlib deflate initialization failed.\n"); - exit(1); - } - - // compress the data - int status = deflate(&zs, Z_FINISH); - if(status != Z_STREAM_END) { - deflateEnd(&zs); - - // reduce the input length and try again - if(status == Z_OK) { - inputLength -= 1024; - if(inputLength < 0) { - printf("ERROR: input reduction failed.\n"); - exit(1); - } - continue; - } - - printf("ERROR: zlib deflate failed.\n"); - exit(1); - } - - // finalize the compression routine - if(deflateEnd(&zs) != Z_OK) { - printf("ERROR: deflate end failed.\n"); - exit(1); - } - - compressedLength = zs.total_out; - compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; - - if(compressedLength > MAX_BLOCK_SIZE) { - printf("ERROR: deflate overflow.\n"); - exit(1); - } - - break; - } - - // store the compressed length - BgzfPackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1)); - - // store the CRC32 checksum - unsigned int crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)mBGZF.UncompressedBlock, inputLength); - BgzfPackUnsignedInt(&buffer[compressedLength - 8], crc); - BgzfPackUnsignedInt(&buffer[compressedLength - 4], inputLength); - - // ensure that we have less than a block of data left - int remaining = mBGZF.BlockOffset - inputLength; - if(remaining > 0) { - if(remaining > inputLength) { - printf("ERROR: remainder too large.\n"); - exit(1); - } - - memcpy(mBGZF.UncompressedBlock, mBGZF.UncompressedBlock + inputLength, remaining); - } - - mBGZF.BlockOffset = remaining; - return compressedLength; -} - -// flushes the data in the BGZF block -void BamWriter::BgzfFlushBlock(void) { - - // flush all of the remaining blocks - while(mBGZF.BlockOffset > 0) { - - // compress the data block - int blockLength = BgzfDeflateBlock(); - - // flush the data to our output stream - int numBytesWritten = fwrite(mBGZF.CompressedBlock, 1, blockLength, mBGZF.Stream); - - if(numBytesWritten != blockLength) { - printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten); - exit(1); - } - - mBGZF.BlockAddress += blockLength; - } -} - -// opens the BAM file for writing -void BamWriter::BgzfOpen(const string& filename) { - - mBGZF.Stream = fopen(filename.c_str(), "wb"); - - if(!mBGZF.Stream) { - printf("ERROR: Unable to open the BAM file (%s) for writing.\n", filename.c_str()); - exit(1); - } - - mBGZF.IsOpen = true; -} - -// writes the supplied data into the BGZF buffer -unsigned int BamWriter::BgzfWrite(const char* data, const unsigned int dataLen) { - - // initialize - unsigned int numBytesWritten = 0; - const char* input = data; - unsigned int blockLength = mBGZF.UncompressedBlockSize; - - // copy the data to the buffer - while(numBytesWritten < dataLen) { - unsigned int copyLength = min(blockLength - mBGZF.BlockOffset, dataLen - numBytesWritten); - char* buffer = mBGZF.UncompressedBlock; - memcpy(buffer + mBGZF.BlockOffset, input, copyLength); - - mBGZF.BlockOffset += copyLength; - input += copyLength; - numBytesWritten += copyLength; - - if(mBGZF.BlockOffset == blockLength) BgzfFlushBlock(); - } - - return numBytesWritten; -} - -// closes the alignment archive -void BamWriter::Close(void) { - if(mBGZF.IsOpen) BgzfClose(); -} - -// creates a cigar string from the supplied alignment -void BamWriter::CreatePackedCigar(const vector& cigarOperations, string& packedCigar) { - - // initialize - const unsigned int numCigarOperations = cigarOperations.size(); - packedCigar.resize(numCigarOperations * BT_SIZEOF_INT); - - // pack the cigar data into the string - unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); - - unsigned int cigarOp; - vector::const_iterator coIter; - for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) { - - switch(coIter->Type) { - case 'M': - cigarOp = BAM_CMATCH; - break; - case 'I': - cigarOp = BAM_CINS; - break; - case 'D': - cigarOp = BAM_CDEL; - break; - case 'N': - cigarOp = BAM_CREF_SKIP; - break; - case 'S': - cigarOp = BAM_CSOFT_CLIP; - break; - case 'H': - cigarOp = BAM_CHARD_CLIP; - break; - case 'P': - cigarOp = BAM_CPAD; - break; - default: - printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type); - exit(1); - } - - *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; - pPackedCigar++; - } -} - -// encodes the supplied query sequence into 4-bit notation -void BamWriter::EncodeQuerySequence(const string& query, string& encodedQuery) { - - // prepare the encoded query string - const unsigned int queryLen = query.size(); - const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); - encodedQuery.resize(encodedQueryLen); - char* pEncodedQuery = (char*)encodedQuery.data(); - const char* pQuery = (const char*)query.data(); - - unsigned char nucleotideCode; - bool useHighWord = true; - - while(*pQuery) { - - switch(*pQuery) { - case '=': - nucleotideCode = 0; - break; - case 'A': - nucleotideCode = 1; - break; - case 'C': - nucleotideCode = 2; - break; - case 'G': - nucleotideCode = 4; - break; - case 'T': - nucleotideCode = 8; - break; - case 'N': - nucleotideCode = 15; - break; - default: - printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); - exit(1); - } - - // pack the nucleotide code - if(useHighWord) { - *pEncodedQuery = nucleotideCode << 4; - useHighWord = false; - } else { - *pEncodedQuery |= nucleotideCode; - pEncodedQuery++; - useHighWord = true; - } - - // increment the query position - pQuery++; - } -} - -// opens the alignment archive -void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) { - - // open the BGZF file for writing - BgzfOpen(filename); - - // ================ - // write the header - // ================ - - // write the BAM signature - const unsigned char SIGNATURE_LENGTH = 4; - const char* BAM_SIGNATURE = "BAM\1"; - BgzfWrite(BAM_SIGNATURE, SIGNATURE_LENGTH); - - // write the SAM header text length - const unsigned int samHeaderLen = samHeader.size(); - BgzfWrite((char*)&samHeaderLen, BT_SIZEOF_INT); - - // write the SAM header text - if(samHeaderLen > 0) BgzfWrite(samHeader.data(), samHeaderLen); - - // write the number of reference sequences - const unsigned int numReferenceSequences = referenceSequences.size(); - BgzfWrite((char*)&numReferenceSequences, BT_SIZEOF_INT); - - // ============================= - // write the sequence dictionary - // ============================= - - RefVector::const_iterator rsIter; - for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) { - - // write the reference sequence name length - const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1; - BgzfWrite((char*)&referenceSequenceNameLen, BT_SIZEOF_INT); - - // write the reference sequence name - BgzfWrite(rsIter->RefName.c_str(), referenceSequenceNameLen); - - // write the reference sequence length - BgzfWrite((char*)&rsIter->RefLength, BT_SIZEOF_INT); - } -} - -// saves the alignment to the alignment archive -void BamWriter::SaveAlignment(const BamAlignment& al) { - - // initialize - const unsigned int nameLen = al.Name.size() + 1; - const unsigned int queryLen = al.QueryBases.size(); - const unsigned int numCigarOperations = al.CigarData.size(); - - // create our packed cigar string - string packedCigar; - CreatePackedCigar(al.CigarData, packedCigar); - const unsigned int packedCigarLen = packedCigar.size(); - - // encode the query - string encodedQuery; - EncodeQuerySequence(al.QueryBases, encodedQuery); - const unsigned int encodedQueryLen = encodedQuery.size(); - - // store the tag data length - const unsigned int tagDataLength = al.TagData.size() + 1; - - // assign the BAM core data - unsigned int buffer[8]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen; - buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; - buffer[4] = queryLen; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // write the block size - const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength; - const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; - - BgzfWrite((char*)&blockSize, BT_SIZEOF_INT); - - // write the BAM core - BgzfWrite((char*)&buffer, BAM_CORE_SIZE); - - // write the query name - BgzfWrite(al.Name.c_str(), nameLen); - - // write the packed cigar - BgzfWrite(packedCigar.data(), packedCigarLen); - - // write the encoded query sequence - BgzfWrite(encodedQuery.data(), encodedQueryLen); - - // write the base qualities - string baseQualities = al.Qualities; - char* pBaseQualities = (char*)al.Qualities.data(); - for(unsigned int i = 0; i < queryLen; i++) pBaseQualities[i] -= 33; - BgzfWrite(pBaseQualities, queryLen); - - // write the read group tag - BgzfWrite(al.TagData.data(), tagDataLength); -} +// *************************************************************************** +// BamWriter.cpp (c) 2009 Michael Strömberg, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 8 December 2009 (DB) +// --------------------------------------------------------------------------- +// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad +// Institute. +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +// BGZF includes +#include "BGZF.h" +#include "BamWriter.h" +using namespace BamTools; +using namespace std; + +struct BamWriter::BamWriterPrivate { + + // data members + BgzfData mBGZF; + + // constructor / destructor + BamWriterPrivate(void) { } + ~BamWriterPrivate(void) { + mBGZF.Close(); + } + + // "public" interface + void Close(void); + void Open(const std::string& filename, const std::string& samHeader, const BamTools::RefVector& referenceSequences); + void SaveAlignment(const BamTools::BamAlignment& al); + + // internal methods + void CreatePackedCigar(const std::vector& cigarOperations, std::string& packedCigar); + void EncodeQuerySequence(const std::string& query, std::string& encodedQuery); +}; + +// ----------------------------------------------------- +// BamWriter implementation +// ----------------------------------------------------- + +// constructor +BamWriter::BamWriter(void) { + d = new BamWriterPrivate; +} + +// destructor +BamWriter::~BamWriter(void) { + delete d; + d = 0; +} + +// closes the alignment archive +void BamWriter::Close(void) { + d->Close(); +} + +// opens the alignment archive +void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) { + d->Open(filename, samHeader, referenceSequences); +} + +// saves the alignment to the alignment archive +void BamWriter::SaveAlignment(const BamAlignment& al) { + d->SaveAlignment(al); +} + +// ----------------------------------------------------- +// BamWriterPrivate implementation +// ----------------------------------------------------- + +// closes the alignment archive +void BamWriter::BamWriterPrivate::Close(void) { + mBGZF.Close(); +} + +// creates a cigar string from the supplied alignment +void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, string& packedCigar) { + + // initialize + const unsigned int numCigarOperations = cigarOperations.size(); + packedCigar.resize(numCigarOperations * BT_SIZEOF_INT); + + // pack the cigar data into the string + unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); + + unsigned int cigarOp; + vector::const_iterator coIter; + for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) { + + switch(coIter->Type) { + case 'M': + cigarOp = BAM_CMATCH; + break; + case 'I': + cigarOp = BAM_CINS; + break; + case 'D': + cigarOp = BAM_CDEL; + break; + case 'N': + cigarOp = BAM_CREF_SKIP; + break; + case 'S': + cigarOp = BAM_CSOFT_CLIP; + break; + case 'H': + cigarOp = BAM_CHARD_CLIP; + break; + case 'P': + cigarOp = BAM_CPAD; + break; + default: + printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type); + exit(1); + } + + *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; + pPackedCigar++; + } +} + +// encodes the supplied query sequence into 4-bit notation +void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) { + + // prepare the encoded query string + const unsigned int queryLen = query.size(); + const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); + encodedQuery.resize(encodedQueryLen); + char* pEncodedQuery = (char*)encodedQuery.data(); + const char* pQuery = (const char*)query.data(); + + unsigned char nucleotideCode; + bool useHighWord = true; + + while(*pQuery) { + + switch(*pQuery) { + case '=': + nucleotideCode = 0; + break; + case 'A': + nucleotideCode = 1; + break; + case 'C': + nucleotideCode = 2; + break; + case 'G': + nucleotideCode = 4; + break; + case 'T': + nucleotideCode = 8; + break; + case 'N': + nucleotideCode = 15; + break; + default: + printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); + exit(1); + } + + // pack the nucleotide code + if(useHighWord) { + *pEncodedQuery = nucleotideCode << 4; + useHighWord = false; + } else { + *pEncodedQuery |= nucleotideCode; + pEncodedQuery++; + useHighWord = true; + } + + // increment the query position + pQuery++; + } +} + +// opens the alignment archive +void BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) { + + // open the BGZF file for writing + mBGZF.Open(filename, "wb"); + + // ================ + // write the header + // ================ + + // write the BAM signature + const unsigned char SIGNATURE_LENGTH = 4; + const char* BAM_SIGNATURE = "BAM\1"; + mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH); + + // write the SAM header text length + const unsigned int samHeaderLen = samHeader.size(); + mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT); + + // write the SAM header text + if(samHeaderLen > 0) { + mBGZF.Write(samHeader.data(), samHeaderLen); + } + + // write the number of reference sequences + const unsigned int numReferenceSequences = referenceSequences.size(); + mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT); + + // ============================= + // write the sequence dictionary + // ============================= + + RefVector::const_iterator rsIter; + for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) { + + // write the reference sequence name length + const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1; + mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT); + + // write the reference sequence name + mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); + + // write the reference sequence length + mBGZF.Write((char*)&rsIter->RefLength, BT_SIZEOF_INT); + } +} + +// saves the alignment to the alignment archive +void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { + + // initialize + const unsigned int nameLen = al.Name.size() + 1; + const unsigned int queryLen = al.QueryBases.size(); + const unsigned int numCigarOperations = al.CigarData.size(); + + // create our packed cigar string + string packedCigar; + CreatePackedCigar(al.CigarData, packedCigar); + const unsigned int packedCigarLen = packedCigar.size(); + + // encode the query + string encodedQuery; + EncodeQuerySequence(al.QueryBases, encodedQuery); + const unsigned int encodedQueryLen = encodedQuery.size(); + + // store the tag data length + const unsigned int tagDataLength = al.TagData.size() + 1; + + // assign the BAM core data + unsigned int buffer[8]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen; + buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; + buffer[4] = queryLen; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // write the block size + const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength; + const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; + mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); + + // write the BAM core + mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); + + // write the query name + mBGZF.Write(al.Name.c_str(), nameLen); + + // write the packed cigar + mBGZF.Write(packedCigar.data(), packedCigarLen); + + // write the encoded query sequence + mBGZF.Write(encodedQuery.data(), encodedQueryLen); + + // write the base qualities + string baseQualities = al.Qualities; + char* pBaseQualities = (char*)al.Qualities.data(); + for(unsigned int i = 0; i < queryLen; i++) { pBaseQualities[i] -= 33; } + mBGZF.Write(pBaseQualities, queryLen); + + // write the read group tag + mBGZF.Write(al.TagData.data(), tagDataLength); +} diff --git a/BamWriter.h b/BamWriter.h index 5294dc9..14de8b5 100644 --- a/BamWriter.h +++ b/BamWriter.h @@ -1,114 +1,49 @@ -// *************************************************************************** -// BamWriter.h (c) 2009 Michael Strömberg, Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 24 June 2009 (DB) -// --------------------------------------------------------------------------- -// The BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for producing BAM files -// *************************************************************************** - -/*! - \file BamWriter.h - \brief API for writing BAM files. -*/ - -#pragma once - -// C++ includes -#include -#include - -// zlib includes -#include - -// BamTools includes -#include "BamAux.h" - -namespace BamTools { - - //! API for writing BAM files. - class BamWriter { - - public: - - //! Constructor. - BamWriter(void); - - //! Destructor. - ~BamWriter(void); - - public: - - /*! - \brief Closes the alignment archive - \sa Open() - */ - void Close(void); - - /*! - \brief Opens the alignment archive - \param filename output BAM file - \param samHeader SAM-format header text - \param referenceSequences Reference sequence data - \sa Close() - */ - void Open(const std::string& filename, const std::string& samHeader, const RefVector& referenceSequences); - - /*! - \brief Saves an alignment to the archive - \param al The BamAlignment to be saved - */ - void SaveAlignment(const BamAlignment& al); - - // -------------------------------------------------------------------------------------- - // internal methods - private: - // closes the BAM file - void BgzfClose(void); - // compresses the current block - int BgzfDeflateBlock(void); - // flushes the data in the BGZF block - void BgzfFlushBlock(void); - // opens the BAM file for writing - void BgzfOpen(const std::string& filename); - // packs an unsigned integer into the specified buffer - static inline void BgzfPackUnsignedInt(char* buffer, unsigned int value); - // packs an unsigned short into the specified buffer - static inline void BgzfPackUnsignedShort(char* buffer, unsigned short value); - // writes the supplied data into the BGZF buffer - unsigned int BgzfWrite(const char* data, const unsigned int dataLen); - // calculates the minimum bin that contains a region [begin, end) - static inline unsigned int CalculateMinimumBin(unsigned int begin, unsigned int end); - // creates a packed cigar string from the supplied alignment - static void CreatePackedCigar(const std::vector& cigarOperations, std::string& packedCigar); - // encodes the supplied query sequence into 4-bit notation - static void EncodeQuerySequence(const std::string& query, std::string& encodedQuery); - // our BGZF output object - BgzfData mBGZF; - }; - - //! \cond - // -------------------------------------------------------------------------------------- - // static inline methods (internal - can exclude from main documentation) - - // packs an unsigned integer into the specified buffer - inline void BamWriter::BgzfPackUnsignedInt(char* buffer, unsigned int value) { - buffer[0] = (char)value; - buffer[1] = (char)(value >> 8); - buffer[2] = (char)(value >> 16); - buffer[3] = (char)(value >> 24); - } - - // packs an unsigned short into the specified buffer - inline void BamWriter::BgzfPackUnsignedShort(char* buffer, unsigned short value) { - buffer[0] = (char)value; - buffer[1] = (char)(value >> 8); - } - // -------------------------------------------------------------------------------------- - //! \endcond - -} // end BamTools namespace \ No newline at end of file +// *************************************************************************** +// BamWriter.h (c) 2009 Michael Strömberg, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 8 December 2009 (DB) +// --------------------------------------------------------------------------- +// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad +// Institute. +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#ifndef BAMWRITER_H +#define BAMWRITER_H + +// C++ includes +#include + +// BamTools includes +#include "BamAux.h" + +namespace BamTools { + +class BamWriter { + + // constructor/destructor + public: + BamWriter(void); + ~BamWriter(void); + + // public interface + public: + // closes the alignment archive + void Close(void); + // opens the alignment archive + void Open(const std::string& filename, const std::string& samHeader, const BamTools::RefVector& referenceSequences); + // saves the alignment to the alignment archive + void SaveAlignment(const BamTools::BamAlignment& al); + + // private implementation + private: + struct BamWriterPrivate; + BamWriterPrivate* d; +}; + +} // namespace BamTools + +#endif // BAMWRITER_H -- 2.39.5