From: Derek Date: Wed, 18 Aug 2010 19:19:08 +0000 (-0400) Subject: Reorganized source tree & build system X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=0180d7256f5c781364b04559b39ed2db41585787;p=bamtools.git Reorganized source tree & build system --- diff --git a/BGZF.cpp b/BGZF.cpp deleted file mode 100644 index 92afb96..0000000 --- a/BGZF.cpp +++ /dev/null @@ -1,398 +0,0 @@ -// *************************************************************************** -// BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 16 August 2010 (DB) -// --------------------------------------------------------------------------- -// BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading & writing BGZF files -// *************************************************************************** - -#include -#include "BGZF.h" -using namespace BamTools; -using std::string; -using std::min; - -BgzfData::BgzfData(void) - : UncompressedBlockSize(DEFAULT_BLOCK_SIZE) - , CompressedBlockSize(MAX_BLOCK_SIZE) - , BlockLength(0) - , BlockOffset(0) - , BlockAddress(0) - , IsOpen(false) - , IsWriteOnly(false) - , IsWriteUncompressed(false) - , Stream(NULL) - , UncompressedBlock(NULL) - , CompressedBlock(NULL) -{ - try { - CompressedBlock = new char[CompressedBlockSize]; - UncompressedBlock = new char[UncompressedBlockSize]; - } catch( std::bad_alloc& ba ) { - printf("BGZF ERROR: unable to allocate memory for our BGZF object.\n"); - exit(1); - } -} - -// destructor -BgzfData::~BgzfData(void) { - if( CompressedBlock ) delete[] CompressedBlock; - if( UncompressedBlock ) delete[] UncompressedBlock; -} - -// closes BGZF file -void BgzfData::Close(void) { - - // skip if file not open, otherwise set flag - if ( !IsOpen ) return; - - // if writing to file, flush the current BGZF block, - // then write an empty block (as EOF marker) - if ( IsWriteOnly ) { - FlushBlock(); - int blockLength = DeflateBlock(); - fwrite(CompressedBlock, 1, blockLength, Stream); - } - - // flush and close - fflush(Stream); - fclose(Stream); - IsWriteUncompressed = false; - IsOpen = false; -} - -// compresses the current block -int BgzfData::DeflateBlock(void) { - - // initialize the gzip header - char* buffer = CompressedBlock; - memset(buffer, 0, 18); - buffer[0] = GZIP_ID1; - buffer[1] = (char)GZIP_ID2; - buffer[2] = CM_DEFLATE; - buffer[3] = FLG_FEXTRA; - buffer[9] = (char)OS_UNKNOWN; - buffer[10] = BGZF_XLEN; - buffer[12] = BGZF_ID1; - buffer[13] = BGZF_ID2; - buffer[14] = BGZF_LEN; - - // set compression level - const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION ); - - // loop to retry for blocks that do not compress enough - int inputLength = BlockOffset; - int compressedLength = 0; - unsigned int bufferSize = CompressedBlockSize; - - while ( true ) { - - // initialize zstream values - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)UncompressedBlock; - zs.avail_in = inputLength; - zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH]; - zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; - - // initialize the zlib compression algorithm - if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) { - printf("BGZF ERROR: zlib deflate initialization failed.\n"); - exit(1); - } - - // compress the data - int status = deflate(&zs, Z_FINISH); - if ( status != Z_STREAM_END ) { - - deflateEnd(&zs); - - // reduce the input length and try again - if ( status == Z_OK ) { - inputLength -= 1024; - if( inputLength < 0 ) { - printf("BGZF ERROR: input reduction failed.\n"); - exit(1); - } - continue; - } - - printf("BGZF ERROR: zlib::deflateEnd() failed.\n"); - exit(1); - } - - // finalize the compression routine - if ( deflateEnd(&zs) != Z_OK ) { - printf("BGZF ERROR: zlib::deflateEnd() failed.\n"); - exit(1); - } - - compressedLength = zs.total_out; - compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; - if ( compressedLength > MAX_BLOCK_SIZE ) { - printf("BGZF ERROR: deflate overflow.\n"); - exit(1); - } - - break; - } - - // store the compressed length - BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1)); - - // store the CRC32 checksum - unsigned int crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength); - BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc); - BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength); - - // ensure that we have less than a block of data left - int remaining = BlockOffset - inputLength; - if ( remaining > 0 ) { - if ( remaining > inputLength ) { - printf("BGZF ERROR: after deflate, remainder too large.\n"); - exit(1); - } - memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining); - } - - BlockOffset = remaining; - return compressedLength; -} - -// flushes the data in the BGZF block -void BgzfData::FlushBlock(void) { - - // flush all of the remaining blocks - while ( BlockOffset > 0 ) { - - // compress the data block - int blockLength = DeflateBlock(); - - // flush the data to our output stream - int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream); - - if ( numBytesWritten != blockLength ) { - printf("BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten); - exit(1); - } - - BlockAddress += blockLength; - } -} - -// de-compresses the current block -int BgzfData::InflateBlock(const int& blockLength) { - - // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)CompressedBlock + 18; - zs.avail_in = blockLength - 16; - zs.next_out = (Bytef*)UncompressedBlock; - zs.avail_out = UncompressedBlockSize; - - int status = inflateInit2(&zs, GZIP_WINDOW_BITS); - if ( status != Z_OK ) { - printf("BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n"); - return -1; - } - - status = inflate(&zs, Z_FINISH); - if ( status != Z_STREAM_END ) { - inflateEnd(&zs); - printf("BGZF ERROR: could not decompress block - zlib::inflate() failed\n"); - return -1; - } - - status = inflateEnd(&zs); - if ( status != Z_OK ) { - printf("BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n"); - return -1; - } - - return zs.total_out; -} - -// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing) -bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncompressed ) { - - // determine open mode - if ( strcmp(mode, "rb") == 0 ) - IsWriteOnly = false; - else if ( strcmp(mode, "wb") == 0) - IsWriteOnly = true; - else { - printf("BGZF ERROR: unknown file mode: %s\n", mode); - return false; - } - - // ---------------------------------------------------------------- - // open Stream to read to/write from file, stdin, or stdout - // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03) - - // read/write BGZF data to/from a file - if ( (filename != "stdin") && (filename != "stdout") ) - Stream = fopen(filename.c_str(), mode); - - // read BGZF data from stdin - else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) ) - Stream = freopen(NULL, mode, stdin); - - // write BGZF data to stdout - else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) ) - Stream = freopen(NULL, mode, stdout); - - if ( !Stream ) { - printf("BGZF ERROR: unable to open file %s\n", filename.c_str() ); - return false; - } - - // set flags, return success - IsOpen = true; - IsWriteUncompressed = isWriteUncompressed; - return true; -} - -// reads BGZF data into a byte buffer -int BgzfData::Read(char* data, const unsigned int dataLength) { - - if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0; - - char* output = data; - unsigned int numBytesRead = 0; - while ( numBytesRead < dataLength ) { - - int bytesAvailable = BlockLength - BlockOffset; - if ( bytesAvailable <= 0 ) { - if ( !ReadBlock() ) return -1; - bytesAvailable = BlockLength - BlockOffset; - if ( bytesAvailable <= 0 ) break; - } - - char* buffer = UncompressedBlock; - int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable ); - memcpy(output, buffer + BlockOffset, copyLength); - - BlockOffset += copyLength; - output += copyLength; - numBytesRead += copyLength; - } - - if ( BlockOffset == BlockLength ) { - BlockAddress = ftell64(Stream); - BlockOffset = 0; - BlockLength = 0; - } - - return numBytesRead; -} - -// reads a BGZF block -bool BgzfData::ReadBlock(void) { - - char header[BLOCK_HEADER_LENGTH]; - int64_t blockAddress = ftell64(Stream); - - int count = fread(header, 1, sizeof(header), Stream); - if ( count == 0 ) { - BlockLength = 0; - return true; - } - - if ( count != sizeof(header) ) { - printf("BGZF ERROR: read block failed - could not read block header\n"); - return false; - } - - if ( !BgzfData::CheckBlockHeader(header) ) { - printf("BGZF ERROR: read block failed - invalid block header\n"); - return false; - } - - int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1; - char* compressedBlock = CompressedBlock; - memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH); - int remaining = blockLength - BLOCK_HEADER_LENGTH; - - count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream); - if ( count != remaining ) { - printf("BGZF ERROR: read block failed - could not read data from block\n"); - return false; - } - - count = InflateBlock(blockLength); - if ( count < 0 ) { - printf("BGZF ERROR: read block failed - could not decompress block data\n"); - return false; - } - - if ( BlockLength != 0 ) - BlockOffset = 0; - - BlockAddress = blockAddress; - BlockLength = count; - return true; -} - -// seek to position in BGZF file -bool BgzfData::Seek(int64_t position) { - - if ( !IsOpen ) return false; - - int blockOffset = (position & 0xFFFF); - int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; - - if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) { - printf("BGZF ERROR: unable to seek in file\n"); - return false; - } - - BlockLength = 0; - BlockAddress = blockAddress; - BlockOffset = blockOffset; - return true; -} - -// get file position in BGZF file -int64_t BgzfData::Tell(void) { - if ( !IsOpen ) - return false; - else - return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) ); -} - -// writes the supplied data into the BGZF buffer -unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) { - - if ( !IsOpen || !IsWriteOnly ) return false; - - // initialize - unsigned int numBytesWritten = 0; - const char* input = data; - unsigned int blockLength = UncompressedBlockSize; - - // copy the data to the buffer - while ( numBytesWritten < dataLen ) { - - unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten); - char* buffer = UncompressedBlock; - memcpy(buffer + BlockOffset, input, copyLength); - - BlockOffset += copyLength; - input += copyLength; - numBytesWritten += copyLength; - - if ( BlockOffset == blockLength ) - FlushBlock(); - } - - return numBytesWritten; -} diff --git a/BGZF.h b/BGZF.h deleted file mode 100644 index 37bcff7..0000000 --- a/BGZF.h +++ /dev/null @@ -1,325 +0,0 @@ -// *************************************************************************** -// BGZF.h (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 16 August 2010 (DB) -// --------------------------------------------------------------------------- -// BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading & writing BGZF files -// *************************************************************************** - -#ifndef BGZF_H -#define BGZF_H - -// 'C' includes -#include -#include -#include - -// C++ includes -#include - -// zlib includes -#include "zlib.h" - -// Platform-specific large-file support -#ifndef BAMTOOLS_LFS -#define BAMTOOLS_LFS - #ifdef WIN32 - #define ftell64(a) _ftelli64(a) - #define fseek64(a,b,c) _fseeki64(a,b,c) - #else - #define ftell64(a) ftello(a) - #define fseek64(a,b,c) fseeko(a,b,c) - #endif -#endif // BAMTOOLS_LFS - -// Platform-specific type definitions -#ifndef BAMTOOLS_TYPES -#define BAMTOOLS_TYPES - #ifdef _MSC_VER - typedef char int8_t; - typedef unsigned char uint8_t; - typedef short int16_t; - typedef unsigned short uint16_t; - typedef int int32_t; - typedef unsigned int uint32_t; - typedef long long int64_t; - typedef unsigned long long uint64_t; - #else - #include - #endif -#endif // BAMTOOLS_TYPES - -namespace BamTools { - -// zlib constants -const int GZIP_ID1 = 31; -const int GZIP_ID2 = 139; -const int CM_DEFLATE = 8; -const int FLG_FEXTRA = 4; -const int OS_UNKNOWN = 255; -const int BGZF_XLEN = 6; -const int BGZF_ID1 = 66; -const int BGZF_ID2 = 67; -const int BGZF_LEN = 2; -const int GZIP_WINDOW_BITS = -15; -const int Z_DEFAULT_MEM_LEVEL = 8; - -// BZGF constants -const int BLOCK_HEADER_LENGTH = 18; -const int BLOCK_FOOTER_LENGTH = 8; -const int MAX_BLOCK_SIZE = 65536; -const int DEFAULT_BLOCK_SIZE = 65536; - -struct BgzfData { - - // data members - public: - unsigned int UncompressedBlockSize; - unsigned int CompressedBlockSize; - unsigned int BlockLength; - unsigned int BlockOffset; - uint64_t BlockAddress; - bool IsOpen; - bool IsWriteOnly; - bool IsWriteUncompressed; - FILE* Stream; - char* UncompressedBlock; - char* CompressedBlock; - - // constructor & destructor - public: - BgzfData(void); - ~BgzfData(void); - - // main interface methods - public: - // closes BGZF file - void Close(void); - // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing) - bool Open(const std::string& filename, const char* mode, bool isWriteUncompressed = false); - // reads BGZF data into a byte buffer - int Read(char* data, const unsigned int dataLength); - // seek to position in BGZF file - bool Seek(int64_t position); - // get file position in BGZF file - int64_t Tell(void); - // writes the supplied data into the BGZF buffer - unsigned int Write(const char* data, const unsigned int dataLen); - - // internal methods - private: - // compresses the current block - int DeflateBlock(void); - // flushes the data in the BGZF block - void FlushBlock(void); - // de-compresses the current block - int InflateBlock(const int& blockLength); - // reads a BGZF block - bool ReadBlock(void); - - // static 'utility' methods - public: - // checks BGZF block header - static inline bool CheckBlockHeader(char* header); - // packs an unsigned integer into the specified buffer - static inline void PackUnsignedInt(char* buffer, unsigned int value); - // packs an unsigned short into the specified buffer - static inline void PackUnsignedShort(char* buffer, unsigned short value); - // unpacks a buffer into a double - static inline double UnpackDouble(char* buffer); - static inline double UnpackDouble(const char* buffer); - // unpacks a buffer into a float - static inline float UnpackFloat(char* buffer); - static inline float UnpackFloat(const char* buffer); - // unpacks a buffer into a signed int - static inline signed int UnpackSignedInt(char* buffer); - static inline signed int UnpackSignedInt(const char* buffer); - // unpacks a buffer into a signed short - static inline signed short UnpackSignedShort(char* buffer); - static inline signed short UnpackSignedShort(const char* buffer); - // unpacks a buffer into an unsigned int - static inline unsigned int UnpackUnsignedInt(char* buffer); - static inline unsigned int UnpackUnsignedInt(const char* buffer); - // unpacks a buffer into an unsigned short - static inline unsigned short UnpackUnsignedShort(char* buffer); - static inline unsigned short UnpackUnsignedShort(const char* buffer); -}; - -// ------------------------------------------------------------- -// static 'utility' method implementations - -// checks BGZF block header -inline -bool BgzfData::CheckBlockHeader(char* header) { - return (header[0] == GZIP_ID1 && - header[1] == (char)GZIP_ID2 && - header[2] == Z_DEFLATED && - (header[3] & FLG_FEXTRA) != 0 && - BgzfData::UnpackUnsignedShort(&header[10]) == BGZF_XLEN && - header[12] == BGZF_ID1 && - header[13] == BGZF_ID2 && - BgzfData::UnpackUnsignedShort(&header[14]) == BGZF_LEN ); -} - -// 'packs' an unsigned integer into the specified buffer -inline -void BgzfData::PackUnsignedInt(char* buffer, unsigned int value) { - buffer[0] = (char)value; - buffer[1] = (char)(value >> 8); - buffer[2] = (char)(value >> 16); - buffer[3] = (char)(value >> 24); -} - -// 'packs' an unsigned short into the specified buffer -inline -void BgzfData::PackUnsignedShort(char* buffer, unsigned short value) { - buffer[0] = (char)value; - buffer[1] = (char)(value >> 8); -} - -// 'unpacks' a buffer into a double (includes both non-const & const char* flavors) -inline -double BgzfData::UnpackDouble(char* buffer) { - union { double value; unsigned char valueBuffer[sizeof(double)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - un.valueBuffer[4] = buffer[4]; - un.valueBuffer[5] = buffer[5]; - un.valueBuffer[6] = buffer[6]; - un.valueBuffer[7] = buffer[7]; - return un.value; -} - -inline -double BgzfData::UnpackDouble(const char* buffer) { - union { double value; unsigned char valueBuffer[sizeof(double)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - un.valueBuffer[4] = buffer[4]; - un.valueBuffer[5] = buffer[5]; - un.valueBuffer[6] = buffer[6]; - un.valueBuffer[7] = buffer[7]; - return un.value; -} - -// 'unpacks' a buffer into a float (includes both non-const & const char* flavors) -inline -float BgzfData::UnpackFloat(char* buffer) { - union { float value; unsigned char valueBuffer[sizeof(float)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -inline -float BgzfData::UnpackFloat(const char* buffer) { - union { float value; unsigned char valueBuffer[sizeof(float)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -// 'unpacks' a buffer into a signed int (includes both non-const & const char* flavors) -inline -signed int BgzfData::UnpackSignedInt(char* buffer) { - union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -inline -signed int BgzfData::UnpackSignedInt(const char* buffer) { - union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -// 'unpacks' a buffer into a signed short (includes both non-const & const char* flavors) -inline -signed short BgzfData::UnpackSignedShort(char* buffer) { - union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -inline -signed short BgzfData::UnpackSignedShort(const char* buffer) { - union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -// 'unpacks' a buffer into an unsigned int (includes both non-const & const char* flavors) -inline -unsigned int BgzfData::UnpackUnsignedInt(char* buffer) { - union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -inline -unsigned int BgzfData::UnpackUnsignedInt(const char* buffer) { - union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -// 'unpacks' a buffer into an unsigned short (includes both non-const & const char* flavors) -inline -unsigned short BgzfData::UnpackUnsignedShort(char* buffer) { - union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -inline -unsigned short BgzfData::UnpackUnsignedShort(const char* buffer) { - union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -} // namespace BamTools - -#endif // BGZF_H diff --git a/BamAux.h b/BamAux.h deleted file mode 100644 index 73e8838..0000000 --- a/BamAux.h +++ /dev/null @@ -1,991 +0,0 @@ -// *************************************************************************** -// BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 27 July 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the basic constants, data structures, etc. for using BAM files -// *************************************************************************** - -#ifndef BAMAUX_H -#define BAMAUX_H - -// C inclues -#include -#include -#include -#include - -// C++ includes -#include -#include -#include -#include -#include - -// Platform-specific type definitions -#ifndef BAMTOOLS_TYPES -#define BAMTOOLS_TYPES - #ifdef _MSC_VER - typedef char int8_t; - typedef unsigned char uint8_t; - typedef short int16_t; - typedef unsigned short uint16_t; - typedef int int32_t; - typedef unsigned int uint32_t; - typedef long long int64_t; - typedef unsigned long long uint64_t; - #else - #include - #endif -#endif // BAMTOOLS_TYPES - -namespace BamTools { - -// BAM constants -const int BAM_CORE_SIZE = 32; -const int BAM_CMATCH = 0; -const int BAM_CINS = 1; -const int BAM_CDEL = 2; -const int BAM_CREF_SKIP = 3; -const int BAM_CSOFT_CLIP = 4; -const int BAM_CHARD_CLIP = 5; -const int BAM_CPAD = 6; -const int BAM_CIGAR_SHIFT = 4; -const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1); - -// BAM index constants -const int MAX_BIN = 37450; // =(8^6-1)/7+1 -const int BAM_MIN_CHUNK_GAP = 32768; -const int BAM_LIDX_SHIFT = 14; - -// Explicit variable sizes -const int BT_SIZEOF_INT = 4; - -struct CigarOp; - -struct BamAlignment { - - // constructors & destructor - public: - BamAlignment(void); - BamAlignment(const BamAlignment& other); - ~BamAlignment(void); - - // Queries against alignment flags - public: - bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate - bool IsFailedQC(void) const; // Returns true if this read failed quality control - bool IsFirstMate(void) const; // Returns true if alignment is first mate on read - bool IsMapped(void) const; // Returns true if alignment is mapped - bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped - bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand - bool IsPaired(void) const; // Returns true if alignment part of paired-end read - bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment - bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution - bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand - bool IsSecondMate(void) const; // Returns true if alignment is second mate on read - - // Manipulate alignment flags - public: - void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag - void SetIsFailedQC(bool ok); // Sets "failed quality control" flag - void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag - void SetIsMateUnmapped(bool ok); // Sets "alignment's mate is mapped" flag - void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag - void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag - void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag - void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag - void SetIsSecondaryAlignment(bool ok); // Sets "position is primary alignment" flag - void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag - void SetIsUnmapped(bool ok); // Sets "alignment is mapped" flag - - // Tag data access methods - public: - // ------------------------------------------------------------------------------------- - // N.B. - The following tag-modifying methods may not be used on BamAlignments fetched - // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in - // error message (to keep output clean) but will ALWAYS return false. Only user- - // generated BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid. - - // add tag data (create new TAG entry with TYPE and VALUE) - // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details - // returns true if new data added, false if error or TAG already exists - // N.B. - will NOT modify existing tag. Use EditTag() instead - bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H - bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i - bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i - bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f - - // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present) - // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details - // returns true if edit was successfaul, false if error - bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H - bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i - bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i - bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f - - // specific tag data access methods - these only remain for legacy support - bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (implemented as GetTag("NM", editDistance)) - bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (implemented as GetTag("RG", readGroup)) - - // generic tag data access methods - bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings - bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data - bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data - bool GetTag(const std::string& tag, float& destination) const; // access floating point data - - // remove tag data - // returns true if removal was successful, false if error - // N.B. - returns false if TAG does not exist (no removal can occur) - bool RemoveTag(const std::string& tag); - - // Additional data access methods - public: - int GetEndPosition(bool usePadded = false) const; // calculates alignment end position, based on starting position and CIGAR operations - - // 'internal' utility methods - private: - static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed); - static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed); - - // Data members - public: - std::string Name; // Read name - int32_t Length; // Query length - std::string QueryBases; // 'Original' sequence (as reported from sequencing machine) - std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping) - std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values) - std::string TagData; // Tag data (accessor methods will pull the requested information out) - int32_t RefID; // ID number for reference sequence - int32_t Position; // Position (0-based) where alignment starts - uint16_t Bin; // Bin in BAM file where this alignment resides - uint16_t MapQuality; // Mapping quality score - uint32_t AlignmentFlag; // Alignment bit-flag - see Is() methods to query this value, SetIs() methods to manipulate - std::vector CigarData; // CIGAR operations for this alignment - int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned - int32_t MatePosition; // Position (0-based) where alignment's mate starts - int32_t InsertSize; // Mate-pair insert size - - // internal data - private: - struct BamAlignmentSupportData { - - // data members - std::string AllCharData; - uint32_t BlockLength; - uint32_t NumCigarOperations; - uint32_t QueryNameLength; - uint32_t QuerySequenceLength; - bool HasCoreOnly; - - // constructor - BamAlignmentSupportData(void) - : BlockLength(0) - , NumCigarOperations(0) - , QueryNameLength(0) - , QuerySequenceLength(0) - , HasCoreOnly(false) - { } - }; - - // contains raw character data & lengths - BamAlignmentSupportData SupportData; - - // allow these classes access to BamAlignment private members (SupportData) - // but client code should not need to touch this data - friend class BamReader; - friend class BamWriter; - - // Alignment flag query constants - // Use the get/set methods above instead - private: - enum { PAIRED = 1 - , PROPER_PAIR = 2 - , UNMAPPED = 4 - , MATE_UNMAPPED = 8 - , REVERSE = 16 - , MATE_REVERSE = 32 - , READ_1 = 64 - , READ_2 = 128 - , SECONDARY = 256 - , QC_FAILED = 512 - , DUPLICATE = 1024 - }; -}; - -// ---------------------------------------------------------------- -// Auxiliary data structs & typedefs - -struct CigarOp { - - // data members - char Type; // Operation type (MIDNSHP) - uint32_t Length; // Operation length (number of bases) - - // constructor - CigarOp(const char type = '\0', - const uint32_t length = 0) - : Type(type) - , Length(length) - { } -}; - -struct RefData { - - // data members - std::string RefName; // Name of reference sequence - int32_t RefLength; // Length of reference sequence - bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence - - // constructor - RefData(const int32_t& length = 0, - bool ok = false) - : RefLength(length) - , RefHasAlignments(ok) - { } -}; - -typedef std::vector RefVector; -typedef std::vector BamAlignmentVector; - -struct BamRegion { - - // data members - int LeftRefID; - int LeftPosition; - int RightRefID; - int RightPosition; - - // constructor - BamRegion(const int& leftID = -1, - const int& leftPos = -1, - const int& rightID = -1, - const int& rightPos = -1) - : LeftRefID(leftID) - , LeftPosition(leftPos) - , RightRefID(rightID) - , RightPosition(rightPos) - { } -}; - -// ---------------------------------------------------------------- -// Added: 3-35-2010 DWB -// Fixed: Routines to provide endian-correctness -// ---------------------------------------------------------------- - -// returns true if system is big endian -inline bool SystemIsBigEndian(void) { - const uint16_t one = 0x0001; - return ((*(char*) &one) == 0 ); -} - -// swaps endianness of 16-bit value 'in place' -inline void SwapEndian_16(int16_t& x) { - x = ((x >> 8) | (x << 8)); -} - -inline void SwapEndian_16(uint16_t& x) { - x = ((x >> 8) | (x << 8)); -} - -// swaps endianness of 32-bit value 'in-place' -inline void SwapEndian_32(int32_t& x) { - x = ( (x >> 24) | - ((x << 8) & 0x00FF0000) | - ((x >> 8) & 0x0000FF00) | - (x << 24) - ); -} - -inline void SwapEndian_32(uint32_t& x) { - x = ( (x >> 24) | - ((x << 8) & 0x00FF0000) | - ((x >> 8) & 0x0000FF00) | - (x << 24) - ); -} - -// swaps endianness of 64-bit value 'in-place' -inline void SwapEndian_64(int64_t& x) { - x = ( (x >> 56) | - ((x << 40) & 0x00FF000000000000ll) | - ((x << 24) & 0x0000FF0000000000ll) | - ((x << 8) & 0x000000FF00000000ll) | - ((x >> 8) & 0x00000000FF000000ll) | - ((x >> 24) & 0x0000000000FF0000ll) | - ((x >> 40) & 0x000000000000FF00ll) | - (x << 56) - ); -} - -inline void SwapEndian_64(uint64_t& x) { - x = ( (x >> 56) | - ((x << 40) & 0x00FF000000000000ll) | - ((x << 24) & 0x0000FF0000000000ll) | - ((x << 8) & 0x000000FF00000000ll) | - ((x >> 8) & 0x00000000FF000000ll) | - ((x >> 24) & 0x0000000000FF0000ll) | - ((x >> 40) & 0x000000000000FF00ll) | - (x << 56) - ); -} - -// swaps endianness of 'next 2 bytes' in a char buffer (in-place) -inline void SwapEndian_16p(char* data) { - uint16_t& value = (uint16_t&)*data; - SwapEndian_16(value); -} - -// swaps endianness of 'next 4 bytes' in a char buffer (in-place) -inline void SwapEndian_32p(char* data) { - uint32_t& value = (uint32_t&)*data; - SwapEndian_32(value); -} - -// swaps endianness of 'next 8 bytes' in a char buffer (in-place) -inline void SwapEndian_64p(char* data) { - uint64_t& value = (uint64_t&)*data; - SwapEndian_64(value); -} - -// ---------------------------------------------------------------- -// BamAlignment member methods - -// constructors & destructor -inline BamAlignment::BamAlignment(void) { } - -inline BamAlignment::BamAlignment(const BamAlignment& other) - : Name(other.Name) - , Length(other.Length) - , QueryBases(other.QueryBases) - , AlignedBases(other.AlignedBases) - , Qualities(other.Qualities) - , TagData(other.TagData) - , RefID(other.RefID) - , Position(other.Position) - , Bin(other.Bin) - , MapQuality(other.MapQuality) - , AlignmentFlag(other.AlignmentFlag) - , CigarData(other.CigarData) - , MateRefID(other.MateRefID) - , MatePosition(other.MatePosition) - , InsertSize(other.InsertSize) - , SupportData(other.SupportData) -{ } - -inline BamAlignment::~BamAlignment(void) { } - -// Queries against alignment flags -inline bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } -inline bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } -inline bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); } -inline bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); } -inline bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } -inline bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); } -inline bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } -inline bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } -inline bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } -inline bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } -inline bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } - -// Manipulate alignment flags -inline void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; } -inline void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; } -inline void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; } -inline void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; } -inline void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; } -inline void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; } -inline void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; } -inline void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; } -inline void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; } -inline void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; } -inline void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; } - -// calculates alignment end position, based on starting position and CIGAR operations -inline -int BamAlignment::GetEndPosition(bool usePadded) const { - - // initialize alignment end to starting position - int alignEnd = Position; - - // iterate over cigar operations - std::vector::const_iterator cigarIter = CigarData.begin(); - std::vector::const_iterator cigarEnd = CigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter) { - const char cigarType = (*cigarIter).Type; - if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) { - alignEnd += (*cigarIter).Length; - } - else if ( usePadded && cigarType == 'I' ) { - alignEnd += (*cigarIter).Length; - } - } - return alignEnd; -} - -inline -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type != "Z" && type != "H" ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - - // otherwise, copy tag data to temp buffer - std::string newTag = tag + type + value; - const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -inline -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "f" || type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - - // otherwise, convert value to string - union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; - un.value = value; - - // copy original tag data to temp buffer - std::string newTag = tag + type; - const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); - memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int)); - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -inline -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) { - return AddTag(tag, type, (const uint32_t&)value); -} - -inline -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - - // otherwise, convert value to string - union { float value; char valueBuffer[sizeof(float)]; } un; - un.value = value; - - // copy original tag data to temp buffer - std::string newTag = tag + type; - const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); - memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float)); - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -inline -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type != "Z" && type != "H" ) return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + value.size()]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new VALUE in place of current tag data - const unsigned int dataLength = strlen(value.c_str()); - memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 ); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1; - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} - -inline -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "f" || type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + sizeof(value)]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new VALUE in place of current tag data - union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; - un.value = value; - memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int)); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int); - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} - -inline -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) { - return EditTag(tag, type, (const uint32_t&)value); -} - -inline -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + sizeof(value)]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new VALUE in place of current tag data - union { float value; char valueBuffer[sizeof(float)]; } un; - un.value = value; - memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float)); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + sizeof(float); - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} - -// get "NM" tag data - originally contributed by Aaron Quinlan -// stores data in 'editDistance', returns success/fail -inline -bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { - return GetTag("NM", (uint32_t&)editDistance); -} - -// get "RG" tag data -// stores data in 'readGroup', returns success/fail -inline -bool BamAlignment::GetReadGroup(std::string& readGroup) const { - return GetTag("RG", readGroup); -} - -inline -bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - const unsigned int dataLength = strlen(pTagData); - destination.clear(); - destination.resize(dataLength); - memcpy( (char*)destination.data(), pTagData, dataLength ); - return true; - } - - // tag not found, return failure - return false; -} - -inline -bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found, determine data byte-length, store data in readGroup, return success - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - - // determine data byte-length - const char type = *(pTagData - 1); - int destinationLength = 0; - switch (type) { - // 1 byte data - case 'A': - case 'c': - case 'C': - destinationLength = 1; - break; - - // 2 byte data - case 's': - case 'S': - destinationLength = 2; - break; - - // 4 byte data - case 'i': - case 'I': - destinationLength = 4; - break; - - // unsupported type for integer destination (float or var-length strings) - case 'f': - case 'Z': - case 'H': - printf("ERROR: Cannot store tag of type %c in integer destination\n", type); - return false; - - // unknown tag type - default: - printf("ERROR: Unknown tag storage class encountered: [%c]\n", type); - return false; - } - - // store in destination - destination = 0; - memcpy(&destination, pTagData, destinationLength); - return true; - } - - // tag not found, return failure - return false; -} - -inline -bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const { - return GetTag(tag, (uint32_t&)destination); -} - -inline -bool BamAlignment::GetTag(const std::string& tag, float& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found, determine data byte-length, store data in readGroup, return success - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - //pTagData += numBytesParsed; - - // determine data byte-length - const char type = *(pTagData - 1); - int destinationLength = 0; - switch(type) { - - // 1 byte data - case 'A': - case 'c': - case 'C': - destinationLength = 1; - break; - - // 2 byte data - case 's': - case 'S': - destinationLength = 2; - break; - - // 4 byte data - case 'f': - case 'i': - case 'I': - destinationLength = 4; - break; - - // unsupported type (var-length strings) - case 'Z': - case 'H': - printf("ERROR: Cannot store tag of type %c in integer destination\n", type); - return false; - - // unknown tag type - default: - printf("ERROR: Unknown tag storage class encountered: [%c]\n", type); - return false; - } - - // store in destination - destination = 0.0; - memcpy(&destination, pTagData, destinationLength); - return true; - } - - // tag not found, return failure - return false; -} - -inline -bool BamAlignment::RemoveTag(const std::string& tag) { - - // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed - // also, return false if no data present to remove - if ( SupportData.HasCoreOnly || TagData.empty() ) return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - char newTagData[originalTagDataLength]; - - // copy original tag data up til desired tag - pTagData -= 3; - numBytesParsed -= 3; - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength ); - - // save new tag data - TagData.assign(newTagData, beginningTagDataLength + endTagDataLength); - return true; - } - - // tag not found, no removal - return failure - return false; -} - -inline -bool BamAlignment::FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) { - - while ( numBytesParsed < tagDataLength ) { - - const char* pTagType = pTagData; - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - - // check the current tag, return true on match - if ( std::strncmp(pTagType, tag.c_str(), 2) == 0 ) - return true; - - // get the storage class and find the next tag - if ( *pTagStorageType == '\0' ) return false; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; - if ( *pTagData == '\0' ) return false; - } - - // checked all tags, none match - return false; -} - -inline -bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { - - switch(storageType) { - - case 'A': - case 'c': - case 'C': - ++numBytesParsed; - ++pTagData; - break; - - case 's': - case 'S': - numBytesParsed += 2; - pTagData += 2; - break; - - case 'f': - case 'i': - case 'I': - numBytesParsed += 4; - pTagData += 4; - break; - - case 'Z': - case 'H': - while(*pTagData) { - ++numBytesParsed; - ++pTagData; - } - // increment for null-terminator - ++numBytesParsed; - ++pTagData; - break; - - default: - // error case - printf("ERROR: Unknown tag storage class encountered: [%c]\n", storageType); - return false; - } - - // return success - return true; -} - -} // namespace BamTools - -#endif // BAMAUX_H diff --git a/BamIndex.cpp b/BamIndex.cpp deleted file mode 100644 index d74e751..0000000 --- a/BamIndex.cpp +++ /dev/null @@ -1,926 +0,0 @@ -// *************************************************************************** -// BamIndex.cpp (c) 2009 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 17 August 2010 (DB) -// --------------------------------------------------------------------------- -// Provides index functionality - both for the default (standardized) BAM -// index format (.bai) as well as a BamTools-specific (nonstandard) index -// format (.bti). -// *************************************************************************** - -#include -#include -#include -// #include -#include -#include "BamIndex.h" -#include "BamReader.h" -#include "BGZF.h" -using namespace std; -using namespace BamTools; - -// ------------------------------- -// BamIndex implementation - -BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian) - : m_BGZF(bgzf) - , m_reader(reader) - , m_isBigEndian(isBigEndian) -{ - if ( m_reader && m_reader->IsOpen() ) - m_references = m_reader->GetReferenceData(); -} - -bool BamIndex::HasAlignments(const int& referenceID) { - - // return false if invalid ID - if ( (referenceID < 0) || (referenceID >= (int)m_references.size()) ) - return false; - - // else return status of reference (has alignments?) - else - return m_references.at(referenceID).RefHasAlignments; -} - -// ######################################################################################### -// ######################################################################################### - -// ------------------------------- -// BamDefaultIndex structs & typedefs - -namespace BamTools { - -// -------------------------------------------------- -// BamDefaultIndex data structures & typedefs -struct Chunk { - - // data members - uint64_t Start; - uint64_t Stop; - - // constructor - Chunk(const uint64_t& start = 0, - const uint64_t& stop = 0) - : Start(start) - , Stop(stop) - { } -}; - -bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) { - return lhs.Start < rhs.Start; -} - -typedef vector ChunkVector; -typedef map BamBinMap; -typedef vector LinearOffsetVector; - -struct ReferenceIndex { - - // data members - BamBinMap Bins; - LinearOffsetVector Offsets; - - // constructor - ReferenceIndex(const BamBinMap& binMap = BamBinMap(), - const LinearOffsetVector& offsets = LinearOffsetVector()) - : Bins(binMap) - , Offsets(offsets) - { } -}; - -typedef vector BamDefaultIndexData; - -} // namespace BamTools - -// ------------------------------- -// BamDefaultIndex implementation - -struct BamDefaultIndex::BamDefaultIndexPrivate { - - // ------------------------- - // data members - - BamDefaultIndexData m_indexData; - BamDefaultIndex* m_parent; - - // ------------------------- - // ctor & dtor - - BamDefaultIndexPrivate(BamDefaultIndex* parent) : m_parent(parent) { } - ~BamDefaultIndexPrivate(void) { } - - // ------------------------- - // internal methods - - // calculate bins that overlap region - int BinsFromRegion(const BamTools::BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]); - // saves BAM bin entry for index - void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset); - // saves linear offset entry for index - void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset); - // simplifies index by merging 'chunks' - void MergeChunks(void); - -}; - -BamDefaultIndex::BamDefaultIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) - : BamIndex(bgzf, reader, isBigEndian) -{ - d = new BamDefaultIndexPrivate(this); -} - -BamDefaultIndex::~BamDefaultIndex(void) { - d->m_indexData.clear(); - delete d; - d = 0; -} - -// calculate bins that overlap region -int BamDefaultIndex::BamDefaultIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) { - - // get region boundaries - uint32_t begin = (unsigned int)region.LeftPosition; - uint32_t end; - - // if right bound specified AND left&right bounds are on same reference - // OK to use right bound position - if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) ) - end = (unsigned int)region.RightPosition; - - // otherwise, use end of left bound reference as cutoff - else - end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1; - - // initialize list, bin '0' always a valid bin - int i = 0; - bins[i++] = 0; - - // get rest of bins that contain this region - unsigned int k; - for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; } - for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; } - for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; } - for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; } - for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; } - - // return number of bins stored - return i; -} - -bool BamDefaultIndex::Build(void) { - - // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) - return false; - - // move file pointer to beginning of alignments - m_reader->Rewind(); - - // get reference count, reserve index space - int numReferences = (int)m_references.size(); - for ( int i = 0; i < numReferences; ++i ) { - d->m_indexData.push_back(ReferenceIndex()); - } - - // sets default constant for bin, ID, offset, coordinate variables - const uint32_t defaultValue = 0xffffffffu; - - // bin data - uint32_t saveBin(defaultValue); - uint32_t lastBin(defaultValue); - - // reference ID data - int32_t saveRefID(defaultValue); - int32_t lastRefID(defaultValue); - - // offset data - uint64_t saveOffset = m_BGZF->Tell(); - uint64_t lastOffset = saveOffset; - - // coordinate data - int32_t lastCoordinate = defaultValue; - - BamAlignment bAlignment; - while ( m_reader->GetNextAlignmentCore(bAlignment) ) { - - // change of chromosome, save ID, reset bin - if ( lastRefID != bAlignment.RefID ) { - lastRefID = bAlignment.RefID; - lastBin = defaultValue; - } - - // if lastCoordinate greater than BAM position - file not sorted properly - else if ( lastCoordinate > bAlignment.Position ) { - printf("BAM file not properly sorted:\n"); - printf("Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID); - exit(1); - } - - // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions) - if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) { - - // save linear offset entry (matched to BAM entry refID) - ReferenceIndex& refIndex = d->m_indexData.at(bAlignment.RefID); - LinearOffsetVector& offsets = refIndex.Offsets; - d->InsertLinearOffset(offsets, bAlignment, lastOffset); - } - - // if current BamAlignment bin != lastBin, "then possibly write the binning index" - if ( bAlignment.Bin != lastBin ) { - - // if not first time through - if ( saveBin != defaultValue ) { - - // save Bam bin entry - ReferenceIndex& refIndex = d->m_indexData.at(saveRefID); - BamBinMap& binMap = refIndex.Bins; - d->InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); - } - - // update saveOffset - saveOffset = lastOffset; - - // update bin values - saveBin = bAlignment.Bin; - lastBin = bAlignment.Bin; - - // update saveRefID - saveRefID = bAlignment.RefID; - - // if invalid RefID, break out (why?) - if ( saveRefID < 0 ) { break; } - } - - // make sure that current file pointer is beyond lastOffset - if ( m_BGZF->Tell() <= (int64_t)lastOffset ) { - printf("Error in BGZF offsets.\n"); - exit(1); - } - - // update lastOffset - lastOffset = m_BGZF->Tell(); - - // update lastCoordinate - lastCoordinate = bAlignment.Position; - } - - // save any leftover BAM data (as long as refID is valid) - if ( saveRefID >= 0 ) { - // save Bam bin entry - ReferenceIndex& refIndex = d->m_indexData.at(saveRefID); - BamBinMap& binMap = refIndex.Bins; - d->InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); - } - - // simplify index by merging chunks - d->MergeChunks(); - - // iterate through references in index - // store whether reference has data & - // sort offsets in linear offset vector - BamDefaultIndexData::iterator indexIter = d->m_indexData.begin(); - BamDefaultIndexData::iterator indexEnd = d->m_indexData.end(); - for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) { - - // get reference index data - ReferenceIndex& refIndex = (*indexIter); - BamBinMap& binMap = refIndex.Bins; - LinearOffsetVector& offsets = refIndex.Offsets; - - // store whether reference has alignments or no - m_references[i].RefHasAlignments = ( binMap.size() > 0 ); - - // sort linear offsets - sort(offsets.begin(), offsets.end()); - } - - // rewind file pointer to beginning of alignments, return success/fail - return m_reader->Rewind(); -} - -bool BamDefaultIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets) { - - // calculate which bins overlap this region - uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); - int numBins = d->BinsFromRegion(region, isRightBoundSpecified, bins); - - // get bins for this reference - const ReferenceIndex& refIndex = d->m_indexData.at(region.LeftRefID); - const BamBinMap& binMap = refIndex.Bins; - - // get minimum offset to consider - const LinearOffsetVector& linearOffsets = refIndex.Offsets; - uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT); - - // store all alignment 'chunk' starts (file offsets) for bins in this region - for ( int i = 0; i < numBins; ++i ) { - - const uint16_t binKey = bins[i]; - map::const_iterator binIter = binMap.find(binKey); - if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) { - - const ChunkVector& chunks = (*binIter).second; - std::vector::const_iterator chunksIter = chunks.begin(); - std::vector::const_iterator chunksEnd = chunks.end(); - for ( ; chunksIter != chunksEnd; ++chunksIter) { - - // if valid chunk found, store its file offset - const Chunk& chunk = (*chunksIter); - if ( chunk.Stop > minOffset ) - offsets.push_back( chunk.Start ); - } - } - } - - // clean up memory - free(bins); - - // sort the offsets before returning - sort(offsets.begin(), offsets.end()); - - // return whether any offsets were found - return ( offsets.size() != 0 ); -} - -// saves BAM bin entry for index -void BamDefaultIndex::BamDefaultIndexPrivate::InsertBinEntry(BamBinMap& binMap, - const uint32_t& saveBin, - const uint64_t& saveOffset, - const uint64_t& lastOffset) -{ - // look up saveBin - BamBinMap::iterator binIter = binMap.find(saveBin); - - // create new chunk - Chunk newChunk(saveOffset, lastOffset); - - // if entry doesn't exist - if ( binIter == binMap.end() ) { - ChunkVector newChunks; - newChunks.push_back(newChunk); - binMap.insert( pair(saveBin, newChunks)); - } - - // otherwise - else { - ChunkVector& binChunks = (*binIter).second; - binChunks.push_back( newChunk ); - } -} - -// saves linear offset entry for index -void BamDefaultIndex::BamDefaultIndexPrivate::InsertLinearOffset(LinearOffsetVector& offsets, - const BamAlignment& bAlignment, - const uint64_t& lastOffset) -{ - // get converted offsets - int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; - int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; - - // resize vector if necessary - int oldSize = offsets.size(); - int newSize = endOffset + 1; - if ( oldSize < newSize ) - offsets.resize(newSize, 0); - - // store offset - for( int i = beginOffset + 1; i <= endOffset; ++i ) { - if ( offsets[i] == 0 ) - offsets[i] = lastOffset; - } -} - -bool BamDefaultIndex::Load(const string& filename) { - - // open index file, abort on error - FILE* indexStream = fopen(filename.c_str(), "rb"); - if( !indexStream ) { - printf("ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str()); - return false; - } - - // set placeholder to receive input byte count (suppresses compiler warnings) - size_t elementsRead = 0; - - // see if index is valid BAM index - char magic[4]; - elementsRead = fread(magic, 1, 4, indexStream); - if ( strncmp(magic, "BAI\1", 4) ) { - printf("Problem with index file - invalid format.\n"); - fclose(indexStream); - return false; - } - - // get number of reference sequences - uint32_t numRefSeqs; - elementsRead = fread(&numRefSeqs, 4, 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_32(numRefSeqs); } - - // intialize space for BamDefaultIndexData data structure - d->m_indexData.reserve(numRefSeqs); - - // iterate over reference sequences - for ( unsigned int i = 0; i < numRefSeqs; ++i ) { - - // get number of bins for this reference sequence - int32_t numBins; - elementsRead = fread(&numBins, 4, 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_32(numBins); } - - if ( numBins > 0 ) { - RefData& refEntry = m_references[i]; - refEntry.RefHasAlignments = true; - } - - // intialize BinVector - BamBinMap binMap; - - // iterate over bins for that reference sequence - for ( int j = 0; j < numBins; ++j ) { - - // get binID - uint32_t binID; - elementsRead = fread(&binID, 4, 1, indexStream); - - // get number of regionChunks in this bin - uint32_t numChunks; - elementsRead = fread(&numChunks, 4, 1, indexStream); - - if ( m_isBigEndian ) { - SwapEndian_32(binID); - SwapEndian_32(numChunks); - } - - // intialize ChunkVector - ChunkVector regionChunks; - regionChunks.reserve(numChunks); - - // iterate over regionChunks in this bin - for ( unsigned int k = 0; k < numChunks; ++k ) { - - // get chunk boundaries (left, right) - uint64_t left; - uint64_t right; - elementsRead = fread(&left, 8, 1, indexStream); - elementsRead = fread(&right, 8, 1, indexStream); - - if ( m_isBigEndian ) { - SwapEndian_64(left); - SwapEndian_64(right); - } - - // save ChunkPair - regionChunks.push_back( Chunk(left, right) ); - } - - // sort chunks for this bin - sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan ); - - // save binID, chunkVector for this bin - binMap.insert( pair(binID, regionChunks) ); - } - - // load linear index for this reference sequence - - // get number of linear offsets - int32_t numLinearOffsets; - elementsRead = fread(&numLinearOffsets, 4, 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_32(numLinearOffsets); } - - // intialize LinearOffsetVector - LinearOffsetVector offsets; - offsets.reserve(numLinearOffsets); - - // iterate over linear offsets for this reference sequeence - uint64_t linearOffset; - for ( int j = 0; j < numLinearOffsets; ++j ) { - // read a linear offset & store - elementsRead = fread(&linearOffset, 8, 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_64(linearOffset); } - offsets.push_back(linearOffset); - } - - // sort linear offsets - sort( offsets.begin(), offsets.end() ); - - // store index data for that reference sequence - d->m_indexData.push_back( ReferenceIndex(binMap, offsets) ); - } - - // close index file (.bai) and return - fclose(indexStream); - return true; -} - -// merges 'alignment chunks' in BAM bin (used for index building) -void BamDefaultIndex::BamDefaultIndexPrivate::MergeChunks(void) { - - // iterate over reference enties - BamDefaultIndexData::iterator indexIter = m_indexData.begin(); - BamDefaultIndexData::iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - - // get BAM bin map for this reference - ReferenceIndex& refIndex = (*indexIter); - BamBinMap& bamBinMap = refIndex.Bins; - - // iterate over BAM bins - BamBinMap::iterator binIter = bamBinMap.begin(); - BamBinMap::iterator binEnd = bamBinMap.end(); - for ( ; binIter != binEnd; ++binIter ) { - - // get chunk vector for this bin - ChunkVector& binChunks = (*binIter).second; - if ( binChunks.size() == 0 ) { continue; } - - ChunkVector mergedChunks; - mergedChunks.push_back( binChunks[0] ); - - // iterate over chunks - int i = 0; - ChunkVector::iterator chunkIter = binChunks.begin(); - ChunkVector::iterator chunkEnd = binChunks.end(); - for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { - - // get 'currentChunk' based on numeric index - Chunk& currentChunk = mergedChunks[i]; - - // get iteratorChunk based on vector iterator - Chunk& iteratorChunk = (*chunkIter); - - // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted) - if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) { - - // set currentChunk.Stop to iteratorChunk.Stop - currentChunk.Stop = iteratorChunk.Stop; - } - - // otherwise - else { - // set currentChunk + 1 to iteratorChunk - mergedChunks.push_back(iteratorChunk); - ++i; - } - } - - // saved merged chunk vector - (*binIter).second = mergedChunks; - } - } -} - -// writes in-memory index data out to file -// N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamDefaultIndex::Write(const std::string& bamFilename) { - - string indexFilename = bamFilename + ".bai"; - FILE* indexStream = fopen(indexFilename.c_str(), "wb"); - if ( indexStream == 0 ) { - printf("ERROR: Could not open file to save index.\n"); - return false; - } - - // write BAM index header - fwrite("BAI\1", 1, 4, indexStream); - - // write number of reference sequences - int32_t numReferenceSeqs = d->m_indexData.size(); - if ( m_isBigEndian ) { SwapEndian_32(numReferenceSeqs); } - fwrite(&numReferenceSeqs, 4, 1, indexStream); - - // iterate over reference sequences - BamDefaultIndexData::const_iterator indexIter = d->m_indexData.begin(); - BamDefaultIndexData::const_iterator indexEnd = d->m_indexData.end(); - for ( ; indexIter != indexEnd; ++ indexIter ) { - - // get reference index data - const ReferenceIndex& refIndex = (*indexIter); - const BamBinMap& binMap = refIndex.Bins; - const LinearOffsetVector& offsets = refIndex.Offsets; - - // write number of bins - int32_t binCount = binMap.size(); - if ( m_isBigEndian ) { SwapEndian_32(binCount); } - fwrite(&binCount, 4, 1, indexStream); - - // iterate over bins - BamBinMap::const_iterator binIter = binMap.begin(); - BamBinMap::const_iterator binEnd = binMap.end(); - for ( ; binIter != binEnd; ++binIter ) { - - // get bin data (key and chunk vector) - uint32_t binKey = (*binIter).first; - const ChunkVector& binChunks = (*binIter).second; - - // save BAM bin key - if ( m_isBigEndian ) { SwapEndian_32(binKey); } - fwrite(&binKey, 4, 1, indexStream); - - // save chunk count - int32_t chunkCount = binChunks.size(); - if ( m_isBigEndian ) { SwapEndian_32(chunkCount); } - fwrite(&chunkCount, 4, 1, indexStream); - - // iterate over chunks - ChunkVector::const_iterator chunkIter = binChunks.begin(); - ChunkVector::const_iterator chunkEnd = binChunks.end(); - for ( ; chunkIter != chunkEnd; ++chunkIter ) { - - // get current chunk data - const Chunk& chunk = (*chunkIter); - uint64_t start = chunk.Start; - uint64_t stop = chunk.Stop; - - if ( m_isBigEndian ) { - SwapEndian_64(start); - SwapEndian_64(stop); - } - - // save chunk offsets - fwrite(&start, 8, 1, indexStream); - fwrite(&stop, 8, 1, indexStream); - } - } - - // write linear offsets size - int32_t offsetSize = offsets.size(); - if ( m_isBigEndian ) { SwapEndian_32(offsetSize); } - fwrite(&offsetSize, 4, 1, indexStream); - - // iterate over linear offsets - LinearOffsetVector::const_iterator offsetIter = offsets.begin(); - LinearOffsetVector::const_iterator offsetEnd = offsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) { - - // write linear offset value - uint64_t linearOffset = (*offsetIter); - if ( m_isBigEndian ) { SwapEndian_64(linearOffset); } - fwrite(&linearOffset, 8, 1, indexStream); - } - } - - // flush buffer, close file, and return success - fflush(indexStream); - fclose(indexStream); - return true; -} - -// ######################################################################################### -// ######################################################################################### - -// ------------------------------------- -// BamToolsIndex implementation - -namespace BamTools { - -struct BamToolsIndexEntry { - - // data members - int64_t Offset; - int RefID; - int Position; - - // ctor - BamToolsIndexEntry(const uint64_t& offset = 0, - const int& id = -1, - const int& position = -1) - : Offset(offset) - , RefID(id) - , Position(position) - { } -}; - -typedef vector BamToolsIndexData; - -} // namespace BamTools - -struct BamToolsIndex::BamToolsIndexPrivate { - - // ------------------------- - // data members - BamToolsIndexData m_indexData; - BamToolsIndex* m_parent; - int32_t m_blockSize; - - // ------------------------- - // ctor & dtor - - BamToolsIndexPrivate(BamToolsIndex* parent) - : m_parent(parent) - , m_blockSize(1000) - { } - - ~BamToolsIndexPrivate(void) { } - - // ------------------------- - // internal methods -}; - -BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) - : BamIndex(bgzf, reader, isBigEndian) -{ - d = new BamToolsIndexPrivate(this); -} - -BamToolsIndex::~BamToolsIndex(void) { - delete d; - d = 0; -} - -bool BamToolsIndex::Build(void) { - - // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) - return false; - - // move file pointer to beginning of alignments - m_reader->Rewind(); - - // plow through alignments, store block offsets - int32_t currentBlockCount = 0; - int64_t blockStartOffset = m_BGZF->Tell(); - int blockStartId = -1; - int blockStartPosition = -1; - BamAlignment al; - while ( m_reader->GetNextAlignmentCore(al) ) { - - // set reference flag - m_references[al.RefID].RefHasAlignments = true; - - // if beginning of block, save first alignment's refID & position - if ( currentBlockCount == 0 ) { - blockStartId = al.RefID; - blockStartPosition = al.Position; - } - - // increment block counter - ++currentBlockCount; - - // if block is full, get offset for next block, reset currentBlockCount - if ( currentBlockCount == d->m_blockSize ) { - - d->m_indexData.push_back( BamToolsIndexEntry(blockStartOffset, blockStartId, blockStartPosition) ); - blockStartOffset = m_BGZF->Tell(); - currentBlockCount = 0; - } - } - - return m_reader->Rewind(); -} - -// N.B. - ignores isRightBoundSpecified -bool BamToolsIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets) { - - // return false if no index data present - if ( d->m_indexData.empty() ) return false; - - // clear any prior data - offsets.clear(); - - // calculate nearest index to jump to - int64_t previousOffset = -1; - BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin(); - BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - - const BamToolsIndexEntry& entry = (*indexIter); - - // check if we are 'past' beginning of desired region - // if so, we will break out & use previously stored offset - if ( entry.RefID > region.LeftRefID ) break; - if ( (entry.RefID == region.LeftRefID) && (entry.Position > region.LeftPosition) ) break; - - // not past desired region, so store current entry offset in previousOffset - previousOffset = entry.Offset; - } - - // no index was found - if ( previousOffset == -1 ) - return false; - - // store offset & return success - offsets.push_back(previousOffset); - return true; -} - -bool BamToolsIndex::Load(const string& filename) { - - // open index file, abort on error - FILE* indexStream = fopen(filename.c_str(), "rb"); - if( !indexStream ) { - printf("ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str()); - return false; - } - - // set placeholder to receive input byte count (suppresses compiler warnings) - size_t elementsRead = 0; - - // see if index is valid BAM index - char magic[4]; - elementsRead = fread(magic, 1, 4, indexStream); - if ( strncmp(magic, "BTI\1", 4) ) { - printf("Problem with index file - invalid format.\n"); - fclose(indexStream); - return false; - } - - // read in block size - elementsRead = fread(&d->m_blockSize, sizeof(d->m_blockSize), 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_32(d->m_blockSize); } - - // read in number of offsets - uint32_t numOffsets; - elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_32(numOffsets); } - - // reserve space for index data - d->m_indexData.reserve(numOffsets); - - // iterate over index entries - for ( unsigned int i = 0; i < numOffsets; ++i ) { - - uint64_t offset; - int id; - int position; - - // read in data - elementsRead = fread(&offset, sizeof(offset), 1, indexStream); - elementsRead = fread(&id, sizeof(id), 1, indexStream); - elementsRead = fread(&position, sizeof(position), 1, indexStream); - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_64(offset); - SwapEndian_32(id); - SwapEndian_32(position); - } - - // save reference index entry - d->m_indexData.push_back( BamToolsIndexEntry(offset, id, position) ); - - // set reference flag - m_references[id].RefHasAlignments = true; // what about sparse references? wont be able to set flag? - } - - // close index file and return - fclose(indexStream); - return true; -} - -// writes in-memory index data out to file -// N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamToolsIndex::Write(const std::string& bamFilename) { - - string indexFilename = bamFilename + ".bti"; - FILE* indexStream = fopen(indexFilename.c_str(), "wb"); - if ( indexStream == 0 ) { - printf("ERROR: Could not open file to save index.\n"); - return false; - } - - // write BAM index header - fwrite("BTI\1", 1, 4, indexStream); - - // write block size - int32_t blockSize = d->m_blockSize; - if ( m_isBigEndian ) { SwapEndian_32(blockSize); } - fwrite(&blockSize, sizeof(blockSize), 1, indexStream); - - // write number of offset entries - uint32_t numOffsets = d->m_indexData.size(); - if ( m_isBigEndian ) { SwapEndian_32(numOffsets); } - fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream); - - // iterate over offset entries - BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin(); - BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end(); - for ( ; indexIter != indexEnd; ++ indexIter ) { - - // get reference index data - const BamToolsIndexEntry& entry = (*indexIter); - - // copy entry data - uint64_t offset = entry.Offset; - int id = entry.RefID; - int position = entry.Position; - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_64(offset); - SwapEndian_32(id); - SwapEndian_32(position); - } - - // write the reference index entry - fwrite(&offset, sizeof(offset), 1, indexStream); - fwrite(&id, sizeof(id), 1, indexStream); - fwrite(&position, sizeof(position), 1, indexStream); - } - - // flush file buffer, close file, and return success - fflush(indexStream); - fclose(indexStream); - return true; -} diff --git a/BamIndex.h b/BamIndex.h deleted file mode 100644 index b9ce7d0..0000000 --- a/BamIndex.h +++ /dev/null @@ -1,120 +0,0 @@ -// *************************************************************************** -// BamIndex.h (c) 2009 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 17 August 2010 (DB) -// --------------------------------------------------------------------------- -// Provides index functionality - both for the default (standardized) BAM -// index format (.bai) as well as a BamTools-specific (nonstandard) index -// format (.bti). -// *************************************************************************** - -#ifndef BAM_INDEX_H -#define BAM_INDEX_H - -#include -#include -#include "BamAux.h" - -namespace BamTools { - -class BamReader; -class BgzfData; - -// -------------------------------------------------- -// BamIndex base class -class BamIndex { - - public: - BamIndex(BamTools::BgzfData* bgzf, - BamTools::BamReader* reader, - bool isBigEndian); - virtual ~BamIndex(void) { } - - public: - // creates index data (in-memory) from current reader data - virtual bool Build(void) =0; - // calculates offset(s) for a given region - virtual bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector& offsets) =0; - // loads existing data from file into memory - virtual bool Load(const std::string& filename) =0; - // returns whether reference has alignments or no - virtual bool HasAlignments(const int& referenceID); - // writes in-memory index data out to file - // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) - virtual bool Write(const std::string& bamFilename) =0; - - protected: - BamTools::BgzfData* m_BGZF; - BamTools::BamReader* m_reader; - BamTools::RefVector m_references; - bool m_isBigEndian; -}; - -// -------------------------------------------------- -// BamDefaultIndex class -// -// implements default (per SAM/BAM spec) index file ops -class BamDefaultIndex : public BamIndex { - - - // ctor & dtor - public: - BamDefaultIndex(BamTools::BgzfData* bgzf, - BamTools::BamReader* reader, - bool isBigEndian); - ~BamDefaultIndex(void); - - // interface (implements BamIndex virtual methods) - public: - // creates index data (in-memory) from current reader data - bool Build(void); - // calculates offset(s) for a given region - bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector& offsets); - // loads existing data from file into memory - bool Load(const std::string& filename); - // writes in-memory index data out to file - // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) - bool Write(const std::string& bamFilename); - - // internal implementation - private: - struct BamDefaultIndexPrivate; - BamDefaultIndexPrivate* d; -}; - -// -------------------------------------------------- -// BamToolsIndex class -// -// implements BamTools-specific index file ops -class BamToolsIndex : public BamIndex { - - // ctor & dtor - public: - BamToolsIndex(BamTools::BgzfData* bgzf, - BamTools::BamReader* reader, - bool isBigEndian); - ~BamToolsIndex(void); - - // interface (implements BamIndex virtual methods) - public: - // creates index data (in-memory) from current reader data - bool Build(void); - // calculates offset(s) for a given region - bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector& offsets); - // loads existing data from file into memory - bool Load(const std::string& filename); - // writes in-memory index data out to file - // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) - bool Write(const std::string& bamFilename); - - // internal implementation - private: - struct BamToolsIndexPrivate; - BamToolsIndexPrivate* d; -}; - -} // namespace BamTools - -#endif // BAM_INDEX_H \ No newline at end of file diff --git a/BamMultiReader.cpp b/BamMultiReader.cpp deleted file mode 100644 index 005b0b0..0000000 --- a/BamMultiReader.cpp +++ /dev/null @@ -1,420 +0,0 @@ -// *************************************************************************** -// BamMultiReader.cpp (c) 2010 Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 20 July 2010 (DB) -// --------------------------------------------------------------------------- -// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Functionality for simultaneously reading multiple BAM files. -// -// This functionality allows applications to work on very large sets of files -// without requiring intermediate merge, sort, and index steps for each file -// subset. It also improves the performance of our merge system as it -// precludes the need to sort merged files. -// *************************************************************************** - -// C++ includes -#include -#include -#include -#include -#include -#include - -// BamTools includes -#include "BGZF.h" -#include "BamMultiReader.h" -using namespace BamTools; -using namespace std; - -// ----------------------------------------------------- -// BamMultiReader implementation -// ----------------------------------------------------- - -// constructor -BamMultiReader::BamMultiReader(void) - : CurrentRefID(0) - , CurrentLeft(0) -{ } - -// destructor -BamMultiReader::~BamMultiReader(void) { - Close(); // close the bam files - // clean up reader objects - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - delete it->first; - delete it->second; - } -} - -// close the BAM files -void BamMultiReader::Close(void) { - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - reader->Close(); // close the reader - } -} - -// updates the reference id stored in the BamMultiReader -// to reflect the current state of the readers -void BamMultiReader::UpdateReferenceID(void) { - // the alignments are sorted by position, so the first alignment will always have the lowest reference ID - if (alignments.begin()->second.second->RefID != CurrentRefID) { - // get the next reference id - // while there aren't any readers at the next ref id - // increment the ref id - int nextRefID = CurrentRefID; - while (alignments.begin()->second.second->RefID != nextRefID) { - ++nextRefID; - } - //cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl; - CurrentRefID = nextRefID; - } -} - -// checks if any readers still have alignments -bool BamMultiReader::HasOpenReaders() { - return alignments.size() > 0; -} - -// get next alignment among all files -bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { - - // bail out if we are at EOF in all files, means no more alignments to process - if (!HasOpenReaders()) - return false; - - // when all alignments have stepped into a new target sequence, update our - // current reference sequence id - UpdateReferenceID(); - - // our lowest alignment and reader will be at the front of our alignment index - BamAlignment* alignment = alignments.begin()->second.second; - BamReader* reader = alignments.begin()->second.first; - - // now that we have the lowest alignment in the set, save it by copy to our argument - nextAlignment = BamAlignment(*alignment); - - // remove this alignment index entry from our alignment index - alignments.erase(alignments.begin()); - - // and add another entry if we can get another alignment from the reader - if (reader->GetNextAlignment(*alignment)) { - alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), - make_pair(reader, alignment))); - } else { // do nothing - //cerr << "reached end of file " << lowestReader->GetFilename() << endl; - } - - return true; - -} - -// get next alignment among all files without parsing character data from alignments -bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) { - - // bail out if we are at EOF in all files, means no more alignments to process - if (!HasOpenReaders()) - return false; - - // when all alignments have stepped into a new target sequence, update our - // current reference sequence id - UpdateReferenceID(); - - // our lowest alignment and reader will be at the front of our alignment index - BamAlignment* alignment = alignments.begin()->second.second; - BamReader* reader = alignments.begin()->second.first; - - // now that we have the lowest alignment in the set, save it by copy to our argument - nextAlignment = BamAlignment(*alignment); - //memcpy(&nextAlignment, alignment, sizeof(BamAlignment)); - - // remove this alignment index entry from our alignment index - alignments.erase(alignments.begin()); - - // and add another entry if we can get another alignment from the reader - if (reader->GetNextAlignmentCore(*alignment)) { - alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), - make_pair(reader, alignment))); - } else { // do nothing - //cerr << "reached end of file " << lowestReader->GetFilename() << endl; - } - - return true; - -} - -// jumps to specified region(refID, leftBound) in BAM files, returns success/fail -bool BamMultiReader::Jump(int refID, int position) { - - //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) { - CurrentRefID = refID; - CurrentLeft = position; - - bool result = true; - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - result &= reader->Jump(refID, position); - if (!result) { - cerr << "ERROR: could not jump " << reader->GetFilename() << " to " << refID << ":" << position << endl; - exit(1); - } - } - if (result) UpdateAlignments(); - return result; -} - -bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) { - - BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition); - - return SetRegion(region); - -} - -bool BamMultiReader::SetRegion(const BamRegion& region) { - - Region = region; - - // NB: While it may make sense to track readers in which we can - // successfully SetRegion, In practice a failure of SetRegion means "no - // alignments here." It makes sense to simply accept the failure, - // UpdateAlignments(), and continue. - - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - it->first->SetRegion(region); - } - - UpdateAlignments(); - - return true; - -} - -void BamMultiReader::UpdateAlignments(void) { - // Update Alignments - alignments.clear(); - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* br = it->first; - BamAlignment* ba = it->second; - if (br->GetNextAlignment(*ba)) { - alignments.insert(make_pair(make_pair(ba->RefID, ba->Position), - make_pair(br, ba))); - } else { - // assume BamReader end of region / EOF - } - } -} - -// opens BAM files -bool BamMultiReader::Open(const vector filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) { - - // for filename in filenames - fileNames = filenames; // save filenames in our multireader - for (vector::const_iterator it = filenames.begin(); it != filenames.end(); ++it) { - string filename = *it; - BamReader* reader = new BamReader; - - bool openedOK = true; - if (openIndexes) { - if (useDefaultIndex) - openedOK = reader->Open(filename, filename + ".bai"); - else - openedOK = reader->Open(filename, filename + ".bti"); - } else { - openedOK = reader->Open(filename); // for merging, jumping is disallowed - } - - // if file opened ok, check that it can be read - if ( openedOK ) { - - bool fileOK = true; - BamAlignment* alignment = new BamAlignment; - if (coreMode) { - fileOK &= reader->GetNextAlignmentCore(*alignment); - } else { - fileOK &= reader->GetNextAlignment(*alignment); - } - - if (fileOK) { - readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup - alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), - make_pair(reader, alignment))); - } else { - cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl; - // if only file available & could not be read, return failure - if ( filenames.size() == 1 ) return false; - } - - } - - // TODO; any more error handling on openedOK ?? - else - return false; - } - - // files opened ok, at least one alignment could be read, - // now need to check that all files use same reference data - ValidateReaders(); - return true; -} - -void BamMultiReader::PrintFilenames(void) { - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - cout << reader->GetFilename() << endl; - } -} - -// for debugging -void BamMultiReader::DumpAlignmentIndex(void) { - for (AlignmentIndex::const_iterator it = alignments.begin(); it != alignments.end(); ++it) { - cerr << it->first.first << ":" << it->first.second << " " << it->second.first->GetFilename() << endl; - } -} - -// returns BAM file pointers to beginning of alignment data -bool BamMultiReader::Rewind(void) { - bool result = true; - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - result &= reader->Rewind(); - } - return result; -} - -// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail -bool BamMultiReader::CreateIndexes(bool useDefaultIndex) { - bool result = true; - for (vector >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - result &= reader->CreateIndex(useDefaultIndex); - } - return result; -} - -// makes a virtual, unified header for all the bam files in the multireader -const string BamMultiReader::GetHeaderText(void) const { - - string mergedHeader = ""; - map readGroups; - - // foreach extraction entry (each BAM file) - for (vector >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) { - - map currentFileReadGroups; - - BamReader* reader = rs->first; - - stringstream header(reader->GetHeaderText()); - vector lines; - string item; - while (getline(header, item)) - lines.push_back(item); - - for (vector::const_iterator it = lines.begin(); it != lines.end(); ++it) { - - // get next line from header, skip if empty - string headerLine = *it; - if ( headerLine.empty() ) { continue; } - - // if first file, save HD & SQ entries - if ( rs == readers.begin() ) { - if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) { - mergedHeader.append(headerLine.c_str()); - mergedHeader.append(1, '\n'); - } - } - - // (for all files) append RG entries if they are unique - if ( headerLine.find("@RG") == 0 ) { - stringstream headerLineSs(headerLine); - string part, readGroupPart, readGroup; - while(std::getline(headerLineSs, part, '\t')) { - stringstream partSs(part); - string subtag; - std::getline(partSs, subtag, ':'); - if (subtag == "ID") { - std::getline(partSs, readGroup, ':'); - break; - } - } - if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries - mergedHeader.append(headerLine.c_str() ); - mergedHeader.append(1, '\n'); - readGroups[readGroup] = true; - currentFileReadGroups[readGroup] = true; - } else { - // warn iff we are reading one file and discover duplicated @RG tags in the header - // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags - if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) { - cerr << "WARNING: duplicate @RG tag " << readGroup - << " entry in header of " << reader->GetFilename() << endl; - } - } - } - } - } - - // return merged header text - return mergedHeader; -} - -// ValidateReaders checks that all the readers point to BAM files representing -// alignments against the same set of reference sequences, and that the -// sequences are identically ordered. If these checks fail the operation of -// the multireader is undefined, so we force program exit. -void BamMultiReader::ValidateReaders(void) const { - int firstRefCount = readers.front().first->GetReferenceCount(); - BamTools::RefVector firstRefData = readers.front().first->GetReferenceData(); - for (vector >::const_iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - BamTools::RefVector currentRefData = reader->GetReferenceData(); - BamTools::RefVector::const_iterator f = firstRefData.begin(); - BamTools::RefVector::const_iterator c = currentRefData.begin(); - if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) { - cerr << "ERROR: mismatched number of references in " << reader->GetFilename() - << " expected " << firstRefCount - << " reference sequences but only found " << reader->GetReferenceCount() << endl; - exit(1); - } - // this will be ok; we just checked above that we have identically-sized sets of references - // here we simply check if they are all, in fact, equal in content - while (f != firstRefData.end()) { - if (f->RefName != c->RefName || f->RefLength != c->RefLength) { - cerr << "ERROR: mismatched references found in " << reader->GetFilename() - << " expected: " << endl; - for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a) - cerr << a->RefName << " " << a->RefLength << endl; - cerr << "but found: " << endl; - for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a) - cerr << a->RefName << " " << a->RefLength << endl; - exit(1); - } - ++f; ++c; - } - } -} - -// NB: The following functions assume that we have identical references for all -// BAM files. We enforce this by invoking the above validation function -// (ValidateReaders) to verify that our reference data is the same across all -// files on Open, so we will not encounter a situation in which there is a -// mismatch and we are still live. - -// returns the number of reference sequences -const int BamMultiReader::GetReferenceCount(void) const { - return readers.front().first->GetReferenceCount(); -} - -// returns vector of reference objects -const BamTools::RefVector BamMultiReader::GetReferenceData(void) const { - return readers.front().first->GetReferenceData(); -} - -const int BamMultiReader::GetReferenceID(const string& refName) const { - return readers.front().first->GetReferenceID(refName); -} diff --git a/BamMultiReader.h b/BamMultiReader.h deleted file mode 100644 index bd36d71..0000000 --- a/BamMultiReader.h +++ /dev/null @@ -1,133 +0,0 @@ -// *************************************************************************** -// BamMultiReader.h (c) 2010 Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 20 July 2010 (DB) -// --------------------------------------------------------------------------- -// Functionality for simultaneously reading multiple BAM files -// *************************************************************************** - -#ifndef BAMMULTIREADER_H -#define BAMMULTIREADER_H - -// C++ includes -#include -#include -#include // for pair -#include - -using namespace std; - -// BamTools includes -#include "BamAux.h" -#include "BamReader.h" - -namespace BamTools { - -// index mapping reference/position pairings to bamreaders and their alignments -typedef multimap, pair > AlignmentIndex; - - -class BamMultiReader { - - // constructor / destructor - public: - BamMultiReader(void); - ~BamMultiReader(void); - - // public interface - public: - - // positioning - int CurrentRefID; - int CurrentLeft; - - // region under analysis, specified using SetRegion - BamRegion Region; - - // ---------------------- - // BAM file operations - // ---------------------- - - // close BAM files - void Close(void); - - // opens BAM files (and optional BAM index files, if provided) - // @openIndexes - triggers index opening, useful for suppressing - // error messages during merging of files in which we may not have - // indexes. - // @coreMode - setup our first alignments using GetNextAlignmentCore(); - // also useful for merging - bool Open(const vector filenames, bool openIndexes = true, bool coreMode = false, bool useDefaultIndex = true); - - // performs random-access jump to reference, position - bool Jump(int refID, int position = 0); - - // sets the target region - bool SetRegion(const BamRegion& region); - bool SetRegion(const int&, const int&, const int&, const int&); // convenience function to above - - // returns file pointers to beginning of alignments - bool Rewind(void); - - // ---------------------- - // access alignment data - // ---------------------- - // updates the reference id marker to match the lower limit of our readers - void UpdateReferenceID(void); - - // retrieves next available alignment (returns success/fail) from all files - bool GetNextAlignment(BamAlignment&); - // retrieves next available alignment (returns success/fail) from all files - // and populates the support data with information about the alignment - // *** BUT DOES NOT PARSE CHARACTER DATA FROM THE ALIGNMENT - bool GetNextAlignmentCore(BamAlignment&); - // ... should this be private? - bool HasOpenReaders(void); - - // ---------------------- - // access auxiliary data - // ---------------------- - - // returns unified SAM header text for all files - const string GetHeaderText(void) const; - // returns number of reference sequences - const int GetReferenceCount(void) const; - // returns vector of reference objects - const BamTools::RefVector GetReferenceData(void) const; - // returns reference id (used for BamMultiReader::Jump()) for the given reference name - const int GetReferenceID(const std::string& refName) const; - // validates that we have a congruent set of BAM files that are aligned against the same reference sequences - void ValidateReaders() const; - - // ---------------------- - // BAM index operations - // ---------------------- - - // creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai") - bool CreateIndexes(bool useDefaultIndex = true); - - //const int GetReferenceID(const string& refName) const; - - // utility - void PrintFilenames(void); - void DumpAlignmentIndex(void); - void UpdateAlignments(void); // updates our alignment cache - - // private implementation - private: - - // the set of readers and alignments which we operate on, maintained throughout the life of this class - vector > readers; - - // readers and alignments sorted by reference id and position, to keep track of the lowest (next) alignment - // when a reader reaches EOF, its entry is removed from this index - AlignmentIndex alignments; - - vector fileNames; -}; - -} // namespace BamTools - -#endif // BAMMULTIREADER_H diff --git a/BamReader.cpp b/BamReader.cpp deleted file mode 100644 index b1b1b5d..0000000 --- a/BamReader.cpp +++ /dev/null @@ -1,773 +0,0 @@ -// *************************************************************************** -// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 15 July 2010 (DB) -// --------------------------------------------------------------------------- -// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading BAM files -// *************************************************************************** - -// C++ includes -#include -#include -#include -#include -#include - -// BamTools includes -#include "BGZF.h" -#include "BamReader.h" -#include "BamIndex.h" -using namespace BamTools; -using namespace std; - -struct BamReader::BamReaderPrivate { - - // ------------------------------- - // structs, enums, typedefs - // ------------------------------- - enum RegionState { BEFORE_REGION = 0 - , WITHIN_REGION - , AFTER_REGION - }; - - // ------------------------------- - // data members - // ------------------------------- - - // general file data - BgzfData mBGZF; - string HeaderText; - //BamIndex Index; - BamIndex* NewIndex; - RefVector References; - bool IsIndexLoaded; - int64_t AlignmentsBeginOffset; - string Filename; - string IndexFilename; - - // system data - bool IsBigEndian; - - // user-specified region values - BamRegion Region; - bool IsLeftBoundSpecified; - bool IsRightBoundSpecified; - - bool IsRegionSpecified; - int CurrentRefID; - int CurrentLeft; - - // parent BamReader - BamReader* Parent; - - // BAM character constants - const char* DNA_LOOKUP; - const char* CIGAR_LOOKUP; - - // ------------------------------- - // constructor & destructor - // ------------------------------- - BamReaderPrivate(BamReader* parent); - ~BamReaderPrivate(void); - - // ------------------------------- - // "public" interface - // ------------------------------- - - // file operations - void Close(void); - bool Jump(int refID, int position = 0); - bool Open(const string& filename, const string& indexFilename = ""); - bool Rewind(void); - bool SetRegion(const BamRegion& region); - - // access alignment data - bool GetNextAlignment(BamAlignment& bAlignment); - bool GetNextAlignmentCore(BamAlignment& bAlignment); - - // access auxiliary data - int GetReferenceID(const string& refName) const; - - // index operations - bool CreateIndex(bool useDefaultIndex); - - // ------------------------------- - // internal methods - // ------------------------------- - - // *** reading alignments and auxiliary data *** // - - // fills out character data for BamAlignment data - bool BuildCharData(BamAlignment& bAlignment); - // checks to see if alignment overlaps current region - RegionState IsOverlap(BamAlignment& bAlignment); - // retrieves header text from BAM file - void LoadHeaderData(void); - // retrieves BAM alignment under file pointer - bool LoadNextAlignment(BamAlignment& bAlignment); - // builds reference data structure from BAM file - void LoadReferenceData(void); - - // *** index file handling *** // - - // clear out inernal index data structure - void ClearIndex(void); - // loads index from BAM index file - bool LoadIndex(void); -}; - -// ----------------------------------------------------- -// BamReader implementation (wrapper around BRPrivate) -// ----------------------------------------------------- -// constructor -BamReader::BamReader(void) { - d = new BamReaderPrivate(this); -} - -// destructor -BamReader::~BamReader(void) { - delete d; - d = 0; -} - -// file operations -void BamReader::Close(void) { d->Close(); } -bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; } -bool BamReader::Jump(int refID, int position) { - d->Region.LeftRefID = refID; - d->Region.LeftPosition = position; - d->IsLeftBoundSpecified = true; - d->IsRightBoundSpecified = false; - return d->Jump(refID, position); -} -bool BamReader::Open(const string& filename, const string& indexFilename) { return d->Open(filename, indexFilename); } -bool BamReader::Rewind(void) { return d->Rewind(); } -bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); } -bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) { - return d->SetRegion( BamRegion(leftRefID, leftBound, rightRefID, rightBound) ); -} - -// access alignment data -bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); } -bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); } - -// access auxiliary data -const string BamReader::GetHeaderText(void) const { return d->HeaderText; } -int BamReader::GetReferenceCount(void) const { return d->References.size(); } -const RefVector& BamReader::GetReferenceData(void) const { return d->References; } -int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); } -const std::string BamReader::GetFilename(void) const { return d->Filename; } - -// index operations -bool BamReader::CreateIndex(bool useDefaultIndex) { return d->CreateIndex(useDefaultIndex); } - -// ----------------------------------------------------- -// BamReaderPrivate implementation -// ----------------------------------------------------- - -// constructor -BamReader::BamReaderPrivate::BamReaderPrivate(BamReader* parent) - : NewIndex(0) - , IsIndexLoaded(false) - , AlignmentsBeginOffset(0) - , IsLeftBoundSpecified(false) - , IsRightBoundSpecified(false) - , IsRegionSpecified(false) - , CurrentRefID(0) - , CurrentLeft(0) - , Parent(parent) - , DNA_LOOKUP("=ACMGRSVTWYHKDBN") - , CIGAR_LOOKUP("MIDNSHP") -{ - IsBigEndian = SystemIsBigEndian(); -} - -// destructor -BamReader::BamReaderPrivate::~BamReaderPrivate(void) { - Close(); -} - -bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { - - // calculate character lengths/offsets - const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; - const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength; - const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4); - const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2; - const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength; - const unsigned int tagDataLength = dataLength - tagDataOffset; - - // set up char buffers - const char* allCharData = bAlignment.SupportData.AllCharData.data(); - uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); - const char* seqData = ((const char*)allCharData) + seqDataOffset; - const char* qualData = ((const char*)allCharData) + qualDataOffset; - char* tagData = ((char*)allCharData) + tagDataOffset; - - // store alignment name (depends on null char as terminator) - bAlignment.Name.assign((const char*)(allCharData)); - - // save CigarOps - CigarOp op; - bAlignment.CigarData.clear(); - bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations); - for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) { - - // swap if necessary - if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); } - - // build CigarOp structure - op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); - op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; - - // save CigarOp - bAlignment.CigarData.push_back(op); - } - - - // save query sequence - bAlignment.QueryBases.clear(); - bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength); - for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) { - char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; - bAlignment.QueryBases.append(1, singleBase); - } - - // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character - bAlignment.Qualities.clear(); - bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength); - for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) { - char singleQuality = (char)(qualData[i]+33); - bAlignment.Qualities.append(1, singleQuality); - } - - // if QueryBases is empty (and this is a allowed case) - if ( bAlignment.QueryBases.empty() ) - bAlignment.AlignedBases = bAlignment.QueryBases; - - // if QueryBases contains data, then build AlignedBases using CIGAR data - else { - - // resize AlignedBases - bAlignment.AlignedBases.clear(); - bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength); - - // iterate over CigarOps - int k = 0; - vector::const_iterator cigarIter = bAlignment.CigarData.begin(); - vector::const_iterator cigarEnd = bAlignment.CigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter ) { - - const CigarOp& op = (*cigarIter); - switch(op.Type) { - - case ('M') : - case ('I') : - bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases - // fall through - - case ('S') : - k += op.Length; // for 'S' - soft clip, skip over query bases - break; - - case ('D') : - bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character - break; - - case ('P') : - bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character - break; - - case ('N') : - bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence - break; - - case ('H') : - break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op - - default: - printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here - exit(1); - } - } - } - - // ----------------------- - // Added: 3-25-2010 DB - // Fixed: endian-correctness for tag data - // ----------------------- - if ( IsBigEndian ) { - int i = 0; - while ( (unsigned int)i < tagDataLength ) { - - i += 2; // skip tag type (e.g. "RG", "NM", etc) - uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning - ++i; // skip value type - - switch (type) { - - case('A') : - case('C') : - ++i; - break; - - case('S') : - SwapEndian_16p(&tagData[i]); - i += sizeof(uint16_t); - break; - - case('F') : - case('I') : - SwapEndian_32p(&tagData[i]); - i += sizeof(uint32_t); - break; - - case('D') : - SwapEndian_64p(&tagData[i]); - i += sizeof(uint64_t); - break; - - case('H') : - case('Z') : - while (tagData[i]) { ++i; } - ++i; // increment one more for null terminator - break; - - default : - printf("ERROR: Invalid tag value type\n"); // shouldn't get here - exit(1); - } - } - } - - // store TagData - bAlignment.TagData.clear(); - bAlignment.TagData.resize(tagDataLength); - memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength); - - // clear the core-only flag - bAlignment.SupportData.HasCoreOnly = false; - - // return success - return true; -} - -// clear index data structure -void BamReader::BamReaderPrivate::ClearIndex(void) { - delete NewIndex; - NewIndex = 0; -} - -// closes the BAM file -void BamReader::BamReaderPrivate::Close(void) { - - // close BGZF file stream - mBGZF.Close(); - - // clear out index data - ClearIndex(); - - // clear out header data - HeaderText.clear(); - - // clear out region flags - IsLeftBoundSpecified = false; - IsRightBoundSpecified = false; - IsRegionSpecified = false; -} - -// create BAM index from BAM file (keep structure in memory) and write to default index output file -bool BamReader::BamReaderPrivate::CreateIndex(bool useDefaultIndex) { - - // clear out prior index data - ClearIndex(); - - // create default index - if ( useDefaultIndex ) - NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian); - // create BamTools 'custom' index - else - NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian); - - bool ok = true; - ok &= NewIndex->Build(); - ok &= NewIndex->Write(Filename); - - // return success/fail - return ok; -} - -// get next alignment (from specified region, if given) -bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) { - - // if valid alignment found, attempt to parse char data, and return success/failure - if ( GetNextAlignmentCore(bAlignment) ) - return BuildCharData(bAlignment); - - // no valid alignment found - else - return false; -} - -// retrieves next available alignment core data (returns success/fail) -// ** DOES NOT parse any character data (read name, bases, qualities, tag data) -// these can be accessed, if necessary, from the supportData -// useful for operations requiring ONLY positional or other alignment-related information -bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) { - - // if valid alignment available - if ( LoadNextAlignment(bAlignment) ) { - - // set core-only flag - bAlignment.SupportData.HasCoreOnly = true; - - // if region not specified, return success - if ( !IsLeftBoundSpecified ) return true; - - // determine region state (before, within, after) - BamReader::BamReaderPrivate::RegionState state = IsOverlap(bAlignment); - - // if alignment lies after region, return false - if ( state == AFTER_REGION ) - return false; - - while ( state != WITHIN_REGION ) { - // if no valid alignment available (likely EOF) return failure - if ( !LoadNextAlignment(bAlignment) ) return false; - // if alignment lies after region, return false (no available read within region) - state = IsOverlap(bAlignment); - if ( state == AFTER_REGION) return false; - - } - - // return success (alignment found that overlaps region) - return true; - } - - // no valid alignment - else - return false; -} - -// returns RefID for given RefName (returns References.size() if not found) -int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const { - - // retrieve names from reference data - vector refNames; - RefVector::const_iterator refIter = References.begin(); - RefVector::const_iterator refEnd = References.end(); - for ( ; refIter != refEnd; ++refIter) { - refNames.push_back( (*refIter).RefName ); - } - - // return 'index-of' refName ( if not found, returns refNames.size() ) - return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); -} - -// returns region state - whether alignment ends before, overlaps, or starts after currently specified region -// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true -BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { - - // -------------------------------------------------- - // check alignment start against right bound cutoff - - // if full region of interest was given - if ( IsRightBoundSpecified ) { - - // read starts on right bound reference, but AFTER right bound position - if ( bAlignment.RefID == Region.RightRefID && bAlignment.Position > Region.RightPosition ) - return AFTER_REGION; - - // if read starts on reference AFTER right bound, return false - if ( bAlignment.RefID > Region.RightRefID ) - return AFTER_REGION; - } - - // -------------------------------------------------------- - // no right bound given OR read starts before right bound - // so, check if it overlaps left bound - - // if read starts on left bound reference AND after left boundary, return success - if ( bAlignment.RefID == Region.LeftRefID && bAlignment.Position >= Region.LeftPosition) - return WITHIN_REGION; - - // if read is on any reference sequence before left bound, return false - if ( bAlignment.RefID < Region.LeftRefID ) - return BEFORE_REGION; - - // -------------------------------------------------------- - // read is on left bound reference, but starts before left bound position - - // if it overlaps, return WITHIN_REGION - if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) - return WITHIN_REGION; - // else begins before left bound position - else - return BEFORE_REGION; -} - -// jumps to specified region(refID, leftBound) in BAM file, returns success/fail -bool BamReader::BamReaderPrivate::Jump(int refID, int position) { - - // ----------------------------------------------------------------------- - // check for existing index - if ( NewIndex == 0 ) return false; - // see if reference has alignments - if ( !NewIndex->HasAlignments(refID) ) return false; - // make sure position is valid - if ( position > References.at(refID).RefLength ) return false; - - // determine possible offsets - vector offsets; - if ( !NewIndex->GetOffsets(Region, IsRightBoundSpecified, offsets) ) { - printf("ERROR: Could not jump: unable to calculate offset for specified region.\n"); - return false; - } - - // iterate through offsets - BamAlignment bAlignment; - bool result = true; - for ( vector::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { - - // attempt seek & load first available alignment - result &= mBGZF.Seek(*o); - LoadNextAlignment(bAlignment); - - // if this alignment corresponds to desired position - // return success of seeking back to 'current offset' - if ( (bAlignment.RefID == refID && bAlignment.Position + bAlignment.Length > position) || (bAlignment.RefID > refID) ) { - if ( o != offsets.begin() ) --o; - return mBGZF.Seek(*o); - } - } - - return result; -} - -// load BAM header data -void BamReader::BamReaderPrivate::LoadHeaderData(void) { - - // check to see if proper BAM header - char buffer[4]; - if (mBGZF.Read(buffer, 4) != 4) { - printf("Could not read header type\n"); - exit(1); - } - - if (strncmp(buffer, "BAM\001", 4)) { - printf("wrong header type!\n"); - exit(1); - } - - // get BAM header text length - mBGZF.Read(buffer, 4); - unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(headerTextLength); } - - // get BAM header text - char* headerText = (char*)calloc(headerTextLength + 1, 1); - mBGZF.Read(headerText, headerTextLength); - HeaderText = (string)((const char*)headerText); - - // clean up calloc-ed temp variable - free(headerText); -} - -// load existing index data from BAM index file (".bai"), return success/fail -bool BamReader::BamReaderPrivate::LoadIndex(void) { - - // clear out any existing index data - ClearIndex(); - - // skip if index file empty - if ( IndexFilename.empty() ) - return false; - - // check supplied filename for index type - size_t defaultExtensionFound = IndexFilename.find(".bai"); - size_t customExtensionFound = IndexFilename.find(".bti"); - - // if SAM/BAM default (".bai") - if ( defaultExtensionFound != string::npos ) - NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian); - - // if BamTools custom index (".bti") - else if ( customExtensionFound != string::npos ) - NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian); - - // else unknown - else { - printf("ERROR: Unknown index file extension.\n"); - return false; - } - - // return success of loading index data - return NewIndex->Load(IndexFilename); -} - -// populates BamAlignment with alignment data under file pointer, returns success/fail -bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { - - // read in the 'block length' value, make sure it's not zero - char buffer[4]; - mBGZF.Read(buffer, 4); - bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); } - if ( bAlignment.SupportData.BlockLength == 0 ) { return false; } - - // read in core alignment data, make sure the right size of data was read - char x[BAM_CORE_SIZE]; - if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; } - - if ( IsBigEndian ) { - for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) { - SwapEndian_32p(&x[i]); - } - } - - // set BamAlignment 'core' and 'support' data - bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); - bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]); - - unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]); - bAlignment.Bin = tempValue >> 16; - bAlignment.MapQuality = tempValue >> 8 & 0xff; - bAlignment.SupportData.QueryNameLength = tempValue & 0xff; - - tempValue = BgzfData::UnpackUnsignedInt(&x[12]); - bAlignment.AlignmentFlag = tempValue >> 16; - bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff; - - bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]); - bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]); - bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]); - bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]); - - // set BamAlignment length - bAlignment.Length = bAlignment.SupportData.QuerySequenceLength; - - // read in character data - make sure proper data size was read - bool readCharDataOK = false; - const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; - char* allCharData = (char*)calloc(sizeof(char), dataLength); - - if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { - - // store 'allCharData' in supportData structure - bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength); - - // set success flag - readCharDataOK = true; - } - - free(allCharData); - return readCharDataOK; -} - -// loads reference data from BAM file -void BamReader::BamReaderPrivate::LoadReferenceData(void) { - - // get number of reference sequences - char buffer[4]; - mBGZF.Read(buffer, 4); - unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(numberRefSeqs); } - if (numberRefSeqs == 0) { return; } - References.reserve((int)numberRefSeqs); - - // iterate over all references in header - for (unsigned int i = 0; i != numberRefSeqs; ++i) { - - // get length of reference name - mBGZF.Read(buffer, 4); - unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(refNameLength); } - char* refName = (char*)calloc(refNameLength, 1); - - // get reference name and reference sequence length - mBGZF.Read(refName, refNameLength); - mBGZF.Read(buffer, 4); - int refLength = BgzfData::UnpackSignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(refLength); } - - // store data for reference - RefData aReference; - aReference.RefName = (string)((const char*)refName); - aReference.RefLength = refLength; - References.push_back(aReference); - - // clean up calloc-ed temp variable - free(refName); - } -} - -// opens BAM file (and index) -bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) { - - Filename = filename; - IndexFilename = indexFilename; - - // open the BGZF file for reading, return false on failure - if ( !mBGZF.Open(filename, "rb") ) - return false; - - // retrieve header text & reference data - LoadHeaderData(); - LoadReferenceData(); - - // store file offset of first alignment - AlignmentsBeginOffset = mBGZF.Tell(); - - // open index file & load index data (if exists) - if ( !IndexFilename.empty() ) - LoadIndex(); - - // return success - return true; -} - -// returns BAM file pointer to beginning of alignment data -bool BamReader::BamReaderPrivate::Rewind(void) { - - // rewind to first alignment - if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false; - - // retrieve first alignment data - BamAlignment al; - if ( !LoadNextAlignment(al) ) return false; - - // reset default region info using first alignment in file - Region.LeftRefID = al.RefID; - Region.LeftPosition = al.Position; - Region.RightRefID = -1; - Region.RightPosition = -1; - IsLeftBoundSpecified = false; - IsRightBoundSpecified = false; - - // rewind back to before first alignment - // return success/fail of seek - return mBGZF.Seek(AlignmentsBeginOffset); -} - -// sets a region of interest (with left & right bound reference/position) -// attempts a Jump() to left bound as well -// returns success/failure of Jump() -bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) { - - // save region of interest - Region = region; - - // set flags - if ( region.LeftRefID >= 0 && region.LeftPosition >= 0 ) - IsLeftBoundSpecified = true; - if ( region.RightRefID >= 0 && region.RightPosition >= 0 ) - IsRightBoundSpecified = true; - - // attempt jump to beginning of region, return success/fail of Jump() - return Jump( Region.LeftRefID, Region.LeftPosition ); -} diff --git a/BamReader.h b/BamReader.h deleted file mode 100644 index c93987b..0000000 --- a/BamReader.h +++ /dev/null @@ -1,98 +0,0 @@ -// *************************************************************************** -// BamReader.h (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 9 July 2010 (DB) -// --------------------------------------------------------------------------- -// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading BAM files -// *************************************************************************** - -#ifndef BAMREADER_H -#define BAMREADER_H - -// C++ includes -#include - -// BamTools includes -#include "BamAux.h" - -namespace BamTools { - -class BamReader { - - // constructor / destructor - public: - BamReader(void); - ~BamReader(void); - - // public interface - public: - - // ---------------------- - // BAM file operations - // ---------------------- - - // close BAM file - void Close(void); - // returns whether reader is open for reading or not - bool IsOpen(void) const; - // performs random-access jump to reference, position - bool Jump(int refID, int position = 0); - // opens BAM file (and optional BAM index file, if provided) - bool Open(const std::string& filename, const std::string& indexFilename = ""); - // returns file pointer to beginning of alignments - bool Rewind(void); - // sets a region of interest (with left & right bound reference/position) - // attempts a Jump() to left bound as well - // returns success/failure of Jump() - bool SetRegion(const BamRegion& region); - bool SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound); - - // ---------------------- - // access alignment data - // ---------------------- - - // retrieves next available alignment (returns success/fail) - bool GetNextAlignment(BamAlignment& bAlignment); - - // retrieves next available alignment core data (returns success/fail) - // ** DOES NOT parse any character data (read name, bases, qualities, tag data) - // these can be accessed, if necessary, from the supportData - // useful for operations requiring ONLY positional or other alignment-related information - bool GetNextAlignmentCore(BamAlignment& bAlignment); - - // ---------------------- - // access auxiliary data - // ---------------------- - - // returns SAM header text - const std::string GetHeaderText(void) const; - // returns number of reference sequences - int GetReferenceCount(void) const; - // returns vector of reference objects - const BamTools::RefVector& GetReferenceData(void) const; - // returns reference id (used for BamReader::Jump()) for the given reference name - int GetReferenceID(const std::string& refName) const; - // returns the name of the file associated with this BamReader - const std::string GetFilename(void) const; - - // ---------------------- - // BAM index operations - // ---------------------- - - // creates index for BAM file, saves to file (default = bamFilename + ".bai") - bool CreateIndex(bool useDefaultIndex = true); - - // private implementation - private: - struct BamReaderPrivate; - BamReaderPrivate* d; -}; - -} // namespace BamTools - -#endif // BAMREADER_H diff --git a/BamWriter.cpp b/BamWriter.cpp deleted file mode 100644 index f83ff1c..0000000 --- a/BamWriter.cpp +++ /dev/null @@ -1,432 +0,0 @@ -// *************************************************************************** -// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 17 August 2010 (DB) -// --------------------------------------------------------------------------- -// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for producing BAM files -// *************************************************************************** - -#include - -#include "BGZF.h" -#include "BamWriter.h" -using namespace BamTools; -using namespace std; - -struct BamWriter::BamWriterPrivate { - - // data members - BgzfData mBGZF; - bool IsBigEndian; - - // constructor / destructor - BamWriterPrivate(void) { - IsBigEndian = SystemIsBigEndian(); - } - - ~BamWriterPrivate(void) { - mBGZF.Close(); - } - - // "public" interface - void Close(void); - bool Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed); - void SaveAlignment(const BamAlignment& al); - - // internal methods - const unsigned int CalculateMinimumBin(const int begin, int end) const; - void CreatePackedCigar(const vector& cigarOperations, string& packedCigar); - void EncodeQuerySequence(const string& query, string& encodedQuery); -}; - -// ----------------------------------------------------- -// BamWriter implementation -// ----------------------------------------------------- - -// constructor -BamWriter::BamWriter(void) { - d = new BamWriterPrivate; -} - -// destructor -BamWriter::~BamWriter(void) { - delete d; - d = 0; -} - -// closes the alignment archive -void BamWriter::Close(void) { - d->Close(); -} - -// opens the alignment archive -bool BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) { - return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed); -} - -// saves the alignment to the alignment archive -void BamWriter::SaveAlignment(const BamAlignment& al) { - d->SaveAlignment(al); -} - -// ----------------------------------------------------- -// BamWriterPrivate implementation -// ----------------------------------------------------- - -// closes the alignment archive -void BamWriter::BamWriterPrivate::Close(void) { - mBGZF.Close(); -} - -// calculates minimum bin for a BAM alignment interval -const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { - --end; - if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); - if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); - if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20); - if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23); - if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26); - return 0; -} - -// creates a cigar string from the supplied alignment -void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, string& packedCigar) { - - // initialize - const unsigned int numCigarOperations = cigarOperations.size(); - packedCigar.resize(numCigarOperations * BT_SIZEOF_INT); - - // pack the cigar data into the string - unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); - - unsigned int cigarOp; - vector::const_iterator coIter; - for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) { - - switch(coIter->Type) { - case 'M': - cigarOp = BAM_CMATCH; - break; - case 'I': - cigarOp = BAM_CINS; - break; - case 'D': - cigarOp = BAM_CDEL; - break; - case 'N': - cigarOp = BAM_CREF_SKIP; - break; - case 'S': - cigarOp = BAM_CSOFT_CLIP; - break; - case 'H': - cigarOp = BAM_CHARD_CLIP; - break; - case 'P': - cigarOp = BAM_CPAD; - break; - default: - printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type); - exit(1); - } - - *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; - pPackedCigar++; - } -} - -// encodes the supplied query sequence into 4-bit notation -void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) { - - // prepare the encoded query string - const unsigned int queryLen = query.size(); - const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); - encodedQuery.resize(encodedQueryLen); - char* pEncodedQuery = (char*)encodedQuery.data(); - const char* pQuery = (const char*)query.data(); - - unsigned char nucleotideCode; - bool useHighWord = true; - - while(*pQuery) { - - switch(*pQuery) { - - case '=': - nucleotideCode = 0; - break; - - case 'A': - nucleotideCode = 1; - break; - - case 'C': - nucleotideCode = 2; - break; - - case 'G': - nucleotideCode = 4; - break; - - case 'T': - nucleotideCode = 8; - break; - - case 'N': - nucleotideCode = 15; - break; - - default: - printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); - exit(1); - } - - // pack the nucleotide code - if(useHighWord) { - *pEncodedQuery = nucleotideCode << 4; - useHighWord = false; - } else { - *pEncodedQuery |= nucleotideCode; - pEncodedQuery++; - useHighWord = true; - } - - // increment the query position - pQuery++; - } -} - -// opens the alignment archive -bool BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) { - - // open the BGZF file for writing, return failure if error - if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) ) - return false; - - // ================ - // write the header - // ================ - - // write the BAM signature - const unsigned char SIGNATURE_LENGTH = 4; - const char* BAM_SIGNATURE = "BAM\1"; - mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH); - - // write the SAM header text length - uint32_t samHeaderLen = samHeader.size(); - if (IsBigEndian) SwapEndian_32(samHeaderLen); - mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT); - - // write the SAM header text - if(samHeaderLen > 0) - mBGZF.Write(samHeader.data(), samHeaderLen); - - // write the number of reference sequences - uint32_t numReferenceSequences = referenceSequences.size(); - if (IsBigEndian) SwapEndian_32(numReferenceSequences); - mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT); - - // ============================= - // write the sequence dictionary - // ============================= - - RefVector::const_iterator rsIter; - for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) { - - // write the reference sequence name length - uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; - if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen); - mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT); - - // write the reference sequence name - mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); - - // write the reference sequence length - int32_t referenceLength = rsIter->RefLength; - if (IsBigEndian) SwapEndian_32(referenceLength); - mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT); - } - - // return success - return true; -} - -// saves the alignment to the alignment archive -void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { - - // if BamAlignment contains only the core data and a raw char data buffer - // (as a result of BamReader::GetNextAlignmentCore()) - if ( al.SupportData.HasCoreOnly ) { - - // write the block size - unsigned int blockSize = al.SupportData.BlockLength; - if (IsBigEndian) SwapEndian_32(blockSize); - mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); - - // assign the BAM core data - uint32_t buffer[8]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; - buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; - buffer[4] = al.SupportData.QuerySequenceLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { - for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); - } - - // write the BAM core - mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); - - // write the raw char data - mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); - } - - // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc - // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code ) - else { - - // calculate char lengths - const unsigned int nameLength = al.Name.size() + 1; - const unsigned int numCigarOperations = al.CigarData.size(); - const unsigned int queryLength = al.QueryBases.size(); - const unsigned int tagDataLength = al.TagData.size(); - - // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) - // force calculation of Bin before storing - const int endPosition = al.GetEndPosition(); - const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition); - - // create our packed cigar string - string packedCigar; - CreatePackedCigar(al.CigarData, packedCigar); - const unsigned int packedCigarLength = packedCigar.size(); - - // encode the query - string encodedQuery; - EncodeQuerySequence(al.QueryBases, encodedQuery); - const unsigned int encodedQueryLength = encodedQuery.size(); - - // write the block size - const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength; - unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; - if (IsBigEndian) SwapEndian_32(blockSize); - mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); - - // assign the BAM core data - uint32_t buffer[8]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; - buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; - buffer[4] = queryLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { - for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); - } - - // write the BAM core - mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); - - // write the query name - mBGZF.Write(al.Name.c_str(), nameLength); - - // write the packed cigar - if ( IsBigEndian ) { - - char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); - memcpy(cigarData, packedCigar.data(), packedCigarLength); - - for (unsigned int i = 0; i < packedCigarLength; ++i) { - if ( IsBigEndian ) - SwapEndian_32p(&cigarData[i]); - } - - mBGZF.Write(cigarData, packedCigarLength); - free(cigarData); - } - else - mBGZF.Write(packedCigar.data(), packedCigarLength); - - // write the encoded query sequence - mBGZF.Write(encodedQuery.data(), encodedQueryLength); - - // write the base qualities - string baseQualities(al.Qualities); - char* pBaseQualities = (char*)al.Qualities.data(); - for(unsigned int i = 0; i < queryLength; i++) { - pBaseQualities[i] -= 33; - } - mBGZF.Write(pBaseQualities, queryLength); - - // write the read group tag - if ( IsBigEndian ) { - - char* tagData = (char*)calloc(sizeof(char), tagDataLength); - memcpy(tagData, al.TagData.data(), tagDataLength); - - int i = 0; - while ( (unsigned int)i < tagDataLength ) { - - i += 2; // skip tag type (e.g. "RG", "NM", etc) - uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning - ++i; // skip value type - - switch (type) { - - case('A') : - case('C') : - ++i; - break; - - case('S') : - SwapEndian_16p(&tagData[i]); - i+=2; // sizeof(uint16_t) - break; - - case('F') : - case('I') : - SwapEndian_32p(&tagData[i]); - i+=4; // sizeof(uint32_t) - break; - - case('D') : - SwapEndian_64p(&tagData[i]); - i+=8; // sizeof(uint64_t) - break; - - case('H') : - case('Z') : - while (tagData[i]) { ++i; } - ++i; // increment one more for null terminator - break; - - default : - printf("ERROR: Invalid tag value type\n"); // shouldn't get here - free(tagData); - exit(1); - } - } - - mBGZF.Write(tagData, tagDataLength); - free(tagData); - } - else - mBGZF.Write(al.TagData.data(), tagDataLength); - } -} diff --git a/BamWriter.h b/BamWriter.h deleted file mode 100644 index b0cb6ce..0000000 --- a/BamWriter.h +++ /dev/null @@ -1,52 +0,0 @@ -// *************************************************************************** -// BamWriter.h (c) 2009 Michael Str�mberg, Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 17 August 2010 (DB) -// --------------------------------------------------------------------------- -// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for producing BAM files -// *************************************************************************** - -#ifndef BAMWRITER_H -#define BAMWRITER_H - -// C++ includes -#include - -// BamTools includes -#include "BamAux.h" - -namespace BamTools { - -class BamWriter { - - // constructor/destructor - public: - BamWriter(void); - ~BamWriter(void); - - // public interface - public: - // closes the alignment archive - void Close(void); - // opens the alignment archive - bool Open(const std::string& filename, - const std::string& samHeader, - const BamTools::RefVector& referenceSequences, - bool writeUncompressed = false); - // saves the alignment to the alignment archive - void SaveAlignment(const BamTools::BamAlignment& al); - - // private implementation - private: - struct BamWriterPrivate; - BamWriterPrivate* d; -}; - -} // namespace BamTools - -#endif // BAMWRITER_H diff --git a/Makefile b/Makefile index b3d285b..033c662 100644 --- a/Makefile +++ b/Makefile @@ -1,33 +1,41 @@ -CXX= g++ -CXXFLAGS= -Wall -O3 -D_FILE_OFFSET_BITS=64 -PROG= bamtools -API= BGZF.o \ - BamIndex.o \ - BamReader.o \ - BamWriter.o \ - BamMultiReader.o -UTILS= bamtools_fasta.o \ - bamtools_options.o \ - bamtools_pileup.o \ - bamtools_utilities.o -TOOLKIT= bamtools_convert.o \ - bamtools_count.o \ - bamtools_coverage.o \ - bamtools_filter.o \ - bamtools_header.o \ - bamtools_index.o \ - bamtools_merge.o \ - bamtools_random.o \ - bamtools_sort.o \ - bamtools_stats.o -MAIN= bamtools.o -OBJS= $(API) $(UTILS) $(TOOLKIT) $(MAIN) -LIBS= -lz - -all: $(PROG) - -bamtools: $(OBJS) - $(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(LIBS) - -clean: - rm -fr gmon.out *.o *.a a.out *~ +# ========================== +# BamTools Makefile +# (c) 2010 Derek Barnett +# ========================== + +# define main directories +export OBJ_DIR = obj +export BIN_DIR = bin +export SRC_DIR = src + +# define compile/link flags +export CXX = g++ +export CXXFLAGS = -Wall -O3 -D_FILE_OFFSET_BITS=64 +export LIBS = -lz + +# define current BamTools version +export BAMTOOLS_VERSION = 0.7.0812 + +# define source subdirectories +SUBDIRS = $(SRC_DIR)/api \ + $(SRC_DIR)/utils \ + $(SRC_DIR)/toolkit + +all: + @echo "Building BamTools:" + @echo "Version: $$BAMTOOLS_VERSION" + @echo "=========================================================" + + @for dir in $(SUBDIRS); do \ + echo "- Building in $$dir"; \ + $(MAKE) --no-print-directory -C $$dir; \ + echo ""; \ + done + +.PHONY: all + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean diff --git a/bamtools.cpp b/bamtools.cpp deleted file mode 100644 index f75b93d..0000000 --- a/bamtools.cpp +++ /dev/null @@ -1,138 +0,0 @@ -// *************************************************************************** -// bamtools.cpp (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 22 July 2010 -// --------------------------------------------------------------------------- -// Integrates a number of BamTools functionalities into a single executable. -// *************************************************************************** - -// Std C/C++ includes -#include - -// BamTools includes -#include "bamtools_convert.h" -#include "bamtools_count.h" -#include "bamtools_coverage.h" -#include "bamtools_filter.h" -#include "bamtools_header.h" -#include "bamtools_index.h" -#include "bamtools_merge.h" -#include "bamtools_random.h" -#include "bamtools_sort.h" -#include "bamtools_stats.h" - -using namespace std; -using namespace BamTools; - -// ------------------------------------------ -// bamtools subtool names -static const string CONVERT = "convert"; -static const string COUNT = "count"; -static const string COVERAGE = "coverage"; -static const string FILTER = "filter"; -static const string HEADER = "header"; -static const string INDEX = "index"; -static const string MERGE = "merge"; -static const string RANDOM = "random"; -static const string SORT = "sort"; -static const string STATS = "stats"; - -// ------------------------------------------ -// bamtools help/version names -static const string HELP = "help"; -static const string LONG_HELP = "--help"; -static const string SHORT_HELP = "-h"; - -static const string VERSION = "version"; -static const string LONG_VERSION = "--version"; -static const string SHORT_VERSION = "-v"; - -// ------------------------------------------ -// Print help info -int Help(int argc, char* argv[]) { - - // 'bamtools help COMMAND' - if (argc > 2) { - - AbstractTool* tool(0); - if ( argv[2] == CONVERT ) tool = new ConvertTool; - if ( argv[2] == COUNT ) tool = new CountTool; - if ( argv[2] == COVERAGE ) tool = new CoverageTool; - if ( argv[2] == FILTER ) tool = new FilterTool; - if ( argv[2] == HEADER ) tool = new HeaderTool; - if ( argv[2] == INDEX ) tool = new IndexTool; - if ( argv[2] == MERGE ) tool = new MergeTool; - if ( argv[2] == RANDOM ) tool = new RandomTool; - if ( argv[2] == SORT ) tool = new SortTool; - if ( argv[2] == STATS ) tool = new StatsTool; - - // if tool known, print its help screen - if ( tool ) return tool->Help(); - } - - // either 'bamtools help' or unrecognized argument after 'help' - cerr << endl; - cerr << "usage: bamtools [--help] COMMAND [ARGS]" << endl; - cerr << endl; - cerr << "Available bamtools commands:" << endl; - cerr << "\tconvert Converts between BAM and a number of other formats" << endl; - cerr << "\tcount Prints number of alignments in BAM file" << endl; - cerr << "\tcoverage Prints coverage statistics from the input BAM file" << endl; - cerr << "\tfilter Filters BAM file(s) by user-specified criteria" << endl; - cerr << "\theader Prints BAM header information" << endl; - cerr << "\tindex Generates index for BAM file" << endl; - cerr << "\tmerge Merge multiple BAM files into single file" << endl; - cerr << "\trandom Grab a random subset of alignments" << endl; - cerr << "\tsort Sorts the BAM file according to some criteria" << endl; - cerr << "\tstats Prints general alignment statistics" << endl; - cerr << endl; - cerr << "See 'bamtools help COMMAND' for more information on a specific command." << endl; - cerr << endl; - return 0; -} - -// ------------------------------------------ -// Print version info -int Version(void) { - cout << endl; - cout << "bamtools v0.8.xx" << endl; - cout << "Part of BamTools API and toolkit" << endl; - cout << "Primary authors: Derek Barnett, Erik Garrison, Michael Stromberg" << endl; - cout << "(c) 2009-2010 Marth Lab, Biology Dept., Boston College" << endl; - cout << endl; - return 0; -} - -// ------------------------------------------ -// toolkit entry point -int main(int argc, char* argv[]) { - - // just 'bamtools' - if ( (argc == 1) ) return Help(argc, argv); - - // 'bamtools help', 'bamtools --help', or 'bamtools -h' - if ( (argv[1] == HELP) || (argv[1] == LONG_HELP) || (argv[1] == SHORT_HELP) ) return Help(argc, argv); - - // 'bamtools version', 'bamtools --version', or 'bamtools -v' - if ( (argv[1] == VERSION) || (argv[1] == LONG_VERSION) || (argv[1] == SHORT_VERSION) ) return Version(); - - // determine desired sub-tool - AbstractTool* tool(0); - if ( argv[1] == CONVERT ) tool = new ConvertTool; - if ( argv[1] == COUNT ) tool = new CountTool; - if ( argv[1] == COVERAGE ) tool = new CoverageTool; - if ( argv[1] == FILTER ) tool = new FilterTool; - if ( argv[1] == HEADER ) tool = new HeaderTool; - if ( argv[1] == INDEX ) tool = new IndexTool; - if ( argv[1] == MERGE ) tool = new MergeTool; - if ( argv[1] == RANDOM ) tool = new RandomTool; - if ( argv[1] == SORT ) tool = new SortTool; - if ( argv[1] == STATS ) tool = new StatsTool; - - // if found, run tool - if ( tool ) return tool->Run(argc, argv); - // no match found, show help - else return Help(argc, argv); -} diff --git a/bamtools_convert.cpp b/bamtools_convert.cpp deleted file mode 100644 index 1c5ba48..0000000 --- a/bamtools_convert.cpp +++ /dev/null @@ -1,619 +0,0 @@ -// *************************************************************************** -// bamtools_convert.cpp (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 22 July 2010 -// --------------------------------------------------------------------------- -// Converts between BAM and a number of other formats -// *************************************************************************** - -#include -#include -#include -#include -#include - -#include "bamtools_convert.h" -#include "bamtools_options.h" -#include "bamtools_pileup.h" -#include "bamtools_utilities.h" -#include "BGZF.h" -#include "BamReader.h" -#include "BamMultiReader.h" - -using namespace std; -using namespace BamTools; - -namespace BamTools { - - // format names - static const string FORMAT_BED = "bed"; - static const string FORMAT_BEDGRAPH = "bedgraph"; - static const string FORMAT_FASTA = "fasta"; - static const string FORMAT_FASTQ = "fastq"; - static const string FORMAT_JSON = "json"; - static const string FORMAT_SAM = "sam"; - static const string FORMAT_PILEUP = "pileup"; - static const string FORMAT_WIGGLE = "wig"; - - // other constants - static const unsigned int FASTA_LINE_MAX = 50; - -} // namespace BamTools - -struct ConvertTool::ConvertToolPrivate { - - // ctor & dtor - public: - ConvertToolPrivate(ConvertTool::ConvertSettings* settings); - ~ConvertToolPrivate(void); - - // interface - public: - bool Run(void); - - // internal methods - private: - void PrintBed(const BamAlignment& a); - void PrintBedGraph(const BamAlignment& a); - void PrintFasta(const BamAlignment& a); - void PrintFastq(const BamAlignment& a); - void PrintJson(const BamAlignment& a); - void PrintSam(const BamAlignment& a); - void PrintWiggle(const BamAlignment& a); - - // data members - private: - ConvertTool::ConvertSettings* m_settings; - RefVector m_references; - ostream m_out; -}; - -// --------------------------------------------- -// ConvertSettings implementation - -struct ConvertTool::ConvertSettings { - - // flags - bool HasInput; - bool HasOutput; - bool HasFormat; - bool HasRegion; - - // pileup flags - bool HasFastaFilename; - bool IsOmittingSamHeader; - bool IsPrintingPileupMapQualities; - - // options - vector InputFiles; - string OutputFilename; - string Format; - string Region; - - // pileup options - string FastaFilename; - - // constructor - ConvertSettings(void) - : HasInput(false) - , HasOutput(false) - , HasFormat(false) - , HasRegion(false) - , HasFastaFilename(false) - , IsOmittingSamHeader(false) - , IsPrintingPileupMapQualities(false) - , OutputFilename(Options::StandardOut()) - { } -}; - -// --------------------------------------------- -// ConvertTool implementation - -ConvertTool::ConvertTool(void) - : AbstractTool() - , m_settings(new ConvertSettings) - , m_impl(0) -{ - // set program details - Options::SetProgramInfo("bamtools convert", "converts BAM to a number of other formats", "-format [-in -in ...] [-out ] [other options]"); - - // set up options - OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn()); - Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); - Options::AddValueOption("-format", "FORMAT", "the output file format - see README for recognized formats", "", m_settings->HasFormat, m_settings->Format, IO_Opts); - - OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters"); - Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as .bai. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts); - - OptionGroup* PileupOpts = Options::CreateOptionGroup("Pileup Options"); - Options::AddValueOption("-fasta", "FASTA filename", "FASTA reference file", "", m_settings->HasFastaFilename, m_settings->FastaFilename, PileupOpts, ""); - Options::AddOption("-mapqual", "print the mapping qualities", m_settings->IsPrintingPileupMapQualities, PileupOpts); - - OptionGroup* SamOpts = Options::CreateOptionGroup("SAM Options"); - Options::AddOption("-noheader", "omit the SAM header from output", m_settings->IsOmittingSamHeader, SamOpts); -} - -ConvertTool::~ConvertTool(void) { - delete m_settings; - m_settings = 0; - - delete m_impl; - m_impl = 0; -} - -int ConvertTool::Help(void) { - Options::DisplayHelp(); - return 0; -} - -int ConvertTool::Run(int argc, char* argv[]) { - - // parse command line arguments - Options::Parse(argc, argv, 1); - - // run internal ConvertTool implementation, return success/fail - m_impl = new ConvertToolPrivate(m_settings); - - if ( m_impl->Run() ) - return 0; - else - return 1; -} - -// --------------------------------------------- -// ConvertToolPrivate implementation - -ConvertTool::ConvertToolPrivate::ConvertToolPrivate(ConvertTool::ConvertSettings* settings) - : m_settings(settings) - , m_out(cout.rdbuf()) // default output to cout -{ } - -ConvertTool::ConvertToolPrivate::~ConvertToolPrivate(void) { } - -bool ConvertTool::ConvertToolPrivate::Run(void) { - - bool convertedOk = true; - - // ------------------------------------ - // initialize conversion input/output - - // set to default input if none provided - if ( !m_settings->HasInput ) - m_settings->InputFiles.push_back(Options::StandardIn()); - - // open input files - BamMultiReader reader; - reader.Open(m_settings->InputFiles, false); - m_references = reader.GetReferenceData(); - - // set region if specified - BamRegion region; - if ( m_settings->HasRegion ) { - if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { - if ( !reader.SetRegion(region) ) - cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl; - } - } - - // if output file given - ofstream outFile; - if ( m_settings->HasOutput ) { - - // open output file stream - outFile.open(m_settings->OutputFilename.c_str()); - if ( !outFile ) { - cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl; - return false; - } - - // set m_out to file's streambuf - m_out.rdbuf(outFile.rdbuf()); - } - - // ------------------------ - // pileup is special case - if ( m_settings->Format == FORMAT_PILEUP ) { - - // initialize pileup input/output - Pileup pileup(&reader, &m_out); - - // --------------------------- - // configure pileup settings - - if ( m_settings->HasRegion ) - pileup.SetRegion(region); - - if ( m_settings->HasFastaFilename ) - pileup.SetFastaFilename(m_settings->FastaFilename); - - pileup.SetIsPrintingMapQualities( m_settings->IsPrintingPileupMapQualities ); - - // run pileup - convertedOk = pileup.Run(); - } - - // ------------------------------------- - // else determine 'simpler' format type - else { - - bool formatError = false; - void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0; - if ( m_settings->Format == FORMAT_BED ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed; - else if ( m_settings->Format == FORMAT_BEDGRAPH ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBedGraph; - else if ( m_settings->Format == FORMAT_FASTA ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta; - else if ( m_settings->Format == FORMAT_FASTQ ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq; - else if ( m_settings->Format == FORMAT_JSON ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson; - else if ( m_settings->Format == FORMAT_SAM ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam; - else if ( m_settings->Format == FORMAT_WIGGLE ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintWiggle; - else { - cerr << "Unrecognized format: " << m_settings->Format << endl; - cerr << "Please see help|README (?) for details on supported formats " << endl; - formatError = true; - convertedOk = false; - } - - // if SAM format & not omitting header, print SAM header - if ( (m_settings->Format == FORMAT_SAM) && !m_settings->IsOmittingSamHeader ) { - string headerText = reader.GetHeaderText(); - m_out << headerText; - } - - // ------------------------ - // do conversion - if ( !formatError ) { - BamAlignment a; - while ( reader.GetNextAlignment(a) ) { - (this->*pFunction)(a); - } - } - } - - // ------------------------ - // clean up & exit - reader.Close(); - if ( m_settings->HasOutput ) outFile.close(); - return convertedOk; -} - -// ---------------------------------------------------------- -// Conversion/output methods -// ---------------------------------------------------------- - -void ConvertTool::ConvertToolPrivate::PrintBed(const BamAlignment& a) { - - // tab-delimited, 0-based half-open - // (e.g. a 50-base read aligned to pos 10 could have BED coordinates (10, 60) instead of BAM coordinates (10, 59) ) - // - - m_out << m_references.at(a.RefID).RefName << "\t" - << a.Position << "\t" - << a.GetEndPosition() + 1 << "\t" - << a.Name << "\t" - << a.MapQuality << "\t" - << (a.IsReverseStrand() ? "-" : "+") << endl; -} - -void ConvertTool::ConvertToolPrivate::PrintBedGraph(const BamAlignment& a) { - ; -} - -// print BamAlignment in FASTA format -// N.B. - uses QueryBases NOT AlignedBases -void ConvertTool::ConvertToolPrivate::PrintFasta(const BamAlignment& a) { - - // >BamAlignment.Name - // BamAlignment.QueryBases (up to FASTA_LINE_MAX bases per line) - // ... - - // print header - m_out << "> " << a.Name << endl; - - // if sequence fits on single line - if ( a.QueryBases.length() <= FASTA_LINE_MAX ) - m_out << a.QueryBases << endl; - - // else split over multiple lines - else { - - size_t position = 0; - size_t seqLength = a.QueryBases.length(); - - // write subsequences to each line - while ( position < (seqLength - FASTA_LINE_MAX) ) { - m_out << a.QueryBases.substr(position, FASTA_LINE_MAX) << endl; - position += FASTA_LINE_MAX; - } - - // write final subsequence - m_out << a.QueryBases.substr(position) << endl; - } -} - -// print BamAlignment in FASTQ format -// N.B. - uses QueryBases NOT AlignedBases -void ConvertTool::ConvertToolPrivate::PrintFastq(const BamAlignment& a) { - - // @BamAlignment.Name - // BamAlignment.QueryBases - // + - // BamAlignment.Qualities - - m_out << "@" << a.Name << endl - << a.QueryBases << endl - << "+" << endl - << a.Qualities << endl; -} - -// print BamAlignment in JSON format -void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) { - - // write name & alignment flag - m_out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" << a.AlignmentFlag << "\","; - - // write reference name - if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) ) - m_out << "\"reference\":\"" << m_references[a.RefID].RefName << "\","; - - // write position & map quality - m_out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ","; - - // write CIGAR - const vector& cigarData = a.CigarData; - if ( !cigarData.empty() ) { - m_out << "\"cigar\":["; - vector::const_iterator cigarBegin = cigarData.begin(); - vector::const_iterator cigarIter = cigarBegin; - vector::const_iterator cigarEnd = cigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter ) { - const CigarOp& op = (*cigarIter); - if (cigarIter != cigarBegin) m_out << ","; - m_out << "\"" << op.Length << op.Type << "\""; - } - m_out << "],"; - } - - // write mate reference name, mate position, & insert size - if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) { - m_out << "\"mate\":{" - << "\"reference\":\"" << m_references[a.MateRefID].RefName << "\"," - << "\"position\":" << a.MatePosition+1 - << ",\"insertSize\":" << a.InsertSize << "},"; - } - - // write sequence - if ( !a.QueryBases.empty() ) - m_out << "\"queryBases\":\"" << a.QueryBases << "\","; - - // write qualities - if ( !a.Qualities.empty() ) { - string::const_iterator s = a.Qualities.begin(); - m_out << "\"qualities\":[" << static_cast(*s) - 33; - ++s; - for (; s != a.Qualities.end(); ++s) { - m_out << "," << static_cast(*s) - 33; - } - m_out << "],"; - } - - // write tag data - const char* tagData = a.TagData.c_str(); - const size_t tagDataLength = a.TagData.length(); - size_t index = 0; - if (index < tagDataLength) { - - m_out << "\"tags\":{"; - - while ( index < tagDataLength ) { - - if (index > 0) - m_out << ","; - - // write tag name - m_out << "\"" << a.TagData.substr(index, 2) << "\":"; - index += 2; - - // get data type - char type = a.TagData.at(index); - ++index; - - switch (type) { - case('A') : - m_out << "\"" << tagData[index] << "\""; - ++index; - break; - - case('C') : - m_out << (int)tagData[index]; - ++index; - break; - - case('c') : - m_out << (int)tagData[index]; - ++index; - break; - - case('S') : - m_out << BgzfData::UnpackUnsignedShort(&tagData[index]); - index += 2; - break; - - case('s') : - m_out << BgzfData::UnpackSignedShort(&tagData[index]); - index += 2; - break; - - case('I') : - m_out << BgzfData::UnpackUnsignedInt(&tagData[index]); - index += 4; - break; - - case('i') : - m_out << BgzfData::UnpackSignedInt(&tagData[index]); - index += 4; - break; - - case('f') : - m_out << BgzfData::UnpackFloat(&tagData[index]); - index += 4; - break; - - case('d') : - m_out << BgzfData::UnpackDouble(&tagData[index]); - index += 8; - break; - - case('Z') : - case('H') : - m_out << "\""; - while (tagData[index]) { - m_out << tagData[index]; - ++index; - } - m_out << "\""; - ++index; - break; - } - - if ( tagData[index] == '\0') - break; - } - - m_out << "}"; - } - - m_out << "}" << endl; - -} - -// print BamAlignment in SAM format -void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) { - - // tab-delimited - // [ :: [...] ] - - // write name & alignment flag - m_out << a.Name << "\t" << a.AlignmentFlag << "\t"; - - // write reference name - if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) ) - m_out << m_references[a.RefID].RefName << "\t"; - else - m_out << "*\t"; - - // write position & map quality - m_out << a.Position+1 << "\t" << a.MapQuality << "\t"; - - // write CIGAR - const vector& cigarData = a.CigarData; - if ( cigarData.empty() ) m_out << "*\t"; - else { - vector::const_iterator cigarIter = cigarData.begin(); - vector::const_iterator cigarEnd = cigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter ) { - const CigarOp& op = (*cigarIter); - m_out << op.Length << op.Type; - } - m_out << "\t"; - } - - // write mate reference name, mate position, & insert size - if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) { - if ( a.MateRefID == a.RefID ) m_out << "=\t"; - else m_out << m_references[a.MateRefID].RefName << "\t"; - m_out << a.MatePosition+1 << "\t" << a.InsertSize << "\t"; - } - else m_out << "*\t0\t0\t"; - - // write sequence - if ( a.QueryBases.empty() ) m_out << "*\t"; - else m_out << a.QueryBases << "\t"; - - // write qualities - if ( a.Qualities.empty() ) m_out << "*"; - else m_out << a.Qualities; - - // write tag data - const char* tagData = a.TagData.c_str(); - const size_t tagDataLength = a.TagData.length(); - - size_t index = 0; - while ( index < tagDataLength ) { - - // write tag name - string tagName = a.TagData.substr(index, 2); - m_out << "\t" << tagName << ":"; - index += 2; - - // get data type - char type = a.TagData.at(index); - ++index; - switch (type) { - case('A') : - m_out << "A:" << tagData[index]; - ++index; - break; - - case('C') : - m_out << "i:" << (int)tagData[index]; - ++index; - break; - - case('c') : - m_out << "i:" << (int)tagData[index]; - ++index; - break; - - case('S') : - m_out << "i:" << BgzfData::UnpackUnsignedShort(&tagData[index]); - index += 2; - break; - - case('s') : - m_out << "i:" << BgzfData::UnpackSignedShort(&tagData[index]); - index += 2; - break; - - case('I') : - m_out << "i:" << BgzfData::UnpackUnsignedInt(&tagData[index]); - index += 4; - break; - - case('i') : - m_out << "i:" << BgzfData::UnpackSignedInt(&tagData[index]); - index += 4; - break; - - case('f') : - m_out << "f:" << BgzfData::UnpackFloat(&tagData[index]); - index += 4; - break; - - case('d') : - m_out << "d:" << BgzfData::UnpackDouble(&tagData[index]); - index += 8; - break; - - case('Z') : - case('H') : - m_out << type << ":"; - while (tagData[index]) { - m_out << tagData[index]; - ++index; - } - ++index; - break; - } - - if ( tagData[index] == '\0') - break; - } - - m_out << endl; -} - -void ConvertTool::ConvertToolPrivate::PrintWiggle(const BamAlignment& a) { - ; -} diff --git a/bamtools_convert.h b/bamtools_convert.h deleted file mode 100644 index 8dd6857..0000000 --- a/bamtools_convert.h +++ /dev/null @@ -1,38 +0,0 @@ -// *************************************************************************** -// bamtools_convert.h (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 9 July 2010 -// --------------------------------------------------------------------------- -// Converts between BAM and a number of other formats -// *************************************************************************** - -#ifndef BAMTOOLS_CONVERT_H -#define BAMTOOLS_CONVERT_H - -#include "bamtools_tool.h" - -namespace BamTools { - -class ConvertTool : public AbstractTool { - - public: - ConvertTool(void); - ~ConvertTool(void); - - public: - int Help(void); - int Run(int argc, char* argv[]); - - private: - struct ConvertSettings; - ConvertSettings* m_settings; - - struct ConvertToolPrivate; - ConvertToolPrivate* m_impl; -}; - -} // namespace BamTools - -#endif // BAMTOOLS_CONVERT_H \ No newline at end of file diff --git a/bamtools_count.cpp b/bamtools_count.cpp deleted file mode 100644 index 4bd7c82..0000000 --- a/bamtools_count.cpp +++ /dev/null @@ -1,192 +0,0 @@ -// *************************************************************************** -// bamtools_count.cpp (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 2 June 2010 -// --------------------------------------------------------------------------- -// Prints alignment count for BAM file -// -// ** Expand to multiple?? -// -// *************************************************************************** - -#include -#include -#include -#include - -#include "bamtools_count.h" -#include "bamtools_options.h" -#include "bamtools_utilities.h" -#include "BamReader.h" -#include "BamMultiReader.h" - -using namespace std; -using namespace BamTools; - -// --------------------------------------------- -// CountSettings implementation - -struct CountTool::CountSettings { - - // flags - bool HasInput; - bool HasRegion; - - // filenames - vector InputFiles; - string Region; - - // constructor - CountSettings(void) - : HasInput(false) - , HasRegion(false) - { } -}; - -// --------------------------------------------- -// CountTool implementation - -CountTool::CountTool(void) - : AbstractTool() - , m_settings(new CountSettings) -{ - // set program details - Options::SetProgramInfo("bamtools count", "prints alignment counts for a BAM file", "-in [-region ]"); - - // set up options - OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts); - //Options::AddValueOption("-index", "BAM index filename", "the BAM index file", "", m_settings->HasBamIndexFilename, m_settings->BamIndexFilename, IO_Opts); - - OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters"); - Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as .bai or .bti. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts); -} - -CountTool::~CountTool(void) { - delete m_settings; - m_settings = 0; -} - -int CountTool::Help(void) { - Options::DisplayHelp(); - return 0; -} - -int CountTool::Run(int argc, char* argv[]) { - - // parse command line arguments - Options::Parse(argc, argv, 1); - - if ( !m_settings->HasInput ) - m_settings->InputFiles.push_back(Options::StandardIn()); - - BamMultiReader reader; - reader.Open(m_settings->InputFiles, false, true); - - // alignment counter - int alignmentCount(0); - - // set up error handling - ostringstream errorStream(""); - bool foundError(false); - - // if no region specified, count entire file - if ( !m_settings->HasRegion ) { - BamAlignment al; - while ( reader.GetNextAlignmentCore(al) ) - ++alignmentCount; - } - - // more complicated - region specified - else { - - BamRegion region; - if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { - - // check if there are index files *.bai/*.bti corresponding to the input files - bool hasDefaultIndex = false; - bool hasBamtoolsIndex = false; - bool hasNoIndex = false; - int defaultIndexCount = 0; - int bamtoolsIndexCount = 0; - for (vector::const_iterator f = m_settings->InputFiles.begin(); f != m_settings->InputFiles.end(); ++f) { - - if ( Utilities::FileExists(*f + ".bai") ) { - hasDefaultIndex = true; - ++defaultIndexCount; - } - - if ( Utilities::FileExists(*f + ".bti") ) { - hasBamtoolsIndex = true; - ++bamtoolsIndexCount; - } - - if ( !hasDefaultIndex && !hasBamtoolsIndex ) { - hasNoIndex = true; - cerr << "*WARNING - could not find index file for " << *f - << ", parsing whole file(s) to get alignment counts for target region" - << " (could be slow)" << endl; - break; - } - } - - // determine if index file types are heterogeneous - bool hasDifferentIndexTypes = false; - if ( defaultIndexCount > 0 && bamtoolsIndexCount > 0 ) { - hasDifferentIndexTypes = true; - cerr << "*WARNING - different index file formats found" - << ", parsing whole file(s) to get alignment counts for target region" - << " (could be slow)" << endl; - } - - // if any input file has no index, or if input files use different index formats - // can't use BamMultiReader to jump directly (**for now**) - if ( hasNoIndex || hasDifferentIndexTypes ) { - - // read through sequentially, counting all overlapping reads - BamAlignment al; - while( reader.GetNextAlignmentCore(al) ) { - if ( (al.RefID >= region.LeftRefID) && ( (al.Position + al.Length) >= region.LeftPosition ) && - (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) - { - ++alignmentCount; - } - } - } - - // has index file for each input file (and same format) - else { - - // this is kind of a hack...? - BamMultiReader reader; - reader.Open(m_settings->InputFiles, true, true, hasDefaultIndex ); - - if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) { - foundError = true; - errorStream << "Could not set BamReader region to REGION: " << m_settings->Region << endl; - } else { - BamAlignment al; - while ( reader.GetNextAlignmentCore(al) ) - ++alignmentCount; - } - } - - } else { - foundError = true; - errorStream << "Could not parse REGION: " << m_settings->Region << endl; - errorStream << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl; - } - } - - // print errors OR results - if ( foundError ) - cerr << errorStream.str() << endl; - else - cout << alignmentCount << endl; - - // clean & exit - reader.Close(); - return (int)foundError; -} diff --git a/bamtools_count.h b/bamtools_count.h deleted file mode 100644 index e3d0c81..0000000 --- a/bamtools_count.h +++ /dev/null @@ -1,38 +0,0 @@ -// *************************************************************************** -// bamtools_count.h (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 1 June 2010 -// --------------------------------------------------------------------------- -// Prints alignment count for BAM file -// -// ** Expand to multiple?? -// -// *************************************************************************** - -#ifndef BAMTOOLS_COUNT_H -#define BAMTOOLS_COUNT_H - -#include "bamtools_tool.h" - -namespace BamTools { - -class CountTool : public AbstractTool { - - public: - CountTool(void); - ~CountTool(void); - - public: - int Help(void); - int Run(int argc, char* argv[]); - - private: - struct CountSettings; - CountSettings* m_settings; -}; - -} // namespace BamTools - -#endif // BAMTOOLS_COUNT_H diff --git a/bamtools_coverage.cpp b/bamtools_coverage.cpp deleted file mode 100644 index b587445..0000000 --- a/bamtools_coverage.cpp +++ /dev/null @@ -1,84 +0,0 @@ -// *************************************************************************** -// bamtools_coverage.cpp (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 1 June 2010 -// --------------------------------------------------------------------------- -// Prints coverage statistics for a single BAM file -// -// ** Expand to multiple?? -// -// *************************************************************************** - -#include -#include -#include - -#include "bamtools_coverage.h" -#include "bamtools_options.h" -#include "BamReader.h" - -using namespace std; -using namespace BamTools; - -// --------------------------------------------- -// CoverageSettings implementation - -struct CoverageTool::CoverageSettings { - - // flags - bool HasInputBamFilename; - - // filenames - std::string InputBamFilename; - - // constructor - CoverageSettings(void) - : HasInputBamFilename(false) - , InputBamFilename(Options::StandardIn()) - { } -}; - -// --------------------------------------------- -// CoverageTool implementation - -CoverageTool::CoverageTool(void) - : AbstractTool() - , m_settings(new CoverageSettings) -{ - // set program details - Options::SetProgramInfo("bamtools coverage", "prints coverage stats for a BAM file", "-in "); - - // set up options - OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn()); -} - -CoverageTool::~CoverageTool(void) { - delete m_settings; - m_settings = 0; -} - -int CoverageTool::Help(void) { - Options::DisplayHelp(); - return 0; -} - -int CoverageTool::Run(int argc, char* argv[]) { - - // parse command line arguments - Options::Parse(argc, argv, 1); - - //open our BAM reader - BamReader reader; - reader.Open(m_settings->InputBamFilename); - - // generate coverage stats - cerr << "Generating coverage stats for " << m_settings->InputBamFilename << endl; - cerr << "FEATURE NOT YET IMPLEMENTED!" << endl; - - // clean & exit - reader.Close(); - return 0; -} \ No newline at end of file diff --git a/bamtools_coverage.h b/bamtools_coverage.h deleted file mode 100644 index c3714c0..0000000 --- a/bamtools_coverage.h +++ /dev/null @@ -1,38 +0,0 @@ -// *************************************************************************** -// bamtools_coverage.h (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 1 June 2010 -// --------------------------------------------------------------------------- -// Prints coverage statistics for a single BAM file -// -// ** Expand to multiple?? -// -// *************************************************************************** - -#ifndef BAMTOOLS_COVERAGE_H -#define BAMTOOLS_COVERAGE_H - -#include "bamtools_tool.h" - -namespace BamTools { - -class CoverageTool : public AbstractTool { - - public: - CoverageTool(void); - ~CoverageTool(void); - - public: - int Help(void); - int Run(int argc, char* argv[]); - - private: - struct CoverageSettings; - CoverageSettings* m_settings; -}; - -} // namespace BamTools - -#endif // BAMTOOLS_COVERAGE_H diff --git a/bamtools_fasta.cpp b/bamtools_fasta.cpp deleted file mode 100644 index ef1c742..0000000 --- a/bamtools_fasta.cpp +++ /dev/null @@ -1,627 +0,0 @@ -// *************************************************************************** -// bamtools_fasta.cpp (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 13 July 2010 -// --------------------------------------------------------------------------- -// Provides FASTA reading/indexing functionality. -// *************************************************************************** - -#include -#include -#include -#include -#include -#include -#include -#include "bamtools_fasta.h" -using namespace std; -using namespace BamTools; - -struct Fasta::FastaPrivate { - - struct FastaIndexData { - string Name; - int32_t Length; - int64_t Offset; - int32_t LineLength; - int32_t ByteLength; // LineLength + newline character(s) - varies on OS where file was generated - }; - - // data members - FILE* Stream; - bool IsOpen; - - FILE* IndexStream; - bool HasIndex; - bool IsIndexOpen; - - vector Index; - - // ctor - FastaPrivate(void); - ~FastaPrivate(void); - - // 'public' API methods - bool Close(void); - bool CreateIndex(const string& indexFilename); - bool GetBase(const int& refId, const int& position, char& base); - bool GetSequence(const int& refId, const int& start, const int& stop, string& sequence); - bool Open(const string& filename, const string& indexFilename); - - // internal methods - private: - void Chomp(char* sequence); - bool GetNameFromHeader(const string& header, string& name); - bool GetNextHeader(string& header); - bool GetNextSequence(string& sequence); - bool LoadIndexData(void); - bool Rewind(void); - bool WriteIndexData(void); -}; - -Fasta::FastaPrivate::FastaPrivate(void) - : IsOpen(false) - , HasIndex(false) - , IsIndexOpen(false) -{ } - -Fasta::FastaPrivate::~FastaPrivate(void) { - Close(); -} - -// remove any trailing newlines -void Fasta::FastaPrivate::Chomp(char* sequence) { - - static const int CHAR_LF = 10; - static const int CHAR_CR = 13; - - size_t seqLength = strlen(sequence); - if ( seqLength == 0 ) return; - --seqLength; // ignore null terminator - - while ( sequence[seqLength] == CHAR_LF || - sequence[seqLength] == CHAR_CR - ) - { - sequence[seqLength] = 0; - --seqLength; - if (seqLength < 0) - break; - } -} - -bool Fasta::FastaPrivate::Close(void) { - - // close fasta file - if ( IsOpen ) { - fclose(Stream); - IsOpen = false; - } - - // close index file - if ( HasIndex && IsIndexOpen ) { - fclose(IndexStream); - HasIndex = false; - IsIndexOpen = false; - } - - // return success - return true; -} - -bool Fasta::FastaPrivate::CreateIndex(const string& indexFilename) { - - // check that file is open - if ( !IsOpen ) { - cerr << "FASTA error : cannot create index, FASTA file not open" << endl; - return false; - } - - // rewind FASTA file - if ( !Rewind() ) { - cerr << "FASTA error : could not rewind FASTA file" << endl; - return false; - } - - // clear out prior index data - Index.clear(); - - // ------------------------------------------- - // calculate lineLength & byteLength - - int lineLength = 0; - int byteLength = 0; - - // skip over header - char buffer[1024]; - if ( fgets(buffer, 1024, Stream) == 0 ) { - cerr << "FASTA error : could not read from file" << endl; - return false; - } - if ( feof(Stream) ) return false; - if ( buffer[0] != '>' ) { - cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl; - return false; - } - - // read in first line of sequence - char c = fgetc(Stream); - while ( (c >= 0) && (c != '\n') ) { - ++byteLength; - if (isgraph(c)) ++lineLength; - c = fgetc(Stream); - } - ++byteLength; // store newline - - // rewind FASTA file - if ( !Rewind() ) { - cerr << "FASTA error : could not rewind FASTA file" << endl; - return false; - } - - // iterate through fasta entries - int currentId = 0; - string header = ""; - string sequence = ""; - while ( GetNextHeader(header) ) { - - // --------------------------- - // build index entry data - FastaIndexData data; - - // store file offset of beginning of DNA sequence (after header) - data.Offset = ftello(Stream); - - // parse header, store sequence name in data.Name - if ( !GetNameFromHeader(header, data.Name) ) { - cerr << "FASTA error : could not parse read name from FASTA header" << endl; - return false; - } - - // retrieve FASTA sequence - if ( !GetNextSequence(sequence) ) { - cerr << "FASTA error : could not read in next sequence from FASTA file" << endl; - return false; - } - - // store sequence length & line/byte lengths - data.Length = sequence.length(); - data.LineLength = lineLength; - data.ByteLength = byteLength; - - // store index entry - Index.push_back(data); - - // update ref Id - ++currentId; - } - - // open index file - if ( !indexFilename.empty() ) { - IndexStream = fopen(indexFilename.c_str(), "wb"); - if ( !IndexStream ) { - cerr << "FASTA error : Could not open " << indexFilename << " for writing." << endl; - return false; - } - IsIndexOpen = true; - } - - // write index data - if ( !WriteIndexData() ) return false; - HasIndex = true; - - // close index file - fclose(IndexStream); - IsIndexOpen = false; - - // return succes status - return true; -} - -bool Fasta::FastaPrivate::GetBase(const int& refId, const int& position, char& base) { - - // make sure FASTA file is open - if ( !IsOpen ) { - cerr << "FASTA error : file not open for reading" << endl; - return false; - } - - // use index if available - if ( HasIndex && !Index.empty() ) { - - // validate reference id - if ( (refId < 0) || (refId >= (int)Index.size()) ) { - cerr << "FASTA error: invalid refId specified: " << refId << endl; - return false; - } - - // retrieve reference index data - const FastaIndexData& referenceData = Index.at(refId); - - // validate position - if ( (position < 0) || (position > referenceData.Length) ) { - cerr << "FASTA error: invalid position specified: " << position << endl; - return false; - } - - // seek to beginning of sequence data - if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) { - cerr << "FASTA error : could not sek in file" << endl; - return false; - } - - // retrieve sequence - string sequence = ""; - if ( !GetNextSequence(sequence) ) { - cerr << "FASTA error : could not retrieve base from FASTA file" << endl; - return false; - } - - // set base & return success - base = sequence.at(position); - return true; - } - - // else plow through sequentially - else { - - // rewind FASTA file - if ( !Rewind() ) { - cerr << "FASTA error : could not rewind FASTA file" << endl; - return false; - } - - // iterate through fasta entries - int currentId = 0; - string header = ""; - string sequence = ""; - - // get first entry - GetNextHeader(header); - GetNextSequence(sequence); - - while ( currentId != refId ) { - GetNextHeader(header); - GetNextSequence(sequence); - ++currentId; - } - - // get desired base from sequence - // TODO: error reporting on invalid position - if ( currentId == refId && (sequence.length() >= (size_t)position) ) { - base = sequence.at(position); - return true; - } - - // could not get sequence - return false; - } - - // return success - return true; -} - -bool Fasta::FastaPrivate::GetNameFromHeader(const string& header, string& name) { - - // get rid of the leading greater than sign - string s = header.substr(1); - - // extract the first non-whitespace segment - char* pName = (char*)s.data(); - unsigned int nameLen = (unsigned int)s.size(); - - unsigned int start = 0; - while ( (pName[start] == 32) || (pName[start] == 9) || (pName[start] == 10) || (pName[start] == 13) ) { - start++; - if ( start == nameLen ) - break; - } - - unsigned int stop = start; - if ( stop < nameLen ) { - while( (pName[stop] != 32) && (pName[stop] != 9) && (pName[stop] != 10) && (pName[stop] != 13) ) { - stop++; - if ( stop == nameLen ) - break; - } - } - - if ( start == stop ) { - cerr << "FASTA error : could not parse read name from FASTA header" << endl; - return false; - } - - name = s.substr(start, stop - start).c_str(); - return true; -} - -bool Fasta::FastaPrivate::GetNextHeader(string& header) { - - // validate input stream - if ( !IsOpen || feof(Stream) ) - return false; - - // read in header line - char buffer[1024]; - if ( fgets(buffer, 1024, Stream) == 0 ) { - cerr << "FASTA error : could not read from file" << endl; - return false; - } - - // make sure it's a FASTA header - if ( buffer[0] != '>' ) { - cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl; - return false; - } - - // import buffer contents to header string - stringstream headerBuffer(""); - headerBuffer << buffer; - header = headerBuffer.str(); - - // return success - return true; -} - -bool Fasta::FastaPrivate::GetNextSequence(string& sequence) { - - // validate input stream - if ( !IsOpen || feof(Stream) ) - return false; - - // read in sequence - char buffer[1024]; - ostringstream seqBuffer(""); - while(true) { - - char ch = fgetc(Stream); - ungetc(ch, Stream); - if( (ch == '>') || feof(Stream) ) - break; - - if ( fgets(buffer, 1024, Stream) == 0 ) { - cerr << "FASTA error : could not read from file" << endl; - return false; - } - - Chomp(buffer); - seqBuffer << buffer; - } - - // import buffer contents to sequence string - sequence = seqBuffer.str(); - - // return success - return true; -} - -bool Fasta::FastaPrivate::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) { - - // make sure FASTA file is open - if ( !IsOpen ) { - cerr << "FASTA error : file not open for reading" << endl; - return false; - } - - // use index if available - if ( HasIndex && !Index.empty() ) { - - // validate reference id - if ( (refId < 0) || (refId >= (int)Index.size()) ) { - cerr << "FASTA error: invalid refId specified: " << refId << endl; - return false; - } - - // retrieve reference index data - const FastaIndexData& referenceData = Index.at(refId); - - // validate stop position - if ( (start < 0) || (start > stop) || (stop > referenceData.Length) ) { - cerr << "FASTA error: invalid start/stop positions specified: " << start << ", " << stop << endl; - return false; - } - - // seek to beginning of sequence data - if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) { - cerr << "FASTA error : could not sek in file" << endl; - return false; - } - - // retrieve full sequence - string fullSequence = ""; - if ( !GetNextSequence(fullSequence) ) { - cerr << "FASTA error : could not retrieve sequence from FASTA file" << endl; - return false; - } - - // set sub-sequence & return success - const int seqLength = (stop - start) + 1; - sequence = fullSequence.substr(start, seqLength); - return true; - } - - // else plow through sequentially - else { - - // rewind FASTA file - if ( !Rewind() ) { - cerr << "FASTA error : could not rewind FASTA file" << endl; - return false; - } - - // iterate through fasta entries - int currentId = 0; - string header = ""; - string fullSequence = ""; - - // get first entry - GetNextHeader(header); - GetNextSequence(fullSequence); - - while ( currentId != refId ) { - GetNextHeader(header); - GetNextSequence(fullSequence); - ++currentId; - } - - // get desired substring from sequence - // TODO: error reporting on invalid start/stop positions - if ( currentId == refId && (fullSequence.length() >= (size_t)stop) ) { - const int seqLength = (stop - start) + 1; - sequence = fullSequence.substr(start, seqLength); - return true; - } - - // could not get sequence - return false; - } - - // return success - return true; -} - -bool Fasta::FastaPrivate::LoadIndexData(void) { - - // skip if no index file available - if ( !IsIndexOpen ) return false; - - // clear any prior index data - Index.clear(); - - char buffer[1024]; - stringstream indexBuffer; - while ( true ) { - - char c = fgetc(IndexStream); - if ( (c == '\n') || feof(IndexStream) ) break; - ungetc(c, IndexStream); - - // clear index buffer - indexBuffer.str(""); - - // read line from index file - if ( fgets(buffer, 1024, IndexStream) == 0 ) { - cerr << "FASTA LoadIndexData() error : could not read from index file" << endl; - HasIndex = false; - return false; - } - - // store line in indexBuffer - indexBuffer << buffer; - - // retrieve fasta index data from line - FastaIndexData data; - indexBuffer >> data.Name; - indexBuffer >> data.Length; - indexBuffer >> data.Offset; - indexBuffer >> data.LineLength; - indexBuffer >> data.ByteLength; - - // store index entry - Index.push_back(data); - } - - return true; -} - -bool Fasta::FastaPrivate::Open(const string& filename, const string& indexFilename) { - - bool success = true; - - // open FASTA filename - Stream = fopen(filename.c_str(), "rb"); - if ( !Stream ) { - cerr << "FASTA error: Could not open " << filename << " for reading" << endl; - return false; - } - IsOpen = true; - success &= IsOpen; - - // open index file if it exists - if ( !indexFilename.empty() ) { - IndexStream = fopen(indexFilename.c_str(), "rb"); - if ( !IndexStream ) { - cerr << "FASTA error : Could not open " << indexFilename << " for reading." << endl; - return false; - } - IsIndexOpen = true; - success &= IsIndexOpen; - - // attempt to load index data - HasIndex = LoadIndexData(); - success &= HasIndex; - } - - // return success status - return success; -} - -bool Fasta::FastaPrivate::Rewind(void) { - if ( !IsOpen ) return false; - return ( fseeko(Stream, 0, SEEK_SET) == 0 ); -} - -bool Fasta::FastaPrivate::WriteIndexData(void) { - - // skip if no index file available - if ( !IsIndexOpen ) return false; - - // iterate over index entries - bool success = true; - stringstream indexBuffer; - vector::const_iterator indexIter = Index.begin(); - vector::const_iterator indexEnd = Index.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - - // clear stream - indexBuffer.str(""); - - // write data to stream - const FastaIndexData& data = (*indexIter); - indexBuffer << data.Name << "\t" - << data.Length << "\t" - << data.Offset << "\t" - << data.LineLength << "\t" - << data.ByteLength << endl; - - // write stream to file - success &= ( fputs(indexBuffer.str().c_str(), IndexStream) >= 0 ); - } - - // return success status - return success; -} - -// -------------------------------- -// Fasta implementation - -Fasta::Fasta(void) { - d = new FastaPrivate; -} - -Fasta::~Fasta(void) { - delete d; - d = 0; -} - -bool Fasta::Close(void) { - return d->Close(); -} - -bool Fasta::CreateIndex(const string& indexFilename) { - return d->CreateIndex(indexFilename); -} - -bool Fasta::GetBase(const int& refId, const int& position, char& base) { - return d->GetBase(refId, position, base); -} - -bool Fasta::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) { - return d->GetSequence(refId, start, stop, sequence); -} - -bool Fasta::Open(const string& filename, const string& indexFilename) { - return d->Open(filename, indexFilename); -} diff --git a/bamtools_fasta.h b/bamtools_fasta.h deleted file mode 100644 index f64a087..0000000 --- a/bamtools_fasta.h +++ /dev/null @@ -1,47 +0,0 @@ -// *************************************************************************** -// bamtools_fasta.h (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 13 July 2010 -// --------------------------------------------------------------------------- -// Provides FASTA reading/indexing functionality. -// *************************************************************************** - -#ifndef BAMTOOLS_FASTA_H -#define BAMTOOLS_FASTA_H - -#include - -namespace BamTools { - -class Fasta { - - // ctor & dtor - public: - Fasta(void); - ~Fasta(void); - - // file-handling methods - public: - bool Close(void); - bool Open(const std::string& filename, const std::string& indexFilename = ""); - - // sequence access methods - public: - bool GetBase(const int& refID, const int& position, char& base); - bool GetSequence(const int& refId, const int& start, const int& stop, std::string& sequence); - - // index-handling methods - public: - bool CreateIndex(const std::string& indexFilename); - - // internal implementation - private: - struct FastaPrivate; - FastaPrivate* d; -}; - -} // BAMTOOLS_FASTA_H - -#endif // BAMTOOLS_FASTA_H \ No newline at end of file diff --git a/bamtools_filter.cpp b/bamtools_filter.cpp deleted file mode 100644 index 2249022..0000000 --- a/bamtools_filter.cpp +++ /dev/null @@ -1,89 +0,0 @@ -// *************************************************************************** -// bamtools_filter.cpp (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 1 June 2010 -// --------------------------------------------------------------------------- -// Filters a single BAM file (or filters multiple BAM files and merges) -// according to some user-specified criteria. -// *************************************************************************** - -#include -#include -#include - -#include "bamtools_filter.h" -#include "bamtools_options.h" -#include "BamReader.h" -#include "BamMultiReader.h" - -using namespace std; -using namespace BamTools; - -// --------------------------------------------- -// FilterSettings implementation - -struct FilterTool::FilterSettings { - - // flags - bool HasInputBamFilename; - bool HasOutputBamFilename; - - // filenames - vector InputFiles; - string OutputFilename; - - // constructor - FilterSettings(void) - : HasInputBamFilename(false) - , HasOutputBamFilename(false) - , OutputFilename(Options::StandardOut()) - { } -}; - -// --------------------------------------------- -// FilterTool implementation - -FilterTool::FilterTool(void) - : AbstractTool() - , m_settings(new FilterSettings) -{ - // set program details - Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", "-in [-in ... ] -out "); - - // set up options - OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts, Options::StandardIn()); - Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); -} - -FilterTool::~FilterTool(void) { - delete m_settings; - m_settings = 0; -} - -int FilterTool::Help(void) { - Options::DisplayHelp(); - return 0; -} - -int FilterTool::Run(int argc, char* argv[]) { - - // parse command line arguments - Options::Parse(argc, argv, 1); - - // set to default input if none provided - if ( !m_settings->HasInputBamFilename ) - m_settings->InputFiles.push_back(Options::StandardIn()); - - // open files - BamMultiReader reader; - reader.Open(m_settings->InputFiles, false); - - // do filtering - - // clean up & exit - reader.Close(); - return 0; -} \ No newline at end of file diff --git a/bamtools_filter.h b/bamtools_filter.h deleted file mode 100644 index fe8728b..0000000 --- a/bamtools_filter.h +++ /dev/null @@ -1,36 +0,0 @@ -// *************************************************************************** -// bamtools_filter.h (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 2 June 2010 -// --------------------------------------------------------------------------- -// Filters a single BAM file (or filters multiple BAM files and merges) -// according to some user-specified criteria. -// *************************************************************************** - -#ifndef BAMTOOLS_FILTER_H -#define BAMTOOLS_FILTER_H - -#include "bamtools_tool.h" - -namespace BamTools { - -class FilterTool : public AbstractTool { - - public: - FilterTool(void); - ~FilterTool(void); - - public: - int Help(void); - int Run(int argc, char* argv[]); - - private: - struct FilterSettings; - FilterSettings* m_settings; -}; - -} // namespace BamTools - -#endif // BAMTOOLS_FILTER_H diff --git a/bamtools_header.cpp b/bamtools_header.cpp deleted file mode 100644 index b75f9a0..0000000 --- a/bamtools_header.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// *************************************************************************** -// bamtools_header.cpp (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 1 June 2010 -// --------------------------------------------------------------------------- -// Prints the SAM-style header from a single BAM file ( or merged header from -// multiple BAM files) to stdout -// *************************************************************************** - -#include -#include -#include - -#include "bamtools_header.h" -#include "bamtools_options.h" -#include "BamReader.h" -#include "BamMultiReader.h" - -using namespace std; -using namespace BamTools; - -// --------------------------------------------- -// HeaderSettings implementation - -struct HeaderTool::HeaderSettings { - - // flags - bool HasInputBamFilename; - - // filenames - vector InputFiles; - - // constructor - HeaderSettings(void) - : HasInputBamFilename(false) - { } -}; - -// --------------------------------------------- -// HeaderTool implementation - -HeaderTool::HeaderTool(void) - : AbstractTool() - , m_settings(new HeaderSettings) -{ - // set program details - Options::SetProgramInfo("bamtools header", "prints header from BAM file(s)", "-in [-in ... ] "); - - // set up options - OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts, Options::StandardIn()); -} - -HeaderTool::~HeaderTool(void) { - delete m_settings; - m_settings = 0; -} - -int HeaderTool::Help(void) { - Options::DisplayHelp(); - return 0; -} - -int HeaderTool::Run(int argc, char* argv[]) { - - // parse command line arguments - Options::Parse(argc, argv, 1); - - // set to default input if none provided - if ( !m_settings->HasInputBamFilename ) - m_settings->InputFiles.push_back(Options::StandardIn()); - - // open files - BamMultiReader reader; - if ( reader.Open(m_settings->InputFiles, false) ) { - // dump header contents to stdout - cout << reader.GetHeaderText() << endl; - } - - // clean up & exit - reader.Close(); - return 0; -} \ No newline at end of file diff --git a/bamtools_header.h b/bamtools_header.h deleted file mode 100644 index c52e090..0000000 --- a/bamtools_header.h +++ /dev/null @@ -1,36 +0,0 @@ -// *************************************************************************** -// bamtools_header.h (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 1 June 2010 -// --------------------------------------------------------------------------- -// Prints the SAM-style header from a single BAM file ( or merged header from -// multiple BAM files) to stdout -// *************************************************************************** - -#ifndef BAMTOOLS_HEADER_H -#define BAMTOOLS_HEADER_H - -#include "bamtools_tool.h" - -namespace BamTools { - -class HeaderTool : public AbstractTool { - - public: - HeaderTool(void); - ~HeaderTool(void); - - public: - int Help(void); - int Run(int argc, char* argv[]); - - private: - struct HeaderSettings; - HeaderSettings* m_settings; -}; - -} // namespace BamTools - -#endif // BAMTOOLS_HEADER_H diff --git a/bamtools_index.cpp b/bamtools_index.cpp deleted file mode 100644 index 41dd5c7..0000000 --- a/bamtools_index.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// *************************************************************************** -// bamtools_index.cpp (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 7 July 2010 -// --------------------------------------------------------------------------- -// Creates a BAM index (".bai") file for the provided BAM file. -// *************************************************************************** - -#include -#include - -#include "bamtools_index.h" -#include "bamtools_options.h" -#include "BamReader.h" - -using namespace std; -using namespace BamTools; - -// --------------------------------------------- -// IndexSettings implementation - -struct IndexTool::IndexSettings { - - // flags - bool HasInputBamFilename; - bool IsUsingBamtoolsIndex; - - // filenames - string InputBamFilename; - - // constructor - IndexSettings(void) - : HasInputBamFilename(false) - , IsUsingBamtoolsIndex(false) - , InputBamFilename(Options::StandardIn()) - { } -}; - -// --------------------------------------------- -// IndexTool implementation - -IndexTool::IndexTool(void) - : AbstractTool() - , m_settings(new IndexSettings) -{ - // set program details - Options::SetProgramInfo("bamtools index", "creates index for BAM file", "[-in ] [-bti]"); - - // set up options - OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn()); - Options::AddOption("-bti", "use (non-standard) BamTools indexing scheme", m_settings->IsUsingBamtoolsIndex, IO_Opts); -} - -IndexTool::~IndexTool(void) { - delete m_settings; - m_settings = 0; -} - -int IndexTool::Help(void) { - Options::DisplayHelp(); - return 0; -} - -int IndexTool::Run(int argc, char* argv[]) { - - // parse command line arguments - Options::Parse(argc, argv, 1); - - // open our BAM reader - BamReader reader; - reader.Open(m_settings->InputBamFilename); - - // create index for BAM file - bool useDefaultIndex = !m_settings->IsUsingBamtoolsIndex; - reader.CreateIndex(useDefaultIndex); - - // clean & exit - reader.Close(); - return 0; -} diff --git a/bamtools_index.h b/bamtools_index.h deleted file mode 100644 index bb6d893..0000000 --- a/bamtools_index.h +++ /dev/null @@ -1,35 +0,0 @@ -// *************************************************************************** -// bamtools_index.h (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 1 June 2010 -// --------------------------------------------------------------------------- -// Creates a BAM index (".bai") file for the provided BAM file -// *************************************************************************** - -#ifndef BAMTOOLS_INDEX_H -#define BAMTOOLS_INDEX_H - -#include "bamtools_tool.h" - -namespace BamTools { - -class IndexTool : public AbstractTool { - - public: - IndexTool(void); - ~IndexTool(void); - - public: - int Help(void); - int Run(int argc, char* argv[]); - - private: - struct IndexSettings; - IndexSettings* m_settings; -}; - -} // namespace BamTools - -#endif // BAMTOOLS_INDEX_H diff --git a/bamtools_merge.cpp b/bamtools_merge.cpp deleted file mode 100644 index dcea172..0000000 --- a/bamtools_merge.cpp +++ /dev/null @@ -1,112 +0,0 @@ -// *************************************************************************** -// bamtools_merge.cpp (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 2 June 2010 -// --------------------------------------------------------------------------- -// Merges multiple BAM files into one. -// -// ** Provide selectable region? eg chr2:10000..20000 -// -// *************************************************************************** - -#include -#include -#include - -#include "bamtools_merge.h" -#include "bamtools_options.h" -#include "bamtools_utilities.h" -#include "BamMultiReader.h" -#include "BamWriter.h" - -using namespace std; -using namespace BamTools; - -// --------------------------------------------- -// MergeSettings implementation - -struct MergeTool::MergeSettings { - - // flags - bool HasInputBamFilename; - bool HasOutputBamFilename; -// bool HasRegion; - - // filenames - vector InputFiles; - - // other parameters - string OutputFilename; -// string Region; - - // constructor - MergeSettings(void) - : HasInputBamFilename(false) - , HasOutputBamFilename(false) -// , HasRegion(false) - , OutputFilename(Options::StandardOut()) - { } -}; - -// --------------------------------------------- -// MergeTool implementation - -MergeTool::MergeTool(void) - : AbstractTool() - , m_settings(new MergeSettings) -{ - // set program details - Options::SetProgramInfo("bamtools merge", "merges multiple BAM files into one", "[-in -in ...] [-out ]"); - - // set up options - OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts); - Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts); - -// OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters"); -// Options::AddValueOption("-region", "REGION", "genomic region. See README for more details", "", m_settings->HasRegion, m_settings->Region, FilterOpts); -} - -MergeTool::~MergeTool(void) { - delete m_settings; - m_settings = 0; -} - -int MergeTool::Help(void) { - Options::DisplayHelp(); - return 0; -} - -int MergeTool::Run(int argc, char* argv[]) { - - // parse command line arguments - Options::Parse(argc, argv, 1); - - // set to default input if none provided - if ( !m_settings->HasInputBamFilename ) m_settings->InputFiles.push_back(Options::StandardIn()); - - // opens the BAM files without checking for indexes - BamMultiReader reader; - reader.Open(m_settings->InputFiles, false, true); - - // retrieve header & reference dictionary info - std::string mergedHeader = reader.GetHeaderText(); - RefVector references = reader.GetReferenceData(); - - // open BamWriter - BamWriter writer; - writer.Open(m_settings->OutputFilename, mergedHeader, references); - - // store alignments to output file - BamAlignment bAlignment; - while (reader.GetNextAlignmentCore(bAlignment)) { - writer.SaveAlignment(bAlignment); - } - - // clean & exit - reader.Close(); - writer.Close(); - return 0; -} diff --git a/bamtools_merge.h b/bamtools_merge.h deleted file mode 100644 index 2962ce8..0000000 --- a/bamtools_merge.h +++ /dev/null @@ -1,35 +0,0 @@ -// *************************************************************************** -// bamtools_merge.h (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 1 June 2010 -// --------------------------------------------------------------------------- -// Merges multiple BAM files into one -// *************************************************************************** - -#ifndef BAMTOOLS_MERGE_H -#define BAMTOOLS_MERGE_H - -#include "bamtools_tool.h" - -namespace BamTools { - -class MergeTool : public AbstractTool { - - public: - MergeTool(void); - ~MergeTool(void); - - public: - int Help(void); - int Run(int argc, char* argv[]); - - private: - struct MergeSettings; - MergeSettings* m_settings; -}; - -} // namespace BamTools - -#endif // BAMTOOLS_MERGE_H diff --git a/bamtools_options.cpp b/bamtools_options.cpp deleted file mode 100644 index 931fbd8..0000000 --- a/bamtools_options.cpp +++ /dev/null @@ -1,277 +0,0 @@ -// *************************************************************************** -// bamtools_options.cpp (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 2 June 2010 -// --------------------------------------------------------------------------- -// Parses command line arguments and creates a help menu -// --------------------------------------------------------------------------- -// Modified from: -// The Mosaik suite's command line parser class: COptions -// (c) 2006 - 2009 Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// Dual licenced under the GNU General Public License 2.0+ license or as -// a commercial license with the Marth Lab. -// -// * Modified slightly to fit BamTools, otherwise code is same. -// * (BamTools namespace, added stdin/stdout) (DB) -// *************************************************************************** - -#include "bamtools_options.h" -#include -#include -#include -#include -#include -using namespace std; -using namespace BamTools; - -string Options::m_programName; // the program name -string Options::m_description; // the main description -string Options::m_exampleArguments; // the example arguments -vector Options::m_optionGroups; // stores the option groups -map Options::m_optionsMap; // stores the options in a map -string Options::m_stdin = "stdin"; // string representation of stdin -string Options::m_stdout = "stdout"; // string representation of stdout - -// adds a simple option to the parser -void Options::AddOption(const string& argument, const string& optionDescription, bool& foundArgument, OptionGroup* group) { - - Option o; - o.Argument = argument; - o.Description = optionDescription; - o.StoreValue = false; - group->Options.push_back(o); - - OptionValue ov; - ov.pFoundArgument = &foundArgument; - ov.StoreValue = false; - - m_optionsMap[argument] = ov; -} - -// creates an option group -OptionGroup* Options::CreateOptionGroup(const string& groupName) { - OptionGroup og; - og.Name = groupName; - m_optionGroups.push_back(og); - return &m_optionGroups[m_optionGroups.size() - 1]; -} - -// displays the help menu -void Options::DisplayHelp(void) { - - // initialize - char argumentBuffer[ARGUMENT_LENGTH + 1]; - ostringstream sb; - - char indentBuffer[MAX_LINE_LENGTH - DESC_LENGTH + 1]; - memset(indentBuffer, ' ', MAX_LINE_LENGTH - DESC_LENGTH); - indentBuffer[MAX_LINE_LENGTH - DESC_LENGTH] = 0; - - // display the menu - printf("Description: %s.\n\n", m_description.c_str()); - printf("Usage: "); - printf("%s", m_programName.c_str()); - printf(" %s\n\n", m_exampleArguments.c_str()); - - vector