+++ /dev/null
-// ***************************************************************************\r
-// BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg\r
-// Marth Lab, Department of Biology, Boston College\r
-// All rights reserved.\r
-// ---------------------------------------------------------------------------\r
-// Last modified: 16 August 2010 (DB)\r
-// ---------------------------------------------------------------------------\r
-// BGZF routines were adapted from the bgzf.c code developed at the Broad\r
-// Institute.\r
-// ---------------------------------------------------------------------------\r
-// Provides the basic functionality for reading & writing BGZF files\r
-// ***************************************************************************\r
-\r
-#include <algorithm>\r
-#include "BGZF.h"\r
-using namespace BamTools;\r
-using std::string;\r
-using std::min;\r
-\r
-BgzfData::BgzfData(void)\r
- : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)\r
- , CompressedBlockSize(MAX_BLOCK_SIZE)\r
- , BlockLength(0)\r
- , BlockOffset(0)\r
- , BlockAddress(0)\r
- , IsOpen(false)\r
- , IsWriteOnly(false)\r
- , IsWriteUncompressed(false)\r
- , Stream(NULL)\r
- , UncompressedBlock(NULL)\r
- , CompressedBlock(NULL)\r
-{\r
- try {\r
- CompressedBlock = new char[CompressedBlockSize];\r
- UncompressedBlock = new char[UncompressedBlockSize];\r
- } catch( std::bad_alloc& ba ) {\r
- printf("BGZF ERROR: unable to allocate memory for our BGZF object.\n");\r
- exit(1);\r
- }\r
-}\r
-\r
-// destructor\r
-BgzfData::~BgzfData(void) {\r
- if( CompressedBlock ) delete[] CompressedBlock;\r
- if( UncompressedBlock ) delete[] UncompressedBlock;\r
-}\r
-\r
-// closes BGZF file\r
-void BgzfData::Close(void) {\r
-\r
- // skip if file not open, otherwise set flag\r
- if ( !IsOpen ) return;\r
-\r
- // if writing to file, flush the current BGZF block,\r
- // then write an empty block (as EOF marker)\r
- if ( IsWriteOnly ) {\r
- FlushBlock();\r
- int blockLength = DeflateBlock();\r
- fwrite(CompressedBlock, 1, blockLength, Stream);\r
- }\r
- \r
- // flush and close\r
- fflush(Stream);\r
- fclose(Stream);\r
- IsWriteUncompressed = false;\r
- IsOpen = false;\r
-}\r
-\r
-// compresses the current block\r
-int BgzfData::DeflateBlock(void) {\r
-\r
- // initialize the gzip header\r
- char* buffer = CompressedBlock;\r
- memset(buffer, 0, 18);\r
- buffer[0] = GZIP_ID1;\r
- buffer[1] = (char)GZIP_ID2;\r
- buffer[2] = CM_DEFLATE;\r
- buffer[3] = FLG_FEXTRA;\r
- buffer[9] = (char)OS_UNKNOWN;\r
- buffer[10] = BGZF_XLEN;\r
- buffer[12] = BGZF_ID1;\r
- buffer[13] = BGZF_ID2;\r
- buffer[14] = BGZF_LEN;\r
-\r
- // set compression level\r
- const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION );\r
- \r
- // loop to retry for blocks that do not compress enough\r
- int inputLength = BlockOffset;\r
- int compressedLength = 0;\r
- unsigned int bufferSize = CompressedBlockSize;\r
-\r
- while ( true ) {\r
- \r
- // initialize zstream values\r
- z_stream zs;\r
- zs.zalloc = NULL;\r
- zs.zfree = NULL;\r
- zs.next_in = (Bytef*)UncompressedBlock;\r
- zs.avail_in = inputLength;\r
- zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];\r
- zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;\r
-\r
- // initialize the zlib compression algorithm\r
- if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) {\r
- printf("BGZF ERROR: zlib deflate initialization failed.\n");\r
- exit(1);\r
- }\r
-\r
- // compress the data\r
- int status = deflate(&zs, Z_FINISH);\r
- if ( status != Z_STREAM_END ) {\r
-\r
- deflateEnd(&zs);\r
-\r
- // reduce the input length and try again\r
- if ( status == Z_OK ) {\r
- inputLength -= 1024;\r
- if( inputLength < 0 ) {\r
- printf("BGZF ERROR: input reduction failed.\n");\r
- exit(1);\r
- }\r
- continue;\r
- }\r
-\r
- printf("BGZF ERROR: zlib::deflateEnd() failed.\n");\r
- exit(1);\r
- }\r
-\r
- // finalize the compression routine\r
- if ( deflateEnd(&zs) != Z_OK ) {\r
- printf("BGZF ERROR: zlib::deflateEnd() failed.\n");\r
- exit(1);\r
- }\r
-\r
- compressedLength = zs.total_out;\r
- compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;\r
- if ( compressedLength > MAX_BLOCK_SIZE ) {\r
- printf("BGZF ERROR: deflate overflow.\n");\r
- exit(1);\r
- }\r
-\r
- break;\r
- }\r
-\r
- // store the compressed length\r
- BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));\r
-\r
- // store the CRC32 checksum\r
- unsigned int crc = crc32(0, NULL, 0);\r
- crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);\r
- BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc);\r
- BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);\r
-\r
- // ensure that we have less than a block of data left\r
- int remaining = BlockOffset - inputLength;\r
- if ( remaining > 0 ) {\r
- if ( remaining > inputLength ) {\r
- printf("BGZF ERROR: after deflate, remainder too large.\n");\r
- exit(1);\r
- }\r
- memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);\r
- }\r
-\r
- BlockOffset = remaining;\r
- return compressedLength;\r
-}\r
-\r
-// flushes the data in the BGZF block\r
-void BgzfData::FlushBlock(void) {\r
-\r
- // flush all of the remaining blocks\r
- while ( BlockOffset > 0 ) {\r
-\r
- // compress the data block\r
- int blockLength = DeflateBlock();\r
-\r
- // flush the data to our output stream\r
- int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);\r
-\r
- if ( numBytesWritten != blockLength ) {\r
- printf("BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);\r
- exit(1);\r
- }\r
- \r
- BlockAddress += blockLength;\r
- }\r
-}\r
-\r
-// de-compresses the current block\r
-int BgzfData::InflateBlock(const int& blockLength) {\r
-\r
- // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock\r
- z_stream zs;\r
- zs.zalloc = NULL;\r
- zs.zfree = NULL;\r
- zs.next_in = (Bytef*)CompressedBlock + 18;\r
- zs.avail_in = blockLength - 16;\r
- zs.next_out = (Bytef*)UncompressedBlock;\r
- zs.avail_out = UncompressedBlockSize;\r
-\r
- int status = inflateInit2(&zs, GZIP_WINDOW_BITS);\r
- if ( status != Z_OK ) {\r
- printf("BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n");\r
- return -1;\r
- }\r
-\r
- status = inflate(&zs, Z_FINISH);\r
- if ( status != Z_STREAM_END ) {\r
- inflateEnd(&zs);\r
- printf("BGZF ERROR: could not decompress block - zlib::inflate() failed\n");\r
- return -1;\r
- }\r
-\r
- status = inflateEnd(&zs);\r
- if ( status != Z_OK ) {\r
- printf("BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n");\r
- return -1;\r
- }\r
-\r
- return zs.total_out;\r
-}\r
-\r
-// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)\r
-bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncompressed ) {\r
-\r
- // determine open mode\r
- if ( strcmp(mode, "rb") == 0 )\r
- IsWriteOnly = false;\r
- else if ( strcmp(mode, "wb") == 0) \r
- IsWriteOnly = true;\r
- else {\r
- printf("BGZF ERROR: unknown file mode: %s\n", mode);\r
- return false; \r
- }\r
-\r
- // ----------------------------------------------------------------\r
- // open Stream to read to/write from file, stdin, or stdout\r
- // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03)\r
- \r
- // read/write BGZF data to/from a file\r
- if ( (filename != "stdin") && (filename != "stdout") )\r
- Stream = fopen(filename.c_str(), mode);\r
- \r
- // read BGZF data from stdin\r
- else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) )\r
- Stream = freopen(NULL, mode, stdin);\r
- \r
- // write BGZF data to stdout\r
- else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) )\r
- Stream = freopen(NULL, mode, stdout);\r
-\r
- if ( !Stream ) {\r
- printf("BGZF ERROR: unable to open file %s\n", filename.c_str() );\r
- return false;\r
- }\r
- \r
- // set flags, return success\r
- IsOpen = true;\r
- IsWriteUncompressed = isWriteUncompressed;\r
- return true;\r
-}\r
-\r
-// reads BGZF data into a byte buffer\r
-int BgzfData::Read(char* data, const unsigned int dataLength) {\r
-\r
- if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0;\r
-\r
- char* output = data;\r
- unsigned int numBytesRead = 0;\r
- while ( numBytesRead < dataLength ) {\r
-\r
- int bytesAvailable = BlockLength - BlockOffset;\r
- if ( bytesAvailable <= 0 ) {\r
- if ( !ReadBlock() ) return -1; \r
- bytesAvailable = BlockLength - BlockOffset;\r
- if ( bytesAvailable <= 0 ) break;\r
- }\r
-\r
- char* buffer = UncompressedBlock;\r
- int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );\r
- memcpy(output, buffer + BlockOffset, copyLength);\r
-\r
- BlockOffset += copyLength;\r
- output += copyLength;\r
- numBytesRead += copyLength;\r
- }\r
-\r
- if ( BlockOffset == BlockLength ) {\r
- BlockAddress = ftell64(Stream);\r
- BlockOffset = 0;\r
- BlockLength = 0;\r
- }\r
-\r
- return numBytesRead;\r
-}\r
-\r
-// reads a BGZF block\r
-bool BgzfData::ReadBlock(void) {\r
-\r
- char header[BLOCK_HEADER_LENGTH];\r
- int64_t blockAddress = ftell64(Stream);\r
- \r
- int count = fread(header, 1, sizeof(header), Stream);\r
- if ( count == 0 ) {\r
- BlockLength = 0;\r
- return true;\r
- }\r
-\r
- if ( count != sizeof(header) ) {\r
- printf("BGZF ERROR: read block failed - could not read block header\n");\r
- return false;\r
- }\r
-\r
- if ( !BgzfData::CheckBlockHeader(header) ) {\r
- printf("BGZF ERROR: read block failed - invalid block header\n");\r
- return false;\r
- }\r
-\r
- int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1;\r
- char* compressedBlock = CompressedBlock;\r
- memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);\r
- int remaining = blockLength - BLOCK_HEADER_LENGTH;\r
-\r
- count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream);\r
- if ( count != remaining ) {\r
- printf("BGZF ERROR: read block failed - could not read data from block\n");\r
- return false;\r
- }\r
-\r
- count = InflateBlock(blockLength);\r
- if ( count < 0 ) { \r
- printf("BGZF ERROR: read block failed - could not decompress block data\n");\r
- return false;\r
- }\r
-\r
- if ( BlockLength != 0 )\r
- BlockOffset = 0;\r
-\r
- BlockAddress = blockAddress;\r
- BlockLength = count;\r
- return true;\r
-}\r
-\r
-// seek to position in BGZF file\r
-bool BgzfData::Seek(int64_t position) {\r
-\r
- if ( !IsOpen ) return false;\r
- \r
- int blockOffset = (position & 0xFFFF);\r
- int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;\r
-\r
- if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) {\r
- printf("BGZF ERROR: unable to seek in file\n");\r
- return false;\r
- }\r
-\r
- BlockLength = 0;\r
- BlockAddress = blockAddress;\r
- BlockOffset = blockOffset;\r
- return true;\r
-}\r
-\r
-// get file position in BGZF file\r
-int64_t BgzfData::Tell(void) {\r
- if ( !IsOpen ) \r
- return false;\r
- else \r
- return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );\r
-}\r
-\r
-// writes the supplied data into the BGZF buffer\r
-unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) {\r
-\r
- if ( !IsOpen || !IsWriteOnly ) return false;\r
- \r
- // initialize\r
- unsigned int numBytesWritten = 0;\r
- const char* input = data;\r
- unsigned int blockLength = UncompressedBlockSize;\r
-\r
- // copy the data to the buffer\r
- while ( numBytesWritten < dataLen ) {\r
- \r
- unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);\r
- char* buffer = UncompressedBlock;\r
- memcpy(buffer + BlockOffset, input, copyLength);\r
-\r
- BlockOffset += copyLength;\r
- input += copyLength;\r
- numBytesWritten += copyLength;\r
-\r
- if ( BlockOffset == blockLength )\r
- FlushBlock();\r
- }\r
-\r
- return numBytesWritten;\r
-}\r
+++ /dev/null
-// ***************************************************************************\r
-// BGZF.h (c) 2009 Derek Barnett, Michael Str�mberg\r
-// Marth Lab, Department of Biology, Boston College\r
-// All rights reserved.\r
-// ---------------------------------------------------------------------------\r
-// Last modified: 16 August 2010 (DB)\r
-// ---------------------------------------------------------------------------\r
-// BGZF routines were adapted from the bgzf.c code developed at the Broad\r
-// Institute.\r
-// ---------------------------------------------------------------------------\r
-// Provides the basic functionality for reading & writing BGZF files\r
-// ***************************************************************************\r
-\r
-#ifndef BGZF_H\r
-#define BGZF_H\r
-\r
-// 'C' includes\r
-#include <cstdio>\r
-#include <cstdlib>\r
-#include <cstring>\r
-\r
-// C++ includes\r
-#include <string>\r
-\r
-// zlib includes\r
-#include "zlib.h"\r
-\r
-// Platform-specific large-file support\r
-#ifndef BAMTOOLS_LFS\r
-#define BAMTOOLS_LFS\r
- #ifdef WIN32\r
- #define ftell64(a) _ftelli64(a)\r
- #define fseek64(a,b,c) _fseeki64(a,b,c)\r
- #else\r
- #define ftell64(a) ftello(a)\r
- #define fseek64(a,b,c) fseeko(a,b,c) \r
- #endif\r
-#endif // BAMTOOLS_LFS\r
-\r
-// Platform-specific type definitions\r
-#ifndef BAMTOOLS_TYPES\r
-#define BAMTOOLS_TYPES\r
- #ifdef _MSC_VER\r
- typedef char int8_t;\r
- typedef unsigned char uint8_t;\r
- typedef short int16_t;\r
- typedef unsigned short uint16_t;\r
- typedef int int32_t;\r
- typedef unsigned int uint32_t;\r
- typedef long long int64_t;\r
- typedef unsigned long long uint64_t;\r
- #else \r
- #include <stdint.h>\r
- #endif\r
-#endif // BAMTOOLS_TYPES\r
-\r
-namespace BamTools {\r
-\r
-// zlib constants\r
-const int GZIP_ID1 = 31;\r
-const int GZIP_ID2 = 139;\r
-const int CM_DEFLATE = 8;\r
-const int FLG_FEXTRA = 4;\r
-const int OS_UNKNOWN = 255;\r
-const int BGZF_XLEN = 6;\r
-const int BGZF_ID1 = 66;\r
-const int BGZF_ID2 = 67;\r
-const int BGZF_LEN = 2;\r
-const int GZIP_WINDOW_BITS = -15;\r
-const int Z_DEFAULT_MEM_LEVEL = 8;\r
-\r
-// BZGF constants\r
-const int BLOCK_HEADER_LENGTH = 18;\r
-const int BLOCK_FOOTER_LENGTH = 8;\r
-const int MAX_BLOCK_SIZE = 65536;\r
-const int DEFAULT_BLOCK_SIZE = 65536;\r
-\r
-struct BgzfData {\r
-\r
- // data members\r
- public:\r
- unsigned int UncompressedBlockSize;\r
- unsigned int CompressedBlockSize;\r
- unsigned int BlockLength;\r
- unsigned int BlockOffset;\r
- uint64_t BlockAddress;\r
- bool IsOpen;\r
- bool IsWriteOnly;\r
- bool IsWriteUncompressed;\r
- FILE* Stream;\r
- char* UncompressedBlock;\r
- char* CompressedBlock;\r
-\r
- // constructor & destructor\r
- public:\r
- BgzfData(void);\r
- ~BgzfData(void);\r
-\r
- // main interface methods\r
- public: \r
- // closes BGZF file\r
- void Close(void);\r
- // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing)\r
- bool Open(const std::string& filename, const char* mode, bool isWriteUncompressed = false);\r
- // reads BGZF data into a byte buffer\r
- int Read(char* data, const unsigned int dataLength);\r
- // seek to position in BGZF file\r
- bool Seek(int64_t position);\r
- // get file position in BGZF file\r
- int64_t Tell(void);\r
- // writes the supplied data into the BGZF buffer\r
- unsigned int Write(const char* data, const unsigned int dataLen);\r
-\r
- // internal methods\r
- private:\r
- // compresses the current block\r
- int DeflateBlock(void);\r
- // flushes the data in the BGZF block\r
- void FlushBlock(void);\r
- // de-compresses the current block\r
- int InflateBlock(const int& blockLength);\r
- // reads a BGZF block\r
- bool ReadBlock(void);\r
- \r
- // static 'utility' methods\r
- public:\r
- // checks BGZF block header\r
- static inline bool CheckBlockHeader(char* header);\r
- // packs an unsigned integer into the specified buffer\r
- static inline void PackUnsignedInt(char* buffer, unsigned int value);\r
- // packs an unsigned short into the specified buffer\r
- static inline void PackUnsignedShort(char* buffer, unsigned short value);\r
- // unpacks a buffer into a double\r
- static inline double UnpackDouble(char* buffer);\r
- static inline double UnpackDouble(const char* buffer);\r
- // unpacks a buffer into a float\r
- static inline float UnpackFloat(char* buffer);\r
- static inline float UnpackFloat(const char* buffer);\r
- // unpacks a buffer into a signed int\r
- static inline signed int UnpackSignedInt(char* buffer);\r
- static inline signed int UnpackSignedInt(const char* buffer);\r
- // unpacks a buffer into a signed short\r
- static inline signed short UnpackSignedShort(char* buffer);\r
- static inline signed short UnpackSignedShort(const char* buffer);\r
- // unpacks a buffer into an unsigned int\r
- static inline unsigned int UnpackUnsignedInt(char* buffer);\r
- static inline unsigned int UnpackUnsignedInt(const char* buffer);\r
- // unpacks a buffer into an unsigned short\r
- static inline unsigned short UnpackUnsignedShort(char* buffer);\r
- static inline unsigned short UnpackUnsignedShort(const char* buffer);\r
-};\r
-\r
-// -------------------------------------------------------------\r
-// static 'utility' method implementations\r
-\r
-// checks BGZF block header\r
-inline\r
-bool BgzfData::CheckBlockHeader(char* header) {\r
- return (header[0] == GZIP_ID1 &&\r
- header[1] == (char)GZIP_ID2 &&\r
- header[2] == Z_DEFLATED &&\r
- (header[3] & FLG_FEXTRA) != 0 &&\r
- BgzfData::UnpackUnsignedShort(&header[10]) == BGZF_XLEN &&\r
- header[12] == BGZF_ID1 &&\r
- header[13] == BGZF_ID2 &&\r
- BgzfData::UnpackUnsignedShort(&header[14]) == BGZF_LEN );\r
-}\r
-\r
-// 'packs' an unsigned integer into the specified buffer\r
-inline\r
-void BgzfData::PackUnsignedInt(char* buffer, unsigned int value) {\r
- buffer[0] = (char)value;\r
- buffer[1] = (char)(value >> 8);\r
- buffer[2] = (char)(value >> 16);\r
- buffer[3] = (char)(value >> 24);\r
-}\r
-\r
-// 'packs' an unsigned short into the specified buffer\r
-inline\r
-void BgzfData::PackUnsignedShort(char* buffer, unsigned short value) {\r
- buffer[0] = (char)value;\r
- buffer[1] = (char)(value >> 8);\r
-}\r
-\r
-// 'unpacks' a buffer into a double (includes both non-const & const char* flavors)\r
-inline\r
-double BgzfData::UnpackDouble(char* buffer) {\r
- union { double value; unsigned char valueBuffer[sizeof(double)]; } un;\r
- un.value = 0;\r
- un.valueBuffer[0] = buffer[0];\r
- un.valueBuffer[1] = buffer[1];\r
- un.valueBuffer[2] = buffer[2];\r
- un.valueBuffer[3] = buffer[3];\r
- un.valueBuffer[4] = buffer[4];\r
- un.valueBuffer[5] = buffer[5];\r
- un.valueBuffer[6] = buffer[6];\r
- un.valueBuffer[7] = buffer[7];\r
- return un.value;\r
-}\r
-\r
-inline\r
-double BgzfData::UnpackDouble(const char* buffer) {\r
- union { double value; unsigned char valueBuffer[sizeof(double)]; } un;\r
- un.value = 0;\r
- un.valueBuffer[0] = buffer[0];\r
- un.valueBuffer[1] = buffer[1];\r
- un.valueBuffer[2] = buffer[2];\r
- un.valueBuffer[3] = buffer[3];\r
- un.valueBuffer[4] = buffer[4];\r
- un.valueBuffer[5] = buffer[5];\r
- un.valueBuffer[6] = buffer[6];\r
- un.valueBuffer[7] = buffer[7];\r
- return un.value;\r
-}\r
-\r
-// 'unpacks' a buffer into a float (includes both non-const & const char* flavors)\r
-inline\r
-float BgzfData::UnpackFloat(char* buffer) {\r
- union { float value; unsigned char valueBuffer[sizeof(float)]; } un;\r
- un.value = 0;\r
- un.valueBuffer[0] = buffer[0];\r
- un.valueBuffer[1] = buffer[1];\r
- un.valueBuffer[2] = buffer[2];\r
- un.valueBuffer[3] = buffer[3];\r
- return un.value;\r
-}\r
-\r
-inline\r
-float BgzfData::UnpackFloat(const char* buffer) {\r
- union { float value; unsigned char valueBuffer[sizeof(float)]; } un;\r
- un.value = 0;\r
- un.valueBuffer[0] = buffer[0];\r
- un.valueBuffer[1] = buffer[1];\r
- un.valueBuffer[2] = buffer[2];\r
- un.valueBuffer[3] = buffer[3];\r
- return un.value;\r
-}\r
-\r
-// 'unpacks' a buffer into a signed int (includes both non-const & const char* flavors)\r
-inline\r
-signed int BgzfData::UnpackSignedInt(char* buffer) {\r
- union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un;\r
- un.value = 0;\r
- un.valueBuffer[0] = buffer[0];\r
- un.valueBuffer[1] = buffer[1];\r
- un.valueBuffer[2] = buffer[2];\r
- un.valueBuffer[3] = buffer[3];\r
- return un.value;\r
-}\r
-\r
-inline\r
-signed int BgzfData::UnpackSignedInt(const char* buffer) {\r
- union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un;\r
- un.value = 0;\r
- un.valueBuffer[0] = buffer[0];\r
- un.valueBuffer[1] = buffer[1];\r
- un.valueBuffer[2] = buffer[2];\r
- un.valueBuffer[3] = buffer[3];\r
- return un.value;\r
-}\r
-\r
-// 'unpacks' a buffer into a signed short (includes both non-const & const char* flavors)\r
-inline\r
-signed short BgzfData::UnpackSignedShort(char* buffer) {\r
- union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un;\r
- un.value = 0;\r
- un.valueBuffer[0] = buffer[0];\r
- un.valueBuffer[1] = buffer[1];\r
- return un.value;\r
-}\r
-\r
-inline\r
-signed short BgzfData::UnpackSignedShort(const char* buffer) {\r
- union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un;\r
- un.value = 0;\r
- un.valueBuffer[0] = buffer[0];\r
- un.valueBuffer[1] = buffer[1];\r
- return un.value;\r
-}\r
-\r
-// 'unpacks' a buffer into an unsigned int (includes both non-const & const char* flavors)\r
-inline\r
-unsigned int BgzfData::UnpackUnsignedInt(char* buffer) {\r
- union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;\r
- un.value = 0;\r
- un.valueBuffer[0] = buffer[0];\r
- un.valueBuffer[1] = buffer[1];\r
- un.valueBuffer[2] = buffer[2];\r
- un.valueBuffer[3] = buffer[3];\r
- return un.value;\r
-}\r
-\r
-inline\r
-unsigned int BgzfData::UnpackUnsignedInt(const char* buffer) {\r
- union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;\r
- un.value = 0;\r
- un.valueBuffer[0] = buffer[0];\r
- un.valueBuffer[1] = buffer[1];\r
- un.valueBuffer[2] = buffer[2];\r
- un.valueBuffer[3] = buffer[3];\r
- return un.value;\r
-}\r
-\r
-// 'unpacks' a buffer into an unsigned short (includes both non-const & const char* flavors)\r
-inline\r
-unsigned short BgzfData::UnpackUnsignedShort(char* buffer) {\r
- union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un;\r
- un.value = 0;\r
- un.valueBuffer[0] = buffer[0];\r
- un.valueBuffer[1] = buffer[1];\r
- return un.value;\r
-}\r
-\r
-inline\r
-unsigned short BgzfData::UnpackUnsignedShort(const char* buffer) {\r
- union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un;\r
- un.value = 0;\r
- un.valueBuffer[0] = buffer[0];\r
- un.valueBuffer[1] = buffer[1];\r
- return un.value;\r
-}\r
-\r
-} // namespace BamTools\r
-\r
-#endif // BGZF_H\r
+++ /dev/null
-// ***************************************************************************\r
-// BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg\r
-// Marth Lab, Department of Biology, Boston College\r
-// All rights reserved.\r
-// ---------------------------------------------------------------------------\r
-// Last modified: 27 July 2010 (DB)\r
-// ---------------------------------------------------------------------------\r
-// Provides the basic constants, data structures, etc. for using BAM files\r
-// ***************************************************************************\r
-\r
-#ifndef BAMAUX_H\r
-#define BAMAUX_H\r
-\r
-// C inclues\r
-#include <cctype>\r
-#include <cstdio>\r
-#include <cstdlib>\r
-#include <cstring>\r
-\r
-// C++ includes\r
-#include <exception>\r
-#include <map>\r
-#include <string>\r
-#include <utility>\r
-#include <vector>\r
-\r
-// Platform-specific type definitions\r
-#ifndef BAMTOOLS_TYPES\r
-#define BAMTOOLS_TYPES\r
- #ifdef _MSC_VER\r
- typedef char int8_t;\r
- typedef unsigned char uint8_t;\r
- typedef short int16_t;\r
- typedef unsigned short uint16_t;\r
- typedef int int32_t;\r
- typedef unsigned int uint32_t;\r
- typedef long long int64_t;\r
- typedef unsigned long long uint64_t;\r
- #else\r
- #include <stdint.h>\r
- #endif\r
-#endif // BAMTOOLS_TYPES\r
-\r
-namespace BamTools {\r
-\r
-// BAM constants\r
-const int BAM_CORE_SIZE = 32;\r
-const int BAM_CMATCH = 0;\r
-const int BAM_CINS = 1;\r
-const int BAM_CDEL = 2;\r
-const int BAM_CREF_SKIP = 3;\r
-const int BAM_CSOFT_CLIP = 4;\r
-const int BAM_CHARD_CLIP = 5;\r
-const int BAM_CPAD = 6;\r
-const int BAM_CIGAR_SHIFT = 4;\r
-const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1);\r
-\r
-// BAM index constants\r
-const int MAX_BIN = 37450; // =(8^6-1)/7+1\r
-const int BAM_MIN_CHUNK_GAP = 32768;\r
-const int BAM_LIDX_SHIFT = 14;\r
-\r
-// Explicit variable sizes\r
-const int BT_SIZEOF_INT = 4;\r
-\r
-struct CigarOp;\r
-\r
-struct BamAlignment {\r
-\r
- // constructors & destructor\r
- public:\r
- BamAlignment(void);\r
- BamAlignment(const BamAlignment& other);\r
- ~BamAlignment(void);\r
-\r
- // Queries against alignment flags\r
- public: \r
- bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate \r
- bool IsFailedQC(void) const; // Returns true if this read failed quality control \r
- bool IsFirstMate(void) const; // Returns true if alignment is first mate on read \r
- bool IsMapped(void) const; // Returns true if alignment is mapped \r
- bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped \r
- bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand \r
- bool IsPaired(void) const; // Returns true if alignment part of paired-end read \r
- bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment \r
- bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution \r
- bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand\r
- bool IsSecondMate(void) const; // Returns true if alignment is second mate on read\r
-\r
- // Manipulate alignment flags\r
- public: \r
- void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag \r
- void SetIsFailedQC(bool ok); // Sets "failed quality control" flag \r
- void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag \r
- void SetIsMateUnmapped(bool ok); // Sets "alignment's mate is mapped" flag \r
- void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag \r
- void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag \r
- void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag \r
- void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag \r
- void SetIsSecondaryAlignment(bool ok); // Sets "position is primary alignment" flag \r
- void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag \r
- void SetIsUnmapped(bool ok); // Sets "alignment is mapped" flag\r
-\r
- // Tag data access methods\r
- public:\r
- // -------------------------------------------------------------------------------------\r
- // N.B. - The following tag-modifying methods may not be used on BamAlignments fetched\r
- // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in \r
- // error message (to keep output clean) but will ALWAYS return false. Only user-\r
- // generated BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid.\r
-\r
- // add tag data (create new TAG entry with TYPE and VALUE)\r
- // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details\r
- // returns true if new data added, false if error or TAG already exists\r
- // N.B. - will NOT modify existing tag. Use EditTag() instead\r
- bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H\r
- bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i\r
- bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i\r
- bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f\r
- \r
- // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present)\r
- // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details\r
- // returns true if edit was successfaul, false if error\r
- bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H\r
- bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i\r
- bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i\r
- bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f\r
-\r
- // specific tag data access methods - these only remain for legacy support\r
- bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (implemented as GetTag("NM", editDistance))\r
- bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (implemented as GetTag("RG", readGroup)) \r
- \r
- // generic tag data access methods \r
- bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings \r
- bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data\r
- bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data\r
- bool GetTag(const std::string& tag, float& destination) const; // access floating point data\r
- \r
- // remove tag data\r
- // returns true if removal was successful, false if error\r
- // N.B. - returns false if TAG does not exist (no removal can occur)\r
- bool RemoveTag(const std::string& tag);\r
-\r
- // Additional data access methods\r
- public:\r
- int GetEndPosition(bool usePadded = false) const; // calculates alignment end position, based on starting position and CIGAR operations\r
-\r
- // 'internal' utility methods \r
- private:\r
- static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed);\r
- static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed);\r
-\r
- // Data members\r
- public:\r
- std::string Name; // Read name\r
- int32_t Length; // Query length\r
- std::string QueryBases; // 'Original' sequence (as reported from sequencing machine)\r
- std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping)\r
- std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)\r
- std::string TagData; // Tag data (accessor methods will pull the requested information out)\r
- int32_t RefID; // ID number for reference sequence\r
- int32_t Position; // Position (0-based) where alignment starts\r
- uint16_t Bin; // Bin in BAM file where this alignment resides\r
- uint16_t MapQuality; // Mapping quality score\r
- uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate \r
- std::vector<CigarOp> CigarData; // CIGAR operations for this alignment\r
- int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned\r
- int32_t MatePosition; // Position (0-based) where alignment's mate starts\r
- int32_t InsertSize; // Mate-pair insert size\r
- \r
- // internal data\r
- private:\r
- struct BamAlignmentSupportData {\r
- \r
- // data members\r
- std::string AllCharData;\r
- uint32_t BlockLength;\r
- uint32_t NumCigarOperations;\r
- uint32_t QueryNameLength;\r
- uint32_t QuerySequenceLength;\r
- bool HasCoreOnly;\r
- \r
- // constructor\r
- BamAlignmentSupportData(void)\r
- : BlockLength(0)\r
- , NumCigarOperations(0)\r
- , QueryNameLength(0)\r
- , QuerySequenceLength(0)\r
- , HasCoreOnly(false)\r
- { }\r
- };\r
- \r
- // contains raw character data & lengths\r
- BamAlignmentSupportData SupportData; \r
- \r
- // allow these classes access to BamAlignment private members (SupportData)\r
- // but client code should not need to touch this data\r
- friend class BamReader;\r
- friend class BamWriter;\r
-\r
- // Alignment flag query constants\r
- // Use the get/set methods above instead\r
- private:\r
- enum { PAIRED = 1\r
- , PROPER_PAIR = 2\r
- , UNMAPPED = 4\r
- , MATE_UNMAPPED = 8\r
- , REVERSE = 16\r
- , MATE_REVERSE = 32\r
- , READ_1 = 64\r
- , READ_2 = 128\r
- , SECONDARY = 256\r
- , QC_FAILED = 512\r
- , DUPLICATE = 1024 \r
- };\r
-};\r
-\r
-// ----------------------------------------------------------------\r
-// Auxiliary data structs & typedefs\r
-\r
-struct CigarOp {\r
- \r
- // data members\r
- char Type; // Operation type (MIDNSHP)\r
- uint32_t Length; // Operation length (number of bases)\r
- \r
- // constructor\r
- CigarOp(const char type = '\0', \r
- const uint32_t length = 0) \r
- : Type(type)\r
- , Length(length) \r
- { }\r
-};\r
-\r
-struct RefData {\r
- \r
- // data members\r
- std::string RefName; // Name of reference sequence\r
- int32_t RefLength; // Length of reference sequence\r
- bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence\r
- \r
- // constructor\r
- RefData(const int32_t& length = 0, \r
- bool ok = false)\r
- : RefLength(length)\r
- , RefHasAlignments(ok)\r
- { }\r
-};\r
-\r
-typedef std::vector<RefData> RefVector;\r
-typedef std::vector<BamAlignment> BamAlignmentVector;\r
-\r
-struct BamRegion {\r
- \r
- // data members\r
- int LeftRefID;\r
- int LeftPosition;\r
- int RightRefID;\r
- int RightPosition;\r
- \r
- // constructor\r
- BamRegion(const int& leftID = -1, \r
- const int& leftPos = -1,\r
- const int& rightID = -1,\r
- const int& rightPos = -1)\r
- : LeftRefID(leftID)\r
- , LeftPosition(leftPos)\r
- , RightRefID(rightID)\r
- , RightPosition(rightPos)\r
- { }\r
-};\r
-\r
-// ----------------------------------------------------------------\r
-// Added: 3-35-2010 DWB\r
-// Fixed: Routines to provide endian-correctness\r
-// ----------------------------------------------------------------\r
-\r
-// returns true if system is big endian\r
-inline bool SystemIsBigEndian(void) {\r
- const uint16_t one = 0x0001;\r
- return ((*(char*) &one) == 0 );\r
-}\r
-\r
-// swaps endianness of 16-bit value 'in place'\r
-inline void SwapEndian_16(int16_t& x) {\r
- x = ((x >> 8) | (x << 8));\r
-}\r
-\r
-inline void SwapEndian_16(uint16_t& x) {\r
- x = ((x >> 8) | (x << 8));\r
-}\r
-\r
-// swaps endianness of 32-bit value 'in-place'\r
-inline void SwapEndian_32(int32_t& x) {\r
- x = ( (x >> 24) | \r
- ((x << 8) & 0x00FF0000) | \r
- ((x >> 8) & 0x0000FF00) | \r
- (x << 24)\r
- );\r
-}\r
-\r
-inline void SwapEndian_32(uint32_t& x) {\r
- x = ( (x >> 24) | \r
- ((x << 8) & 0x00FF0000) | \r
- ((x >> 8) & 0x0000FF00) | \r
- (x << 24)\r
- );\r
-}\r
-\r
-// swaps endianness of 64-bit value 'in-place'\r
-inline void SwapEndian_64(int64_t& x) {\r
- x = ( (x >> 56) | \r
- ((x << 40) & 0x00FF000000000000ll) |\r
- ((x << 24) & 0x0000FF0000000000ll) |\r
- ((x << 8) & 0x000000FF00000000ll) |\r
- ((x >> 8) & 0x00000000FF000000ll) |\r
- ((x >> 24) & 0x0000000000FF0000ll) |\r
- ((x >> 40) & 0x000000000000FF00ll) |\r
- (x << 56)\r
- );\r
-}\r
-\r
-inline void SwapEndian_64(uint64_t& x) {\r
- x = ( (x >> 56) | \r
- ((x << 40) & 0x00FF000000000000ll) |\r
- ((x << 24) & 0x0000FF0000000000ll) |\r
- ((x << 8) & 0x000000FF00000000ll) |\r
- ((x >> 8) & 0x00000000FF000000ll) |\r
- ((x >> 24) & 0x0000000000FF0000ll) |\r
- ((x >> 40) & 0x000000000000FF00ll) |\r
- (x << 56)\r
- );\r
-}\r
-\r
-// swaps endianness of 'next 2 bytes' in a char buffer (in-place)\r
-inline void SwapEndian_16p(char* data) {\r
- uint16_t& value = (uint16_t&)*data; \r
- SwapEndian_16(value);\r
-}\r
-\r
-// swaps endianness of 'next 4 bytes' in a char buffer (in-place)\r
-inline void SwapEndian_32p(char* data) {\r
- uint32_t& value = (uint32_t&)*data; \r
- SwapEndian_32(value);\r
-}\r
-\r
-// swaps endianness of 'next 8 bytes' in a char buffer (in-place)\r
-inline void SwapEndian_64p(char* data) {\r
- uint64_t& value = (uint64_t&)*data; \r
- SwapEndian_64(value);\r
-}\r
-\r
-// ----------------------------------------------------------------\r
-// BamAlignment member methods\r
-\r
-// constructors & destructor\r
-inline BamAlignment::BamAlignment(void) { }\r
-\r
-inline BamAlignment::BamAlignment(const BamAlignment& other)\r
- : Name(other.Name)\r
- , Length(other.Length)\r
- , QueryBases(other.QueryBases)\r
- , AlignedBases(other.AlignedBases)\r
- , Qualities(other.Qualities)\r
- , TagData(other.TagData)\r
- , RefID(other.RefID)\r
- , Position(other.Position)\r
- , Bin(other.Bin)\r
- , MapQuality(other.MapQuality)\r
- , AlignmentFlag(other.AlignmentFlag)\r
- , CigarData(other.CigarData)\r
- , MateRefID(other.MateRefID)\r
- , MatePosition(other.MatePosition)\r
- , InsertSize(other.InsertSize)\r
- , SupportData(other.SupportData)\r
-{ }\r
-\r
-inline BamAlignment::~BamAlignment(void) { }\r
-\r
-// Queries against alignment flags\r
-inline bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }\r
-inline bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }\r
-inline bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }\r
-inline bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }\r
-inline bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }\r
-inline bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }\r
-inline bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }\r
-inline bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }\r
-inline bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }\r
-inline bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }\r
-inline bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }\r
-\r
-// Manipulate alignment flags \r
-inline void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; }\r
-inline void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; }\r
-inline void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; }\r
-inline void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; }\r
-inline void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; }\r
-inline void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; }\r
-inline void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; }\r
-inline void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; }\r
-inline void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; }\r
-inline void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; }\r
-inline void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; }\r
-\r
-// calculates alignment end position, based on starting position and CIGAR operations\r
-inline \r
-int BamAlignment::GetEndPosition(bool usePadded) const {\r
-\r
- // initialize alignment end to starting position\r
- int alignEnd = Position;\r
-\r
- // iterate over cigar operations\r
- std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin();\r
- std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end();\r
- for ( ; cigarIter != cigarEnd; ++cigarIter) {\r
- const char cigarType = (*cigarIter).Type;\r
- if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {\r
- alignEnd += (*cigarIter).Length;\r
- } \r
- else if ( usePadded && cigarType == 'I' ) {\r
- alignEnd += (*cigarIter).Length;\r
- }\r
- }\r
- return alignEnd;\r
-}\r
-\r
-inline\r
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type != "Z" && type != "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag already exists, return false\r
- // use EditTag explicitly instead\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;\r
- \r
- // otherwise, copy tag data to temp buffer\r
- std::string newTag = tag + type + value;\r
- const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term\r
- char originalTagData[newTagDataLength];\r
- memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term\r
- \r
- // append newTag\r
- strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term\r
- \r
- // store temp buffer back in TagData\r
- const char* newTagData = (const char*)originalTagData;\r
- TagData.assign(newTagData, newTagDataLength);\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
-inline\r
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type == "f" || type == "Z" || type == "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag already exists, return false\r
- // use EditTag explicitly instead\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;\r
- \r
- // otherwise, convert value to string\r
- union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;\r
- un.value = value;\r
-\r
- // copy original tag data to temp buffer\r
- std::string newTag = tag + type;\r
- const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer\r
- char originalTagData[newTagDataLength];\r
- memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term\r
- \r
- // append newTag\r
- strcat(originalTagData + tagDataLength, newTag.data());\r
- memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int));\r
- \r
- // store temp buffer back in TagData\r
- const char* newTagData = (const char*)originalTagData;\r
- TagData.assign(newTagData, newTagDataLength);\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
-inline\r
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) {\r
- return AddTag(tag, type, (const uint32_t&)value);\r
-}\r
-\r
-inline\r
-bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type == "Z" || type == "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag already exists, return false\r
- // use EditTag explicitly instead\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;\r
- \r
- // otherwise, convert value to string\r
- union { float value; char valueBuffer[sizeof(float)]; } un;\r
- un.value = value;\r
-\r
- // copy original tag data to temp buffer\r
- std::string newTag = tag + type;\r
- const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float\r
- char originalTagData[newTagDataLength];\r
- memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term\r
- \r
- // append newTag\r
- strcat(originalTagData + tagDataLength, newTag.data());\r
- memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float));\r
- \r
- // store temp buffer back in TagData\r
- const char* newTagData = (const char*)originalTagData;\r
- TagData.assign(newTagData, newTagDataLength);\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
-inline\r
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type != "Z" && type != "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pOriginalTagData = (char*)TagData.data();\r
- char* pTagData = pOriginalTagData;\r
- const unsigned int originalTagDataLength = TagData.size();\r
- \r
- unsigned int newTagDataLength = 0;\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {\r
- \r
- // make sure array is more than big enough\r
- char newTagData[originalTagDataLength + value.size()]; \r
-\r
- // copy original tag data up til desired tag\r
- const unsigned int beginningTagDataLength = numBytesParsed;\r
- newTagDataLength += beginningTagDataLength;\r
- memcpy(newTagData, pOriginalTagData, numBytesParsed);\r
- \r
- // copy new VALUE in place of current tag data\r
- const unsigned int dataLength = strlen(value.c_str());\r
- memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 );\r
- \r
- // skip to next tag (if tag for removal is last, return true) \r
- const char* pTagStorageType = pTagData - 1;\r
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;\r
- \r
- // copy everything from current tag (the next one after tag for removal) to end\r
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);\r
- const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1;\r
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;\r
- memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);\r
- \r
- // ensure null-terminator\r
- newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;\r
- \r
- // save new tag data\r
- TagData.assign(newTagData, endTagOffset + endTagDataLength);\r
- return true;\r
- }\r
- \r
- // tag not found, attempt AddTag\r
- else return AddTag(tag, type, value);\r
-}\r
-\r
-inline\r
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type == "f" || type == "Z" || type == "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pOriginalTagData = (char*)TagData.data();\r
- char* pTagData = pOriginalTagData;\r
- const unsigned int originalTagDataLength = TagData.size();\r
- \r
- unsigned int newTagDataLength = 0;\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {\r
- \r
- // make sure array is more than big enough\r
- char newTagData[originalTagDataLength + sizeof(value)]; \r
-\r
- // copy original tag data up til desired tag\r
- const unsigned int beginningTagDataLength = numBytesParsed;\r
- newTagDataLength += beginningTagDataLength;\r
- memcpy(newTagData, pOriginalTagData, numBytesParsed);\r
- \r
- // copy new VALUE in place of current tag data\r
- union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;\r
- un.value = value;\r
- memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int));\r
- \r
- // skip to next tag (if tag for removal is last, return true) \r
- const char* pTagStorageType = pTagData - 1;\r
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;\r
- \r
- // copy everything from current tag (the next one after tag for removal) to end\r
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);\r
- const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int);\r
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;\r
- memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);\r
- \r
- // ensure null-terminator\r
- newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;\r
- \r
- // save new tag data\r
- TagData.assign(newTagData, endTagOffset + endTagDataLength);\r
- return true;\r
- }\r
- \r
- // tag not found, attempt AddTag\r
- else return AddTag(tag, type, value);\r
-}\r
-\r
-inline\r
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) {\r
- return EditTag(tag, type, (const uint32_t&)value);\r
-}\r
-\r
-inline\r
-bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) {\r
- \r
- if ( SupportData.HasCoreOnly ) return false;\r
- if ( tag.size() != 2 || type.size() != 1 ) return false;\r
- if ( type == "Z" || type == "H" ) return false;\r
- \r
- // localize the tag data\r
- char* pOriginalTagData = (char*)TagData.data();\r
- char* pTagData = pOriginalTagData;\r
- const unsigned int originalTagDataLength = TagData.size();\r
- \r
- unsigned int newTagDataLength = 0;\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {\r
- \r
- // make sure array is more than big enough\r
- char newTagData[originalTagDataLength + sizeof(value)]; \r
-\r
- // copy original tag data up til desired tag\r
- const unsigned int beginningTagDataLength = numBytesParsed;\r
- newTagDataLength += beginningTagDataLength;\r
- memcpy(newTagData, pOriginalTagData, numBytesParsed);\r
- \r
- // copy new VALUE in place of current tag data\r
- union { float value; char valueBuffer[sizeof(float)]; } un;\r
- un.value = value;\r
- memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float));\r
- \r
- // skip to next tag (if tag for removal is last, return true) \r
- const char* pTagStorageType = pTagData - 1;\r
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;\r
- \r
- // copy everything from current tag (the next one after tag for removal) to end\r
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);\r
- const unsigned int endTagOffset = beginningTagDataLength + sizeof(float);\r
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;\r
- memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);\r
- \r
- // ensure null-terminator\r
- newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;\r
- \r
- // save new tag data\r
- TagData.assign(newTagData, endTagOffset + endTagDataLength);\r
- return true;\r
- }\r
- \r
- // tag not found, attempt AddTag\r
- else return AddTag(tag, type, value);\r
-}\r
-\r
-// get "NM" tag data - originally contributed by Aaron Quinlan\r
-// stores data in 'editDistance', returns success/fail\r
-inline \r
-bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { \r
- return GetTag("NM", (uint32_t&)editDistance);\r
-}\r
-\r
-// get "RG" tag data\r
-// stores data in 'readGroup', returns success/fail\r
-inline \r
-bool BamAlignment::GetReadGroup(std::string& readGroup) const {\r
- return GetTag("RG", readGroup);\r
-}\r
-\r
-inline\r
-bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const {\r
-\r
- // make sure tag data exists\r
- if ( SupportData.HasCoreOnly || TagData.empty() ) \r
- return false;\r
-\r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {\r
- const unsigned int dataLength = strlen(pTagData);\r
- destination.clear();\r
- destination.resize(dataLength);\r
- memcpy( (char*)destination.data(), pTagData, dataLength );\r
- return true;\r
- }\r
- \r
- // tag not found, return failure\r
- return false;\r
-}\r
-\r
-inline\r
-bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const {\r
- \r
- // make sure tag data exists\r
- if ( SupportData.HasCoreOnly || TagData.empty() ) \r
- return false;\r
-\r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, determine data byte-length, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {\r
- \r
- // determine data byte-length\r
- const char type = *(pTagData - 1);\r
- int destinationLength = 0;\r
- switch (type) {\r
- // 1 byte data\r
- case 'A':\r
- case 'c':\r
- case 'C':\r
- destinationLength = 1;\r
- break;\r
-\r
- // 2 byte data\r
- case 's':\r
- case 'S':\r
- destinationLength = 2;\r
- break;\r
-\r
- // 4 byte data\r
- case 'i':\r
- case 'I':\r
- destinationLength = 4;\r
- break;\r
-\r
- // unsupported type for integer destination (float or var-length strings)\r
- case 'f':\r
- case 'Z':\r
- case 'H':\r
- printf("ERROR: Cannot store tag of type %c in integer destination\n", type);\r
- return false;\r
-\r
- // unknown tag type\r
- default:\r
- printf("ERROR: Unknown tag storage class encountered: [%c]\n", type);\r
- return false;\r
- }\r
- \r
- // store in destination\r
- destination = 0;\r
- memcpy(&destination, pTagData, destinationLength);\r
- return true;\r
- }\r
- \r
- // tag not found, return failure\r
- return false;\r
-}\r
-\r
-inline\r
-bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const {\r
- return GetTag(tag, (uint32_t&)destination);\r
-}\r
-\r
-inline\r
-bool BamAlignment::GetTag(const std::string& tag, float& destination) const {\r
- \r
- // make sure tag data exists\r
- if ( SupportData.HasCoreOnly || TagData.empty() ) \r
- return false;\r
-\r
- // localize the tag data\r
- char* pTagData = (char*)TagData.data();\r
- const unsigned int tagDataLength = TagData.size();\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, determine data byte-length, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {\r
- //pTagData += numBytesParsed;\r
- \r
- // determine data byte-length\r
- const char type = *(pTagData - 1);\r
- int destinationLength = 0;\r
- switch(type) {\r
-\r
- // 1 byte data\r
- case 'A':\r
- case 'c':\r
- case 'C':\r
- destinationLength = 1;\r
- break;\r
-\r
- // 2 byte data\r
- case 's':\r
- case 'S':\r
- destinationLength = 2;\r
- break;\r
-\r
- // 4 byte data\r
- case 'f':\r
- case 'i':\r
- case 'I':\r
- destinationLength = 4;\r
- break;\r
- \r
- // unsupported type (var-length strings)\r
- case 'Z':\r
- case 'H':\r
- printf("ERROR: Cannot store tag of type %c in integer destination\n", type);\r
- return false;\r
-\r
- // unknown tag type\r
- default:\r
- printf("ERROR: Unknown tag storage class encountered: [%c]\n", type);\r
- return false;\r
- }\r
- \r
- // store in destination\r
- destination = 0.0;\r
- memcpy(&destination, pTagData, destinationLength);\r
- return true;\r
- }\r
- \r
- // tag not found, return failure\r
- return false;\r
-}\r
-\r
-inline\r
-bool BamAlignment::RemoveTag(const std::string& tag) {\r
- \r
- // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed\r
- // also, return false if no data present to remove\r
- if ( SupportData.HasCoreOnly || TagData.empty() ) return false;\r
- \r
- // localize the tag data\r
- char* pOriginalTagData = (char*)TagData.data();\r
- char* pTagData = pOriginalTagData;\r
- const unsigned int originalTagDataLength = TagData.size();\r
- unsigned int newTagDataLength = 0;\r
- unsigned int numBytesParsed = 0;\r
- \r
- // if tag found, store data in readGroup, return success\r
- if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {\r
- \r
- char newTagData[originalTagDataLength];\r
-\r
- // copy original tag data up til desired tag\r
- pTagData -= 3;\r
- numBytesParsed -= 3;\r
- const unsigned int beginningTagDataLength = numBytesParsed;\r
- newTagDataLength += beginningTagDataLength;\r
- memcpy(newTagData, pOriginalTagData, numBytesParsed);\r
- \r
- // skip to next tag (if tag for removal is last, return true) \r
- const char* pTagStorageType = pTagData + 2;\r
- pTagData += 3;\r
- numBytesParsed += 3;\r
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;\r
- \r
- // copy everything from current tag (the next one after tag for removal) to end\r
- const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);\r
- const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;\r
- memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength );\r
- \r
- // save new tag data\r
- TagData.assign(newTagData, beginningTagDataLength + endTagDataLength);\r
- return true;\r
- }\r
- \r
- // tag not found, no removal - return failure\r
- return false;\r
-}\r
-\r
-inline\r
-bool BamAlignment::FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) {\r
-\r
- while ( numBytesParsed < tagDataLength ) {\r
-\r
- const char* pTagType = pTagData;\r
- const char* pTagStorageType = pTagData + 2;\r
- pTagData += 3;\r
- numBytesParsed += 3;\r
-\r
- // check the current tag, return true on match\r
- if ( std::strncmp(pTagType, tag.c_str(), 2) == 0 ) \r
- return true;\r
-\r
- // get the storage class and find the next tag\r
- if ( *pTagStorageType == '\0' ) return false; \r
- if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;\r
- if ( *pTagData == '\0' ) return false;\r
- }\r
- \r
- // checked all tags, none match\r
- return false;\r
-}\r
-\r
-inline\r
-bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {\r
- \r
- switch(storageType) {\r
-\r
- case 'A':\r
- case 'c':\r
- case 'C':\r
- ++numBytesParsed;\r
- ++pTagData;\r
- break;\r
-\r
- case 's':\r
- case 'S':\r
- numBytesParsed += 2;\r
- pTagData += 2;\r
- break;\r
-\r
- case 'f':\r
- case 'i':\r
- case 'I':\r
- numBytesParsed += 4;\r
- pTagData += 4;\r
- break;\r
-\r
- case 'Z':\r
- case 'H':\r
- while(*pTagData) {\r
- ++numBytesParsed;\r
- ++pTagData;\r
- }\r
- // increment for null-terminator\r
- ++numBytesParsed;\r
- ++pTagData;\r
- break;\r
-\r
- default: \r
- // error case\r
- printf("ERROR: Unknown tag storage class encountered: [%c]\n", storageType);\r
- return false;\r
- }\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
-} // namespace BamTools\r
-\r
-#endif // BAMAUX_H\r
+++ /dev/null
-// ***************************************************************************
-// BamIndex.cpp (c) 2009 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 17 August 2010 (DB)
-// ---------------------------------------------------------------------------
-// Provides index functionality - both for the default (standardized) BAM
-// index format (.bai) as well as a BamTools-specific (nonstandard) index
-// format (.bti).
-// ***************************************************************************
-
-#include <cstdio>
-#include <cstdlib>
-#include <algorithm>
-// #include <iostream>
-#include <map>
-#include "BamIndex.h"
-#include "BamReader.h"
-#include "BGZF.h"
-using namespace std;
-using namespace BamTools;
-
-// -------------------------------
-// BamIndex implementation
-
-BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian)
- : m_BGZF(bgzf)
- , m_reader(reader)
- , m_isBigEndian(isBigEndian)
-{
- if ( m_reader && m_reader->IsOpen() )
- m_references = m_reader->GetReferenceData();
-}
-
-bool BamIndex::HasAlignments(const int& referenceID) {
-
- // return false if invalid ID
- if ( (referenceID < 0) || (referenceID >= (int)m_references.size()) )
- return false;
-
- // else return status of reference (has alignments?)
- else
- return m_references.at(referenceID).RefHasAlignments;
-}
-
-// #########################################################################################
-// #########################################################################################
-
-// -------------------------------
-// BamDefaultIndex structs & typedefs
-
-namespace BamTools {
-
-// --------------------------------------------------
-// BamDefaultIndex data structures & typedefs
-struct Chunk {
-
- // data members
- uint64_t Start;
- uint64_t Stop;
-
- // constructor
- Chunk(const uint64_t& start = 0,
- const uint64_t& stop = 0)
- : Start(start)
- , Stop(stop)
- { }
-};
-
-bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
- return lhs.Start < rhs.Start;
-}
-
-typedef vector<Chunk> ChunkVector;
-typedef map<uint32_t, ChunkVector> BamBinMap;
-typedef vector<uint64_t> LinearOffsetVector;
-
-struct ReferenceIndex {
-
- // data members
- BamBinMap Bins;
- LinearOffsetVector Offsets;
-
- // constructor
- ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
- const LinearOffsetVector& offsets = LinearOffsetVector())
- : Bins(binMap)
- , Offsets(offsets)
- { }
-};
-
-typedef vector<ReferenceIndex> BamDefaultIndexData;
-
-} // namespace BamTools
-
-// -------------------------------
-// BamDefaultIndex implementation
-
-struct BamDefaultIndex::BamDefaultIndexPrivate {
-
- // -------------------------
- // data members
-
- BamDefaultIndexData m_indexData;
- BamDefaultIndex* m_parent;
-
- // -------------------------
- // ctor & dtor
-
- BamDefaultIndexPrivate(BamDefaultIndex* parent) : m_parent(parent) { }
- ~BamDefaultIndexPrivate(void) { }
-
- // -------------------------
- // internal methods
-
- // calculate bins that overlap region
- int BinsFromRegion(const BamTools::BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]);
- // saves BAM bin entry for index
- void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset);
- // saves linear offset entry for index
- void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset);
- // simplifies index by merging 'chunks'
- void MergeChunks(void);
-
-};
-
-BamDefaultIndex::BamDefaultIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
- : BamIndex(bgzf, reader, isBigEndian)
-{
- d = new BamDefaultIndexPrivate(this);
-}
-
-BamDefaultIndex::~BamDefaultIndex(void) {
- d->m_indexData.clear();
- delete d;
- d = 0;
-}
-
-// calculate bins that overlap region
-int BamDefaultIndex::BamDefaultIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) {
-
- // get region boundaries
- uint32_t begin = (unsigned int)region.LeftPosition;
- uint32_t end;
-
- // if right bound specified AND left&right bounds are on same reference
- // OK to use right bound position
- if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
- end = (unsigned int)region.RightPosition;
-
- // otherwise, use end of left bound reference as cutoff
- else
- end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1;
-
- // initialize list, bin '0' always a valid bin
- int i = 0;
- bins[i++] = 0;
-
- // get rest of bins that contain this region
- unsigned int k;
- for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; }
- for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; }
- for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; }
- for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; }
- for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
-
- // return number of bins stored
- return i;
-}
-
-bool BamDefaultIndex::Build(void) {
-
- // be sure reader & BGZF file are valid & open for reading
- if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen )
- return false;
-
- // move file pointer to beginning of alignments
- m_reader->Rewind();
-
- // get reference count, reserve index space
- int numReferences = (int)m_references.size();
- for ( int i = 0; i < numReferences; ++i ) {
- d->m_indexData.push_back(ReferenceIndex());
- }
-
- // sets default constant for bin, ID, offset, coordinate variables
- const uint32_t defaultValue = 0xffffffffu;
-
- // bin data
- uint32_t saveBin(defaultValue);
- uint32_t lastBin(defaultValue);
-
- // reference ID data
- int32_t saveRefID(defaultValue);
- int32_t lastRefID(defaultValue);
-
- // offset data
- uint64_t saveOffset = m_BGZF->Tell();
- uint64_t lastOffset = saveOffset;
-
- // coordinate data
- int32_t lastCoordinate = defaultValue;
-
- BamAlignment bAlignment;
- while ( m_reader->GetNextAlignmentCore(bAlignment) ) {
-
- // change of chromosome, save ID, reset bin
- if ( lastRefID != bAlignment.RefID ) {
- lastRefID = bAlignment.RefID;
- lastBin = defaultValue;
- }
-
- // if lastCoordinate greater than BAM position - file not sorted properly
- else if ( lastCoordinate > bAlignment.Position ) {
- printf("BAM file not properly sorted:\n");
- printf("Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID);
- exit(1);
- }
-
- // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
- if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
-
- // save linear offset entry (matched to BAM entry refID)
- ReferenceIndex& refIndex = d->m_indexData.at(bAlignment.RefID);
- LinearOffsetVector& offsets = refIndex.Offsets;
- d->InsertLinearOffset(offsets, bAlignment, lastOffset);
- }
-
- // if current BamAlignment bin != lastBin, "then possibly write the binning index"
- if ( bAlignment.Bin != lastBin ) {
-
- // if not first time through
- if ( saveBin != defaultValue ) {
-
- // save Bam bin entry
- ReferenceIndex& refIndex = d->m_indexData.at(saveRefID);
- BamBinMap& binMap = refIndex.Bins;
- d->InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
- }
-
- // update saveOffset
- saveOffset = lastOffset;
-
- // update bin values
- saveBin = bAlignment.Bin;
- lastBin = bAlignment.Bin;
-
- // update saveRefID
- saveRefID = bAlignment.RefID;
-
- // if invalid RefID, break out (why?)
- if ( saveRefID < 0 ) { break; }
- }
-
- // make sure that current file pointer is beyond lastOffset
- if ( m_BGZF->Tell() <= (int64_t)lastOffset ) {
- printf("Error in BGZF offsets.\n");
- exit(1);
- }
-
- // update lastOffset
- lastOffset = m_BGZF->Tell();
-
- // update lastCoordinate
- lastCoordinate = bAlignment.Position;
- }
-
- // save any leftover BAM data (as long as refID is valid)
- if ( saveRefID >= 0 ) {
- // save Bam bin entry
- ReferenceIndex& refIndex = d->m_indexData.at(saveRefID);
- BamBinMap& binMap = refIndex.Bins;
- d->InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
- }
-
- // simplify index by merging chunks
- d->MergeChunks();
-
- // iterate through references in index
- // store whether reference has data &
- // sort offsets in linear offset vector
- BamDefaultIndexData::iterator indexIter = d->m_indexData.begin();
- BamDefaultIndexData::iterator indexEnd = d->m_indexData.end();
- for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
-
- // get reference index data
- ReferenceIndex& refIndex = (*indexIter);
- BamBinMap& binMap = refIndex.Bins;
- LinearOffsetVector& offsets = refIndex.Offsets;
-
- // store whether reference has alignments or no
- m_references[i].RefHasAlignments = ( binMap.size() > 0 );
-
- // sort linear offsets
- sort(offsets.begin(), offsets.end());
- }
-
- // rewind file pointer to beginning of alignments, return success/fail
- return m_reader->Rewind();
-}
-
-bool BamDefaultIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) {
-
- // calculate which bins overlap this region
- uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
- int numBins = d->BinsFromRegion(region, isRightBoundSpecified, bins);
-
- // get bins for this reference
- const ReferenceIndex& refIndex = d->m_indexData.at(region.LeftRefID);
- const BamBinMap& binMap = refIndex.Bins;
-
- // get minimum offset to consider
- const LinearOffsetVector& linearOffsets = refIndex.Offsets;
- uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
-
- // store all alignment 'chunk' starts (file offsets) for bins in this region
- for ( int i = 0; i < numBins; ++i ) {
-
- const uint16_t binKey = bins[i];
- map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
- if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
-
- const ChunkVector& chunks = (*binIter).second;
- std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
- std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
- for ( ; chunksIter != chunksEnd; ++chunksIter) {
-
- // if valid chunk found, store its file offset
- const Chunk& chunk = (*chunksIter);
- if ( chunk.Stop > minOffset )
- offsets.push_back( chunk.Start );
- }
- }
- }
-
- // clean up memory
- free(bins);
-
- // sort the offsets before returning
- sort(offsets.begin(), offsets.end());
-
- // return whether any offsets were found
- return ( offsets.size() != 0 );
-}
-
-// saves BAM bin entry for index
-void BamDefaultIndex::BamDefaultIndexPrivate::InsertBinEntry(BamBinMap& binMap,
- const uint32_t& saveBin,
- const uint64_t& saveOffset,
- const uint64_t& lastOffset)
-{
- // look up saveBin
- BamBinMap::iterator binIter = binMap.find(saveBin);
-
- // create new chunk
- Chunk newChunk(saveOffset, lastOffset);
-
- // if entry doesn't exist
- if ( binIter == binMap.end() ) {
- ChunkVector newChunks;
- newChunks.push_back(newChunk);
- binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
- }
-
- // otherwise
- else {
- ChunkVector& binChunks = (*binIter).second;
- binChunks.push_back( newChunk );
- }
-}
-
-// saves linear offset entry for index
-void BamDefaultIndex::BamDefaultIndexPrivate::InsertLinearOffset(LinearOffsetVector& offsets,
- const BamAlignment& bAlignment,
- const uint64_t& lastOffset)
-{
- // get converted offsets
- int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
- int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
-
- // resize vector if necessary
- int oldSize = offsets.size();
- int newSize = endOffset + 1;
- if ( oldSize < newSize )
- offsets.resize(newSize, 0);
-
- // store offset
- for( int i = beginOffset + 1; i <= endOffset; ++i ) {
- if ( offsets[i] == 0 )
- offsets[i] = lastOffset;
- }
-}
-
-bool BamDefaultIndex::Load(const string& filename) {
-
- // open index file, abort on error
- FILE* indexStream = fopen(filename.c_str(), "rb");
- if( !indexStream ) {
- printf("ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
- return false;
- }
-
- // set placeholder to receive input byte count (suppresses compiler warnings)
- size_t elementsRead = 0;
-
- // see if index is valid BAM index
- char magic[4];
- elementsRead = fread(magic, 1, 4, indexStream);
- if ( strncmp(magic, "BAI\1", 4) ) {
- printf("Problem with index file - invalid format.\n");
- fclose(indexStream);
- return false;
- }
-
- // get number of reference sequences
- uint32_t numRefSeqs;
- elementsRead = fread(&numRefSeqs, 4, 1, indexStream);
- if ( m_isBigEndian ) { SwapEndian_32(numRefSeqs); }
-
- // intialize space for BamDefaultIndexData data structure
- d->m_indexData.reserve(numRefSeqs);
-
- // iterate over reference sequences
- for ( unsigned int i = 0; i < numRefSeqs; ++i ) {
-
- // get number of bins for this reference sequence
- int32_t numBins;
- elementsRead = fread(&numBins, 4, 1, indexStream);
- if ( m_isBigEndian ) { SwapEndian_32(numBins); }
-
- if ( numBins > 0 ) {
- RefData& refEntry = m_references[i];
- refEntry.RefHasAlignments = true;
- }
-
- // intialize BinVector
- BamBinMap binMap;
-
- // iterate over bins for that reference sequence
- for ( int j = 0; j < numBins; ++j ) {
-
- // get binID
- uint32_t binID;
- elementsRead = fread(&binID, 4, 1, indexStream);
-
- // get number of regionChunks in this bin
- uint32_t numChunks;
- elementsRead = fread(&numChunks, 4, 1, indexStream);
-
- if ( m_isBigEndian ) {
- SwapEndian_32(binID);
- SwapEndian_32(numChunks);
- }
-
- // intialize ChunkVector
- ChunkVector regionChunks;
- regionChunks.reserve(numChunks);
-
- // iterate over regionChunks in this bin
- for ( unsigned int k = 0; k < numChunks; ++k ) {
-
- // get chunk boundaries (left, right)
- uint64_t left;
- uint64_t right;
- elementsRead = fread(&left, 8, 1, indexStream);
- elementsRead = fread(&right, 8, 1, indexStream);
-
- if ( m_isBigEndian ) {
- SwapEndian_64(left);
- SwapEndian_64(right);
- }
-
- // save ChunkPair
- regionChunks.push_back( Chunk(left, right) );
- }
-
- // sort chunks for this bin
- sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan );
-
- // save binID, chunkVector for this bin
- binMap.insert( pair<uint32_t, ChunkVector>(binID, regionChunks) );
- }
-
- // load linear index for this reference sequence
-
- // get number of linear offsets
- int32_t numLinearOffsets;
- elementsRead = fread(&numLinearOffsets, 4, 1, indexStream);
- if ( m_isBigEndian ) { SwapEndian_32(numLinearOffsets); }
-
- // intialize LinearOffsetVector
- LinearOffsetVector offsets;
- offsets.reserve(numLinearOffsets);
-
- // iterate over linear offsets for this reference sequeence
- uint64_t linearOffset;
- for ( int j = 0; j < numLinearOffsets; ++j ) {
- // read a linear offset & store
- elementsRead = fread(&linearOffset, 8, 1, indexStream);
- if ( m_isBigEndian ) { SwapEndian_64(linearOffset); }
- offsets.push_back(linearOffset);
- }
-
- // sort linear offsets
- sort( offsets.begin(), offsets.end() );
-
- // store index data for that reference sequence
- d->m_indexData.push_back( ReferenceIndex(binMap, offsets) );
- }
-
- // close index file (.bai) and return
- fclose(indexStream);
- return true;
-}
-
-// merges 'alignment chunks' in BAM bin (used for index building)
-void BamDefaultIndex::BamDefaultIndexPrivate::MergeChunks(void) {
-
- // iterate over reference enties
- BamDefaultIndexData::iterator indexIter = m_indexData.begin();
- BamDefaultIndexData::iterator indexEnd = m_indexData.end();
- for ( ; indexIter != indexEnd; ++indexIter ) {
-
- // get BAM bin map for this reference
- ReferenceIndex& refIndex = (*indexIter);
- BamBinMap& bamBinMap = refIndex.Bins;
-
- // iterate over BAM bins
- BamBinMap::iterator binIter = bamBinMap.begin();
- BamBinMap::iterator binEnd = bamBinMap.end();
- for ( ; binIter != binEnd; ++binIter ) {
-
- // get chunk vector for this bin
- ChunkVector& binChunks = (*binIter).second;
- if ( binChunks.size() == 0 ) { continue; }
-
- ChunkVector mergedChunks;
- mergedChunks.push_back( binChunks[0] );
-
- // iterate over chunks
- int i = 0;
- ChunkVector::iterator chunkIter = binChunks.begin();
- ChunkVector::iterator chunkEnd = binChunks.end();
- for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
-
- // get 'currentChunk' based on numeric index
- Chunk& currentChunk = mergedChunks[i];
-
- // get iteratorChunk based on vector iterator
- Chunk& iteratorChunk = (*chunkIter);
-
- // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted)
- if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) {
-
- // set currentChunk.Stop to iteratorChunk.Stop
- currentChunk.Stop = iteratorChunk.Stop;
- }
-
- // otherwise
- else {
- // set currentChunk + 1 to iteratorChunk
- mergedChunks.push_back(iteratorChunk);
- ++i;
- }
- }
-
- // saved merged chunk vector
- (*binIter).second = mergedChunks;
- }
- }
-}
-
-// writes in-memory index data out to file
-// N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
-bool BamDefaultIndex::Write(const std::string& bamFilename) {
-
- string indexFilename = bamFilename + ".bai";
- FILE* indexStream = fopen(indexFilename.c_str(), "wb");
- if ( indexStream == 0 ) {
- printf("ERROR: Could not open file to save index.\n");
- return false;
- }
-
- // write BAM index header
- fwrite("BAI\1", 1, 4, indexStream);
-
- // write number of reference sequences
- int32_t numReferenceSeqs = d->m_indexData.size();
- if ( m_isBigEndian ) { SwapEndian_32(numReferenceSeqs); }
- fwrite(&numReferenceSeqs, 4, 1, indexStream);
-
- // iterate over reference sequences
- BamDefaultIndexData::const_iterator indexIter = d->m_indexData.begin();
- BamDefaultIndexData::const_iterator indexEnd = d->m_indexData.end();
- for ( ; indexIter != indexEnd; ++ indexIter ) {
-
- // get reference index data
- const ReferenceIndex& refIndex = (*indexIter);
- const BamBinMap& binMap = refIndex.Bins;
- const LinearOffsetVector& offsets = refIndex.Offsets;
-
- // write number of bins
- int32_t binCount = binMap.size();
- if ( m_isBigEndian ) { SwapEndian_32(binCount); }
- fwrite(&binCount, 4, 1, indexStream);
-
- // iterate over bins
- BamBinMap::const_iterator binIter = binMap.begin();
- BamBinMap::const_iterator binEnd = binMap.end();
- for ( ; binIter != binEnd; ++binIter ) {
-
- // get bin data (key and chunk vector)
- uint32_t binKey = (*binIter).first;
- const ChunkVector& binChunks = (*binIter).second;
-
- // save BAM bin key
- if ( m_isBigEndian ) { SwapEndian_32(binKey); }
- fwrite(&binKey, 4, 1, indexStream);
-
- // save chunk count
- int32_t chunkCount = binChunks.size();
- if ( m_isBigEndian ) { SwapEndian_32(chunkCount); }
- fwrite(&chunkCount, 4, 1, indexStream);
-
- // iterate over chunks
- ChunkVector::const_iterator chunkIter = binChunks.begin();
- ChunkVector::const_iterator chunkEnd = binChunks.end();
- for ( ; chunkIter != chunkEnd; ++chunkIter ) {
-
- // get current chunk data
- const Chunk& chunk = (*chunkIter);
- uint64_t start = chunk.Start;
- uint64_t stop = chunk.Stop;
-
- if ( m_isBigEndian ) {
- SwapEndian_64(start);
- SwapEndian_64(stop);
- }
-
- // save chunk offsets
- fwrite(&start, 8, 1, indexStream);
- fwrite(&stop, 8, 1, indexStream);
- }
- }
-
- // write linear offsets size
- int32_t offsetSize = offsets.size();
- if ( m_isBigEndian ) { SwapEndian_32(offsetSize); }
- fwrite(&offsetSize, 4, 1, indexStream);
-
- // iterate over linear offsets
- LinearOffsetVector::const_iterator offsetIter = offsets.begin();
- LinearOffsetVector::const_iterator offsetEnd = offsets.end();
- for ( ; offsetIter != offsetEnd; ++offsetIter ) {
-
- // write linear offset value
- uint64_t linearOffset = (*offsetIter);
- if ( m_isBigEndian ) { SwapEndian_64(linearOffset); }
- fwrite(&linearOffset, 8, 1, indexStream);
- }
- }
-
- // flush buffer, close file, and return success
- fflush(indexStream);
- fclose(indexStream);
- return true;
-}
-
-// #########################################################################################
-// #########################################################################################
-
-// -------------------------------------
-// BamToolsIndex implementation
-
-namespace BamTools {
-
-struct BamToolsIndexEntry {
-
- // data members
- int64_t Offset;
- int RefID;
- int Position;
-
- // ctor
- BamToolsIndexEntry(const uint64_t& offset = 0,
- const int& id = -1,
- const int& position = -1)
- : Offset(offset)
- , RefID(id)
- , Position(position)
- { }
-};
-
-typedef vector<BamToolsIndexEntry> BamToolsIndexData;
-
-} // namespace BamTools
-
-struct BamToolsIndex::BamToolsIndexPrivate {
-
- // -------------------------
- // data members
- BamToolsIndexData m_indexData;
- BamToolsIndex* m_parent;
- int32_t m_blockSize;
-
- // -------------------------
- // ctor & dtor
-
- BamToolsIndexPrivate(BamToolsIndex* parent)
- : m_parent(parent)
- , m_blockSize(1000)
- { }
-
- ~BamToolsIndexPrivate(void) { }
-
- // -------------------------
- // internal methods
-};
-
-BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
- : BamIndex(bgzf, reader, isBigEndian)
-{
- d = new BamToolsIndexPrivate(this);
-}
-
-BamToolsIndex::~BamToolsIndex(void) {
- delete d;
- d = 0;
-}
-
-bool BamToolsIndex::Build(void) {
-
- // be sure reader & BGZF file are valid & open for reading
- if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen )
- return false;
-
- // move file pointer to beginning of alignments
- m_reader->Rewind();
-
- // plow through alignments, store block offsets
- int32_t currentBlockCount = 0;
- int64_t blockStartOffset = m_BGZF->Tell();
- int blockStartId = -1;
- int blockStartPosition = -1;
- BamAlignment al;
- while ( m_reader->GetNextAlignmentCore(al) ) {
-
- // set reference flag
- m_references[al.RefID].RefHasAlignments = true;
-
- // if beginning of block, save first alignment's refID & position
- if ( currentBlockCount == 0 ) {
- blockStartId = al.RefID;
- blockStartPosition = al.Position;
- }
-
- // increment block counter
- ++currentBlockCount;
-
- // if block is full, get offset for next block, reset currentBlockCount
- if ( currentBlockCount == d->m_blockSize ) {
-
- d->m_indexData.push_back( BamToolsIndexEntry(blockStartOffset, blockStartId, blockStartPosition) );
- blockStartOffset = m_BGZF->Tell();
- currentBlockCount = 0;
- }
- }
-
- return m_reader->Rewind();
-}
-
-// N.B. - ignores isRightBoundSpecified
-bool BamToolsIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) {
-
- // return false if no index data present
- if ( d->m_indexData.empty() ) return false;
-
- // clear any prior data
- offsets.clear();
-
- // calculate nearest index to jump to
- int64_t previousOffset = -1;
- BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin();
- BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end();
- for ( ; indexIter != indexEnd; ++indexIter ) {
-
- const BamToolsIndexEntry& entry = (*indexIter);
-
- // check if we are 'past' beginning of desired region
- // if so, we will break out & use previously stored offset
- if ( entry.RefID > region.LeftRefID ) break;
- if ( (entry.RefID == region.LeftRefID) && (entry.Position > region.LeftPosition) ) break;
-
- // not past desired region, so store current entry offset in previousOffset
- previousOffset = entry.Offset;
- }
-
- // no index was found
- if ( previousOffset == -1 )
- return false;
-
- // store offset & return success
- offsets.push_back(previousOffset);
- return true;
-}
-
-bool BamToolsIndex::Load(const string& filename) {
-
- // open index file, abort on error
- FILE* indexStream = fopen(filename.c_str(), "rb");
- if( !indexStream ) {
- printf("ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
- return false;
- }
-
- // set placeholder to receive input byte count (suppresses compiler warnings)
- size_t elementsRead = 0;
-
- // see if index is valid BAM index
- char magic[4];
- elementsRead = fread(magic, 1, 4, indexStream);
- if ( strncmp(magic, "BTI\1", 4) ) {
- printf("Problem with index file - invalid format.\n");
- fclose(indexStream);
- return false;
- }
-
- // read in block size
- elementsRead = fread(&d->m_blockSize, sizeof(d->m_blockSize), 1, indexStream);
- if ( m_isBigEndian ) { SwapEndian_32(d->m_blockSize); }
-
- // read in number of offsets
- uint32_t numOffsets;
- elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream);
- if ( m_isBigEndian ) { SwapEndian_32(numOffsets); }
-
- // reserve space for index data
- d->m_indexData.reserve(numOffsets);
-
- // iterate over index entries
- for ( unsigned int i = 0; i < numOffsets; ++i ) {
-
- uint64_t offset;
- int id;
- int position;
-
- // read in data
- elementsRead = fread(&offset, sizeof(offset), 1, indexStream);
- elementsRead = fread(&id, sizeof(id), 1, indexStream);
- elementsRead = fread(&position, sizeof(position), 1, indexStream);
-
- // swap endian-ness if necessary
- if ( m_isBigEndian ) {
- SwapEndian_64(offset);
- SwapEndian_32(id);
- SwapEndian_32(position);
- }
-
- // save reference index entry
- d->m_indexData.push_back( BamToolsIndexEntry(offset, id, position) );
-
- // set reference flag
- m_references[id].RefHasAlignments = true; // what about sparse references? wont be able to set flag?
- }
-
- // close index file and return
- fclose(indexStream);
- return true;
-}
-
-// writes in-memory index data out to file
-// N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
-bool BamToolsIndex::Write(const std::string& bamFilename) {
-
- string indexFilename = bamFilename + ".bti";
- FILE* indexStream = fopen(indexFilename.c_str(), "wb");
- if ( indexStream == 0 ) {
- printf("ERROR: Could not open file to save index.\n");
- return false;
- }
-
- // write BAM index header
- fwrite("BTI\1", 1, 4, indexStream);
-
- // write block size
- int32_t blockSize = d->m_blockSize;
- if ( m_isBigEndian ) { SwapEndian_32(blockSize); }
- fwrite(&blockSize, sizeof(blockSize), 1, indexStream);
-
- // write number of offset entries
- uint32_t numOffsets = d->m_indexData.size();
- if ( m_isBigEndian ) { SwapEndian_32(numOffsets); }
- fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream);
-
- // iterate over offset entries
- BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin();
- BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end();
- for ( ; indexIter != indexEnd; ++ indexIter ) {
-
- // get reference index data
- const BamToolsIndexEntry& entry = (*indexIter);
-
- // copy entry data
- uint64_t offset = entry.Offset;
- int id = entry.RefID;
- int position = entry.Position;
-
- // swap endian-ness if necessary
- if ( m_isBigEndian ) {
- SwapEndian_64(offset);
- SwapEndian_32(id);
- SwapEndian_32(position);
- }
-
- // write the reference index entry
- fwrite(&offset, sizeof(offset), 1, indexStream);
- fwrite(&id, sizeof(id), 1, indexStream);
- fwrite(&position, sizeof(position), 1, indexStream);
- }
-
- // flush file buffer, close file, and return success
- fflush(indexStream);
- fclose(indexStream);
- return true;
-}
+++ /dev/null
-// ***************************************************************************
-// BamIndex.h (c) 2009 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 17 August 2010 (DB)
-// ---------------------------------------------------------------------------
-// Provides index functionality - both for the default (standardized) BAM
-// index format (.bai) as well as a BamTools-specific (nonstandard) index
-// format (.bti).
-// ***************************************************************************
-
-#ifndef BAM_INDEX_H
-#define BAM_INDEX_H
-
-#include <string>
-#include <vector>
-#include "BamAux.h"
-
-namespace BamTools {
-
-class BamReader;
-class BgzfData;
-
-// --------------------------------------------------
-// BamIndex base class
-class BamIndex {
-
- public:
- BamIndex(BamTools::BgzfData* bgzf,
- BamTools::BamReader* reader,
- bool isBigEndian);
- virtual ~BamIndex(void) { }
-
- public:
- // creates index data (in-memory) from current reader data
- virtual bool Build(void) =0;
- // calculates offset(s) for a given region
- virtual bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector<int64_t>& offsets) =0;
- // loads existing data from file into memory
- virtual bool Load(const std::string& filename) =0;
- // returns whether reference has alignments or no
- virtual bool HasAlignments(const int& referenceID);
- // writes in-memory index data out to file
- // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
- virtual bool Write(const std::string& bamFilename) =0;
-
- protected:
- BamTools::BgzfData* m_BGZF;
- BamTools::BamReader* m_reader;
- BamTools::RefVector m_references;
- bool m_isBigEndian;
-};
-
-// --------------------------------------------------
-// BamDefaultIndex class
-//
-// implements default (per SAM/BAM spec) index file ops
-class BamDefaultIndex : public BamIndex {
-
-
- // ctor & dtor
- public:
- BamDefaultIndex(BamTools::BgzfData* bgzf,
- BamTools::BamReader* reader,
- bool isBigEndian);
- ~BamDefaultIndex(void);
-
- // interface (implements BamIndex virtual methods)
- public:
- // creates index data (in-memory) from current reader data
- bool Build(void);
- // calculates offset(s) for a given region
- bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector<int64_t>& offsets);
- // loads existing data from file into memory
- bool Load(const std::string& filename);
- // writes in-memory index data out to file
- // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
- bool Write(const std::string& bamFilename);
-
- // internal implementation
- private:
- struct BamDefaultIndexPrivate;
- BamDefaultIndexPrivate* d;
-};
-
-// --------------------------------------------------
-// BamToolsIndex class
-//
-// implements BamTools-specific index file ops
-class BamToolsIndex : public BamIndex {
-
- // ctor & dtor
- public:
- BamToolsIndex(BamTools::BgzfData* bgzf,
- BamTools::BamReader* reader,
- bool isBigEndian);
- ~BamToolsIndex(void);
-
- // interface (implements BamIndex virtual methods)
- public:
- // creates index data (in-memory) from current reader data
- bool Build(void);
- // calculates offset(s) for a given region
- bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector<int64_t>& offsets);
- // loads existing data from file into memory
- bool Load(const std::string& filename);
- // writes in-memory index data out to file
- // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
- bool Write(const std::string& bamFilename);
-
- // internal implementation
- private:
- struct BamToolsIndexPrivate;
- BamToolsIndexPrivate* d;
-};
-
-} // namespace BamTools
-
-#endif // BAM_INDEX_H
\ No newline at end of file
+++ /dev/null
-// ***************************************************************************
-// BamMultiReader.cpp (c) 2010 Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 20 July 2010 (DB)
-// ---------------------------------------------------------------------------
-// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
-// Institute.
-// ---------------------------------------------------------------------------
-// Functionality for simultaneously reading multiple BAM files.
-//
-// This functionality allows applications to work on very large sets of files
-// without requiring intermediate merge, sort, and index steps for each file
-// subset. It also improves the performance of our merge system as it
-// precludes the need to sort merged files.
-// ***************************************************************************
-
-// C++ includes
-#include <algorithm>
-#include <iterator>
-#include <string>
-#include <vector>
-#include <iostream>
-#include <sstream>
-
-// BamTools includes
-#include "BGZF.h"
-#include "BamMultiReader.h"
-using namespace BamTools;
-using namespace std;
-
-// -----------------------------------------------------
-// BamMultiReader implementation
-// -----------------------------------------------------
-
-// constructor
-BamMultiReader::BamMultiReader(void)
- : CurrentRefID(0)
- , CurrentLeft(0)
-{ }
-
-// destructor
-BamMultiReader::~BamMultiReader(void) {
- Close(); // close the bam files
- // clean up reader objects
- for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
- delete it->first;
- delete it->second;
- }
-}
-
-// close the BAM files
-void BamMultiReader::Close(void) {
- for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
- BamReader* reader = it->first;
- reader->Close(); // close the reader
- }
-}
-
-// updates the reference id stored in the BamMultiReader
-// to reflect the current state of the readers
-void BamMultiReader::UpdateReferenceID(void) {
- // the alignments are sorted by position, so the first alignment will always have the lowest reference ID
- if (alignments.begin()->second.second->RefID != CurrentRefID) {
- // get the next reference id
- // while there aren't any readers at the next ref id
- // increment the ref id
- int nextRefID = CurrentRefID;
- while (alignments.begin()->second.second->RefID != nextRefID) {
- ++nextRefID;
- }
- //cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl;
- CurrentRefID = nextRefID;
- }
-}
-
-// checks if any readers still have alignments
-bool BamMultiReader::HasOpenReaders() {
- return alignments.size() > 0;
-}
-
-// get next alignment among all files
-bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
-
- // bail out if we are at EOF in all files, means no more alignments to process
- if (!HasOpenReaders())
- return false;
-
- // when all alignments have stepped into a new target sequence, update our
- // current reference sequence id
- UpdateReferenceID();
-
- // our lowest alignment and reader will be at the front of our alignment index
- BamAlignment* alignment = alignments.begin()->second.second;
- BamReader* reader = alignments.begin()->second.first;
-
- // now that we have the lowest alignment in the set, save it by copy to our argument
- nextAlignment = BamAlignment(*alignment);
-
- // remove this alignment index entry from our alignment index
- alignments.erase(alignments.begin());
-
- // and add another entry if we can get another alignment from the reader
- if (reader->GetNextAlignment(*alignment)) {
- alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
- make_pair(reader, alignment)));
- } else { // do nothing
- //cerr << "reached end of file " << lowestReader->GetFilename() << endl;
- }
-
- return true;
-
-}
-
-// get next alignment among all files without parsing character data from alignments
-bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) {
-
- // bail out if we are at EOF in all files, means no more alignments to process
- if (!HasOpenReaders())
- return false;
-
- // when all alignments have stepped into a new target sequence, update our
- // current reference sequence id
- UpdateReferenceID();
-
- // our lowest alignment and reader will be at the front of our alignment index
- BamAlignment* alignment = alignments.begin()->second.second;
- BamReader* reader = alignments.begin()->second.first;
-
- // now that we have the lowest alignment in the set, save it by copy to our argument
- nextAlignment = BamAlignment(*alignment);
- //memcpy(&nextAlignment, alignment, sizeof(BamAlignment));
-
- // remove this alignment index entry from our alignment index
- alignments.erase(alignments.begin());
-
- // and add another entry if we can get another alignment from the reader
- if (reader->GetNextAlignmentCore(*alignment)) {
- alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
- make_pair(reader, alignment)));
- } else { // do nothing
- //cerr << "reached end of file " << lowestReader->GetFilename() << endl;
- }
-
- return true;
-
-}
-
-// jumps to specified region(refID, leftBound) in BAM files, returns success/fail
-bool BamMultiReader::Jump(int refID, int position) {
-
- //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {
- CurrentRefID = refID;
- CurrentLeft = position;
-
- bool result = true;
- for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
- BamReader* reader = it->first;
- result &= reader->Jump(refID, position);
- if (!result) {
- cerr << "ERROR: could not jump " << reader->GetFilename() << " to " << refID << ":" << position << endl;
- exit(1);
- }
- }
- if (result) UpdateAlignments();
- return result;
-}
-
-bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) {
-
- BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition);
-
- return SetRegion(region);
-
-}
-
-bool BamMultiReader::SetRegion(const BamRegion& region) {
-
- Region = region;
-
- // NB: While it may make sense to track readers in which we can
- // successfully SetRegion, In practice a failure of SetRegion means "no
- // alignments here." It makes sense to simply accept the failure,
- // UpdateAlignments(), and continue.
-
- for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
- it->first->SetRegion(region);
- }
-
- UpdateAlignments();
-
- return true;
-
-}
-
-void BamMultiReader::UpdateAlignments(void) {
- // Update Alignments
- alignments.clear();
- for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
- BamReader* br = it->first;
- BamAlignment* ba = it->second;
- if (br->GetNextAlignment(*ba)) {
- alignments.insert(make_pair(make_pair(ba->RefID, ba->Position),
- make_pair(br, ba)));
- } else {
- // assume BamReader end of region / EOF
- }
- }
-}
-
-// opens BAM files
-bool BamMultiReader::Open(const vector<string> filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) {
-
- // for filename in filenames
- fileNames = filenames; // save filenames in our multireader
- for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) {
- string filename = *it;
- BamReader* reader = new BamReader;
-
- bool openedOK = true;
- if (openIndexes) {
- if (useDefaultIndex)
- openedOK = reader->Open(filename, filename + ".bai");
- else
- openedOK = reader->Open(filename, filename + ".bti");
- } else {
- openedOK = reader->Open(filename); // for merging, jumping is disallowed
- }
-
- // if file opened ok, check that it can be read
- if ( openedOK ) {
-
- bool fileOK = true;
- BamAlignment* alignment = new BamAlignment;
- if (coreMode) {
- fileOK &= reader->GetNextAlignmentCore(*alignment);
- } else {
- fileOK &= reader->GetNextAlignment(*alignment);
- }
-
- if (fileOK) {
- readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup
- alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
- make_pair(reader, alignment)));
- } else {
- cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl;
- // if only file available & could not be read, return failure
- if ( filenames.size() == 1 ) return false;
- }
-
- }
-
- // TODO; any more error handling on openedOK ??
- else
- return false;
- }
-
- // files opened ok, at least one alignment could be read,
- // now need to check that all files use same reference data
- ValidateReaders();
- return true;
-}
-
-void BamMultiReader::PrintFilenames(void) {
- for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
- BamReader* reader = it->first;
- cout << reader->GetFilename() << endl;
- }
-}
-
-// for debugging
-void BamMultiReader::DumpAlignmentIndex(void) {
- for (AlignmentIndex::const_iterator it = alignments.begin(); it != alignments.end(); ++it) {
- cerr << it->first.first << ":" << it->first.second << " " << it->second.first->GetFilename() << endl;
- }
-}
-
-// returns BAM file pointers to beginning of alignment data
-bool BamMultiReader::Rewind(void) {
- bool result = true;
- for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
- BamReader* reader = it->first;
- result &= reader->Rewind();
- }
- return result;
-}
-
-// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail
-bool BamMultiReader::CreateIndexes(bool useDefaultIndex) {
- bool result = true;
- for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
- BamReader* reader = it->first;
- result &= reader->CreateIndex(useDefaultIndex);
- }
- return result;
-}
-
-// makes a virtual, unified header for all the bam files in the multireader
-const string BamMultiReader::GetHeaderText(void) const {
-
- string mergedHeader = "";
- map<string, bool> readGroups;
-
- // foreach extraction entry (each BAM file)
- for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) {
-
- map<string, bool> currentFileReadGroups;
-
- BamReader* reader = rs->first;
-
- stringstream header(reader->GetHeaderText());
- vector<string> lines;
- string item;
- while (getline(header, item))
- lines.push_back(item);
-
- for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) {
-
- // get next line from header, skip if empty
- string headerLine = *it;
- if ( headerLine.empty() ) { continue; }
-
- // if first file, save HD & SQ entries
- if ( rs == readers.begin() ) {
- if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
- mergedHeader.append(headerLine.c_str());
- mergedHeader.append(1, '\n');
- }
- }
-
- // (for all files) append RG entries if they are unique
- if ( headerLine.find("@RG") == 0 ) {
- stringstream headerLineSs(headerLine);
- string part, readGroupPart, readGroup;
- while(std::getline(headerLineSs, part, '\t')) {
- stringstream partSs(part);
- string subtag;
- std::getline(partSs, subtag, ':');
- if (subtag == "ID") {
- std::getline(partSs, readGroup, ':');
- break;
- }
- }
- if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries
- mergedHeader.append(headerLine.c_str() );
- mergedHeader.append(1, '\n');
- readGroups[readGroup] = true;
- currentFileReadGroups[readGroup] = true;
- } else {
- // warn iff we are reading one file and discover duplicated @RG tags in the header
- // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
- if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) {
- cerr << "WARNING: duplicate @RG tag " << readGroup
- << " entry in header of " << reader->GetFilename() << endl;
- }
- }
- }
- }
- }
-
- // return merged header text
- return mergedHeader;
-}
-
-// ValidateReaders checks that all the readers point to BAM files representing
-// alignments against the same set of reference sequences, and that the
-// sequences are identically ordered. If these checks fail the operation of
-// the multireader is undefined, so we force program exit.
-void BamMultiReader::ValidateReaders(void) const {
- int firstRefCount = readers.front().first->GetReferenceCount();
- BamTools::RefVector firstRefData = readers.front().first->GetReferenceData();
- for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) {
- BamReader* reader = it->first;
- BamTools::RefVector currentRefData = reader->GetReferenceData();
- BamTools::RefVector::const_iterator f = firstRefData.begin();
- BamTools::RefVector::const_iterator c = currentRefData.begin();
- if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) {
- cerr << "ERROR: mismatched number of references in " << reader->GetFilename()
- << " expected " << firstRefCount
- << " reference sequences but only found " << reader->GetReferenceCount() << endl;
- exit(1);
- }
- // this will be ok; we just checked above that we have identically-sized sets of references
- // here we simply check if they are all, in fact, equal in content
- while (f != firstRefData.end()) {
- if (f->RefName != c->RefName || f->RefLength != c->RefLength) {
- cerr << "ERROR: mismatched references found in " << reader->GetFilename()
- << " expected: " << endl;
- for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a)
- cerr << a->RefName << " " << a->RefLength << endl;
- cerr << "but found: " << endl;
- for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a)
- cerr << a->RefName << " " << a->RefLength << endl;
- exit(1);
- }
- ++f; ++c;
- }
- }
-}
-
-// NB: The following functions assume that we have identical references for all
-// BAM files. We enforce this by invoking the above validation function
-// (ValidateReaders) to verify that our reference data is the same across all
-// files on Open, so we will not encounter a situation in which there is a
-// mismatch and we are still live.
-
-// returns the number of reference sequences
-const int BamMultiReader::GetReferenceCount(void) const {
- return readers.front().first->GetReferenceCount();
-}
-
-// returns vector of reference objects
-const BamTools::RefVector BamMultiReader::GetReferenceData(void) const {
- return readers.front().first->GetReferenceData();
-}
-
-const int BamMultiReader::GetReferenceID(const string& refName) const {
- return readers.front().first->GetReferenceID(refName);
-}
+++ /dev/null
-// ***************************************************************************\r
-// BamMultiReader.h (c) 2010 Erik Garrison\r
-// Marth Lab, Department of Biology, Boston College\r
-// All rights reserved.\r
-// ---------------------------------------------------------------------------\r
-// Last modified: 20 July 2010 (DB)\r
-// ---------------------------------------------------------------------------\r
-// Functionality for simultaneously reading multiple BAM files\r
-// ***************************************************************************\r
-\r
-#ifndef BAMMULTIREADER_H\r
-#define BAMMULTIREADER_H\r
-\r
-// C++ includes\r
-#include <string>\r
-#include <map>\r
-#include <utility> // for pair\r
-#include <sstream>\r
-\r
-using namespace std;\r
-\r
-// BamTools includes\r
-#include "BamAux.h"\r
-#include "BamReader.h"\r
-\r
-namespace BamTools {\r
-\r
-// index mapping reference/position pairings to bamreaders and their alignments\r
-typedef multimap<pair<int, int>, pair<BamReader*, BamAlignment*> > AlignmentIndex;\r
-\r
-\r
-class BamMultiReader {\r
-\r
- // constructor / destructor\r
- public:\r
- BamMultiReader(void);\r
- ~BamMultiReader(void);\r
-\r
- // public interface\r
- public:\r
-\r
- // positioning\r
- int CurrentRefID;\r
- int CurrentLeft;\r
-\r
- // region under analysis, specified using SetRegion\r
- BamRegion Region;\r
-\r
- // ----------------------\r
- // BAM file operations\r
- // ----------------------\r
-\r
- // close BAM files\r
- void Close(void);\r
-\r
- // opens BAM files (and optional BAM index files, if provided)\r
- // @openIndexes - triggers index opening, useful for suppressing\r
- // error messages during merging of files in which we may not have\r
- // indexes.\r
- // @coreMode - setup our first alignments using GetNextAlignmentCore();\r
- // also useful for merging\r
- bool Open(const vector<string> filenames, bool openIndexes = true, bool coreMode = false, bool useDefaultIndex = true);\r
-\r
- // performs random-access jump to reference, position\r
- bool Jump(int refID, int position = 0);\r
-\r
- // sets the target region\r
- bool SetRegion(const BamRegion& region);\r
- bool SetRegion(const int&, const int&, const int&, const int&); // convenience function to above\r
-\r
- // returns file pointers to beginning of alignments\r
- bool Rewind(void);\r
-\r
- // ----------------------\r
- // access alignment data\r
- // ----------------------\r
- // updates the reference id marker to match the lower limit of our readers\r
- void UpdateReferenceID(void);\r
-\r
- // retrieves next available alignment (returns success/fail) from all files\r
- bool GetNextAlignment(BamAlignment&);\r
- // retrieves next available alignment (returns success/fail) from all files\r
- // and populates the support data with information about the alignment\r
- // *** BUT DOES NOT PARSE CHARACTER DATA FROM THE ALIGNMENT\r
- bool GetNextAlignmentCore(BamAlignment&);\r
- // ... should this be private?\r
- bool HasOpenReaders(void);\r
-\r
- // ----------------------\r
- // access auxiliary data\r
- // ----------------------\r
-\r
- // returns unified SAM header text for all files\r
- const string GetHeaderText(void) const;\r
- // returns number of reference sequences\r
- const int GetReferenceCount(void) const;\r
- // returns vector of reference objects\r
- const BamTools::RefVector GetReferenceData(void) const;\r
- // returns reference id (used for BamMultiReader::Jump()) for the given reference name\r
- const int GetReferenceID(const std::string& refName) const;\r
- // validates that we have a congruent set of BAM files that are aligned against the same reference sequences\r
- void ValidateReaders() const;\r
-\r
- // ----------------------\r
- // BAM index operations\r
- // ----------------------\r
-\r
- // creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai")\r
- bool CreateIndexes(bool useDefaultIndex = true);\r
-\r
- //const int GetReferenceID(const string& refName) const;\r
-\r
- // utility\r
- void PrintFilenames(void);\r
- void DumpAlignmentIndex(void);\r
- void UpdateAlignments(void); // updates our alignment cache\r
-\r
- // private implementation\r
- private:\r
-\r
- // the set of readers and alignments which we operate on, maintained throughout the life of this class\r
- vector<pair<BamReader*, BamAlignment*> > readers;\r
-\r
- // readers and alignments sorted by reference id and position, to keep track of the lowest (next) alignment\r
- // when a reader reaches EOF, its entry is removed from this index\r
- AlignmentIndex alignments;\r
-\r
- vector<string> fileNames;\r
-};\r
-\r
-} // namespace BamTools\r
-\r
-#endif // BAMMULTIREADER_H\r
+++ /dev/null
-// ***************************************************************************\r
-// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg\r
-// Marth Lab, Department of Biology, Boston College\r
-// All rights reserved.\r
-// ---------------------------------------------------------------------------\r
-// Last modified: 15 July 2010 (DB)\r
-// ---------------------------------------------------------------------------\r
-// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
-// Institute.\r
-// ---------------------------------------------------------------------------\r
-// Provides the basic functionality for reading BAM files\r
-// ***************************************************************************\r
-\r
-// C++ includes\r
-#include <algorithm>\r
-#include <iterator>\r
-#include <string>\r
-#include <vector>\r
-#include <iostream>\r
-\r
-// BamTools includes\r
-#include "BGZF.h"\r
-#include "BamReader.h"\r
-#include "BamIndex.h"\r
-using namespace BamTools;\r
-using namespace std;\r
-\r
-struct BamReader::BamReaderPrivate {\r
-\r
- // -------------------------------\r
- // structs, enums, typedefs\r
- // -------------------------------\r
- enum RegionState { BEFORE_REGION = 0\r
- , WITHIN_REGION\r
- , AFTER_REGION\r
- };\r
-\r
- // -------------------------------\r
- // data members\r
- // -------------------------------\r
-\r
- // general file data\r
- BgzfData mBGZF;\r
- string HeaderText;\r
- //BamIndex Index;\r
- BamIndex* NewIndex;\r
- RefVector References;\r
- bool IsIndexLoaded;\r
- int64_t AlignmentsBeginOffset;\r
- string Filename;\r
- string IndexFilename;\r
- \r
- // system data\r
- bool IsBigEndian;\r
-\r
- // user-specified region values\r
- BamRegion Region;\r
- bool IsLeftBoundSpecified;\r
- bool IsRightBoundSpecified;\r
- \r
- bool IsRegionSpecified;\r
- int CurrentRefID;\r
- int CurrentLeft;\r
-\r
- // parent BamReader\r
- BamReader* Parent;\r
- \r
- // BAM character constants\r
- const char* DNA_LOOKUP;\r
- const char* CIGAR_LOOKUP;\r
-\r
- // -------------------------------\r
- // constructor & destructor\r
- // -------------------------------\r
- BamReaderPrivate(BamReader* parent);\r
- ~BamReaderPrivate(void);\r
-\r
- // -------------------------------\r
- // "public" interface\r
- // -------------------------------\r
-\r
- // file operations\r
- void Close(void);\r
- bool Jump(int refID, int position = 0);\r
- bool Open(const string& filename, const string& indexFilename = "");\r
- bool Rewind(void);\r
- bool SetRegion(const BamRegion& region);\r
-\r
- // access alignment data\r
- bool GetNextAlignment(BamAlignment& bAlignment);\r
- bool GetNextAlignmentCore(BamAlignment& bAlignment);\r
-\r
- // access auxiliary data\r
- int GetReferenceID(const string& refName) const;\r
-\r
- // index operations\r
- bool CreateIndex(bool useDefaultIndex);\r
-\r
- // -------------------------------\r
- // internal methods\r
- // -------------------------------\r
-\r
- // *** reading alignments and auxiliary data *** //\r
-\r
- // fills out character data for BamAlignment data\r
- bool BuildCharData(BamAlignment& bAlignment);\r
- // checks to see if alignment overlaps current region\r
- RegionState IsOverlap(BamAlignment& bAlignment);\r
- // retrieves header text from BAM file\r
- void LoadHeaderData(void);\r
- // retrieves BAM alignment under file pointer\r
- bool LoadNextAlignment(BamAlignment& bAlignment);\r
- // builds reference data structure from BAM file\r
- void LoadReferenceData(void);\r
-\r
- // *** index file handling *** //\r
-\r
- // clear out inernal index data structure\r
- void ClearIndex(void);\r
- // loads index from BAM index file\r
- bool LoadIndex(void);\r
-};\r
-\r
-// -----------------------------------------------------\r
-// BamReader implementation (wrapper around BRPrivate)\r
-// -----------------------------------------------------\r
-// constructor\r
-BamReader::BamReader(void) {\r
- d = new BamReaderPrivate(this);\r
-}\r
-\r
-// destructor\r
-BamReader::~BamReader(void) {\r
- delete d;\r
- d = 0;\r
-}\r
-\r
-// file operations\r
-void BamReader::Close(void) { d->Close(); }\r
-bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; }\r
-bool BamReader::Jump(int refID, int position) { \r
- d->Region.LeftRefID = refID;\r
- d->Region.LeftPosition = position;\r
- d->IsLeftBoundSpecified = true;\r
- d->IsRightBoundSpecified = false;\r
- return d->Jump(refID, position); \r
-}\r
-bool BamReader::Open(const string& filename, const string& indexFilename) { return d->Open(filename, indexFilename); }\r
-bool BamReader::Rewind(void) { return d->Rewind(); }\r
-bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); }\r
-bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) {\r
- return d->SetRegion( BamRegion(leftRefID, leftBound, rightRefID, rightBound) );\r
-}\r
-\r
-// access alignment data\r
-bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); }\r
-bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); }\r
-\r
-// access auxiliary data\r
-const string BamReader::GetHeaderText(void) const { return d->HeaderText; }\r
-int BamReader::GetReferenceCount(void) const { return d->References.size(); }\r
-const RefVector& BamReader::GetReferenceData(void) const { return d->References; }\r
-int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }\r
-const std::string BamReader::GetFilename(void) const { return d->Filename; }\r
-\r
-// index operations\r
-bool BamReader::CreateIndex(bool useDefaultIndex) { return d->CreateIndex(useDefaultIndex); }\r
-\r
-// -----------------------------------------------------\r
-// BamReaderPrivate implementation\r
-// -----------------------------------------------------\r
-\r
-// constructor\r
-BamReader::BamReaderPrivate::BamReaderPrivate(BamReader* parent)\r
- : NewIndex(0)\r
- , IsIndexLoaded(false)\r
- , AlignmentsBeginOffset(0)\r
- , IsLeftBoundSpecified(false)\r
- , IsRightBoundSpecified(false)\r
- , IsRegionSpecified(false)\r
- , CurrentRefID(0)\r
- , CurrentLeft(0)\r
- , Parent(parent)\r
- , DNA_LOOKUP("=ACMGRSVTWYHKDBN")\r
- , CIGAR_LOOKUP("MIDNSHP")\r
-{ \r
- IsBigEndian = SystemIsBigEndian();\r
-}\r
-\r
-// destructor\r
-BamReader::BamReaderPrivate::~BamReaderPrivate(void) {\r
- Close();\r
-}\r
-\r
-bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) {\r
- \r
- // calculate character lengths/offsets\r
- const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
- const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;\r
- const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);\r
- const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;\r
- const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;\r
- const unsigned int tagDataLength = dataLength - tagDataOffset;\r
- \r
- // set up char buffers\r
- const char* allCharData = bAlignment.SupportData.AllCharData.data();\r
- uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);\r
- const char* seqData = ((const char*)allCharData) + seqDataOffset;\r
- const char* qualData = ((const char*)allCharData) + qualDataOffset;\r
- char* tagData = ((char*)allCharData) + tagDataOffset;\r
- \r
- // store alignment name (depends on null char as terminator)\r
- bAlignment.Name.assign((const char*)(allCharData)); \r
- \r
- // save CigarOps \r
- CigarOp op;\r
- bAlignment.CigarData.clear();\r
- bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations);\r
- for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {\r
-\r
- // swap if necessary\r
- if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); }\r
- \r
- // build CigarOp structure\r
- op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
- op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
-\r
- // save CigarOp\r
- bAlignment.CigarData.push_back(op);\r
- }\r
- \r
- \r
- // save query sequence\r
- bAlignment.QueryBases.clear();\r
- bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
- for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {\r
- char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];\r
- bAlignment.QueryBases.append(1, singleBase);\r
- }\r
- \r
- // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character\r
- bAlignment.Qualities.clear();\r
- bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength);\r
- for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {\r
- char singleQuality = (char)(qualData[i]+33);\r
- bAlignment.Qualities.append(1, singleQuality);\r
- }\r
- \r
- // if QueryBases is empty (and this is a allowed case)\r
- if ( bAlignment.QueryBases.empty() ) \r
- bAlignment.AlignedBases = bAlignment.QueryBases;\r
- \r
- // if QueryBases contains data, then build AlignedBases using CIGAR data\r
- else {\r
- \r
- // resize AlignedBases\r
- bAlignment.AlignedBases.clear();\r
- bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
- \r
- // iterate over CigarOps\r
- int k = 0;\r
- vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();\r
- vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end();\r
- for ( ; cigarIter != cigarEnd; ++cigarIter ) {\r
- \r
- const CigarOp& op = (*cigarIter);\r
- switch(op.Type) {\r
- \r
- case ('M') :\r
- case ('I') :\r
- bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases\r
- // fall through\r
- \r
- case ('S') :\r
- k += op.Length; // for 'S' - soft clip, skip over query bases\r
- break;\r
- \r
- case ('D') :\r
- bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character\r
- break;\r
- \r
- case ('P') :\r
- bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character\r
- break;\r
- \r
- case ('N') :\r
- bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence\r
- break;\r
- \r
- case ('H') :\r
- break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op\r
- \r
- default:\r
- printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here\r
- exit(1);\r
- }\r
- }\r
- }\r
- \r
- // -----------------------\r
- // Added: 3-25-2010 DB\r
- // Fixed: endian-correctness for tag data\r
- // -----------------------\r
- if ( IsBigEndian ) {\r
- int i = 0;\r
- while ( (unsigned int)i < tagDataLength ) {\r
- \r
- i += 2; // skip tag type (e.g. "RG", "NM", etc)\r
- uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning \r
- ++i; // skip value type\r
- \r
- switch (type) {\r
- \r
- case('A') :\r
- case('C') : \r
- ++i;\r
- break;\r
-\r
- case('S') : \r
- SwapEndian_16p(&tagData[i]); \r
- i += sizeof(uint16_t);\r
- break;\r
- \r
- case('F') :\r
- case('I') : \r
- SwapEndian_32p(&tagData[i]);\r
- i += sizeof(uint32_t);\r
- break;\r
- \r
- case('D') : \r
- SwapEndian_64p(&tagData[i]);\r
- i += sizeof(uint64_t);\r
- break;\r
- \r
- case('H') :\r
- case('Z') : \r
- while (tagData[i]) { ++i; }\r
- ++i; // increment one more for null terminator\r
- break;\r
- \r
- default : \r
- printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
- exit(1);\r
- }\r
- }\r
- }\r
- \r
- // store TagData\r
- bAlignment.TagData.clear();\r
- bAlignment.TagData.resize(tagDataLength);\r
- memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength);\r
- \r
- // clear the core-only flag\r
- bAlignment.SupportData.HasCoreOnly = false;\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
-// clear index data structure\r
-void BamReader::BamReaderPrivate::ClearIndex(void) {\r
- delete NewIndex;\r
- NewIndex = 0;\r
-}\r
-\r
-// closes the BAM file\r
-void BamReader::BamReaderPrivate::Close(void) {\r
- \r
- // close BGZF file stream\r
- mBGZF.Close();\r
- \r
- // clear out index data\r
- ClearIndex();\r
- \r
- // clear out header data\r
- HeaderText.clear();\r
- \r
- // clear out region flags\r
- IsLeftBoundSpecified = false;\r
- IsRightBoundSpecified = false;\r
- IsRegionSpecified = false;\r
-}\r
-\r
-// create BAM index from BAM file (keep structure in memory) and write to default index output file\r
-bool BamReader::BamReaderPrivate::CreateIndex(bool useDefaultIndex) {\r
-\r
- // clear out prior index data\r
- ClearIndex();\r
- \r
- // create default index\r
- if ( useDefaultIndex )\r
- NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian);\r
- // create BamTools 'custom' index\r
- else\r
- NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);\r
- \r
- bool ok = true;\r
- ok &= NewIndex->Build();\r
- ok &= NewIndex->Write(Filename); \r
- \r
- // return success/fail\r
- return ok;\r
-}\r
-\r
-// get next alignment (from specified region, if given)\r
-bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {\r
-\r
- // if valid alignment found, attempt to parse char data, and return success/failure\r
- if ( GetNextAlignmentCore(bAlignment) )\r
- return BuildCharData(bAlignment);\r
- \r
- // no valid alignment found\r
- else\r
- return false;\r
-}\r
-\r
-// retrieves next available alignment core data (returns success/fail)\r
-// ** DOES NOT parse any character data (read name, bases, qualities, tag data)\r
-// these can be accessed, if necessary, from the supportData \r
-// useful for operations requiring ONLY positional or other alignment-related information\r
-bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) {\r
-\r
- // if valid alignment available\r
- if ( LoadNextAlignment(bAlignment) ) {\r
-\r
- // set core-only flag\r
- bAlignment.SupportData.HasCoreOnly = true;\r
- \r
- // if region not specified, return success\r
- if ( !IsLeftBoundSpecified ) return true;\r
-\r
- // determine region state (before, within, after)\r
- BamReader::BamReaderPrivate::RegionState state = IsOverlap(bAlignment);\r
- \r
- // if alignment lies after region, return false\r
- if ( state == AFTER_REGION ) \r
- return false;\r
-\r
- while ( state != WITHIN_REGION ) {\r
- // if no valid alignment available (likely EOF) return failure\r
- if ( !LoadNextAlignment(bAlignment) ) return false;\r
- // if alignment lies after region, return false (no available read within region)\r
- state = IsOverlap(bAlignment);\r
- if ( state == AFTER_REGION) return false;\r
- \r
- }\r
-\r
- // return success (alignment found that overlaps region)\r
- return true;\r
- }\r
-\r
- // no valid alignment\r
- else\r
- return false;\r
-}\r
-\r
-// returns RefID for given RefName (returns References.size() if not found)\r
-int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const {\r
-\r
- // retrieve names from reference data\r
- vector<string> refNames;\r
- RefVector::const_iterator refIter = References.begin();\r
- RefVector::const_iterator refEnd = References.end();\r
- for ( ; refIter != refEnd; ++refIter) {\r
- refNames.push_back( (*refIter).RefName );\r
- }\r
-\r
- // return 'index-of' refName ( if not found, returns refNames.size() )\r
- return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));\r
-}\r
-\r
-// returns region state - whether alignment ends before, overlaps, or starts after currently specified region\r
-// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true\r
-BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {\r
- \r
- // --------------------------------------------------\r
- // check alignment start against right bound cutoff\r
- \r
- // if full region of interest was given\r
- if ( IsRightBoundSpecified ) {\r
- \r
- // read starts on right bound reference, but AFTER right bound position\r
- if ( bAlignment.RefID == Region.RightRefID && bAlignment.Position > Region.RightPosition )\r
- return AFTER_REGION;\r
- \r
- // if read starts on reference AFTER right bound, return false\r
- if ( bAlignment.RefID > Region.RightRefID ) \r
- return AFTER_REGION;\r
- }\r
- \r
- // --------------------------------------------------------\r
- // no right bound given OR read starts before right bound\r
- // so, check if it overlaps left bound \r
- \r
- // if read starts on left bound reference AND after left boundary, return success\r
- if ( bAlignment.RefID == Region.LeftRefID && bAlignment.Position >= Region.LeftPosition)\r
- return WITHIN_REGION;\r
- \r
- // if read is on any reference sequence before left bound, return false\r
- if ( bAlignment.RefID < Region.LeftRefID )\r
- return BEFORE_REGION;\r
-\r
- // --------------------------------------------------------\r
- // read is on left bound reference, but starts before left bound position\r
-\r
- // if it overlaps, return WITHIN_REGION\r
- if ( bAlignment.GetEndPosition() >= Region.LeftPosition )\r
- return WITHIN_REGION;\r
- // else begins before left bound position\r
- else\r
- return BEFORE_REGION;\r
-}\r
-\r
-// jumps to specified region(refID, leftBound) in BAM file, returns success/fail\r
-bool BamReader::BamReaderPrivate::Jump(int refID, int position) {\r
-\r
- // -----------------------------------------------------------------------\r
- // check for existing index \r
- if ( NewIndex == 0 ) return false; \r
- // see if reference has alignments\r
- if ( !NewIndex->HasAlignments(refID) ) return false; \r
- // make sure position is valid\r
- if ( position > References.at(refID).RefLength ) return false;\r
- \r
- // determine possible offsets\r
- vector<int64_t> offsets;\r
- if ( !NewIndex->GetOffsets(Region, IsRightBoundSpecified, offsets) ) {\r
- printf("ERROR: Could not jump: unable to calculate offset for specified region.\n");\r
- return false;\r
- }\r
- \r
- // iterate through offsets\r
- BamAlignment bAlignment;\r
- bool result = true;\r
- for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {\r
- \r
- // attempt seek & load first available alignment\r
- result &= mBGZF.Seek(*o);\r
- LoadNextAlignment(bAlignment);\r
- \r
- // if this alignment corresponds to desired position\r
- // return success of seeking back to 'current offset'\r
- if ( (bAlignment.RefID == refID && bAlignment.Position + bAlignment.Length > position) || (bAlignment.RefID > refID) ) {\r
- if ( o != offsets.begin() ) --o;\r
- return mBGZF.Seek(*o);\r
- }\r
- }\r
- \r
- return result;\r
-}\r
-\r
-// load BAM header data\r
-void BamReader::BamReaderPrivate::LoadHeaderData(void) {\r
-\r
- // check to see if proper BAM header\r
- char buffer[4];\r
- if (mBGZF.Read(buffer, 4) != 4) {\r
- printf("Could not read header type\n");\r
- exit(1);\r
- }\r
-\r
- if (strncmp(buffer, "BAM\001", 4)) {\r
- printf("wrong header type!\n");\r
- exit(1);\r
- }\r
-\r
- // get BAM header text length\r
- mBGZF.Read(buffer, 4);\r
- unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( IsBigEndian ) { SwapEndian_32(headerTextLength); }\r
- \r
- // get BAM header text\r
- char* headerText = (char*)calloc(headerTextLength + 1, 1);\r
- mBGZF.Read(headerText, headerTextLength);\r
- HeaderText = (string)((const char*)headerText);\r
-\r
- // clean up calloc-ed temp variable\r
- free(headerText);\r
-}\r
-\r
-// load existing index data from BAM index file (".bai"), return success/fail\r
-bool BamReader::BamReaderPrivate::LoadIndex(void) {\r
-\r
- // clear out any existing index data\r
- ClearIndex();\r
-\r
- // skip if index file empty\r
- if ( IndexFilename.empty() )\r
- return false;\r
-\r
- // check supplied filename for index type\r
- size_t defaultExtensionFound = IndexFilename.find(".bai");\r
- size_t customExtensionFound = IndexFilename.find(".bti");\r
- \r
- // if SAM/BAM default (".bai")\r
- if ( defaultExtensionFound != string::npos )\r
- NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian);\r
- \r
- // if BamTools custom index (".bti")\r
- else if ( customExtensionFound != string::npos )\r
- NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);\r
- \r
- // else unknown\r
- else {\r
- printf("ERROR: Unknown index file extension.\n");\r
- return false;\r
- }\r
- \r
- // return success of loading index data\r
- return NewIndex->Load(IndexFilename);\r
-}\r
-\r
-// populates BamAlignment with alignment data under file pointer, returns success/fail\r
-bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {\r
-\r
- // read in the 'block length' value, make sure it's not zero\r
- char buffer[4];\r
- mBGZF.Read(buffer, 4);\r
- bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); }\r
- if ( bAlignment.SupportData.BlockLength == 0 ) { return false; }\r
-\r
- // read in core alignment data, make sure the right size of data was read\r
- char x[BAM_CORE_SIZE];\r
- if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }\r
-\r
- if ( IsBigEndian ) {\r
- for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) { \r
- SwapEndian_32p(&x[i]); \r
- }\r
- }\r
- \r
- // set BamAlignment 'core' and 'support' data\r
- bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); \r
- bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);\r
- \r
- unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]);\r
- bAlignment.Bin = tempValue >> 16;\r
- bAlignment.MapQuality = tempValue >> 8 & 0xff;\r
- bAlignment.SupportData.QueryNameLength = tempValue & 0xff;\r
-\r
- tempValue = BgzfData::UnpackUnsignedInt(&x[12]);\r
- bAlignment.AlignmentFlag = tempValue >> 16;\r
- bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff;\r
-\r
- bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);\r
- bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]);\r
- bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);\r
- bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);\r
- \r
- // set BamAlignment length\r
- bAlignment.Length = bAlignment.SupportData.QuerySequenceLength;\r
- \r
- // read in character data - make sure proper data size was read\r
- bool readCharDataOK = false;\r
- const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
- char* allCharData = (char*)calloc(sizeof(char), dataLength);\r
- \r
- if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { \r
- \r
- // store 'allCharData' in supportData structure\r
- bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength);\r
- \r
- // set success flag\r
- readCharDataOK = true;\r
- }\r
-\r
- free(allCharData);\r
- return readCharDataOK;\r
-}\r
-\r
-// loads reference data from BAM file\r
-void BamReader::BamReaderPrivate::LoadReferenceData(void) {\r
-\r
- // get number of reference sequences\r
- char buffer[4];\r
- mBGZF.Read(buffer, 4);\r
- unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( IsBigEndian ) { SwapEndian_32(numberRefSeqs); }\r
- if (numberRefSeqs == 0) { return; }\r
- References.reserve((int)numberRefSeqs);\r
-\r
- // iterate over all references in header\r
- for (unsigned int i = 0; i != numberRefSeqs; ++i) {\r
-\r
- // get length of reference name\r
- mBGZF.Read(buffer, 4);\r
- unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( IsBigEndian ) { SwapEndian_32(refNameLength); }\r
- char* refName = (char*)calloc(refNameLength, 1);\r
-\r
- // get reference name and reference sequence length\r
- mBGZF.Read(refName, refNameLength);\r
- mBGZF.Read(buffer, 4);\r
- int refLength = BgzfData::UnpackSignedInt(buffer);\r
- if ( IsBigEndian ) { SwapEndian_32(refLength); }\r
-\r
- // store data for reference\r
- RefData aReference;\r
- aReference.RefName = (string)((const char*)refName);\r
- aReference.RefLength = refLength;\r
- References.push_back(aReference);\r
-\r
- // clean up calloc-ed temp variable\r
- free(refName);\r
- }\r
-}\r
-\r
-// opens BAM file (and index)\r
-bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) {\r
-\r
- Filename = filename;\r
- IndexFilename = indexFilename;\r
-\r
- // open the BGZF file for reading, return false on failure\r
- if ( !mBGZF.Open(filename, "rb") ) \r
- return false;\r
- \r
- // retrieve header text & reference data\r
- LoadHeaderData();\r
- LoadReferenceData();\r
-\r
- // store file offset of first alignment\r
- AlignmentsBeginOffset = mBGZF.Tell();\r
-\r
- // open index file & load index data (if exists)\r
- if ( !IndexFilename.empty() )\r
- LoadIndex();\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
-// returns BAM file pointer to beginning of alignment data\r
-bool BamReader::BamReaderPrivate::Rewind(void) {\r
- \r
- // rewind to first alignment\r
- if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false;\r
- \r
- // retrieve first alignment data\r
- BamAlignment al;\r
- if ( !LoadNextAlignment(al) ) return false;\r
- \r
- // reset default region info using first alignment in file\r
- Region.LeftRefID = al.RefID;\r
- Region.LeftPosition = al.Position;\r
- Region.RightRefID = -1;\r
- Region.RightPosition = -1;\r
- IsLeftBoundSpecified = false;\r
- IsRightBoundSpecified = false; \r
-\r
- // rewind back to before first alignment\r
- // return success/fail of seek\r
- return mBGZF.Seek(AlignmentsBeginOffset);\r
-}\r
-\r
-// sets a region of interest (with left & right bound reference/position)\r
-// attempts a Jump() to left bound as well\r
-// returns success/failure of Jump()\r
-bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) {\r
- \r
- // save region of interest\r
- Region = region;\r
- \r
- // set flags\r
- if ( region.LeftRefID >= 0 && region.LeftPosition >= 0 ) \r
- IsLeftBoundSpecified = true;\r
- if ( region.RightRefID >= 0 && region.RightPosition >= 0 ) \r
- IsRightBoundSpecified = true;\r
- \r
- // attempt jump to beginning of region, return success/fail of Jump()\r
- return Jump( Region.LeftRefID, Region.LeftPosition );\r
-}\r
+++ /dev/null
-// ***************************************************************************\r
-// BamReader.h (c) 2009 Derek Barnett, Michael Str�mberg\r
-// Marth Lab, Department of Biology, Boston College\r
-// All rights reserved.\r
-// ---------------------------------------------------------------------------\r
-// Last modified: 9 July 2010 (DB)\r
-// ---------------------------------------------------------------------------\r
-// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
-// Institute.\r
-// ---------------------------------------------------------------------------\r
-// Provides the basic functionality for reading BAM files\r
-// ***************************************************************************\r
-\r
-#ifndef BAMREADER_H\r
-#define BAMREADER_H\r
-\r
-// C++ includes\r
-#include <string>\r
-\r
-// BamTools includes\r
-#include "BamAux.h"\r
-\r
-namespace BamTools {\r
- \r
-class BamReader {\r
-\r
- // constructor / destructor\r
- public:\r
- BamReader(void);\r
- ~BamReader(void);\r
-\r
- // public interface\r
- public:\r
-\r
- // ----------------------\r
- // BAM file operations\r
- // ----------------------\r
-\r
- // close BAM file\r
- void Close(void);\r
- // returns whether reader is open for reading or not\r
- bool IsOpen(void) const;\r
- // performs random-access jump to reference, position\r
- bool Jump(int refID, int position = 0);\r
- // opens BAM file (and optional BAM index file, if provided)\r
- bool Open(const std::string& filename, const std::string& indexFilename = "");\r
- // returns file pointer to beginning of alignments\r
- bool Rewind(void);\r
- // sets a region of interest (with left & right bound reference/position)\r
- // attempts a Jump() to left bound as well\r
- // returns success/failure of Jump()\r
- bool SetRegion(const BamRegion& region);\r
- bool SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound);\r
-\r
- // ----------------------\r
- // access alignment data\r
- // ----------------------\r
-\r
- // retrieves next available alignment (returns success/fail)\r
- bool GetNextAlignment(BamAlignment& bAlignment);\r
- \r
- // retrieves next available alignment core data (returns success/fail)\r
- // ** DOES NOT parse any character data (read name, bases, qualities, tag data)\r
- // these can be accessed, if necessary, from the supportData \r
- // useful for operations requiring ONLY positional or other alignment-related information\r
- bool GetNextAlignmentCore(BamAlignment& bAlignment);\r
-\r
- // ----------------------\r
- // access auxiliary data\r
- // ----------------------\r
-\r
- // returns SAM header text\r
- const std::string GetHeaderText(void) const;\r
- // returns number of reference sequences\r
- int GetReferenceCount(void) const;\r
- // returns vector of reference objects\r
- const BamTools::RefVector& GetReferenceData(void) const;\r
- // returns reference id (used for BamReader::Jump()) for the given reference name\r
- int GetReferenceID(const std::string& refName) const;\r
- // returns the name of the file associated with this BamReader\r
- const std::string GetFilename(void) const;\r
-\r
- // ----------------------\r
- // BAM index operations\r
- // ----------------------\r
-\r
- // creates index for BAM file, saves to file (default = bamFilename + ".bai")\r
- bool CreateIndex(bool useDefaultIndex = true);\r
- \r
- // private implementation\r
- private:\r
- struct BamReaderPrivate;\r
- BamReaderPrivate* d;\r
-};\r
-\r
-} // namespace BamTools\r
-\r
-#endif // BAMREADER_H\r
+++ /dev/null
-// ***************************************************************************\r
-// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett\r
-// Marth Lab, Department of Biology, Boston College\r
-// All rights reserved.\r
-// ---------------------------------------------------------------------------\r
-// Last modified: 17 August 2010 (DB)\r
-// ---------------------------------------------------------------------------\r
-// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
-// Institute.\r
-// ---------------------------------------------------------------------------\r
-// Provides the basic functionality for producing BAM files\r
-// ***************************************************************************\r
-\r
-#include <iostream>\r
-\r
-#include "BGZF.h"\r
-#include "BamWriter.h"\r
-using namespace BamTools;\r
-using namespace std;\r
-\r
-struct BamWriter::BamWriterPrivate {\r
-\r
- // data members\r
- BgzfData mBGZF;\r
- bool IsBigEndian;\r
- \r
- // constructor / destructor\r
- BamWriterPrivate(void) { \r
- IsBigEndian = SystemIsBigEndian(); \r
- }\r
- \r
- ~BamWriterPrivate(void) {\r
- mBGZF.Close();\r
- }\r
-\r
- // "public" interface\r
- void Close(void);\r
- bool Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed);\r
- void SaveAlignment(const BamAlignment& al);\r
-\r
- // internal methods\r
- const unsigned int CalculateMinimumBin(const int begin, int end) const;\r
- void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);\r
- void EncodeQuerySequence(const string& query, string& encodedQuery);\r
-};\r
-\r
-// -----------------------------------------------------\r
-// BamWriter implementation\r
-// -----------------------------------------------------\r
-\r
-// constructor\r
-BamWriter::BamWriter(void) {\r
- d = new BamWriterPrivate;\r
-}\r
-\r
-// destructor\r
-BamWriter::~BamWriter(void) {\r
- delete d;\r
- d = 0;\r
-}\r
-\r
-// closes the alignment archive\r
-void BamWriter::Close(void) { \r
- d->Close(); \r
-}\r
-\r
-// opens the alignment archive\r
-bool BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) {\r
- return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed);\r
-}\r
-\r
-// saves the alignment to the alignment archive\r
-void BamWriter::SaveAlignment(const BamAlignment& al) { \r
- d->SaveAlignment(al);\r
-}\r
-\r
-// -----------------------------------------------------\r
-// BamWriterPrivate implementation\r
-// -----------------------------------------------------\r
-\r
-// closes the alignment archive\r
-void BamWriter::BamWriterPrivate::Close(void) {\r
- mBGZF.Close();\r
-}\r
-\r
-// calculates minimum bin for a BAM alignment interval\r
-const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { \r
- --end;\r
- if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);\r
- if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);\r
- if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);\r
- if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);\r
- if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);\r
- return 0;\r
-}\r
-\r
-// creates a cigar string from the supplied alignment\r
-void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {\r
-\r
- // initialize\r
- const unsigned int numCigarOperations = cigarOperations.size();\r
- packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);\r
-\r
- // pack the cigar data into the string\r
- unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();\r
-\r
- unsigned int cigarOp;\r
- vector<CigarOp>::const_iterator coIter;\r
- for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {\r
-\r
- switch(coIter->Type) {\r
- case 'M':\r
- cigarOp = BAM_CMATCH;\r
- break;\r
- case 'I':\r
- cigarOp = BAM_CINS;\r
- break;\r
- case 'D':\r
- cigarOp = BAM_CDEL;\r
- break;\r
- case 'N':\r
- cigarOp = BAM_CREF_SKIP;\r
- break;\r
- case 'S':\r
- cigarOp = BAM_CSOFT_CLIP;\r
- break;\r
- case 'H':\r
- cigarOp = BAM_CHARD_CLIP;\r
- break;\r
- case 'P':\r
- cigarOp = BAM_CPAD;\r
- break;\r
- default:\r
- printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);\r
- exit(1);\r
- }\r
-\r
- *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;\r
- pPackedCigar++;\r
- }\r
-}\r
-\r
-// encodes the supplied query sequence into 4-bit notation\r
-void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {\r
-\r
- // prepare the encoded query string\r
- const unsigned int queryLen = query.size();\r
- const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);\r
- encodedQuery.resize(encodedQueryLen);\r
- char* pEncodedQuery = (char*)encodedQuery.data();\r
- const char* pQuery = (const char*)query.data();\r
-\r
- unsigned char nucleotideCode;\r
- bool useHighWord = true;\r
-\r
- while(*pQuery) {\r
-\r
- switch(*pQuery) {\r
- \r
- case '=':\r
- nucleotideCode = 0;\r
- break;\r
- \r
- case 'A':\r
- nucleotideCode = 1;\r
- break;\r
- \r
- case 'C':\r
- nucleotideCode = 2;\r
- break;\r
- \r
- case 'G':\r
- nucleotideCode = 4;\r
- break;\r
- \r
- case 'T':\r
- nucleotideCode = 8;\r
- break;\r
- \r
- case 'N':\r
- nucleotideCode = 15;\r
- break;\r
- \r
- default:\r
- printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
- exit(1);\r
- }\r
-\r
- // pack the nucleotide code\r
- if(useHighWord) {\r
- *pEncodedQuery = nucleotideCode << 4;\r
- useHighWord = false;\r
- } else {\r
- *pEncodedQuery |= nucleotideCode;\r
- pEncodedQuery++;\r
- useHighWord = true;\r
- }\r
-\r
- // increment the query position\r
- pQuery++;\r
- }\r
-}\r
-\r
-// opens the alignment archive\r
-bool BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) {\r
-\r
- // open the BGZF file for writing, return failure if error\r
- if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )\r
- return false;\r
-\r
- // ================\r
- // write the header\r
- // ================\r
-\r
- // write the BAM signature\r
- const unsigned char SIGNATURE_LENGTH = 4;\r
- const char* BAM_SIGNATURE = "BAM\1";\r
- mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);\r
-\r
- // write the SAM header text length\r
- uint32_t samHeaderLen = samHeader.size();\r
- if (IsBigEndian) SwapEndian_32(samHeaderLen);\r
- mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
-\r
- // write the SAM header text\r
- if(samHeaderLen > 0) \r
- mBGZF.Write(samHeader.data(), samHeaderLen);\r
-\r
- // write the number of reference sequences\r
- uint32_t numReferenceSequences = referenceSequences.size();\r
- if (IsBigEndian) SwapEndian_32(numReferenceSequences);\r
- mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
-\r
- // =============================\r
- // write the sequence dictionary\r
- // =============================\r
-\r
- RefVector::const_iterator rsIter;\r
- for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {\r
-\r
- // write the reference sequence name length\r
- uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
- if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);\r
- mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
-\r
- // write the reference sequence name\r
- mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);\r
-\r
- // write the reference sequence length\r
- int32_t referenceLength = rsIter->RefLength;\r
- if (IsBigEndian) SwapEndian_32(referenceLength);\r
- mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);\r
- }\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
-// saves the alignment to the alignment archive\r
-void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {\r
-\r
- // if BamAlignment contains only the core data and a raw char data buffer\r
- // (as a result of BamReader::GetNextAlignmentCore())\r
- if ( al.SupportData.HasCoreOnly ) {\r
- \r
- // write the block size\r
- unsigned int blockSize = al.SupportData.BlockLength;\r
- if (IsBigEndian) SwapEndian_32(blockSize);\r
- mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
-\r
- // assign the BAM core data\r
- uint32_t buffer[8];\r
- buffer[0] = al.RefID;\r
- buffer[1] = al.Position;\r
- buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;\r
- buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;\r
- buffer[4] = al.SupportData.QuerySequenceLength;\r
- buffer[5] = al.MateRefID;\r
- buffer[6] = al.MatePosition;\r
- buffer[7] = al.InsertSize;\r
- \r
- // swap BAM core endian-ness, if necessary\r
- if ( IsBigEndian ) { \r
- for ( int i = 0; i < 8; ++i )\r
- SwapEndian_32(buffer[i]); \r
- }\r
- \r
- // write the BAM core\r
- mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
- \r
- // write the raw char data\r
- mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); \r
- }\r
- \r
- // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc\r
- // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )\r
- else {\r
- \r
- // calculate char lengths\r
- const unsigned int nameLength = al.Name.size() + 1;\r
- const unsigned int numCigarOperations = al.CigarData.size();\r
- const unsigned int queryLength = al.QueryBases.size();\r
- const unsigned int tagDataLength = al.TagData.size();\r
- \r
- // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)\r
- // force calculation of Bin before storing\r
- const int endPosition = al.GetEndPosition();\r
- const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);\r
- \r
- // create our packed cigar string\r
- string packedCigar;\r
- CreatePackedCigar(al.CigarData, packedCigar);\r
- const unsigned int packedCigarLength = packedCigar.size();\r
-\r
- // encode the query\r
- string encodedQuery;\r
- EncodeQuerySequence(al.QueryBases, encodedQuery);\r
- const unsigned int encodedQueryLength = encodedQuery.size(); \r
- \r
- // write the block size\r
- const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;\r
- unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
- if (IsBigEndian) SwapEndian_32(blockSize);\r
- mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
-\r
- // assign the BAM core data\r
- uint32_t buffer[8];\r
- buffer[0] = al.RefID;\r
- buffer[1] = al.Position;\r
- buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;\r
- buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;\r
- buffer[4] = queryLength;\r
- buffer[5] = al.MateRefID;\r
- buffer[6] = al.MatePosition;\r
- buffer[7] = al.InsertSize;\r
- \r
- // swap BAM core endian-ness, if necessary\r
- if ( IsBigEndian ) { \r
- for ( int i = 0; i < 8; ++i )\r
- SwapEndian_32(buffer[i]); \r
- }\r
- \r
- // write the BAM core\r
- mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
- \r
- // write the query name\r
- mBGZF.Write(al.Name.c_str(), nameLength);\r
-\r
- // write the packed cigar\r
- if ( IsBigEndian ) {\r
- \r
- char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);\r
- memcpy(cigarData, packedCigar.data(), packedCigarLength);\r
- \r
- for (unsigned int i = 0; i < packedCigarLength; ++i) {\r
- if ( IsBigEndian )\r
- SwapEndian_32p(&cigarData[i]); \r
- }\r
- \r
- mBGZF.Write(cigarData, packedCigarLength);\r
- free(cigarData); \r
- } \r
- else \r
- mBGZF.Write(packedCigar.data(), packedCigarLength);\r
-\r
- // write the encoded query sequence\r
- mBGZF.Write(encodedQuery.data(), encodedQueryLength);\r
-\r
- // write the base qualities\r
- string baseQualities(al.Qualities);\r
- char* pBaseQualities = (char*)al.Qualities.data();\r
- for(unsigned int i = 0; i < queryLength; i++) { \r
- pBaseQualities[i] -= 33; \r
- }\r
- mBGZF.Write(pBaseQualities, queryLength);\r
-\r
- // write the read group tag\r
- if ( IsBigEndian ) {\r
- \r
- char* tagData = (char*)calloc(sizeof(char), tagDataLength);\r
- memcpy(tagData, al.TagData.data(), tagDataLength);\r
- \r
- int i = 0;\r
- while ( (unsigned int)i < tagDataLength ) {\r
- \r
- i += 2; // skip tag type (e.g. "RG", "NM", etc)\r
- uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning \r
- ++i; // skip value type\r
- \r
- switch (type) {\r
- \r
- case('A') :\r
- case('C') : \r
- ++i;\r
- break;\r
- \r
- case('S') : \r
- SwapEndian_16p(&tagData[i]); \r
- i+=2; // sizeof(uint16_t)\r
- break;\r
- \r
- case('F') :\r
- case('I') : \r
- SwapEndian_32p(&tagData[i]);\r
- i+=4; // sizeof(uint32_t)\r
- break;\r
- \r
- case('D') : \r
- SwapEndian_64p(&tagData[i]);\r
- i+=8; // sizeof(uint64_t)\r
- break;\r
- \r
- case('H') :\r
- case('Z') : \r
- while (tagData[i]) { ++i; }\r
- ++i; // increment one more for null terminator\r
- break;\r
- \r
- default : \r
- printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
- free(tagData);\r
- exit(1); \r
- }\r
- }\r
- \r
- mBGZF.Write(tagData, tagDataLength);\r
- free(tagData);\r
- } \r
- else \r
- mBGZF.Write(al.TagData.data(), tagDataLength); \r
- }\r
-}\r
+++ /dev/null
-// ***************************************************************************\r
-// BamWriter.h (c) 2009 Michael Str�mberg, Derek Barnett\r
-// Marth Lab, Department of Biology, Boston College\r
-// All rights reserved.\r
-// ---------------------------------------------------------------------------\r
-// Last modified: 17 August 2010 (DB)\r
-// ---------------------------------------------------------------------------\r
-// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
-// Institute.\r
-// ---------------------------------------------------------------------------\r
-// Provides the basic functionality for producing BAM files\r
-// ***************************************************************************\r
-\r
-#ifndef BAMWRITER_H\r
-#define BAMWRITER_H\r
-\r
-// C++ includes\r
-#include <string>\r
-\r
-// BamTools includes\r
-#include "BamAux.h"\r
-\r
-namespace BamTools {\r
-\r
-class BamWriter {\r
-\r
- // constructor/destructor\r
- public:\r
- BamWriter(void);\r
- ~BamWriter(void);\r
-\r
- // public interface\r
- public:\r
- // closes the alignment archive\r
- void Close(void);\r
- // opens the alignment archive\r
- bool Open(const std::string& filename, \r
- const std::string& samHeader, \r
- const BamTools::RefVector& referenceSequences, \r
- bool writeUncompressed = false);\r
- // saves the alignment to the alignment archive\r
- void SaveAlignment(const BamTools::BamAlignment& al);\r
-\r
- // private implementation\r
- private:\r
- struct BamWriterPrivate;\r
- BamWriterPrivate* d;\r
-};\r
-\r
-} // namespace BamTools\r
-\r
-#endif // BAMWRITER_H\r
-CXX= g++\r
-CXXFLAGS= -Wall -O3 -D_FILE_OFFSET_BITS=64\r
-PROG= bamtools\r
-API= BGZF.o \
- BamIndex.o \
- BamReader.o \
- BamWriter.o \
- BamMultiReader.o
-UTILS= bamtools_fasta.o \
- bamtools_options.o \
- bamtools_pileup.o \
- bamtools_utilities.o
-TOOLKIT= bamtools_convert.o \
- bamtools_count.o \
- bamtools_coverage.o \
- bamtools_filter.o \
- bamtools_header.o \
- bamtools_index.o \
- bamtools_merge.o \
- bamtools_random.o \
- bamtools_sort.o \
- bamtools_stats.o
-MAIN= bamtools.o
-OBJS= $(API) $(UTILS) $(TOOLKIT) $(MAIN)
-LIBS= -lz
-\r
-all: $(PROG)\r
-\r
-bamtools: $(OBJS)\r
- $(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(LIBS)
-\r
-clean:\r
- rm -fr gmon.out *.o *.a a.out *~\r
+# ==========================
+# BamTools Makefile
+# (c) 2010 Derek Barnett
+# ==========================
+
+# define main directories
+export OBJ_DIR = obj
+export BIN_DIR = bin
+export SRC_DIR = src
+
+# define compile/link flags
+export CXX = g++\r
+export CXXFLAGS = -Wall -O3 -D_FILE_OFFSET_BITS=64
+export LIBS = -lz
+
+# define current BamTools version
+export BAMTOOLS_VERSION = 0.7.0812
+
+# define source subdirectories
+SUBDIRS = $(SRC_DIR)/api \
+ $(SRC_DIR)/utils \
+ $(SRC_DIR)/toolkit
+
+all:
+ @echo "Building BamTools:"
+ @echo "Version: $$BAMTOOLS_VERSION"
+ @echo "========================================================="
+
+ @for dir in $(SUBDIRS); do \
+ echo "- Building in $$dir"; \
+ $(MAKE) --no-print-directory -C $$dir; \
+ echo ""; \
+ done
+
+.PHONY: all
+
+clean:
+ @echo "Cleaning up."
+ @rm -f $(OBJ_DIR)/* $(BIN_DIR)/*
+
+.PHONY: clean
+++ /dev/null
-// ***************************************************************************
-// bamtools.cpp (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 22 July 2010
-// ---------------------------------------------------------------------------
-// Integrates a number of BamTools functionalities into a single executable.
-// ***************************************************************************
-
-// Std C/C++ includes
-#include <iostream>
-
-// BamTools includes
-#include "bamtools_convert.h"
-#include "bamtools_count.h"
-#include "bamtools_coverage.h"
-#include "bamtools_filter.h"
-#include "bamtools_header.h"
-#include "bamtools_index.h"
-#include "bamtools_merge.h"
-#include "bamtools_random.h"
-#include "bamtools_sort.h"
-#include "bamtools_stats.h"
-
-using namespace std;
-using namespace BamTools;
-
-// ------------------------------------------
-// bamtools subtool names
-static const string CONVERT = "convert";
-static const string COUNT = "count";
-static const string COVERAGE = "coverage";
-static const string FILTER = "filter";
-static const string HEADER = "header";
-static const string INDEX = "index";
-static const string MERGE = "merge";
-static const string RANDOM = "random";
-static const string SORT = "sort";
-static const string STATS = "stats";
-
-// ------------------------------------------
-// bamtools help/version names
-static const string HELP = "help";
-static const string LONG_HELP = "--help";
-static const string SHORT_HELP = "-h";
-
-static const string VERSION = "version";
-static const string LONG_VERSION = "--version";
-static const string SHORT_VERSION = "-v";
-
-// ------------------------------------------
-// Print help info
-int Help(int argc, char* argv[]) {
-
- // 'bamtools help COMMAND'
- if (argc > 2) {
-
- AbstractTool* tool(0);
- if ( argv[2] == CONVERT ) tool = new ConvertTool;
- if ( argv[2] == COUNT ) tool = new CountTool;
- if ( argv[2] == COVERAGE ) tool = new CoverageTool;
- if ( argv[2] == FILTER ) tool = new FilterTool;
- if ( argv[2] == HEADER ) tool = new HeaderTool;
- if ( argv[2] == INDEX ) tool = new IndexTool;
- if ( argv[2] == MERGE ) tool = new MergeTool;
- if ( argv[2] == RANDOM ) tool = new RandomTool;
- if ( argv[2] == SORT ) tool = new SortTool;
- if ( argv[2] == STATS ) tool = new StatsTool;
-
- // if tool known, print its help screen
- if ( tool ) return tool->Help();
- }
-
- // either 'bamtools help' or unrecognized argument after 'help'
- cerr << endl;
- cerr << "usage: bamtools [--help] COMMAND [ARGS]" << endl;
- cerr << endl;
- cerr << "Available bamtools commands:" << endl;
- cerr << "\tconvert Converts between BAM and a number of other formats" << endl;
- cerr << "\tcount Prints number of alignments in BAM file" << endl;
- cerr << "\tcoverage Prints coverage statistics from the input BAM file" << endl;
- cerr << "\tfilter Filters BAM file(s) by user-specified criteria" << endl;
- cerr << "\theader Prints BAM header information" << endl;
- cerr << "\tindex Generates index for BAM file" << endl;
- cerr << "\tmerge Merge multiple BAM files into single file" << endl;
- cerr << "\trandom Grab a random subset of alignments" << endl;
- cerr << "\tsort Sorts the BAM file according to some criteria" << endl;
- cerr << "\tstats Prints general alignment statistics" << endl;
- cerr << endl;
- cerr << "See 'bamtools help COMMAND' for more information on a specific command." << endl;
- cerr << endl;
- return 0;
-}
-
-// ------------------------------------------
-// Print version info
-int Version(void) {
- cout << endl;
- cout << "bamtools v0.8.xx" << endl;
- cout << "Part of BamTools API and toolkit" << endl;
- cout << "Primary authors: Derek Barnett, Erik Garrison, Michael Stromberg" << endl;
- cout << "(c) 2009-2010 Marth Lab, Biology Dept., Boston College" << endl;
- cout << endl;
- return 0;
-}
-
-// ------------------------------------------
-// toolkit entry point
-int main(int argc, char* argv[]) {
-
- // just 'bamtools'
- if ( (argc == 1) ) return Help(argc, argv);
-
- // 'bamtools help', 'bamtools --help', or 'bamtools -h'
- if ( (argv[1] == HELP) || (argv[1] == LONG_HELP) || (argv[1] == SHORT_HELP) ) return Help(argc, argv);
-
- // 'bamtools version', 'bamtools --version', or 'bamtools -v'
- if ( (argv[1] == VERSION) || (argv[1] == LONG_VERSION) || (argv[1] == SHORT_VERSION) ) return Version();
-
- // determine desired sub-tool
- AbstractTool* tool(0);
- if ( argv[1] == CONVERT ) tool = new ConvertTool;
- if ( argv[1] == COUNT ) tool = new CountTool;
- if ( argv[1] == COVERAGE ) tool = new CoverageTool;
- if ( argv[1] == FILTER ) tool = new FilterTool;
- if ( argv[1] == HEADER ) tool = new HeaderTool;
- if ( argv[1] == INDEX ) tool = new IndexTool;
- if ( argv[1] == MERGE ) tool = new MergeTool;
- if ( argv[1] == RANDOM ) tool = new RandomTool;
- if ( argv[1] == SORT ) tool = new SortTool;
- if ( argv[1] == STATS ) tool = new StatsTool;
-
- // if found, run tool
- if ( tool ) return tool->Run(argc, argv);
- // no match found, show help
- else return Help(argc, argv);
-}
+++ /dev/null
-// ***************************************************************************
-// bamtools_convert.cpp (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 22 July 2010
-// ---------------------------------------------------------------------------
-// Converts between BAM and a number of other formats
-// ***************************************************************************
-
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "bamtools_convert.h"
-#include "bamtools_options.h"
-#include "bamtools_pileup.h"
-#include "bamtools_utilities.h"
-#include "BGZF.h"
-#include "BamReader.h"
-#include "BamMultiReader.h"
-
-using namespace std;
-using namespace BamTools;
-
-namespace BamTools {
-
- // format names
- static const string FORMAT_BED = "bed";
- static const string FORMAT_BEDGRAPH = "bedgraph";
- static const string FORMAT_FASTA = "fasta";
- static const string FORMAT_FASTQ = "fastq";
- static const string FORMAT_JSON = "json";
- static const string FORMAT_SAM = "sam";
- static const string FORMAT_PILEUP = "pileup";
- static const string FORMAT_WIGGLE = "wig";
-
- // other constants
- static const unsigned int FASTA_LINE_MAX = 50;
-
-} // namespace BamTools
-
-struct ConvertTool::ConvertToolPrivate {
-
- // ctor & dtor
- public:
- ConvertToolPrivate(ConvertTool::ConvertSettings* settings);
- ~ConvertToolPrivate(void);
-
- // interface
- public:
- bool Run(void);
-
- // internal methods
- private:
- void PrintBed(const BamAlignment& a);
- void PrintBedGraph(const BamAlignment& a);
- void PrintFasta(const BamAlignment& a);
- void PrintFastq(const BamAlignment& a);
- void PrintJson(const BamAlignment& a);
- void PrintSam(const BamAlignment& a);
- void PrintWiggle(const BamAlignment& a);
-
- // data members
- private:
- ConvertTool::ConvertSettings* m_settings;
- RefVector m_references;
- ostream m_out;
-};
-
-// ---------------------------------------------
-// ConvertSettings implementation
-
-struct ConvertTool::ConvertSettings {
-
- // flags
- bool HasInput;
- bool HasOutput;
- bool HasFormat;
- bool HasRegion;
-
- // pileup flags
- bool HasFastaFilename;
- bool IsOmittingSamHeader;
- bool IsPrintingPileupMapQualities;
-
- // options
- vector<string> InputFiles;
- string OutputFilename;
- string Format;
- string Region;
-
- // pileup options
- string FastaFilename;
-
- // constructor
- ConvertSettings(void)
- : HasInput(false)
- , HasOutput(false)
- , HasFormat(false)
- , HasRegion(false)
- , HasFastaFilename(false)
- , IsOmittingSamHeader(false)
- , IsPrintingPileupMapQualities(false)
- , OutputFilename(Options::StandardOut())
- { }
-};
-
-// ---------------------------------------------
-// ConvertTool implementation
-
-ConvertTool::ConvertTool(void)
- : AbstractTool()
- , m_settings(new ConvertSettings)
- , m_impl(0)
-{
- // set program details
- Options::SetProgramInfo("bamtools convert", "converts BAM to a number of other formats", "-format <FORMAT> [-in <filename> -in <filename> ...] [-out <filename>] [other options]");
-
- // set up options
- OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
- Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
- Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
- Options::AddValueOption("-format", "FORMAT", "the output file format - see README for recognized formats", "", m_settings->HasFormat, m_settings->Format, IO_Opts);
-
- OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
- Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as <filename>.bai. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
-
- OptionGroup* PileupOpts = Options::CreateOptionGroup("Pileup Options");
- Options::AddValueOption("-fasta", "FASTA filename", "FASTA reference file", "", m_settings->HasFastaFilename, m_settings->FastaFilename, PileupOpts, "");
- Options::AddOption("-mapqual", "print the mapping qualities", m_settings->IsPrintingPileupMapQualities, PileupOpts);
-
- OptionGroup* SamOpts = Options::CreateOptionGroup("SAM Options");
- Options::AddOption("-noheader", "omit the SAM header from output", m_settings->IsOmittingSamHeader, SamOpts);
-}
-
-ConvertTool::~ConvertTool(void) {
- delete m_settings;
- m_settings = 0;
-
- delete m_impl;
- m_impl = 0;
-}
-
-int ConvertTool::Help(void) {
- Options::DisplayHelp();
- return 0;
-}
-
-int ConvertTool::Run(int argc, char* argv[]) {
-
- // parse command line arguments
- Options::Parse(argc, argv, 1);
-
- // run internal ConvertTool implementation, return success/fail
- m_impl = new ConvertToolPrivate(m_settings);
-
- if ( m_impl->Run() )
- return 0;
- else
- return 1;
-}
-
-// ---------------------------------------------
-// ConvertToolPrivate implementation
-
-ConvertTool::ConvertToolPrivate::ConvertToolPrivate(ConvertTool::ConvertSettings* settings)
- : m_settings(settings)
- , m_out(cout.rdbuf()) // default output to cout
-{ }
-
-ConvertTool::ConvertToolPrivate::~ConvertToolPrivate(void) { }
-
-bool ConvertTool::ConvertToolPrivate::Run(void) {
-
- bool convertedOk = true;
-
- // ------------------------------------
- // initialize conversion input/output
-
- // set to default input if none provided
- if ( !m_settings->HasInput )
- m_settings->InputFiles.push_back(Options::StandardIn());
-
- // open input files
- BamMultiReader reader;
- reader.Open(m_settings->InputFiles, false);
- m_references = reader.GetReferenceData();
-
- // set region if specified
- BamRegion region;
- if ( m_settings->HasRegion ) {
- if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
- if ( !reader.SetRegion(region) )
- cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
- }
- }
-
- // if output file given
- ofstream outFile;
- if ( m_settings->HasOutput ) {
-
- // open output file stream
- outFile.open(m_settings->OutputFilename.c_str());
- if ( !outFile ) {
- cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl;
- return false;
- }
-
- // set m_out to file's streambuf
- m_out.rdbuf(outFile.rdbuf());
- }
-
- // ------------------------
- // pileup is special case
- if ( m_settings->Format == FORMAT_PILEUP ) {
-
- // initialize pileup input/output
- Pileup pileup(&reader, &m_out);
-
- // ---------------------------
- // configure pileup settings
-
- if ( m_settings->HasRegion )
- pileup.SetRegion(region);
-
- if ( m_settings->HasFastaFilename )
- pileup.SetFastaFilename(m_settings->FastaFilename);
-
- pileup.SetIsPrintingMapQualities( m_settings->IsPrintingPileupMapQualities );
-
- // run pileup
- convertedOk = pileup.Run();
- }
-
- // -------------------------------------
- // else determine 'simpler' format type
- else {
-
- bool formatError = false;
- void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0;
- if ( m_settings->Format == FORMAT_BED ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed;
- else if ( m_settings->Format == FORMAT_BEDGRAPH ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBedGraph;
- else if ( m_settings->Format == FORMAT_FASTA ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta;
- else if ( m_settings->Format == FORMAT_FASTQ ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq;
- else if ( m_settings->Format == FORMAT_JSON ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson;
- else if ( m_settings->Format == FORMAT_SAM ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam;
- else if ( m_settings->Format == FORMAT_WIGGLE ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintWiggle;
- else {
- cerr << "Unrecognized format: " << m_settings->Format << endl;
- cerr << "Please see help|README (?) for details on supported formats " << endl;
- formatError = true;
- convertedOk = false;
- }
-
- // if SAM format & not omitting header, print SAM header
- if ( (m_settings->Format == FORMAT_SAM) && !m_settings->IsOmittingSamHeader ) {
- string headerText = reader.GetHeaderText();
- m_out << headerText;
- }
-
- // ------------------------
- // do conversion
- if ( !formatError ) {
- BamAlignment a;
- while ( reader.GetNextAlignment(a) ) {
- (this->*pFunction)(a);
- }
- }
- }
-
- // ------------------------
- // clean up & exit
- reader.Close();
- if ( m_settings->HasOutput ) outFile.close();
- return convertedOk;
-}
-
-// ----------------------------------------------------------
-// Conversion/output methods
-// ----------------------------------------------------------
-
-void ConvertTool::ConvertToolPrivate::PrintBed(const BamAlignment& a) {
-
- // tab-delimited, 0-based half-open
- // (e.g. a 50-base read aligned to pos 10 could have BED coordinates (10, 60) instead of BAM coordinates (10, 59) )
- // <chromName> <chromStart> <chromEnd> <readName> <score> <strand>
-
- m_out << m_references.at(a.RefID).RefName << "\t"
- << a.Position << "\t"
- << a.GetEndPosition() + 1 << "\t"
- << a.Name << "\t"
- << a.MapQuality << "\t"
- << (a.IsReverseStrand() ? "-" : "+") << endl;
-}
-
-void ConvertTool::ConvertToolPrivate::PrintBedGraph(const BamAlignment& a) {
- ;
-}
-
-// print BamAlignment in FASTA format
-// N.B. - uses QueryBases NOT AlignedBases
-void ConvertTool::ConvertToolPrivate::PrintFasta(const BamAlignment& a) {
-
- // >BamAlignment.Name
- // BamAlignment.QueryBases (up to FASTA_LINE_MAX bases per line)
- // ...
-
- // print header
- m_out << "> " << a.Name << endl;
-
- // if sequence fits on single line
- if ( a.QueryBases.length() <= FASTA_LINE_MAX )
- m_out << a.QueryBases << endl;
-
- // else split over multiple lines
- else {
-
- size_t position = 0;
- size_t seqLength = a.QueryBases.length();
-
- // write subsequences to each line
- while ( position < (seqLength - FASTA_LINE_MAX) ) {
- m_out << a.QueryBases.substr(position, FASTA_LINE_MAX) << endl;
- position += FASTA_LINE_MAX;
- }
-
- // write final subsequence
- m_out << a.QueryBases.substr(position) << endl;
- }
-}
-
-// print BamAlignment in FASTQ format
-// N.B. - uses QueryBases NOT AlignedBases
-void ConvertTool::ConvertToolPrivate::PrintFastq(const BamAlignment& a) {
-
- // @BamAlignment.Name
- // BamAlignment.QueryBases
- // +
- // BamAlignment.Qualities
-
- m_out << "@" << a.Name << endl
- << a.QueryBases << endl
- << "+" << endl
- << a.Qualities << endl;
-}
-
-// print BamAlignment in JSON format
-void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) {
-
- // write name & alignment flag
- m_out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" << a.AlignmentFlag << "\",";
-
- // write reference name
- if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) )
- m_out << "\"reference\":\"" << m_references[a.RefID].RefName << "\",";
-
- // write position & map quality
- m_out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ",";
-
- // write CIGAR
- const vector<CigarOp>& cigarData = a.CigarData;
- if ( !cigarData.empty() ) {
- m_out << "\"cigar\":[";
- vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
- vector<CigarOp>::const_iterator cigarIter = cigarBegin;
- vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
- for ( ; cigarIter != cigarEnd; ++cigarIter ) {
- const CigarOp& op = (*cigarIter);
- if (cigarIter != cigarBegin) m_out << ",";
- m_out << "\"" << op.Length << op.Type << "\"";
- }
- m_out << "],";
- }
-
- // write mate reference name, mate position, & insert size
- if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
- m_out << "\"mate\":{"
- << "\"reference\":\"" << m_references[a.MateRefID].RefName << "\","
- << "\"position\":" << a.MatePosition+1
- << ",\"insertSize\":" << a.InsertSize << "},";
- }
-
- // write sequence
- if ( !a.QueryBases.empty() )
- m_out << "\"queryBases\":\"" << a.QueryBases << "\",";
-
- // write qualities
- if ( !a.Qualities.empty() ) {
- string::const_iterator s = a.Qualities.begin();
- m_out << "\"qualities\":[" << static_cast<short>(*s) - 33;
- ++s;
- for (; s != a.Qualities.end(); ++s) {
- m_out << "," << static_cast<short>(*s) - 33;
- }
- m_out << "],";
- }
-
- // write tag data
- const char* tagData = a.TagData.c_str();
- const size_t tagDataLength = a.TagData.length();
- size_t index = 0;
- if (index < tagDataLength) {
-
- m_out << "\"tags\":{";
-
- while ( index < tagDataLength ) {
-
- if (index > 0)
- m_out << ",";
-
- // write tag name
- m_out << "\"" << a.TagData.substr(index, 2) << "\":";
- index += 2;
-
- // get data type
- char type = a.TagData.at(index);
- ++index;
-
- switch (type) {
- case('A') :
- m_out << "\"" << tagData[index] << "\"";
- ++index;
- break;
-
- case('C') :
- m_out << (int)tagData[index];
- ++index;
- break;
-
- case('c') :
- m_out << (int)tagData[index];
- ++index;
- break;
-
- case('S') :
- m_out << BgzfData::UnpackUnsignedShort(&tagData[index]);
- index += 2;
- break;
-
- case('s') :
- m_out << BgzfData::UnpackSignedShort(&tagData[index]);
- index += 2;
- break;
-
- case('I') :
- m_out << BgzfData::UnpackUnsignedInt(&tagData[index]);
- index += 4;
- break;
-
- case('i') :
- m_out << BgzfData::UnpackSignedInt(&tagData[index]);
- index += 4;
- break;
-
- case('f') :
- m_out << BgzfData::UnpackFloat(&tagData[index]);
- index += 4;
- break;
-
- case('d') :
- m_out << BgzfData::UnpackDouble(&tagData[index]);
- index += 8;
- break;
-
- case('Z') :
- case('H') :
- m_out << "\"";
- while (tagData[index]) {
- m_out << tagData[index];
- ++index;
- }
- m_out << "\"";
- ++index;
- break;
- }
-
- if ( tagData[index] == '\0')
- break;
- }
-
- m_out << "}";
- }
-
- m_out << "}" << endl;
-
-}
-
-// print BamAlignment in SAM format
-void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) {
-
- // tab-delimited
- // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ]
-
- // write name & alignment flag
- m_out << a.Name << "\t" << a.AlignmentFlag << "\t";
-
- // write reference name
- if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) )
- m_out << m_references[a.RefID].RefName << "\t";
- else
- m_out << "*\t";
-
- // write position & map quality
- m_out << a.Position+1 << "\t" << a.MapQuality << "\t";
-
- // write CIGAR
- const vector<CigarOp>& cigarData = a.CigarData;
- if ( cigarData.empty() ) m_out << "*\t";
- else {
- vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
- vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
- for ( ; cigarIter != cigarEnd; ++cigarIter ) {
- const CigarOp& op = (*cigarIter);
- m_out << op.Length << op.Type;
- }
- m_out << "\t";
- }
-
- // write mate reference name, mate position, & insert size
- if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
- if ( a.MateRefID == a.RefID ) m_out << "=\t";
- else m_out << m_references[a.MateRefID].RefName << "\t";
- m_out << a.MatePosition+1 << "\t" << a.InsertSize << "\t";
- }
- else m_out << "*\t0\t0\t";
-
- // write sequence
- if ( a.QueryBases.empty() ) m_out << "*\t";
- else m_out << a.QueryBases << "\t";
-
- // write qualities
- if ( a.Qualities.empty() ) m_out << "*";
- else m_out << a.Qualities;
-
- // write tag data
- const char* tagData = a.TagData.c_str();
- const size_t tagDataLength = a.TagData.length();
-
- size_t index = 0;
- while ( index < tagDataLength ) {
-
- // write tag name
- string tagName = a.TagData.substr(index, 2);
- m_out << "\t" << tagName << ":";
- index += 2;
-
- // get data type
- char type = a.TagData.at(index);
- ++index;
- switch (type) {
- case('A') :
- m_out << "A:" << tagData[index];
- ++index;
- break;
-
- case('C') :
- m_out << "i:" << (int)tagData[index];
- ++index;
- break;
-
- case('c') :
- m_out << "i:" << (int)tagData[index];
- ++index;
- break;
-
- case('S') :
- m_out << "i:" << BgzfData::UnpackUnsignedShort(&tagData[index]);
- index += 2;
- break;
-
- case('s') :
- m_out << "i:" << BgzfData::UnpackSignedShort(&tagData[index]);
- index += 2;
- break;
-
- case('I') :
- m_out << "i:" << BgzfData::UnpackUnsignedInt(&tagData[index]);
- index += 4;
- break;
-
- case('i') :
- m_out << "i:" << BgzfData::UnpackSignedInt(&tagData[index]);
- index += 4;
- break;
-
- case('f') :
- m_out << "f:" << BgzfData::UnpackFloat(&tagData[index]);
- index += 4;
- break;
-
- case('d') :
- m_out << "d:" << BgzfData::UnpackDouble(&tagData[index]);
- index += 8;
- break;
-
- case('Z') :
- case('H') :
- m_out << type << ":";
- while (tagData[index]) {
- m_out << tagData[index];
- ++index;
- }
- ++index;
- break;
- }
-
- if ( tagData[index] == '\0')
- break;
- }
-
- m_out << endl;
-}
-
-void ConvertTool::ConvertToolPrivate::PrintWiggle(const BamAlignment& a) {
- ;
-}
+++ /dev/null
-// ***************************************************************************
-// bamtools_convert.h (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 9 July 2010
-// ---------------------------------------------------------------------------
-// Converts between BAM and a number of other formats
-// ***************************************************************************
-
-#ifndef BAMTOOLS_CONVERT_H
-#define BAMTOOLS_CONVERT_H
-
-#include "bamtools_tool.h"
-
-namespace BamTools {
-
-class ConvertTool : public AbstractTool {
-
- public:
- ConvertTool(void);
- ~ConvertTool(void);
-
- public:
- int Help(void);
- int Run(int argc, char* argv[]);
-
- private:
- struct ConvertSettings;
- ConvertSettings* m_settings;
-
- struct ConvertToolPrivate;
- ConvertToolPrivate* m_impl;
-};
-
-} // namespace BamTools
-
-#endif // BAMTOOLS_CONVERT_H
\ No newline at end of file
+++ /dev/null
-// ***************************************************************************
-// bamtools_count.cpp (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 2 June 2010
-// ---------------------------------------------------------------------------
-// Prints alignment count for BAM file
-//
-// ** Expand to multiple??
-//
-// ***************************************************************************
-
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "bamtools_count.h"
-#include "bamtools_options.h"
-#include "bamtools_utilities.h"
-#include "BamReader.h"
-#include "BamMultiReader.h"
-
-using namespace std;
-using namespace BamTools;
-
-// ---------------------------------------------
-// CountSettings implementation
-
-struct CountTool::CountSettings {
-
- // flags
- bool HasInput;
- bool HasRegion;
-
- // filenames
- vector<string> InputFiles;
- string Region;
-
- // constructor
- CountSettings(void)
- : HasInput(false)
- , HasRegion(false)
- { }
-};
-
-// ---------------------------------------------
-// CountTool implementation
-
-CountTool::CountTool(void)
- : AbstractTool()
- , m_settings(new CountSettings)
-{
- // set program details
- Options::SetProgramInfo("bamtools count", "prints alignment counts for a BAM file", "-in <filename> [-region <REGION>]");
-
- // set up options
- OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
- Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts);
- //Options::AddValueOption("-index", "BAM index filename", "the BAM index file", "", m_settings->HasBamIndexFilename, m_settings->BamIndexFilename, IO_Opts);
-
- OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
- Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as <filename>.bai or <filename>.bti. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
-}
-
-CountTool::~CountTool(void) {
- delete m_settings;
- m_settings = 0;
-}
-
-int CountTool::Help(void) {
- Options::DisplayHelp();
- return 0;
-}
-
-int CountTool::Run(int argc, char* argv[]) {
-
- // parse command line arguments
- Options::Parse(argc, argv, 1);
-
- if ( !m_settings->HasInput )
- m_settings->InputFiles.push_back(Options::StandardIn());
-
- BamMultiReader reader;
- reader.Open(m_settings->InputFiles, false, true);
-
- // alignment counter
- int alignmentCount(0);
-
- // set up error handling
- ostringstream errorStream("");
- bool foundError(false);
-
- // if no region specified, count entire file
- if ( !m_settings->HasRegion ) {
- BamAlignment al;
- while ( reader.GetNextAlignmentCore(al) )
- ++alignmentCount;
- }
-
- // more complicated - region specified
- else {
-
- BamRegion region;
- if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
-
- // check if there are index files *.bai/*.bti corresponding to the input files
- bool hasDefaultIndex = false;
- bool hasBamtoolsIndex = false;
- bool hasNoIndex = false;
- int defaultIndexCount = 0;
- int bamtoolsIndexCount = 0;
- for (vector<string>::const_iterator f = m_settings->InputFiles.begin(); f != m_settings->InputFiles.end(); ++f) {
-
- if ( Utilities::FileExists(*f + ".bai") ) {
- hasDefaultIndex = true;
- ++defaultIndexCount;
- }
-
- if ( Utilities::FileExists(*f + ".bti") ) {
- hasBamtoolsIndex = true;
- ++bamtoolsIndexCount;
- }
-
- if ( !hasDefaultIndex && !hasBamtoolsIndex ) {
- hasNoIndex = true;
- cerr << "*WARNING - could not find index file for " << *f
- << ", parsing whole file(s) to get alignment counts for target region"
- << " (could be slow)" << endl;
- break;
- }
- }
-
- // determine if index file types are heterogeneous
- bool hasDifferentIndexTypes = false;
- if ( defaultIndexCount > 0 && bamtoolsIndexCount > 0 ) {
- hasDifferentIndexTypes = true;
- cerr << "*WARNING - different index file formats found"
- << ", parsing whole file(s) to get alignment counts for target region"
- << " (could be slow)" << endl;
- }
-
- // if any input file has no index, or if input files use different index formats
- // can't use BamMultiReader to jump directly (**for now**)
- if ( hasNoIndex || hasDifferentIndexTypes ) {
-
- // read through sequentially, counting all overlapping reads
- BamAlignment al;
- while( reader.GetNextAlignmentCore(al) ) {
- if ( (al.RefID >= region.LeftRefID) && ( (al.Position + al.Length) >= region.LeftPosition ) &&
- (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) )
- {
- ++alignmentCount;
- }
- }
- }
-
- // has index file for each input file (and same format)
- else {
-
- // this is kind of a hack...?
- BamMultiReader reader;
- reader.Open(m_settings->InputFiles, true, true, hasDefaultIndex );
-
- if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
- foundError = true;
- errorStream << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
- } else {
- BamAlignment al;
- while ( reader.GetNextAlignmentCore(al) )
- ++alignmentCount;
- }
- }
-
- } else {
- foundError = true;
- errorStream << "Could not parse REGION: " << m_settings->Region << endl;
- errorStream << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl;
- }
- }
-
- // print errors OR results
- if ( foundError )
- cerr << errorStream.str() << endl;
- else
- cout << alignmentCount << endl;
-
- // clean & exit
- reader.Close();
- return (int)foundError;
-}
+++ /dev/null
-// ***************************************************************************
-// bamtools_count.h (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 1 June 2010
-// ---------------------------------------------------------------------------
-// Prints alignment count for BAM file
-//
-// ** Expand to multiple??
-//
-// ***************************************************************************
-
-#ifndef BAMTOOLS_COUNT_H
-#define BAMTOOLS_COUNT_H
-
-#include "bamtools_tool.h"
-
-namespace BamTools {
-
-class CountTool : public AbstractTool {
-
- public:
- CountTool(void);
- ~CountTool(void);
-
- public:
- int Help(void);
- int Run(int argc, char* argv[]);
-
- private:
- struct CountSettings;
- CountSettings* m_settings;
-};
-
-} // namespace BamTools
-
-#endif // BAMTOOLS_COUNT_H
+++ /dev/null
-// ***************************************************************************
-// bamtools_coverage.cpp (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 1 June 2010
-// ---------------------------------------------------------------------------
-// Prints coverage statistics for a single BAM file
-//
-// ** Expand to multiple??
-//
-// ***************************************************************************
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "bamtools_coverage.h"
-#include "bamtools_options.h"
-#include "BamReader.h"
-
-using namespace std;
-using namespace BamTools;
-
-// ---------------------------------------------
-// CoverageSettings implementation
-
-struct CoverageTool::CoverageSettings {
-
- // flags
- bool HasInputBamFilename;
-
- // filenames
- std::string InputBamFilename;
-
- // constructor
- CoverageSettings(void)
- : HasInputBamFilename(false)
- , InputBamFilename(Options::StandardIn())
- { }
-};
-
-// ---------------------------------------------
-// CoverageTool implementation
-
-CoverageTool::CoverageTool(void)
- : AbstractTool()
- , m_settings(new CoverageSettings)
-{
- // set program details
- Options::SetProgramInfo("bamtools coverage", "prints coverage stats for a BAM file", "-in <filename> ");
-
- // set up options
- OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
- Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn());
-}
-
-CoverageTool::~CoverageTool(void) {
- delete m_settings;
- m_settings = 0;
-}
-
-int CoverageTool::Help(void) {
- Options::DisplayHelp();
- return 0;
-}
-
-int CoverageTool::Run(int argc, char* argv[]) {
-
- // parse command line arguments
- Options::Parse(argc, argv, 1);
-
- //open our BAM reader
- BamReader reader;
- reader.Open(m_settings->InputBamFilename);
-
- // generate coverage stats
- cerr << "Generating coverage stats for " << m_settings->InputBamFilename << endl;
- cerr << "FEATURE NOT YET IMPLEMENTED!" << endl;
-
- // clean & exit
- reader.Close();
- return 0;
-}
\ No newline at end of file
+++ /dev/null
-// ***************************************************************************
-// bamtools_coverage.h (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 1 June 2010
-// ---------------------------------------------------------------------------
-// Prints coverage statistics for a single BAM file
-//
-// ** Expand to multiple??
-//
-// ***************************************************************************
-
-#ifndef BAMTOOLS_COVERAGE_H
-#define BAMTOOLS_COVERAGE_H
-
-#include "bamtools_tool.h"
-
-namespace BamTools {
-
-class CoverageTool : public AbstractTool {
-
- public:
- CoverageTool(void);
- ~CoverageTool(void);
-
- public:
- int Help(void);
- int Run(int argc, char* argv[]);
-
- private:
- struct CoverageSettings;
- CoverageSettings* m_settings;
-};
-
-} // namespace BamTools
-
-#endif // BAMTOOLS_COVERAGE_H
+++ /dev/null
-// ***************************************************************************
-// bamtools_fasta.cpp (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 13 July 2010
-// ---------------------------------------------------------------------------
-// Provides FASTA reading/indexing functionality.
-// ***************************************************************************
-
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include "bamtools_fasta.h"
-using namespace std;
-using namespace BamTools;
-
-struct Fasta::FastaPrivate {
-
- struct FastaIndexData {
- string Name;
- int32_t Length;
- int64_t Offset;
- int32_t LineLength;
- int32_t ByteLength; // LineLength + newline character(s) - varies on OS where file was generated
- };
-
- // data members
- FILE* Stream;
- bool IsOpen;
-
- FILE* IndexStream;
- bool HasIndex;
- bool IsIndexOpen;
-
- vector<FastaIndexData> Index;
-
- // ctor
- FastaPrivate(void);
- ~FastaPrivate(void);
-
- // 'public' API methods
- bool Close(void);
- bool CreateIndex(const string& indexFilename);
- bool GetBase(const int& refId, const int& position, char& base);
- bool GetSequence(const int& refId, const int& start, const int& stop, string& sequence);
- bool Open(const string& filename, const string& indexFilename);
-
- // internal methods
- private:
- void Chomp(char* sequence);
- bool GetNameFromHeader(const string& header, string& name);
- bool GetNextHeader(string& header);
- bool GetNextSequence(string& sequence);
- bool LoadIndexData(void);
- bool Rewind(void);
- bool WriteIndexData(void);
-};
-
-Fasta::FastaPrivate::FastaPrivate(void)
- : IsOpen(false)
- , HasIndex(false)
- , IsIndexOpen(false)
-{ }
-
-Fasta::FastaPrivate::~FastaPrivate(void) {
- Close();
-}
-
-// remove any trailing newlines
-void Fasta::FastaPrivate::Chomp(char* sequence) {
-
- static const int CHAR_LF = 10;
- static const int CHAR_CR = 13;
-
- size_t seqLength = strlen(sequence);
- if ( seqLength == 0 ) return;
- --seqLength; // ignore null terminator
-
- while ( sequence[seqLength] == CHAR_LF ||
- sequence[seqLength] == CHAR_CR
- )
- {
- sequence[seqLength] = 0;
- --seqLength;
- if (seqLength < 0)
- break;
- }
-}
-
-bool Fasta::FastaPrivate::Close(void) {
-
- // close fasta file
- if ( IsOpen ) {
- fclose(Stream);
- IsOpen = false;
- }
-
- // close index file
- if ( HasIndex && IsIndexOpen ) {
- fclose(IndexStream);
- HasIndex = false;
- IsIndexOpen = false;
- }
-
- // return success
- return true;
-}
-
-bool Fasta::FastaPrivate::CreateIndex(const string& indexFilename) {
-
- // check that file is open
- if ( !IsOpen ) {
- cerr << "FASTA error : cannot create index, FASTA file not open" << endl;
- return false;
- }
-
- // rewind FASTA file
- if ( !Rewind() ) {
- cerr << "FASTA error : could not rewind FASTA file" << endl;
- return false;
- }
-
- // clear out prior index data
- Index.clear();
-
- // -------------------------------------------
- // calculate lineLength & byteLength
-
- int lineLength = 0;
- int byteLength = 0;
-
- // skip over header
- char buffer[1024];
- if ( fgets(buffer, 1024, Stream) == 0 ) {
- cerr << "FASTA error : could not read from file" << endl;
- return false;
- }
- if ( feof(Stream) ) return false;
- if ( buffer[0] != '>' ) {
- cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
- return false;
- }
-
- // read in first line of sequence
- char c = fgetc(Stream);
- while ( (c >= 0) && (c != '\n') ) {
- ++byteLength;
- if (isgraph(c)) ++lineLength;
- c = fgetc(Stream);
- }
- ++byteLength; // store newline
-
- // rewind FASTA file
- if ( !Rewind() ) {
- cerr << "FASTA error : could not rewind FASTA file" << endl;
- return false;
- }
-
- // iterate through fasta entries
- int currentId = 0;
- string header = "";
- string sequence = "";
- while ( GetNextHeader(header) ) {
-
- // ---------------------------
- // build index entry data
- FastaIndexData data;
-
- // store file offset of beginning of DNA sequence (after header)
- data.Offset = ftello(Stream);
-
- // parse header, store sequence name in data.Name
- if ( !GetNameFromHeader(header, data.Name) ) {
- cerr << "FASTA error : could not parse read name from FASTA header" << endl;
- return false;
- }
-
- // retrieve FASTA sequence
- if ( !GetNextSequence(sequence) ) {
- cerr << "FASTA error : could not read in next sequence from FASTA file" << endl;
- return false;
- }
-
- // store sequence length & line/byte lengths
- data.Length = sequence.length();
- data.LineLength = lineLength;
- data.ByteLength = byteLength;
-
- // store index entry
- Index.push_back(data);
-
- // update ref Id
- ++currentId;
- }
-
- // open index file
- if ( !indexFilename.empty() ) {
- IndexStream = fopen(indexFilename.c_str(), "wb");
- if ( !IndexStream ) {
- cerr << "FASTA error : Could not open " << indexFilename << " for writing." << endl;
- return false;
- }
- IsIndexOpen = true;
- }
-
- // write index data
- if ( !WriteIndexData() ) return false;
- HasIndex = true;
-
- // close index file
- fclose(IndexStream);
- IsIndexOpen = false;
-
- // return succes status
- return true;
-}
-
-bool Fasta::FastaPrivate::GetBase(const int& refId, const int& position, char& base) {
-
- // make sure FASTA file is open
- if ( !IsOpen ) {
- cerr << "FASTA error : file not open for reading" << endl;
- return false;
- }
-
- // use index if available
- if ( HasIndex && !Index.empty() ) {
-
- // validate reference id
- if ( (refId < 0) || (refId >= (int)Index.size()) ) {
- cerr << "FASTA error: invalid refId specified: " << refId << endl;
- return false;
- }
-
- // retrieve reference index data
- const FastaIndexData& referenceData = Index.at(refId);
-
- // validate position
- if ( (position < 0) || (position > referenceData.Length) ) {
- cerr << "FASTA error: invalid position specified: " << position << endl;
- return false;
- }
-
- // seek to beginning of sequence data
- if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
- cerr << "FASTA error : could not sek in file" << endl;
- return false;
- }
-
- // retrieve sequence
- string sequence = "";
- if ( !GetNextSequence(sequence) ) {
- cerr << "FASTA error : could not retrieve base from FASTA file" << endl;
- return false;
- }
-
- // set base & return success
- base = sequence.at(position);
- return true;
- }
-
- // else plow through sequentially
- else {
-
- // rewind FASTA file
- if ( !Rewind() ) {
- cerr << "FASTA error : could not rewind FASTA file" << endl;
- return false;
- }
-
- // iterate through fasta entries
- int currentId = 0;
- string header = "";
- string sequence = "";
-
- // get first entry
- GetNextHeader(header);
- GetNextSequence(sequence);
-
- while ( currentId != refId ) {
- GetNextHeader(header);
- GetNextSequence(sequence);
- ++currentId;
- }
-
- // get desired base from sequence
- // TODO: error reporting on invalid position
- if ( currentId == refId && (sequence.length() >= (size_t)position) ) {
- base = sequence.at(position);
- return true;
- }
-
- // could not get sequence
- return false;
- }
-
- // return success
- return true;
-}
-
-bool Fasta::FastaPrivate::GetNameFromHeader(const string& header, string& name) {
-
- // get rid of the leading greater than sign
- string s = header.substr(1);
-
- // extract the first non-whitespace segment
- char* pName = (char*)s.data();
- unsigned int nameLen = (unsigned int)s.size();
-
- unsigned int start = 0;
- while ( (pName[start] == 32) || (pName[start] == 9) || (pName[start] == 10) || (pName[start] == 13) ) {
- start++;
- if ( start == nameLen )
- break;
- }
-
- unsigned int stop = start;
- if ( stop < nameLen ) {
- while( (pName[stop] != 32) && (pName[stop] != 9) && (pName[stop] != 10) && (pName[stop] != 13) ) {
- stop++;
- if ( stop == nameLen )
- break;
- }
- }
-
- if ( start == stop ) {
- cerr << "FASTA error : could not parse read name from FASTA header" << endl;
- return false;
- }
-
- name = s.substr(start, stop - start).c_str();
- return true;
-}
-
-bool Fasta::FastaPrivate::GetNextHeader(string& header) {
-
- // validate input stream
- if ( !IsOpen || feof(Stream) )
- return false;
-
- // read in header line
- char buffer[1024];
- if ( fgets(buffer, 1024, Stream) == 0 ) {
- cerr << "FASTA error : could not read from file" << endl;
- return false;
- }
-
- // make sure it's a FASTA header
- if ( buffer[0] != '>' ) {
- cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
- return false;
- }
-
- // import buffer contents to header string
- stringstream headerBuffer("");
- headerBuffer << buffer;
- header = headerBuffer.str();
-
- // return success
- return true;
-}
-
-bool Fasta::FastaPrivate::GetNextSequence(string& sequence) {
-
- // validate input stream
- if ( !IsOpen || feof(Stream) )
- return false;
-
- // read in sequence
- char buffer[1024];
- ostringstream seqBuffer("");
- while(true) {
-
- char ch = fgetc(Stream);
- ungetc(ch, Stream);
- if( (ch == '>') || feof(Stream) )
- break;
-
- if ( fgets(buffer, 1024, Stream) == 0 ) {
- cerr << "FASTA error : could not read from file" << endl;
- return false;
- }
-
- Chomp(buffer);
- seqBuffer << buffer;
- }
-
- // import buffer contents to sequence string
- sequence = seqBuffer.str();
-
- // return success
- return true;
-}
-
-bool Fasta::FastaPrivate::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
-
- // make sure FASTA file is open
- if ( !IsOpen ) {
- cerr << "FASTA error : file not open for reading" << endl;
- return false;
- }
-
- // use index if available
- if ( HasIndex && !Index.empty() ) {
-
- // validate reference id
- if ( (refId < 0) || (refId >= (int)Index.size()) ) {
- cerr << "FASTA error: invalid refId specified: " << refId << endl;
- return false;
- }
-
- // retrieve reference index data
- const FastaIndexData& referenceData = Index.at(refId);
-
- // validate stop position
- if ( (start < 0) || (start > stop) || (stop > referenceData.Length) ) {
- cerr << "FASTA error: invalid start/stop positions specified: " << start << ", " << stop << endl;
- return false;
- }
-
- // seek to beginning of sequence data
- if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
- cerr << "FASTA error : could not sek in file" << endl;
- return false;
- }
-
- // retrieve full sequence
- string fullSequence = "";
- if ( !GetNextSequence(fullSequence) ) {
- cerr << "FASTA error : could not retrieve sequence from FASTA file" << endl;
- return false;
- }
-
- // set sub-sequence & return success
- const int seqLength = (stop - start) + 1;
- sequence = fullSequence.substr(start, seqLength);
- return true;
- }
-
- // else plow through sequentially
- else {
-
- // rewind FASTA file
- if ( !Rewind() ) {
- cerr << "FASTA error : could not rewind FASTA file" << endl;
- return false;
- }
-
- // iterate through fasta entries
- int currentId = 0;
- string header = "";
- string fullSequence = "";
-
- // get first entry
- GetNextHeader(header);
- GetNextSequence(fullSequence);
-
- while ( currentId != refId ) {
- GetNextHeader(header);
- GetNextSequence(fullSequence);
- ++currentId;
- }
-
- // get desired substring from sequence
- // TODO: error reporting on invalid start/stop positions
- if ( currentId == refId && (fullSequence.length() >= (size_t)stop) ) {
- const int seqLength = (stop - start) + 1;
- sequence = fullSequence.substr(start, seqLength);
- return true;
- }
-
- // could not get sequence
- return false;
- }
-
- // return success
- return true;
-}
-
-bool Fasta::FastaPrivate::LoadIndexData(void) {
-
- // skip if no index file available
- if ( !IsIndexOpen ) return false;
-
- // clear any prior index data
- Index.clear();
-
- char buffer[1024];
- stringstream indexBuffer;
- while ( true ) {
-
- char c = fgetc(IndexStream);
- if ( (c == '\n') || feof(IndexStream) ) break;
- ungetc(c, IndexStream);
-
- // clear index buffer
- indexBuffer.str("");
-
- // read line from index file
- if ( fgets(buffer, 1024, IndexStream) == 0 ) {
- cerr << "FASTA LoadIndexData() error : could not read from index file" << endl;
- HasIndex = false;
- return false;
- }
-
- // store line in indexBuffer
- indexBuffer << buffer;
-
- // retrieve fasta index data from line
- FastaIndexData data;
- indexBuffer >> data.Name;
- indexBuffer >> data.Length;
- indexBuffer >> data.Offset;
- indexBuffer >> data.LineLength;
- indexBuffer >> data.ByteLength;
-
- // store index entry
- Index.push_back(data);
- }
-
- return true;
-}
-
-bool Fasta::FastaPrivate::Open(const string& filename, const string& indexFilename) {
-
- bool success = true;
-
- // open FASTA filename
- Stream = fopen(filename.c_str(), "rb");
- if ( !Stream ) {
- cerr << "FASTA error: Could not open " << filename << " for reading" << endl;
- return false;
- }
- IsOpen = true;
- success &= IsOpen;
-
- // open index file if it exists
- if ( !indexFilename.empty() ) {
- IndexStream = fopen(indexFilename.c_str(), "rb");
- if ( !IndexStream ) {
- cerr << "FASTA error : Could not open " << indexFilename << " for reading." << endl;
- return false;
- }
- IsIndexOpen = true;
- success &= IsIndexOpen;
-
- // attempt to load index data
- HasIndex = LoadIndexData();
- success &= HasIndex;
- }
-
- // return success status
- return success;
-}
-
-bool Fasta::FastaPrivate::Rewind(void) {
- if ( !IsOpen ) return false;
- return ( fseeko(Stream, 0, SEEK_SET) == 0 );
-}
-
-bool Fasta::FastaPrivate::WriteIndexData(void) {
-
- // skip if no index file available
- if ( !IsIndexOpen ) return false;
-
- // iterate over index entries
- bool success = true;
- stringstream indexBuffer;
- vector<FastaIndexData>::const_iterator indexIter = Index.begin();
- vector<FastaIndexData>::const_iterator indexEnd = Index.end();
- for ( ; indexIter != indexEnd; ++indexIter ) {
-
- // clear stream
- indexBuffer.str("");
-
- // write data to stream
- const FastaIndexData& data = (*indexIter);
- indexBuffer << data.Name << "\t"
- << data.Length << "\t"
- << data.Offset << "\t"
- << data.LineLength << "\t"
- << data.ByteLength << endl;
-
- // write stream to file
- success &= ( fputs(indexBuffer.str().c_str(), IndexStream) >= 0 );
- }
-
- // return success status
- return success;
-}
-
-// --------------------------------
-// Fasta implementation
-
-Fasta::Fasta(void) {
- d = new FastaPrivate;
-}
-
-Fasta::~Fasta(void) {
- delete d;
- d = 0;
-}
-
-bool Fasta::Close(void) {
- return d->Close();
-}
-
-bool Fasta::CreateIndex(const string& indexFilename) {
- return d->CreateIndex(indexFilename);
-}
-
-bool Fasta::GetBase(const int& refId, const int& position, char& base) {
- return d->GetBase(refId, position, base);
-}
-
-bool Fasta::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
- return d->GetSequence(refId, start, stop, sequence);
-}
-
-bool Fasta::Open(const string& filename, const string& indexFilename) {
- return d->Open(filename, indexFilename);
-}
+++ /dev/null
-// ***************************************************************************
-// bamtools_fasta.h (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 13 July 2010
-// ---------------------------------------------------------------------------
-// Provides FASTA reading/indexing functionality.
-// ***************************************************************************
-
-#ifndef BAMTOOLS_FASTA_H
-#define BAMTOOLS_FASTA_H
-
-#include <string>
-
-namespace BamTools {
-
-class Fasta {
-
- // ctor & dtor
- public:
- Fasta(void);
- ~Fasta(void);
-
- // file-handling methods
- public:
- bool Close(void);
- bool Open(const std::string& filename, const std::string& indexFilename = "");
-
- // sequence access methods
- public:
- bool GetBase(const int& refID, const int& position, char& base);
- bool GetSequence(const int& refId, const int& start, const int& stop, std::string& sequence);
-
- // index-handling methods
- public:
- bool CreateIndex(const std::string& indexFilename);
-
- // internal implementation
- private:
- struct FastaPrivate;
- FastaPrivate* d;
-};
-
-} // BAMTOOLS_FASTA_H
-
-#endif // BAMTOOLS_FASTA_H
\ No newline at end of file
+++ /dev/null
-// ***************************************************************************
-// bamtools_filter.cpp (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 1 June 2010
-// ---------------------------------------------------------------------------
-// Filters a single BAM file (or filters multiple BAM files and merges)
-// according to some user-specified criteria.
-// ***************************************************************************
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "bamtools_filter.h"
-#include "bamtools_options.h"
-#include "BamReader.h"
-#include "BamMultiReader.h"
-
-using namespace std;
-using namespace BamTools;
-
-// ---------------------------------------------
-// FilterSettings implementation
-
-struct FilterTool::FilterSettings {
-
- // flags
- bool HasInputBamFilename;
- bool HasOutputBamFilename;
-
- // filenames
- vector<string> InputFiles;
- string OutputFilename;
-
- // constructor
- FilterSettings(void)
- : HasInputBamFilename(false)
- , HasOutputBamFilename(false)
- , OutputFilename(Options::StandardOut())
- { }
-};
-
-// ---------------------------------------------
-// FilterTool implementation
-
-FilterTool::FilterTool(void)
- : AbstractTool()
- , m_settings(new FilterSettings)
-{
- // set program details
- Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", "-in <filename> [-in <filename> ... ] -out <filename> ");
-
- // set up options
- OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
- Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts, Options::StandardIn());
- Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
-}
-
-FilterTool::~FilterTool(void) {
- delete m_settings;
- m_settings = 0;
-}
-
-int FilterTool::Help(void) {
- Options::DisplayHelp();
- return 0;
-}
-
-int FilterTool::Run(int argc, char* argv[]) {
-
- // parse command line arguments
- Options::Parse(argc, argv, 1);
-
- // set to default input if none provided
- if ( !m_settings->HasInputBamFilename )
- m_settings->InputFiles.push_back(Options::StandardIn());
-
- // open files
- BamMultiReader reader;
- reader.Open(m_settings->InputFiles, false);
-
- // do filtering
-
- // clean up & exit
- reader.Close();
- return 0;
-}
\ No newline at end of file
+++ /dev/null
-// ***************************************************************************
-// bamtools_filter.h (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 2 June 2010
-// ---------------------------------------------------------------------------
-// Filters a single BAM file (or filters multiple BAM files and merges)
-// according to some user-specified criteria.
-// ***************************************************************************
-
-#ifndef BAMTOOLS_FILTER_H
-#define BAMTOOLS_FILTER_H
-
-#include "bamtools_tool.h"
-
-namespace BamTools {
-
-class FilterTool : public AbstractTool {
-
- public:
- FilterTool(void);
- ~FilterTool(void);
-
- public:
- int Help(void);
- int Run(int argc, char* argv[]);
-
- private:
- struct FilterSettings;
- FilterSettings* m_settings;
-};
-
-} // namespace BamTools
-
-#endif // BAMTOOLS_FILTER_H
+++ /dev/null
-// ***************************************************************************
-// bamtools_header.cpp (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 1 June 2010
-// ---------------------------------------------------------------------------
-// Prints the SAM-style header from a single BAM file ( or merged header from
-// multiple BAM files) to stdout
-// ***************************************************************************
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "bamtools_header.h"
-#include "bamtools_options.h"
-#include "BamReader.h"
-#include "BamMultiReader.h"
-
-using namespace std;
-using namespace BamTools;
-
-// ---------------------------------------------
-// HeaderSettings implementation
-
-struct HeaderTool::HeaderSettings {
-
- // flags
- bool HasInputBamFilename;
-
- // filenames
- vector<string> InputFiles;
-
- // constructor
- HeaderSettings(void)
- : HasInputBamFilename(false)
- { }
-};
-
-// ---------------------------------------------
-// HeaderTool implementation
-
-HeaderTool::HeaderTool(void)
- : AbstractTool()
- , m_settings(new HeaderSettings)
-{
- // set program details
- Options::SetProgramInfo("bamtools header", "prints header from BAM file(s)", "-in <filename> [-in <filename> ... ] ");
-
- // set up options
- OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
- Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts, Options::StandardIn());
-}
-
-HeaderTool::~HeaderTool(void) {
- delete m_settings;
- m_settings = 0;
-}
-
-int HeaderTool::Help(void) {
- Options::DisplayHelp();
- return 0;
-}
-
-int HeaderTool::Run(int argc, char* argv[]) {
-
- // parse command line arguments
- Options::Parse(argc, argv, 1);
-
- // set to default input if none provided
- if ( !m_settings->HasInputBamFilename )
- m_settings->InputFiles.push_back(Options::StandardIn());
-
- // open files
- BamMultiReader reader;
- if ( reader.Open(m_settings->InputFiles, false) ) {
- // dump header contents to stdout
- cout << reader.GetHeaderText() << endl;
- }
-
- // clean up & exit
- reader.Close();
- return 0;
-}
\ No newline at end of file
+++ /dev/null
-// ***************************************************************************
-// bamtools_header.h (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 1 June 2010
-// ---------------------------------------------------------------------------
-// Prints the SAM-style header from a single BAM file ( or merged header from
-// multiple BAM files) to stdout
-// ***************************************************************************
-
-#ifndef BAMTOOLS_HEADER_H
-#define BAMTOOLS_HEADER_H
-
-#include "bamtools_tool.h"
-
-namespace BamTools {
-
-class HeaderTool : public AbstractTool {
-
- public:
- HeaderTool(void);
- ~HeaderTool(void);
-
- public:
- int Help(void);
- int Run(int argc, char* argv[]);
-
- private:
- struct HeaderSettings;
- HeaderSettings* m_settings;
-};
-
-} // namespace BamTools
-
-#endif // BAMTOOLS_HEADER_H
+++ /dev/null
-// ***************************************************************************
-// bamtools_index.cpp (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 7 July 2010
-// ---------------------------------------------------------------------------
-// Creates a BAM index (".bai") file for the provided BAM file.
-// ***************************************************************************
-
-#include <iostream>
-#include <string>
-
-#include "bamtools_index.h"
-#include "bamtools_options.h"
-#include "BamReader.h"
-
-using namespace std;
-using namespace BamTools;
-
-// ---------------------------------------------
-// IndexSettings implementation
-
-struct IndexTool::IndexSettings {
-
- // flags
- bool HasInputBamFilename;
- bool IsUsingBamtoolsIndex;
-
- // filenames
- string InputBamFilename;
-
- // constructor
- IndexSettings(void)
- : HasInputBamFilename(false)
- , IsUsingBamtoolsIndex(false)
- , InputBamFilename(Options::StandardIn())
- { }
-};
-
-// ---------------------------------------------
-// IndexTool implementation
-
-IndexTool::IndexTool(void)
- : AbstractTool()
- , m_settings(new IndexSettings)
-{
- // set program details
- Options::SetProgramInfo("bamtools index", "creates index for BAM file", "[-in <filename>] [-bti]");
-
- // set up options
- OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
- Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn());
- Options::AddOption("-bti", "use (non-standard) BamTools indexing scheme", m_settings->IsUsingBamtoolsIndex, IO_Opts);
-}
-
-IndexTool::~IndexTool(void) {
- delete m_settings;
- m_settings = 0;
-}
-
-int IndexTool::Help(void) {
- Options::DisplayHelp();
- return 0;
-}
-
-int IndexTool::Run(int argc, char* argv[]) {
-
- // parse command line arguments
- Options::Parse(argc, argv, 1);
-
- // open our BAM reader
- BamReader reader;
- reader.Open(m_settings->InputBamFilename);
-
- // create index for BAM file
- bool useDefaultIndex = !m_settings->IsUsingBamtoolsIndex;
- reader.CreateIndex(useDefaultIndex);
-
- // clean & exit
- reader.Close();
- return 0;
-}
+++ /dev/null
-// ***************************************************************************
-// bamtools_index.h (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 1 June 2010
-// ---------------------------------------------------------------------------
-// Creates a BAM index (".bai") file for the provided BAM file
-// ***************************************************************************
-
-#ifndef BAMTOOLS_INDEX_H
-#define BAMTOOLS_INDEX_H
-
-#include "bamtools_tool.h"
-
-namespace BamTools {
-
-class IndexTool : public AbstractTool {
-
- public:
- IndexTool(void);
- ~IndexTool(void);
-
- public:
- int Help(void);
- int Run(int argc, char* argv[]);
-
- private:
- struct IndexSettings;
- IndexSettings* m_settings;
-};
-
-} // namespace BamTools
-
-#endif // BAMTOOLS_INDEX_H
+++ /dev/null
-// ***************************************************************************
-// bamtools_merge.cpp (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 2 June 2010
-// ---------------------------------------------------------------------------
-// Merges multiple BAM files into one.
-//
-// ** Provide selectable region? eg chr2:10000..20000
-//
-// ***************************************************************************
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "bamtools_merge.h"
-#include "bamtools_options.h"
-#include "bamtools_utilities.h"
-#include "BamMultiReader.h"
-#include "BamWriter.h"
-
-using namespace std;
-using namespace BamTools;
-
-// ---------------------------------------------
-// MergeSettings implementation
-
-struct MergeTool::MergeSettings {
-
- // flags
- bool HasInputBamFilename;
- bool HasOutputBamFilename;
-// bool HasRegion;
-
- // filenames
- vector<string> InputFiles;
-
- // other parameters
- string OutputFilename;
-// string Region;
-
- // constructor
- MergeSettings(void)
- : HasInputBamFilename(false)
- , HasOutputBamFilename(false)
-// , HasRegion(false)
- , OutputFilename(Options::StandardOut())
- { }
-};
-
-// ---------------------------------------------
-// MergeTool implementation
-
-MergeTool::MergeTool(void)
- : AbstractTool()
- , m_settings(new MergeSettings)
-{
- // set program details
- Options::SetProgramInfo("bamtools merge", "merges multiple BAM files into one", "[-in <filename> -in <filename> ...] [-out <filename>]");
-
- // set up options
- OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
- Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts);
- Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts);
-
-// OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
-// Options::AddValueOption("-region", "REGION", "genomic region. See README for more details", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
-}
-
-MergeTool::~MergeTool(void) {
- delete m_settings;
- m_settings = 0;
-}
-
-int MergeTool::Help(void) {
- Options::DisplayHelp();
- return 0;
-}
-
-int MergeTool::Run(int argc, char* argv[]) {
-
- // parse command line arguments
- Options::Parse(argc, argv, 1);
-
- // set to default input if none provided
- if ( !m_settings->HasInputBamFilename ) m_settings->InputFiles.push_back(Options::StandardIn());
-
- // opens the BAM files without checking for indexes
- BamMultiReader reader;
- reader.Open(m_settings->InputFiles, false, true);
-
- // retrieve header & reference dictionary info
- std::string mergedHeader = reader.GetHeaderText();
- RefVector references = reader.GetReferenceData();
-
- // open BamWriter
- BamWriter writer;
- writer.Open(m_settings->OutputFilename, mergedHeader, references);
-
- // store alignments to output file
- BamAlignment bAlignment;
- while (reader.GetNextAlignmentCore(bAlignment)) {
- writer.SaveAlignment(bAlignment);
- }
-
- // clean & exit
- reader.Close();
- writer.Close();
- return 0;
-}
+++ /dev/null
-// ***************************************************************************
-// bamtools_merge.h (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 1 June 2010
-// ---------------------------------------------------------------------------
-// Merges multiple BAM files into one
-// ***************************************************************************
-
-#ifndef BAMTOOLS_MERGE_H
-#define BAMTOOLS_MERGE_H
-
-#include "bamtools_tool.h"
-
-namespace BamTools {
-
-class MergeTool : public AbstractTool {
-
- public:
- MergeTool(void);
- ~MergeTool(void);
-
- public:
- int Help(void);
- int Run(int argc, char* argv[]);
-
- private:
- struct MergeSettings;
- MergeSettings* m_settings;
-};
-
-} // namespace BamTools
-
-#endif // BAMTOOLS_MERGE_H
+++ /dev/null
-// ***************************************************************************
-// bamtools_options.cpp (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 2 June 2010
-// ---------------------------------------------------------------------------
-// Parses command line arguments and creates a help menu
-// ---------------------------------------------------------------------------
-// Modified from:
-// The Mosaik suite's command line parser class: COptions
-// (c) 2006 - 2009 Michael Str�mberg
-// Marth Lab, Department of Biology, Boston College
-// Dual licenced under the GNU General Public License 2.0+ license or as
-// a commercial license with the Marth Lab.
-//
-// * Modified slightly to fit BamTools, otherwise code is same.
-// * (BamTools namespace, added stdin/stdout) (DB)
-// ***************************************************************************
-
-#include "bamtools_options.h"
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <iomanip>
-#include <sstream>
-using namespace std;
-using namespace BamTools;
-
-string Options::m_programName; // the program name
-string Options::m_description; // the main description
-string Options::m_exampleArguments; // the example arguments
-vector<OptionGroup> Options::m_optionGroups; // stores the option groups
-map<string, OptionValue> Options::m_optionsMap; // stores the options in a map
-string Options::m_stdin = "stdin"; // string representation of stdin
-string Options::m_stdout = "stdout"; // string representation of stdout
-
-// adds a simple option to the parser
-void Options::AddOption(const string& argument, const string& optionDescription, bool& foundArgument, OptionGroup* group) {
-
- Option o;
- o.Argument = argument;
- o.Description = optionDescription;
- o.StoreValue = false;
- group->Options.push_back(o);
-
- OptionValue ov;
- ov.pFoundArgument = &foundArgument;
- ov.StoreValue = false;
-
- m_optionsMap[argument] = ov;
-}
-
-// creates an option group
-OptionGroup* Options::CreateOptionGroup(const string& groupName) {
- OptionGroup og;
- og.Name = groupName;
- m_optionGroups.push_back(og);
- return &m_optionGroups[m_optionGroups.size() - 1];
-}
-
-// displays the help menu
-void Options::DisplayHelp(void) {
-
- // initialize
- char argumentBuffer[ARGUMENT_LENGTH + 1];
- ostringstream sb;
-
- char indentBuffer[MAX_LINE_LENGTH - DESC_LENGTH + 1];
- memset(indentBuffer, ' ', MAX_LINE_LENGTH - DESC_LENGTH);
- indentBuffer[MAX_LINE_LENGTH - DESC_LENGTH] = 0;
-
- // display the menu
- printf("Description: %s.\n\n", m_description.c_str());
- printf("Usage: ");
- printf("%s", m_programName.c_str());
- printf(" %s\n\n", m_exampleArguments.c_str());
-
- vector<Option>::const_iterator optionIter;
- vector<OptionGroup>::const_iterator groupIter;
- for (groupIter = m_optionGroups.begin(); groupIter != m_optionGroups.end(); ++groupIter) {
-
- printf("%s:\n", groupIter->Name.c_str());
-
- for (optionIter = groupIter->Options.begin(); optionIter != groupIter->Options.end(); ++optionIter) {
-
- if (optionIter->StoreValue)
- snprintf(argumentBuffer, ARGUMENT_LENGTH + 1, " %s <%s>", optionIter->Argument.c_str(), optionIter->ValueDescription.c_str());
- else
- snprintf(argumentBuffer, ARGUMENT_LENGTH + 1, " %s", optionIter->Argument.c_str());
- printf("%-35s ", argumentBuffer);
-
- string description = optionIter->Description;
-
- // handle default values
- if (optionIter->HasDefaultValue) {
-
- sb.str("");
- sb << description << " [";
-
- if (optionIter->DefaultValue.is_type<unsigned int>()) {
- sb << (unsigned int)optionIter->DefaultValue;
- } else if (optionIter->DefaultValue.is_type<unsigned char>()) {
- sb << (unsigned short)(unsigned char)optionIter->DefaultValue;
- } else if (optionIter->DefaultValue.is_type<float>()) {
- sb << std::fixed << std::setprecision(2) << (float)optionIter->DefaultValue;
- } else if (optionIter->DefaultValue.is_type<double>()) {
- sb << std::fixed << std::setprecision(4) << (double)optionIter->DefaultValue;
- } else if (optionIter->DefaultValue.is_type<std::string>()) {
- const std::string stringValue = optionIter->DefaultValue;
- sb << stringValue;
- } else {
- printf("ERROR: Found an unsupported data type for argument %s when casting the default value.\n", optionIter->Argument.c_str());
- exit(1);
- }
-
- sb << "]";
- description = sb.str();
- }
-
- if ( description.size() <= DESC_LENGTH_FIRST_ROW ) {
- printf("%s\n", description.c_str());
- } else {
-
- // handle the first row
- const char* pDescription = description.data();
- unsigned int cutIndex = DESC_LENGTH_FIRST_ROW;
- while(pDescription[cutIndex] != ' ')
- cutIndex--;
- printf("%s\n", description.substr(0, cutIndex).c_str());
- description = description.substr(cutIndex + 1);
-
- // handle subsequent rows
- while(description.size() > DESC_LENGTH) {
- pDescription = description.data();
- cutIndex = DESC_LENGTH;
- while(pDescription[cutIndex] != ' ')
- cutIndex--;
- printf("%s%s\n", indentBuffer, description.substr(0, cutIndex).c_str());
- description = description.substr(cutIndex + 1);
- }
-
- // handle last row
- printf("%s%s\n", indentBuffer, description.c_str());
- }
- }
-
- printf("\n");
- }
-
- printf("Help:\n");
- printf(" --help, -h shows this help text\n");
- exit(1);
-}
-
-// parses the command line
-void Options::Parse(int argc, char* argv[], int offset) {
-
- // initialize
- map<string, OptionValue>::const_iterator ovMapIter;
- map<string, OptionValue>::const_iterator checkMapIter;
- const int LAST_INDEX = argc - 1;
- ostringstream errorBuilder;
- bool foundError = false;
- char* end_ptr = NULL;
- const string ERROR_SPACER(7, ' ');
-
- // check if we should show the help menu
- bool showHelpMenu = false;
- if (argc > 1) {
- for (int i = 1; i < argc; i++) {
- const std::string argument = argv[i];
- if ( (argument == "-h") || (argument == "--help") || (argument == "help") )
- showHelpMenu = true;
- }
- } else showHelpMenu = true;
-
- if (showHelpMenu)
- DisplayHelp();
-
- // check each argument
- for (int i = offset+1; i < argc; i++) {
-
- const string argument = argv[i];
- ovMapIter = m_optionsMap.find(argument);
-
- if (ovMapIter == m_optionsMap.end()) {
- errorBuilder << ERROR_SPACER << "An unrecognized argument was found: " << argument << std::endl;
- foundError = true;
- } else {
-
- *ovMapIter->second.pFoundArgument = true;
-
- // grab the value
- if (ovMapIter->second.StoreValue) {
-
- if (i < LAST_INDEX) {
-
- // check if the next argument is really a command line option
- const string val = argv[i + 1];
- checkMapIter = m_optionsMap.find(val);
-
- if (checkMapIter == m_optionsMap.end()) {
-
- ++i;
-
- if (ovMapIter->second.VariantValue.is_type<unsigned int>()) {
- const unsigned int uint32 = (unsigned int)strtoul(val.c_str(), &end_ptr, 10);
- unsigned int* varValue = (unsigned int*)ovMapIter->second.pValue;
- *varValue = uint32;
- } else if (ovMapIter->second.VariantValue.is_type<unsigned char>()) {
- const unsigned char uint8 = (unsigned char)strtoul(val.c_str(), &end_ptr, 10);
- unsigned char* varValue = (unsigned char*)ovMapIter->second.pValue;
- *varValue = uint8;
- } else if (ovMapIter->second.VariantValue.is_type<uint64_t>()) {
- const uint64_t uint64 = strtoui64(val.c_str(), &end_ptr, 10);
- uint64_t* varValue = (uint64_t*)ovMapIter->second.pValue;
- *varValue = uint64;
- } else if (ovMapIter->second.VariantValue.is_type<double>()) {
- const double d = strtod(val.c_str(), &end_ptr);
- double* varValue = (double*)ovMapIter->second.pValue;
- *varValue = d;
- } else if (ovMapIter->second.VariantValue.is_type<float>()) {
- const float f = (float)strtod(val.c_str(), &end_ptr);
- float* varValue = (float*)ovMapIter->second.pValue;
- *varValue = f;
- } else if (ovMapIter->second.VariantValue.is_type<string>()) {
- string* pStringValue = (string*)ovMapIter->second.pValue;
- *pStringValue = val;
- } else if (ovMapIter->second.VariantValue.is_type<vector<string> >()) {
- vector<string>* pVectorValue = (vector<string>*)ovMapIter->second.pValue;
- pVectorValue->push_back(val);
- } else {
- printf("ERROR: Found an unsupported data type for argument %s when parsing the arguments.\n", argument.c_str());
- exit(1);
- }
- } else {
- errorBuilder << ERROR_SPACER << "The argument (" << argument << ") expects a value, but none was found." << endl;
- foundError = true;
- }
- } else {
- errorBuilder << ERROR_SPACER << "The argument (" << argument << ") expects a value, but none was found." << endl;
- foundError = true;
- }
- }
- }
- }
-
- // check if we missed any required parameters
- for (ovMapIter = m_optionsMap.begin(); ovMapIter != m_optionsMap.end(); ++ovMapIter) {
- if (ovMapIter->second.IsRequired && !*ovMapIter->second.pFoundArgument) {
- errorBuilder << ERROR_SPACER << ovMapIter->second.ValueTypeDescription << " was not specified. Please use the " << ovMapIter->first << " parameter." << endl;
- foundError = true;
- }
- }
-
- // print the errors if any were found
- if (foundError) {
- printf("ERROR: Some problems were encountered when parsing the command line options:\n");
- printf("%s\n", errorBuilder.str().c_str());
- printf("For a complete list of command line options, type \"%s help %s\"\n", argv[0], argv[1]);
- exit(1);
- }
-}
-
-// sets the program info
-void Options::SetProgramInfo(const string& programName, const string& description, const string& arguments) {
- m_programName = programName;
- m_description = description;
- m_exampleArguments = arguments;
-}
-
-// return string representations of stdin
-const string& Options::StandardIn(void) { return m_stdin; }
-
-// return string representations of stdout
-const string& Options::StandardOut(void) { return m_stdout; }
+++ /dev/null
-// ***************************************************************************
-// bamtools_options.h (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 1 June 2010
-// ---------------------------------------------------------------------------
-// Parses command line arguments and creates a help menu
-// ---------------------------------------------------------------------------
-// Modified from:
-// The Mosaik suite's command line parser class: COptions
-// (c) 2006 - 2009 Michael Str�mberg
-// Marth Lab, Department of Biology, Boston College
-// Dual licenced under the GNU General Public License 2.0+ license or as
-// a commercial license with the Marth Lab.
-//
-// * Modified to fit BamTools code-style, otherwise code is same. (DB)
-// ***************************************************************************
-
-#ifndef BAMTOOLS_OPTIONS_H
-#define BAMTOOLS_OPTIONS_H
-
-#include <map>
-#include <string>
-#include <vector>
-#include "bamtools_variant.h"
-
-#ifndef WIN32
- #include <stdint.h>
-#endif
-
-namespace BamTools {
-
-#define ARGUMENT_LENGTH 35
-#define DESC_LENGTH_FIRST_ROW 50
-#define DESC_LENGTH 39
-#define MAX_LINE_LENGTH 78
-
-#ifdef WIN32
- #define snprintf _snprintf
- typedef __int64 int64_t;
- typedef unsigned __int64 uint64_t;
- #define strtoui64 _strtoui64
-#else
- #define strtoui64 strtoull
-#endif
-
-struct Option {
-
- // data members
- std::string Argument;
- std::string ValueDescription;
- std::string Description;
- bool StoreValue;
- bool HasDefaultValue;
- Variant DefaultValue;
-
- // constructor
- Option(void)
- : StoreValue(true)
- , HasDefaultValue(false)
- { }
-};
-
-struct OptionValue {
-
- // data members
- bool* pFoundArgument;
- void* pValue;
- std::string ValueTypeDescription;
- bool UseVector;
- bool StoreValue;
- bool IsRequired;
- Variant VariantValue;
-
- // constructor
- OptionValue(void)
- : pFoundArgument(NULL)
- , pValue(NULL)
- , UseVector(false)
- , StoreValue(true)
- , IsRequired(false)
- { }
-};
-
-struct OptionGroup {
- std::string Name;
- std::vector<Option> Options;
-};
-
-class Options {
-
- // add option/argument rules
- public:
- // adds a simple option to the parser
- static void AddOption(const std::string& argument,
- const std::string& optionDescription,
- bool& foundArgument,
- OptionGroup* group);
-
- // adds a value option to the parser
- template<typename T>
- static void AddValueOption(const std::string& argument,
- const std::string& valueDescription,
- const std::string& optionDescription,
- const std::string& valueTypeDescription,
- bool& foundArgument,
- T& val,
- OptionGroup* group);
-
- // adds a value option to the parser (with a default value)
- template<typename T, typename D>
- static void AddValueOption(const std::string& argument,
- const std::string& valueDescription,
- const std::string& optionDescription,
- const std::string& valueTypeDescription,
- bool& foundArgument,
- T& val,
- OptionGroup* group,
- D& defaultValue);
-
- // other API methods
- public:
- // creates an option group
- static OptionGroup* CreateOptionGroup(const std::string& groupName);
- // displays the help menu
- static void DisplayHelp(void);
- // parses the command line
- static void Parse(int argc, char* argv[], int offset = 0);
- // sets the program info
- static void SetProgramInfo(const std::string& programName, const std::string& description, const std::string& arguments);
- // returns string representation of stdin
- static const std::string& StandardIn(void);
- // returns string representation of stdout
- static const std::string& StandardOut(void);
-
- // static data members
- private:
- // the program name
- static std::string m_programName;
- // the main description
- static std::string m_description;
- // the example arguments
- static std::string m_exampleArguments;
- // stores the option groups
- static std::vector<OptionGroup> m_optionGroups;
- // stores the options in a map
- static std::map<std::string, OptionValue> m_optionsMap;
- // string representation of stdin
- static std::string m_stdin;
- // string representation of stdout
- static std::string m_stdout;
-};
-
-// adds a value option to the parser
-template<typename T>
-void Options::AddValueOption(const std::string& argument,
- const std::string& valueDescription,
- const std::string& optionDescription,
- const std::string& valueTypeDescription,
- bool& foundArgument,
- T& val,
- OptionGroup* group)
-{
- Option o;
- o.Argument = argument;
- o.ValueDescription = valueDescription;
- o.Description = optionDescription;
- group->Options.push_back(o);
-
- OptionValue ov;
- ov.pFoundArgument = &foundArgument;
- ov.pValue = (void*)&val;
- ov.VariantValue = val;
- ov.IsRequired = (valueTypeDescription.empty() ? false : true);
- ov.ValueTypeDescription = valueTypeDescription;
- m_optionsMap[argument] = ov;
-}
-
-// adds a value option to the parser (with a default value)
-template<typename T, typename D>
-void Options::AddValueOption(const std::string& argument,
- const std::string& valueDescription,
- const std::string& optionDescription,
- const std::string& valueTypeDescription,
- bool& foundArgument,
- T& val,
- OptionGroup* group,
- D& defaultValue)
-{
- Option o;
- o.Argument = argument;
- o.ValueDescription = valueDescription;
- o.Description = optionDescription;
- o.DefaultValue = defaultValue;
- o.HasDefaultValue = true;
- group->Options.push_back(o);
-
- OptionValue ov;
- ov.pFoundArgument = &foundArgument;
- ov.pValue = (void*)&val;
- ov.VariantValue = val;
- ov.IsRequired = (valueTypeDescription.empty() ? false : true);
- ov.ValueTypeDescription = valueTypeDescription;
- m_optionsMap[argument] = ov;
-}
-
-} // namespace BamTools
-
-#endif // BAMTOOLS_OPTIONS_H
\ No newline at end of file
+++ /dev/null
-// ***************************************************************************
-// bamtools_pileup.cpp (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 13 July 2010
-// ---------------------------------------------------------------------------
-// Provides pileup conversion functionality.
-//
-// The 'assembly' aspect of pileup makes this more complicated than the
-// simpler one-to-one conversion methods for other formats.
-// ***************************************************************************
-
-#include <vector>
-#include "BamMultiReader.h"
-#include "bamtools_pileup.h"
-using namespace std;
-using namespace BamTools;
-
-struct Pileup::PileupPrivate {
-
- // ---------------------
- // data members
-
- // IO & settings
- BamMultiReader* Reader;
- ostream* OutStream;
- string FastaFilename;
- bool IsPrintingMapQualities;
- BamRegion Region;
-
- // parsing data
- int CurrentId;
- int CurrentPosition;
- vector<BamAlignment> CurrentData;
- RefVector References;
-
- // ----------------------
- // ctor
-
- PileupPrivate(BamMultiReader* reader, ostream* outStream)
- : Reader(reader)
- , OutStream(outStream)
- , FastaFilename("")
- , IsPrintingMapQualities(false)
- { }
-
- // ----------------------
- // internal methods
-
- void PrintCurrentData(void);
- bool Run(void);
-};
-
-void Pileup::PileupPrivate::PrintCurrentData(void) {
-
- // remove any data that ends before CurrentPosition
- size_t i = 0;
- while ( i < CurrentData.size() ) {
- if ( CurrentData[i].GetEndPosition() < CurrentPosition )
- CurrentData.erase(CurrentData.begin() + i);
- else
- ++i;
- }
-
- // if not data remains, return
- if ( CurrentData.empty() ) return;
-
- // initialize empty strings
- string bases = "";
- string baseQuals = "";
- string mapQuals = "";
-
- // iterate over alignments
- vector<BamAlignment>::const_iterator dataIter = CurrentData.begin();
- vector<BamAlignment>::const_iterator dataEnd = CurrentData.end();
- for ( ; dataIter != dataEnd; ++dataIter ) {
-
- // retrieve alignment
- const BamAlignment& al = (*dataIter);
-
- // determine current base character & store
- const char base = al.AlignedBases[CurrentPosition -al.Position];
- if ( al.IsReverseStrand() )
- bases.push_back( tolower(base) );
- else
- bases.push_back( toupper(base) );
-
- // determine current base quality & store
- baseQuals.push_back( al.Qualities[CurrentPosition - al.Position] );
-
- // if using mapQuals, determine current mapQual & store
- if ( IsPrintingMapQualities ) {
- int mapQuality = (int)(al.MapQuality + 33);
- if ( mapQuality > 126 ) mapQuality = 126;
- mapQuals.push_back((char)mapQuality);
- }
- }
-
- // print results to OutStream
- const string& refName = References[CurrentId].RefName;
- const char refBase = 'N';
-
- *OutStream << refName << "\t" << CurrentPosition << "\t" << refBase << "\t" << CurrentData.size() << "\t" << bases << "\t" << baseQuals;
- if ( IsPrintingMapQualities ) *OutStream << "\t" << mapQuals;
- *OutStream << endl;
-}
-
-bool Pileup::PileupPrivate::Run(void) {
-
- // -----------------------------
- // validate input & output
-
- if ( !Reader ) {
- cerr << "Pileup::Run() : Invalid multireader" << endl;
- return false;
- }
-
- if ( !OutStream) {
- cerr << "Pileup::Run() : Invalid output stream" << endl;
- return false;
- }
-
- References = Reader->GetReferenceData();
-
- // -----------------------------
- // process input data
-
- // get first entry
- BamAlignment al;
- if ( !Reader->GetNextAlignment(al) ) {
- cerr << "Pileup::Run() : Could not read from multireader" << endl;
- return false;
- }
-
- // set initial markers & store first entry
- CurrentId = al.RefID;
- CurrentPosition = al.Position;
- CurrentData.clear();
- CurrentData.push_back(al);
-
- // iterate over remaining data
- while ( Reader->GetNextAlignment(al) ) {
-
- // if same reference
- if ( al.RefID == CurrentId ) {
-
- // if same position, store and move on
- if ( al.Position == CurrentPosition )
- CurrentData.push_back(al);
-
- // if less than CurrentPosition - sorting error => ABORT
- else if ( al.Position < CurrentPosition ) {
- cerr << "Pileup::Run() : Data not sorted correctly!" << endl;
- return false;
- }
-
- // else print pileup data until 'catching up' to CurrentPosition
- else {
- while ( al.Position > CurrentPosition ) {
- PrintCurrentData();
- ++CurrentPosition;
- }
- CurrentData.push_back(al);
- }
- }
-
- // if reference ID less than CurrentID - sorting error => ABORT
- else if ( al.RefID < CurrentId ) {
- cerr << "Pileup::Run() : Data not sorted correctly!" << endl;
- return false;
- }
-
- // else moved forward onto next reference
- else {
-
- // print any remaining pileup data from previous reference
- while ( !CurrentData.empty() ) {
- PrintCurrentData();
- ++CurrentPosition;
- }
-
- // store first entry on this new reference, update markers
- CurrentData.clear();
- CurrentData.push_back(al);
- CurrentId = al.RefID;
- CurrentPosition = al.Position;
- }
- }
-
- // ------------------------------------
- // handle any remaining data entries
-
- while ( !CurrentData.empty() ) {
- PrintCurrentData();
- ++CurrentPosition;
- }
-
- // -------------------------
- // return success
-
- return true;
-}
-
-// ----------------------------------------------------------
-// Pileup implementation
-
-Pileup::Pileup(BamMultiReader* reader, ostream* outStream) {
- d = new PileupPrivate(reader, outStream);
-}
-
-Pileup::~Pileup(void) {
- delete d;
- d = 0;
-}
-
-bool Pileup::Run(void) {
- return d->Run();
-}
-
-void Pileup::SetFastaFilename(const string& filename) {
- d->FastaFilename = filename;
-}
-
-void Pileup::SetIsPrintingMapQualities(bool ok) {
- d->IsPrintingMapQualities = ok;
-}
-
-void Pileup::SetRegion(const BamRegion& region) {
- d->Region = region;
-}
+++ /dev/null
-// ***************************************************************************
-// bamtools_pileup.h (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 13 July 2010
-// ---------------------------------------------------------------------------
-// Provides pileup conversion functionality.
-//
-// The 'assembly' aspect of pileup makes this more complicated than the
-// simpler one-to-one conversion methods for other formats.
-// ***************************************************************************
-
-#ifndef BAMTOOLS_PILEUP_H
-#define BAMTOOLS_PILEUP_H
-
-#include <iostream>
-#include <string>
-
-namespace BamTools {
-
-class BamMultiReader;
-class BamRegion;
-
-class Pileup {
-
- public:
- Pileup(BamMultiReader* reader, std::ostream* outStream);
- ~Pileup(void);
-
- public:
- bool Run(void);
- void SetFastaFilename(const std::string& filename);
- void SetIsPrintingMapQualities(bool ok);
- void SetRegion(const BamRegion& region);
-
- private:
- struct PileupPrivate;
- PileupPrivate* d;
-};
-
-} // namespace BamTools
-
-#endif // BAMTOOLS_PILEUP_H
\ No newline at end of file
+++ /dev/null
-// ***************************************************************************
-// bamtools_random.cpp (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 20 July 2010 (DB)
-// ---------------------------------------------------------------------------
-// Grab a random subset of alignments.
-// ***************************************************************************
-
-#include <ctime>
-#include <cstdlib>
-#include <iostream>
-#include <string>
-#include <vector>
-#include "bamtools_random.h"
-#include "bamtools_options.h"
-#include "bamtools_utilities.h"
-#include "BamMultiReader.h"
-#include "BamWriter.h"
-using namespace std;
-using namespace BamTools;
-
-namespace BamTools {
-
- // define constants
- const unsigned int RANDOM_MAX_ALIGNMENT_COUNT = 10000;
-
-} // namespace BamTools
-
-// ---------------------------------------------
-// RandomSettings implementation
-
-struct RandomTool::RandomSettings {
-
- // flags
- bool HasAlignmentCount;
- bool HasInput;
- bool HasOutput;
- bool HasRegion;
-
- // parameters
- unsigned int AlignmentCount;
- vector<string> InputFiles;
- string OutputFilename;
- string Region;
-
- // constructor
- RandomSettings(void)
- : HasAlignmentCount(false)
- , HasInput(false)
- , HasOutput(false)
- , HasRegion(false)
- , AlignmentCount(RANDOM_MAX_ALIGNMENT_COUNT)
- { }
-};
-
-// ---------------------------------------------
-// RandomTool implementation
-
-RandomTool::RandomTool(void)
- : AbstractTool()
- , m_settings(new RandomSettings)
-{
- // set program details
- Options::SetProgramInfo("bamtools random", "grab a random subset of alignments", "[-in <filename> -in <filename> ...] [-out <filename>] [-region <REGION>]");
-
- // set up options
- OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
- Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
- Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
-
- OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
- Options::AddValueOption("-n", "count", "number of alignments to grab. Note - no duplicate checking is performed (currently)", "", m_settings->HasAlignmentCount, m_settings->AlignmentCount, FilterOpts, RANDOM_MAX_ALIGNMENT_COUNT);
- Options::AddValueOption("-region", "REGION", "limit source of random alignment subset to a particular genomic region. Index file is recommended for better performance, and is read automatically if it exists as <filename>.bai or <filename>.bti. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
-}
-
-RandomTool::~RandomTool(void) {
- delete m_settings;
- m_settings = 0;
-}
-
-int RandomTool::Help(void) {
- Options::DisplayHelp();
- return 0;
-}
-
-int RandomTool::Run(int argc, char* argv[]) {
-
- // TODO: Handle BAM input WITHOUT index files.
-
- // parse command line arguments
- Options::Parse(argc, argv, 1);
-
- // set to default input if none provided
- if ( !m_settings->HasInput )
- m_settings->InputFiles.push_back(Options::StandardIn());
-
- // open our BAM reader
- BamMultiReader reader;
- reader.Open(m_settings->InputFiles);
- string headerText = reader.GetHeaderText();
- RefVector references = reader.GetReferenceData();
-
- // check that reference data is available, used for generating random jumps
- if ( references.empty() ) {
- cerr << "No reference data available... quitting." << endl;
- reader.Close();
- return 1;
- }
-
- // see if user specified a REGION
- BamRegion region;
- if ( m_settings->HasRegion ) {
- if ( Utilities::ParseRegionString(m_settings->Region, reader, region) )
- reader.SetRegion(region);
- }
-
- // open out BAM writer
- BamWriter writer;
- writer.Open(m_settings->OutputFilename, headerText, references);
-
- // seed our random number generator
- srand (time(NULL) );
-
- // grab random alignments
- BamAlignment al;
- unsigned int i = 0;
- while ( i < m_settings->AlignmentCount ) {
-
- int randomRefId = 0;
- int randomPosition = 0;
-
- // use REGION constraints to generate random refId & position
- if ( m_settings->HasRegion ) {
-
- int lowestRefId = region.LeftRefID;
- int highestRefId = region.RightRefID;
- int rangeRefId = (highestRefId - lowestRefId) + 1;
- randomRefId = lowestRefId + (int)(rangeRefId * (double)(rand()/((double)RAND_MAX + 1)));
-
- int lowestPosition = ( (randomRefId == region.LeftRefID) ? region.LeftPosition : 0 );
- int highestPosition = ( (randomRefId == region.RightRefID) ? region.RightPosition : references.at(randomRefId).RefLength - 1 );
- int rangePosition = (highestPosition - lowestPosition) + 1;
- randomPosition = lowestPosition + (int)(rangePosition * (double)(rand()/((double)RAND_MAX + 1)));
- }
-
- // otherwise generate 'normal' random refId & position
- else {
-
- // generate random refId
- int lowestRefId = 0;
- int highestRefId = references.size() - 1;
- int rangeRefId = (highestRefId - lowestRefId) + 1;
- randomRefId = lowestRefId + (int)(rangeRefId * (double)(rand()/((double)RAND_MAX + 1)));
-
- // generate random position
- int lowestPosition = 0;
- int highestPosition = references.at(randomRefId).RefLength - 1;
- int rangePosition = (highestPosition - lowestPosition) + 1;
- randomPosition = lowestPosition + (int)(rangePosition * (double)(rand()/((double)RAND_MAX + 1)));
- }
-
- // if jump & read successful, save alignment
- if ( reader.Jump(randomRefId, randomPosition) ) {
- while ( reader.GetNextAlignmentCore(al) ) {
- if ( al.RefID == randomRefId && al.Position >= randomPosition ) {
- writer.SaveAlignment(al);
- ++i;
- break;
- }
- }
- }
- }
-
- // close reader & writer
- reader.Close();
- writer.Close();
- return 0;
-}
\ No newline at end of file
+++ /dev/null
-// ***************************************************************************
-// bamtools_random.h (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 20 July 2010 (DB)
-// ---------------------------------------------------------------------------
-// Grab a random subset of alignments.
-// ***************************************************************************
-
-#ifndef BAMTOOLS_RANDOM_H
-#define BAMTOOLS_RANDOM_H
-
-#include "bamtools_tool.h"
-
-namespace BamTools {
-
-class RandomTool : public AbstractTool {
-
- public:
- RandomTool(void);
- ~RandomTool(void);
-
- public:
- int Help(void);
- int Run(int argc, char* argv[]);
-
- private:
- struct RandomSettings;
- RandomSettings* m_settings;
-};
-
-} // namespace BamTools
-
-#endif // BAMTOOLS_RANDOM _H
+++ /dev/null
-// ***************************************************************************
-// bamtools_sort.cpp (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 21 June 2010 (DB)
-// ---------------------------------------------------------------------------
-// Sorts an input BAM file (default by position) and stores in a new BAM file.
-// ***************************************************************************
-
-#include <cstdio>
-#include <algorithm>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "bamtools_sort.h"
-#include "bamtools_options.h"
-#include "BamReader.h"
-#include "BamMultiReader.h"
-#include "BamWriter.h"
-
-using namespace std;
-using namespace BamTools;
-
-namespace BamTools {
-
- // defaults
- //
- // ** These defaults should be tweaked & 'optimized' per testing ** //
- // I say 'optimized' because each system will naturally perform
- // differently. We will attempt to determine a sensible
- // compromise that should perform well on average.
- const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT = 10000; // max numberOfAlignments for buffer
- const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024; // Mb
-
- // -----------------------------------
- // comparison objects (for sorting)
-
- struct SortLessThanPosition {
- bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) {
- if ( lhs.RefID != rhs.RefID )
- return lhs.RefID < rhs.RefID;
- else
- return lhs.Position < rhs.Position;
- }
- };
-
- struct SortLessThanName {
- bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) {
- return lhs.Name < rhs.Name;
- }
- };
-
-} // namespace BamTools
-
-// ---------------------------------------------
-// SortToolPrivate declaration
-class SortTool::SortToolPrivate {
-
- // ctor & dtor
- public:
- SortToolPrivate(SortTool::SortSettings* settings);
- ~SortToolPrivate(void);
-
- // 'public' interface
- public:
- bool Run(void);
-
- // internal methods
- private:
- void ClearBuffer(vector<BamAlignment>& buffer);
- bool GenerateSortedRuns(void);
- bool HandleBufferContents(vector<BamAlignment>& buffer);
- bool MergeSortedRuns(void);
- bool WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename);
- void SortBuffer(vector<BamAlignment>& buffer);
-
- // data members
- private:
- SortTool::SortSettings* m_settings;
- string m_tempFilenameStub;
- int m_numberOfRuns;
- string m_headerText;
- RefVector m_references;
- vector<string> m_tempFilenames;
-};
-
-// ---------------------------------------------
-// SortSettings implementation
-
-struct SortTool::SortSettings {
-
- // flags
- bool HasInputBamFilename;
- bool HasMaxBufferCount;
- bool HasMaxBufferMemory;
- bool HasOutputBamFilename;
- bool IsSortingByName;
-
- // filenames
- string InputBamFilename;
- string OutputBamFilename;
-
- // parameters
- unsigned int MaxBufferCount;
- unsigned int MaxBufferMemory;
-
- // constructor
- SortSettings(void)
- : HasInputBamFilename(false)
- , HasMaxBufferCount(false)
- , HasMaxBufferMemory(false)
- , HasOutputBamFilename(false)
- , IsSortingByName(false)
- , InputBamFilename(Options::StandardIn())
- , OutputBamFilename(Options::StandardOut())
- , MaxBufferCount(SORT_DEFAULT_MAX_BUFFER_COUNT)
- , MaxBufferMemory(SORT_DEFAULT_MAX_BUFFER_MEMORY)
- { }
-};
-
-// ---------------------------------------------
-// SortTool implementation
-
-SortTool::SortTool(void)
- : AbstractTool()
- , m_settings(new SortSettings)
- , m_impl(0)
-{
- // set program details
- Options::SetProgramInfo("bamtools sort", "sorts a BAM file", "[-in <filename>] [-out <filename>]");
-
- // set up options
- OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
- Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn());
- Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputBamFilename, IO_Opts, Options::StandardOut());
-
- OptionGroup* SortOpts = Options::CreateOptionGroup("Sorting Methods");
- Options::AddOption("-byname", "sort by alignment name", m_settings->IsSortingByName, SortOpts);
-
- OptionGroup* MemOpts = Options::CreateOptionGroup("Memory Settings");
- Options::AddValueOption("-n", "count", "max number of alignments per tempfile", "", m_settings->HasMaxBufferCount, m_settings->MaxBufferCount, MemOpts, SORT_DEFAULT_MAX_BUFFER_COUNT);
- Options::AddValueOption("-mem", "Mb", "max memory to use", "", m_settings->HasMaxBufferMemory, m_settings->MaxBufferMemory, MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY);
-}
-
-SortTool::~SortTool(void) {
-
- delete m_settings;
- m_settings = 0;
-
- delete m_impl;
- m_impl = 0;
-}
-
-int SortTool::Help(void) {
- Options::DisplayHelp();
- return 0;
-}
-
-int SortTool::Run(int argc, char* argv[]) {
-
- // parse command line arguments
- Options::Parse(argc, argv, 1);
-
- // run internal SortTool implementation, return success/fail
- m_impl = new SortToolPrivate(m_settings);
-
- if ( m_impl->Run() ) return 0;
- else return 1;
-}
-
-// ---------------------------------------------
-// SortToolPrivate implementation
-
-// constructor
-SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings)
- : m_settings(settings)
- , m_numberOfRuns(0)
-{
- // set filename stub depending on inputfile path
- // that way multiple sort runs don't trip on each other's temp files
- if ( m_settings) {
- size_t extensionFound = m_settings->InputBamFilename.find(".bam");
- if (extensionFound != string::npos )
- m_tempFilenameStub = m_settings->InputBamFilename.substr(0,extensionFound);
- m_tempFilenameStub.append(".sort.temp.");
- }
-}
-
-// destructor
-SortTool::SortToolPrivate::~SortToolPrivate(void) { }
-
-// generates mutiple sorted temp BAM files from single unsorted BAM file
-bool SortTool::SortToolPrivate::GenerateSortedRuns(void) {
-
- // open input BAM file
- BamReader inputReader;
- inputReader.Open(m_settings->InputBamFilename);
-
- // get basic data that will be shared by all temp/output files
- m_headerText = inputReader.GetHeaderText();
- m_references = inputReader.GetReferenceData();
-
- // set up alignments buffer
- vector<BamAlignment> buffer;
- buffer.reserve(m_settings->MaxBufferCount);
-
- // while data available
- BamAlignment al;
- while ( inputReader.GetNextAlignmentCore(al)) {
-
- // store alignments in buffer
- buffer.push_back(al);
-
- // if buffer is full, handle contents (sort & write to temp file)
- if ( buffer.size() == m_settings->MaxBufferCount )
- HandleBufferContents(buffer);
- }
-
- // handle any remaining buffer contents
- if ( buffer.size() > 0 )
- HandleBufferContents(buffer);
-
- // close reader & return success
- inputReader.Close();
- return true;
-}
-
-bool SortTool::SortToolPrivate::HandleBufferContents(vector<BamAlignment>& buffer ) {
-
- // do sorting
- SortBuffer(buffer);
-
- // write sorted contents to temp file, store success/fail
- stringstream tempStr;
- tempStr << m_tempFilenameStub << m_numberOfRuns;
- bool success = WriteTempFile( buffer, tempStr.str() );
-
- // save temp filename for merging later
- m_tempFilenames.push_back(tempStr.str());
-
- // clear buffer contents & update run counter
- buffer.clear();
- ++m_numberOfRuns;
-
- // return success/fail of writing to temp file
- return success;
-}
-
-// merges sorted temp BAM files into single sorted output BAM file
-bool SortTool::SortToolPrivate::MergeSortedRuns(void) {
-
- // open up multi reader for all of our temp files
- // this might get broken up if we do a multi-pass system later ??
- BamMultiReader multiReader;
- multiReader.Open(m_tempFilenames, false, true);
-
- // open writer for our completely sorted output BAM file
- BamWriter mergedWriter;
- mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references);
-
- // while data available in temp files
- BamAlignment al;
- while ( multiReader.GetNextAlignmentCore(al) ) {
- mergedWriter.SaveAlignment(al);
- }
-
- // close readers
- multiReader.Close();
- mergedWriter.Close();
-
- // delete all temp files
- vector<string>::const_iterator tempIter = m_tempFilenames.begin();
- vector<string>::const_iterator tempEnd = m_tempFilenames.end();
- for ( ; tempIter != tempEnd; ++tempIter ) {
- const string& tempFilename = (*tempIter);
- remove(tempFilename.c_str());
- }
-
- return true;
-}
-
-bool SortTool::SortToolPrivate::Run(void) {
-
- // this does a single pass, chunking up the input file into smaller sorted temp files,
- // then write out using BamMultiReader to handle merging
-
- if ( GenerateSortedRuns() )
- return MergeSortedRuns();
- else
- return false;
-}
-
-void SortTool::SortToolPrivate::SortBuffer(vector<BamAlignment>& buffer) {
-
- // ** add further custom sort options later ?? **
-
- // sort buffer by desired method
- if ( m_settings->IsSortingByName )
- sort ( buffer.begin(), buffer.end(), SortLessThanName() );
- else
- sort ( buffer.begin(), buffer.end(), SortLessThanPosition() );
-}
-
-
-bool SortTool::SortToolPrivate::WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename) {
-
- // open temp file for writing
- BamWriter tempWriter;
- tempWriter.Open(tempFilename, m_headerText, m_references);
-
- // write data
- vector<BamAlignment>::const_iterator buffIter = buffer.begin();
- vector<BamAlignment>::const_iterator buffEnd = buffer.end();
- for ( ; buffIter != buffEnd; ++buffIter ) {
- const BamAlignment& al = (*buffIter);
- tempWriter.SaveAlignment(al);
- }
-
- // close temp file & return success
- tempWriter.Close();
- return true;
-}
\ No newline at end of file
+++ /dev/null
-// ***************************************************************************
-// bamtools_sort.h (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 21 June 2010 (DB)
-// ---------------------------------------------------------------------------
-// Sorts a BAM file.
-// ***************************************************************************
-
-#ifndef BAMTOOLS_SORT_H
-#define BAMTOOLS_SORT_H
-
-#include "bamtools_tool.h"
-
-namespace BamTools {
-
-class SortTool : public AbstractTool {
-
- public:
- SortTool(void);
- ~SortTool(void);
-
- public:
- int Help(void);
- int Run(int argc, char* argv[]);
-
- private:
- struct SortSettings;
- SortSettings* m_settings;
-
- struct SortToolPrivate;
- SortToolPrivate* m_impl;
-};
-
-} // namespace BamTools
-
-#endif // BAMTOOLS_SORT_H
+++ /dev/null
-// ***************************************************************************
-// bamtools_cpp (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 22 July 2010
-// ---------------------------------------------------------------------------
-// Prints general alignment statistics for BAM file(s).
-// ***************************************************************************
-
-#include <cmath>
-#include <algorithm>
-#include <functional>
-#include <iostream>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#include "bamtools_stats.h"
-#include "bamtools_options.h"
-#include "BamMultiReader.h"
-using namespace std;
-using namespace BamTools;
-
-// ---------------------------------------------
-// StatsSettings implementation
-
-struct StatsTool::StatsSettings {
-
- // flags
- bool HasInput;
- bool IsShowingInsertSizeSummary;
-
- // filenames
- vector<string> InputFiles;
-
- // constructor
- StatsSettings(void)
- : HasInput(false)
- , IsShowingInsertSizeSummary(false)
- { }
-};
-
-// ---------------------------------------------
-// StatsToolPrivate implementation
-
-struct StatsTool::StatsToolPrivate {
-
- // ctor & dtor
- public:
- StatsToolPrivate(StatsTool::StatsSettings* _settings);
- ~StatsToolPrivate(void);
-
- // 'public' interface
- public:
- bool Run(void);
-
- // internal methods
- private:
- bool CalculateMedian(vector<int>& data, double& median);
- void PrintStats(void);
- void ProcessAlignment(const BamAlignment& al);
-
- // data members
- private:
- StatsTool::StatsSettings* settings;
- unsigned int numReads;
- unsigned int numPaired;
- unsigned int numProperPair;
- unsigned int numMapped;
- unsigned int numBothMatesMapped;
- unsigned int numForwardStrand;
- unsigned int numReverseStrand;
- unsigned int numFirstMate;
- unsigned int numSecondMate;
- unsigned int numSingletons;
- unsigned int numFailedQC;
- unsigned int numDuplicates;
- vector<int> insertSizes;
-};
-
-StatsTool::StatsToolPrivate::StatsToolPrivate(StatsTool::StatsSettings* _settings)
- : settings(_settings)
- , numReads(0)
- , numPaired(0)
- , numProperPair(0)
- , numMapped(0)
- , numBothMatesMapped(0)
- , numForwardStrand(0)
- , numReverseStrand(0)
- , numFirstMate(0)
- , numSecondMate(0)
- , numSingletons(0)
- , numFailedQC(0)
- , numDuplicates(0)
-{
- insertSizes.reserve(100000);
-}
-
-StatsTool::StatsToolPrivate::~StatsToolPrivate(void) { }
-
-bool StatsTool::StatsToolPrivate::CalculateMedian(vector<int>& data, double& median) { // median is double in case of even data size, need to return average of middle 2 elements
-
- // check that data exists
- if ( data.empty() ) return false;
-
- size_t dataSize = data.size();
- size_t middleIndex = dataSize / 2;
-
- vector<int>::iterator target = data.begin() + middleIndex;
- nth_element(data.begin(), target, data.end());
-
- // odd number of elements
- if ( (dataSize % 2) != 0) {
- median = (double)(*target);
- return true;
- }
-
- // even number of elements
- else {
- double rightTarget = (double)(*target);
- vector<int>::iterator leftTarget = target - 1;
- nth_element(data.begin(), leftTarget, data.end());
- median = (double)((rightTarget+*leftTarget)/2.0);
- return true;
- }
-}
-
-// print BAM file alignment stats
-void StatsTool::StatsToolPrivate::PrintStats(void) {
-
- cout << endl;
- cout << "**********************************************" << endl;
- cout << "Stats for BAM file(s): " << endl;
- cout << "**********************************************" << endl;
- cout << endl;
- cout << "Total reads: " << numReads << endl;
- cout << "Mapped reads: " << numMapped << "\t(" << ((float)numMapped/numReads)*100 << "%)" << endl;
- cout << "Forward strand: " << numForwardStrand << "\t(" << ((float)numForwardStrand/numReads)*100 << "%)" << endl;
- cout << "Reverse strand: " << numReverseStrand << "\t(" << ((float)numReverseStrand/numReads)*100 << "%)" << endl;
- cout << "Failed QC: " << numFailedQC << "\t(" << ((float)numFailedQC/numReads)*100 << "%)" << endl;
- cout << "Duplicates: " << numDuplicates << "\t(" << ((float)numDuplicates/numReads)*100 << "%)" << endl;
- cout << "Paired-end reads: " << numPaired << "\t(" << ((float)numPaired/numReads)*100 << "%)" << endl;
-
- if ( numPaired != 0 ) {
- cout << "'Proper-pairs': " << numProperPair << "\t(" << ((float)numProperPair/numPaired)*100 << "%)" << endl;
- cout << "Both pairs mapped: " << numBothMatesMapped << "\t(" << ((float)numBothMatesMapped/numPaired)*100 << "%)" << endl;
- cout << "Read 1: " << numFirstMate << endl;
- cout << "Read 2: " << numSecondMate << endl;
- cout << "Singletons: " << numSingletons << "\t(" << ((float)numSingletons/numPaired)*100 << "%)" << endl;
- }
-
- if ( settings->IsShowingInsertSizeSummary ) {
-
- double avgInsertSize = 0.0;
- if ( !insertSizes.empty() ) {
- avgInsertSize = ( accumulate(insertSizes.begin(), insertSizes.end(), 0.0) / (double)insertSizes.size() );
- cout << "Average insert size (absolute value): " << avgInsertSize << endl;
- }
-
- double medianInsertSize = 0.0;
- if ( CalculateMedian(insertSizes, medianInsertSize) )
- cout << "Median insert size (absolute value): " << medianInsertSize << endl;
- }
- cout << endl;
-}
-
-// use current input alignment to update BAM file alignment stats
-void StatsTool::StatsToolPrivate::ProcessAlignment(const BamAlignment& al) {
-
- // increment total alignment counter
- ++numReads;
-
- // check the paired-independent flags
- if ( al.IsDuplicate() ) ++numDuplicates;
- if ( al.IsFailedQC() ) ++numFailedQC;
- if ( al.IsMapped() ) ++numMapped;
-
- // check forward/reverse strand
- if ( al.IsReverseStrand() )
- ++numReverseStrand;
- else
- ++numForwardStrand;
-
- // if alignment is paired-end
- if ( al.IsPaired() ) {
-
- // increment PE counter
- ++numPaired;
-
- // increment first mate/second mate counters
- if ( al.IsFirstMate() ) ++numFirstMate;
- if ( al.IsSecondMate() ) ++numSecondMate;
-
- // if alignment is mapped, check mate status
- if ( al.IsMapped() ) {
- // if mate mapped
- if ( al.IsMateMapped() )
- ++numBothMatesMapped;
- // else singleton
- else
- ++numSingletons;
- }
-
- // check for explicit proper pair flag
- if ( al.IsProperPair() ) ++numProperPair;
-
- // store insert size for first mate
- if ( settings->IsShowingInsertSizeSummary && al.IsFirstMate() && (al.InsertSize != 0) ) {
- int insertSize = abs(al.InsertSize);
- insertSizes.push_back( insertSize );
- }
- }
-}
-
-bool StatsTool::StatsToolPrivate::Run() {
-
- // opens the BAM files without checking for indexes
- BamMultiReader reader;
- if ( !reader.Open(settings->InputFiles, false, true) ) {
- cerr << "Could not open input BAM file(s)... quitting." << endl;
- reader.Close();
- return false;
- }
-
- // plow through file, keeping track of stats
- BamAlignment al;
- while ( reader.GetNextAlignmentCore(al) ) {
- ProcessAlignment(al);
- }
-
- // print stats
- PrintStats();
-
- // clean and exit
- reader.Close();
- return true;
-}
-
-// ---------------------------------------------
-// StatsTool implementation
-
-StatsTool::StatsTool(void)
- : AbstractTool()
- , m_settings(new StatsSettings)
- , m_impl(0)
-{
- // set program details
- Options::SetProgramInfo("bamtools stats", "prints general alignment statistics", "[-in <filename> -in <filename> ... ]");
-
- // set up options
- OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
- Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
-
- OptionGroup* AdditionalOpts = Options::CreateOptionGroup("Additional Stats");
- Options::AddOption("-insert", "summarize insert size data", m_settings->IsShowingInsertSizeSummary, AdditionalOpts);
-}
-
-StatsTool::~StatsTool(void) {
- delete m_settings;
- m_settings = 0;
-
- delete m_impl;
- m_impl = 0;
-}
-
-int StatsTool::Help(void) {
- Options::DisplayHelp();
- return 0;
-}
-
-int StatsTool::Run(int argc, char* argv[]) {
-
- // parse command line arguments
- Options::Parse(argc, argv, 1);
-
- // set to default input if none provided
- if ( !m_settings->HasInput )
- m_settings->InputFiles.push_back(Options::StandardIn());
-
- // run internal SortTool implementation, return success/fail
- m_impl = new StatsToolPrivate(m_settings);
-
- if ( m_impl->Run() ) return 0;
- else return 1;
-}
+++ /dev/null
-// ***************************************************************************
-// bamtools_stats.h (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 1 June 2010
-// ---------------------------------------------------------------------------
-// Prints general statistics for a single BAM file.
-//
-// ** Expand to multiple? **
-//
-// ***************************************************************************
-
-#ifndef BAMTOOLS_STATS_H
-#define BAMTOOLS_STATS_H
-
-#include "bamtools_tool.h"
-
-namespace BamTools {
-
-class StatsTool : public AbstractTool {
-
- public:
- StatsTool(void);
- ~StatsTool(void);
-
- public:
- int Help(void);
- int Run(int argc, char* argv[]);
-
- private:
- struct StatsSettings;
- StatsSettings* m_settings;
-
- struct StatsToolPrivate;
- StatsToolPrivate* m_impl;
-};
-
-} // namespace BamTools
-
-#endif // BAMTOOLS_STATS_H
+++ /dev/null
-// ***************************************************************************
-// bamtools_tool.h (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 2 June 2010
-// ---------------------------------------------------------------------------
-// Base class for all other BamTools sub-tools
-// All derived classes must provide Help() and Run() methods
-// ***************************************************************************
-
-#ifndef BAMTOOLS_ABSTRACTTOOL_H
-#define BAMTOOLS_ABSTRACTTOOL_H
-
-#include <string>
-
-namespace BamTools {
-
-class AbstractTool {
-
- public:
- AbstractTool(void) { }
- virtual ~AbstractTool(void) { }
-
- public:
- virtual int Help(void) =0;
- virtual int Run(int argc, char* argv[]) =0;
-};
-
-} // namespace BamTools
-
-#endif // BAMTOOLS_ABSTRACTTOOL_H
\ No newline at end of file
+++ /dev/null
-// ***************************************************************************
-// bamtools_utilities.cpp (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 2 June 2010
-// ---------------------------------------------------------------------------
-// Provides general utilities used by BamTools sub-tools.
-// ***************************************************************************
-
-#include <cstdlib>
-#include <sys/stat.h>
-#include "bamtools_utilities.h"
-#include "BamReader.h"
-#include "BamMultiReader.h"
-
-using namespace std;
-using namespace BamTools;
-
-// Parses a region string, does validation (valid ID's, positions), stores in Region struct
-// Returns success (true/false)
-bool Utilities::ParseRegionString(const std::string& regionString, const BamReader& reader, BamRegion& region) {
-
- // -------------------------------
- // parse region string
-
- // check first for empty string
- if ( regionString.empty() )
- return false;
-
- // non-empty string, look for a colom
- size_t foundFirstColon = regionString.find(':');
-
- // store chrom strings, and numeric positions
- string startChrom;
- string stopChrom;
- int startPos;
- int stopPos;
-
- // no colon found
- // going to use entire contents of requested chromosome
- // just store entire region string as startChrom name
- // use BamReader methods to check if its valid for current BAM file
- if ( foundFirstColon == string::npos ) {
- startChrom = regionString;
- startPos = 0;
- stopChrom = regionString;
- stopPos = -1;
- }
-
- // colon found, so we at least have some sort of startPos requested
- else {
-
- // store start chrom from beginning to first colon
- startChrom = regionString.substr(0,foundFirstColon);
-
- // look for ".." after the colon
- size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
-
- // no dots found
- // so we have a startPos but no range
- // store contents before colon as startChrom, after as startPos
- if ( foundRangeDots == string::npos ) {
- startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
- stopChrom = startChrom;
- stopPos = -1;
- }
-
- // ".." found, so we have some sort of range selected
- else {
-
- // store startPos between first colon and range dots ".."
- startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
-
- // look for second colon
- size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
-
- // no second colon found
- // so we have a "standard" chrom:start..stop input format (on single chrom)
- if ( foundSecondColon == string::npos ) {
- stopChrom = startChrom;
- stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
- }
-
- // second colon found
- // so we have a range requested across 2 chrom's
- else {
- stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
- stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
- }
- }
- }
-
- // -------------------------------
- // validate reference IDs & genomic positions
-
- const RefVector references = reader.GetReferenceData();
-
- // if startRefID not found, return false
- int startRefID = reader.GetReferenceID(startChrom);
- if ( startRefID == (int)references.size() ) return false;
-
- // if startPos is larger than reference, return false
- const RefData& startReference = references.at(startRefID);
- if ( startPos > startReference.RefLength ) return false;
-
- // if stopRefID not found, return false
- int stopRefID = reader.GetReferenceID(stopChrom);
- if ( stopRefID == (int)references.size() ) return false;
-
- // if stopPosition larger than reference, return false
- const RefData& stopReference = references.at(stopRefID);
- if ( stopPos > stopReference.RefLength ) return false;
-
- // if no stopPosition specified, set to reference end
- if ( stopPos == -1 ) stopPos = stopReference.RefLength;
-
- // -------------------------------
- // set up Region struct & return
-
- region.LeftRefID = startRefID;
- region.LeftPosition = startPos;
- region.RightRefID = stopRefID;;
- region.RightPosition = stopPos;
- return true;
-}
-
-// Same as ParseRegionString() above, but accepts a BamMultiReader
-bool Utilities::ParseRegionString(const std::string& regionString, const BamMultiReader& reader, BamRegion& region) {
-
- // -------------------------------
- // parse region string
-
- // check first for empty string
- if ( regionString.empty() )
- return false;
-
- // non-empty string, look for a colom
- size_t foundFirstColon = regionString.find(':');
-
- // store chrom strings, and numeric positions
- string startChrom;
- string stopChrom;
- int startPos;
- int stopPos;
-
- // no colon found
- // going to use entire contents of requested chromosome
- // just store entire region string as startChrom name
- // use BamReader methods to check if its valid for current BAM file
- if ( foundFirstColon == string::npos ) {
- startChrom = regionString;
- startPos = 0;
- stopChrom = regionString;
- stopPos = -1;
- }
-
- // colon found, so we at least have some sort of startPos requested
- else {
-
- // store start chrom from beginning to first colon
- startChrom = regionString.substr(0,foundFirstColon);
-
- // look for ".." after the colon
- size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
-
- // no dots found
- // so we have a startPos but no range
- // store contents before colon as startChrom, after as startPos
- if ( foundRangeDots == string::npos ) {
- startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
- stopChrom = startChrom;
- stopPos = -1;
- }
-
- // ".." found, so we have some sort of range selected
- else {
-
- // store startPos between first colon and range dots ".."
- startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
-
- // look for second colon
- size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
-
- // no second colon found
- // so we have a "standard" chrom:start..stop input format (on single chrom)
- if ( foundSecondColon == string::npos ) {
- stopChrom = startChrom;
- stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
- }
-
- // second colon found
- // so we have a range requested across 2 chrom's
- else {
- stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
- stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
- }
- }
- }
-
- // -------------------------------
- // validate reference IDs & genomic positions
-
- const RefVector references = reader.GetReferenceData();
-
- // if startRefID not found, return false
- int startRefID = reader.GetReferenceID(startChrom);
- if ( startRefID == (int)references.size() ) return false;
-
- // if startPos is larger than reference, return false
- const RefData& startReference = references.at(startRefID);
- if ( startPos > startReference.RefLength ) return false;
-
- // if stopRefID not found, return false
- int stopRefID = reader.GetReferenceID(stopChrom);
- if ( stopRefID == (int)references.size() ) return false;
-
- // if stopPosition larger than reference, return false
- const RefData& stopReference = references.at(stopRefID);
- if ( stopPos > stopReference.RefLength ) return false;
-
- // if no stopPosition specified, set to reference end
- if ( stopPos == -1 ) stopPos = stopReference.RefLength;
-
- // -------------------------------
- // set up Region struct & return
-
- region.LeftRefID = startRefID;
- region.LeftPosition = startPos;
- region.RightRefID = stopRefID;;
- region.RightPosition = stopPos;
-
- return true;
-}
-
-bool Utilities::FileExists(const std::string& filename) {
-
- struct stat fileInfo;
- return stat(filename.c_str(), &fileInfo) == 0;
-
-}
+++ /dev/null
-// ***************************************************************************
-// bamtools_utilities.h (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 2 June 2010
-// ---------------------------------------------------------------------------
-// Provides general utilities used by BamTools sub-tools.
-// ***************************************************************************
-
-#ifndef BAMTOOLS_UTILITIES_H
-#define BAMTOOLS_UTILITIES_H
-
-#include <string>
-#include "BamAux.h"
-
-namespace BamTools {
-
-class BamReader;
-class BamMultiReader;
-
-class Utilities {
-
- public:
- // Parses a region string, uses reader to do validation (valid ID's, positions), stores in Region struct
- // Returns success (true/false)
- static bool ParseRegionString(const std::string& regionString, const BamReader& reader, BamRegion& region);
- // Same as above, but accepts a BamMultiReader
- static bool ParseRegionString(const std::string& regionString, const BamMultiReader& reader, BamRegion& region);
-
- // check if a file exists
- static bool FileExists(const std::string& fname);
-};
-
-} // namespace BamTools
-
-#endif // BAMTOOLS_UTILITIES_H
+++ /dev/null
-// ***************************************************************************
-// bamtools_variant.h (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Last modified: 2 June 2010
-// ---------------------------------------------------------------------------
-// Provides a template-based variant type
-// ---------------------------------------------------------------------------
-// Modified from:
-// variant_t - An Improved Variant Type Based on Member Templates
-// (c) 2000 Fernando Cacciola
-// Dr. Dobb's (http://www.ddj.com/cpp/184401293)
-//
-// * Modified to be in BamTools namespace, otherwise code is same. (DB)
-// ***************************************************************************
-
-#ifndef BAMTOOLS_VARIANT_H
-#define BAMTOOLS_VARIANT_H
-
-#include <stdexcept>
-#include <typeinfo>
-#include <string>
-
-namespace BamTools {
-
-class Variant {
-
- public:
- Variant(void) : data (NULL) { }
-
- Variant(const Variant& other) {
- if(other.data != NULL)
- other.data->AddRef();
- data = other.data;
- }
-
- ~Variant(void) {
- if(data != NULL) data->Release();
- }
-
- // NOTE: This code takes care of self-assignment.
- // DO NOT CHANGE THE ORDER of the statements.
- Variant& operator=(const Variant& rhs) {
- if(rhs.data != NULL)
- rhs.data->AddRef();
- if(data != NULL)
- data->Release();
- data = rhs.data;
- return * this;
- }
-
- // This member template constructor allows you to
- // instance a variant_t object with a value of any type.
- template<typename T>
- Variant(T v)
- : data(new Impl<T>(v))
- {
- data->AddRef();
- }
-
- // This generic conversion operator let you retrieve
- // the value held. To avoid template specialization conflicts,
- // it returns an instance of type T, which will be a COPY
- // of the value contained.
- template<typename T>
- operator T() const {
- return CastFromBase<T>(data)->data;
- }
-
- // This forms returns a REFERENCE and not a COPY, which
- // will be significant in some cases.
- template<typename T>
- const T& get(void) const {
- return CastFromBase<T>(data)->data;
- }
-
- template<typename T>
- bool is_type(void) const {
- return typeid(*data)==typeid(Impl<T>);
- }
-
- template<typename T>
- bool is_type(T v) const {
- return typeid(*data)==typeid(v);
- }
-
- private:
- struct ImplBase {
-
- ImplBase() : refs(0) {}
- virtual ~ImplBase() {}
-
- void AddRef(void) { refs ++; }
- void Release(void) {
- --refs;
- if(refs == 0) delete this;
- }
-
- size_t refs;
- };
-
- template<typename T>
- struct Impl : ImplBase {
- Impl(T v) : data (v) { }
- ~Impl(void) { }
- T data;
- };
-
- // The following method is static because it doesn't
- // operate on variant_t instances.
- template<typename T>
- static Impl<T>* CastFromBase(ImplBase* v) {
- // This upcast will fail if T is other than the T used
- // with the constructor of variant_t.
- Impl<T>* p = dynamic_cast<Impl<T>*> (v);
- if (p == NULL)
- throw std::invalid_argument(typeid(T).name()+std::string(" is not a valid type"));
- return p;
- }
-
- ImplBase* data;
-};
-
-} // namespace BamTools
-
-#endif // BAMTOOLS_VARIANT_H
--- /dev/null
+// ***************************************************************************\r
+// BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 16 August 2010 (DB)\r
+// ---------------------------------------------------------------------------\r
+// BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for reading & writing BGZF files\r
+// ***************************************************************************\r
+\r
+#include <algorithm>\r
+#include "BGZF.h"\r
+using namespace BamTools;\r
+using std::string;\r
+using std::min;\r
+\r
+BgzfData::BgzfData(void)\r
+ : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)\r
+ , CompressedBlockSize(MAX_BLOCK_SIZE)\r
+ , BlockLength(0)\r
+ , BlockOffset(0)\r
+ , BlockAddress(0)\r
+ , IsOpen(false)\r
+ , IsWriteOnly(false)\r
+ , IsWriteUncompressed(false)\r
+ , Stream(NULL)\r
+ , UncompressedBlock(NULL)\r
+ , CompressedBlock(NULL)\r
+{\r
+ try {\r
+ CompressedBlock = new char[CompressedBlockSize];\r
+ UncompressedBlock = new char[UncompressedBlockSize];\r
+ } catch( std::bad_alloc& ba ) {\r
+ printf("BGZF ERROR: unable to allocate memory for our BGZF object.\n");\r
+ exit(1);\r
+ }\r
+}\r
+\r
+// destructor\r
+BgzfData::~BgzfData(void) {\r
+ if( CompressedBlock ) delete[] CompressedBlock;\r
+ if( UncompressedBlock ) delete[] UncompressedBlock;\r
+}\r
+\r
+// closes BGZF file\r
+void BgzfData::Close(void) {\r
+\r
+ // skip if file not open, otherwise set flag\r
+ if ( !IsOpen ) return;\r
+\r
+ // if writing to file, flush the current BGZF block,\r
+ // then write an empty block (as EOF marker)\r
+ if ( IsWriteOnly ) {\r
+ FlushBlock();\r
+ int blockLength = DeflateBlock();\r
+ fwrite(CompressedBlock, 1, blockLength, Stream);\r
+ }\r
+ \r
+ // flush and close\r
+ fflush(Stream);\r
+ fclose(Stream);\r
+ IsWriteUncompressed = false;\r
+ IsOpen = false;\r
+}\r
+\r
+// compresses the current block\r
+int BgzfData::DeflateBlock(void) {\r
+\r
+ // initialize the gzip header\r
+ char* buffer = CompressedBlock;\r
+ memset(buffer, 0, 18);\r
+ buffer[0] = GZIP_ID1;\r
+ buffer[1] = (char)GZIP_ID2;\r
+ buffer[2] = CM_DEFLATE;\r
+ buffer[3] = FLG_FEXTRA;\r
+ buffer[9] = (char)OS_UNKNOWN;\r
+ buffer[10] = BGZF_XLEN;\r
+ buffer[12] = BGZF_ID1;\r
+ buffer[13] = BGZF_ID2;\r
+ buffer[14] = BGZF_LEN;\r
+\r
+ // set compression level\r
+ const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION );\r
+ \r
+ // loop to retry for blocks that do not compress enough\r
+ int inputLength = BlockOffset;\r
+ int compressedLength = 0;\r
+ unsigned int bufferSize = CompressedBlockSize;\r
+\r
+ while ( true ) {\r
+ \r
+ // initialize zstream values\r
+ z_stream zs;\r
+ zs.zalloc = NULL;\r
+ zs.zfree = NULL;\r
+ zs.next_in = (Bytef*)UncompressedBlock;\r
+ zs.avail_in = inputLength;\r
+ zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];\r
+ zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;\r
+\r
+ // initialize the zlib compression algorithm\r
+ if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) {\r
+ printf("BGZF ERROR: zlib deflate initialization failed.\n");\r
+ exit(1);\r
+ }\r
+\r
+ // compress the data\r
+ int status = deflate(&zs, Z_FINISH);\r
+ if ( status != Z_STREAM_END ) {\r
+\r
+ deflateEnd(&zs);\r
+\r
+ // reduce the input length and try again\r
+ if ( status == Z_OK ) {\r
+ inputLength -= 1024;\r
+ if( inputLength < 0 ) {\r
+ printf("BGZF ERROR: input reduction failed.\n");\r
+ exit(1);\r
+ }\r
+ continue;\r
+ }\r
+\r
+ printf("BGZF ERROR: zlib::deflateEnd() failed.\n");\r
+ exit(1);\r
+ }\r
+\r
+ // finalize the compression routine\r
+ if ( deflateEnd(&zs) != Z_OK ) {\r
+ printf("BGZF ERROR: zlib::deflateEnd() failed.\n");\r
+ exit(1);\r
+ }\r
+\r
+ compressedLength = zs.total_out;\r
+ compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;\r
+ if ( compressedLength > MAX_BLOCK_SIZE ) {\r
+ printf("BGZF ERROR: deflate overflow.\n");\r
+ exit(1);\r
+ }\r
+\r
+ break;\r
+ }\r
+\r
+ // store the compressed length\r
+ BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));\r
+\r
+ // store the CRC32 checksum\r
+ unsigned int crc = crc32(0, NULL, 0);\r
+ crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength);\r
+ BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc);\r
+ BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);\r
+\r
+ // ensure that we have less than a block of data left\r
+ int remaining = BlockOffset - inputLength;\r
+ if ( remaining > 0 ) {\r
+ if ( remaining > inputLength ) {\r
+ printf("BGZF ERROR: after deflate, remainder too large.\n");\r
+ exit(1);\r
+ }\r
+ memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining);\r
+ }\r
+\r
+ BlockOffset = remaining;\r
+ return compressedLength;\r
+}\r
+\r
+// flushes the data in the BGZF block\r
+void BgzfData::FlushBlock(void) {\r
+\r
+ // flush all of the remaining blocks\r
+ while ( BlockOffset > 0 ) {\r
+\r
+ // compress the data block\r
+ int blockLength = DeflateBlock();\r
+\r
+ // flush the data to our output stream\r
+ int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream);\r
+\r
+ if ( numBytesWritten != blockLength ) {\r
+ printf("BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);\r
+ exit(1);\r
+ }\r
+ \r
+ BlockAddress += blockLength;\r
+ }\r
+}\r
+\r
+// de-compresses the current block\r
+int BgzfData::InflateBlock(const int& blockLength) {\r
+\r
+ // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock\r
+ z_stream zs;\r
+ zs.zalloc = NULL;\r
+ zs.zfree = NULL;\r
+ zs.next_in = (Bytef*)CompressedBlock + 18;\r
+ zs.avail_in = blockLength - 16;\r
+ zs.next_out = (Bytef*)UncompressedBlock;\r
+ zs.avail_out = UncompressedBlockSize;\r
+\r
+ int status = inflateInit2(&zs, GZIP_WINDOW_BITS);\r
+ if ( status != Z_OK ) {\r
+ printf("BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n");\r
+ return -1;\r
+ }\r
+\r
+ status = inflate(&zs, Z_FINISH);\r
+ if ( status != Z_STREAM_END ) {\r
+ inflateEnd(&zs);\r
+ printf("BGZF ERROR: could not decompress block - zlib::inflate() failed\n");\r
+ return -1;\r
+ }\r
+\r
+ status = inflateEnd(&zs);\r
+ if ( status != Z_OK ) {\r
+ printf("BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n");\r
+ return -1;\r
+ }\r
+\r
+ return zs.total_out;\r
+}\r
+\r
+// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing)\r
+bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncompressed ) {\r
+\r
+ // determine open mode\r
+ if ( strcmp(mode, "rb") == 0 )\r
+ IsWriteOnly = false;\r
+ else if ( strcmp(mode, "wb") == 0) \r
+ IsWriteOnly = true;\r
+ else {\r
+ printf("BGZF ERROR: unknown file mode: %s\n", mode);\r
+ return false; \r
+ }\r
+\r
+ // ----------------------------------------------------------------\r
+ // open Stream to read to/write from file, stdin, or stdout\r
+ // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03)\r
+ \r
+ // read/write BGZF data to/from a file\r
+ if ( (filename != "stdin") && (filename != "stdout") )\r
+ Stream = fopen(filename.c_str(), mode);\r
+ \r
+ // read BGZF data from stdin\r
+ else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) )\r
+ Stream = freopen(NULL, mode, stdin);\r
+ \r
+ // write BGZF data to stdout\r
+ else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) )\r
+ Stream = freopen(NULL, mode, stdout);\r
+\r
+ if ( !Stream ) {\r
+ printf("BGZF ERROR: unable to open file %s\n", filename.c_str() );\r
+ return false;\r
+ }\r
+ \r
+ // set flags, return success\r
+ IsOpen = true;\r
+ IsWriteUncompressed = isWriteUncompressed;\r
+ return true;\r
+}\r
+\r
+// reads BGZF data into a byte buffer\r
+int BgzfData::Read(char* data, const unsigned int dataLength) {\r
+\r
+ if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0;\r
+\r
+ char* output = data;\r
+ unsigned int numBytesRead = 0;\r
+ while ( numBytesRead < dataLength ) {\r
+\r
+ int bytesAvailable = BlockLength - BlockOffset;\r
+ if ( bytesAvailable <= 0 ) {\r
+ if ( !ReadBlock() ) return -1; \r
+ bytesAvailable = BlockLength - BlockOffset;\r
+ if ( bytesAvailable <= 0 ) break;\r
+ }\r
+\r
+ char* buffer = UncompressedBlock;\r
+ int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable );\r
+ memcpy(output, buffer + BlockOffset, copyLength);\r
+\r
+ BlockOffset += copyLength;\r
+ output += copyLength;\r
+ numBytesRead += copyLength;\r
+ }\r
+\r
+ if ( BlockOffset == BlockLength ) {\r
+ BlockAddress = ftell64(Stream);\r
+ BlockOffset = 0;\r
+ BlockLength = 0;\r
+ }\r
+\r
+ return numBytesRead;\r
+}\r
+\r
+// reads a BGZF block\r
+bool BgzfData::ReadBlock(void) {\r
+\r
+ char header[BLOCK_HEADER_LENGTH];\r
+ int64_t blockAddress = ftell64(Stream);\r
+ \r
+ int count = fread(header, 1, sizeof(header), Stream);\r
+ if ( count == 0 ) {\r
+ BlockLength = 0;\r
+ return true;\r
+ }\r
+\r
+ if ( count != sizeof(header) ) {\r
+ printf("BGZF ERROR: read block failed - could not read block header\n");\r
+ return false;\r
+ }\r
+\r
+ if ( !BgzfData::CheckBlockHeader(header) ) {\r
+ printf("BGZF ERROR: read block failed - invalid block header\n");\r
+ return false;\r
+ }\r
+\r
+ int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1;\r
+ char* compressedBlock = CompressedBlock;\r
+ memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH);\r
+ int remaining = blockLength - BLOCK_HEADER_LENGTH;\r
+\r
+ count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream);\r
+ if ( count != remaining ) {\r
+ printf("BGZF ERROR: read block failed - could not read data from block\n");\r
+ return false;\r
+ }\r
+\r
+ count = InflateBlock(blockLength);\r
+ if ( count < 0 ) { \r
+ printf("BGZF ERROR: read block failed - could not decompress block data\n");\r
+ return false;\r
+ }\r
+\r
+ if ( BlockLength != 0 )\r
+ BlockOffset = 0;\r
+\r
+ BlockAddress = blockAddress;\r
+ BlockLength = count;\r
+ return true;\r
+}\r
+\r
+// seek to position in BGZF file\r
+bool BgzfData::Seek(int64_t position) {\r
+\r
+ if ( !IsOpen ) return false;\r
+ \r
+ int blockOffset = (position & 0xFFFF);\r
+ int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;\r
+\r
+ if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) {\r
+ printf("BGZF ERROR: unable to seek in file\n");\r
+ return false;\r
+ }\r
+\r
+ BlockLength = 0;\r
+ BlockAddress = blockAddress;\r
+ BlockOffset = blockOffset;\r
+ return true;\r
+}\r
+\r
+// get file position in BGZF file\r
+int64_t BgzfData::Tell(void) {\r
+ if ( !IsOpen ) \r
+ return false;\r
+ else \r
+ return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) );\r
+}\r
+\r
+// writes the supplied data into the BGZF buffer\r
+unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) {\r
+\r
+ if ( !IsOpen || !IsWriteOnly ) return false;\r
+ \r
+ // initialize\r
+ unsigned int numBytesWritten = 0;\r
+ const char* input = data;\r
+ unsigned int blockLength = UncompressedBlockSize;\r
+\r
+ // copy the data to the buffer\r
+ while ( numBytesWritten < dataLen ) {\r
+ \r
+ unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten);\r
+ char* buffer = UncompressedBlock;\r
+ memcpy(buffer + BlockOffset, input, copyLength);\r
+\r
+ BlockOffset += copyLength;\r
+ input += copyLength;\r
+ numBytesWritten += copyLength;\r
+\r
+ if ( BlockOffset == blockLength )\r
+ FlushBlock();\r
+ }\r
+\r
+ return numBytesWritten;\r
+}\r
--- /dev/null
+// ***************************************************************************\r
+// BGZF.h (c) 2009 Derek Barnett, Michael Str�mberg\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 16 August 2010 (DB)\r
+// ---------------------------------------------------------------------------\r
+// BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for reading & writing BGZF files\r
+// ***************************************************************************\r
+\r
+#ifndef BGZF_H\r
+#define BGZF_H\r
+\r
+// 'C' includes\r
+#include <cstdio>\r
+#include <cstdlib>\r
+#include <cstring>\r
+\r
+// C++ includes\r
+#include <string>\r
+\r
+// zlib includes\r
+#include "zlib.h"\r
+\r
+// Platform-specific large-file support\r
+#ifndef BAMTOOLS_LFS\r
+#define BAMTOOLS_LFS\r
+ #ifdef WIN32\r
+ #define ftell64(a) _ftelli64(a)\r
+ #define fseek64(a,b,c) _fseeki64(a,b,c)\r
+ #else\r
+ #define ftell64(a) ftello(a)\r
+ #define fseek64(a,b,c) fseeko(a,b,c) \r
+ #endif\r
+#endif // BAMTOOLS_LFS\r
+\r
+// Platform-specific type definitions\r
+#ifndef BAMTOOLS_TYPES\r
+#define BAMTOOLS_TYPES\r
+ #ifdef _MSC_VER\r
+ typedef char int8_t;\r
+ typedef unsigned char uint8_t;\r
+ typedef short int16_t;\r
+ typedef unsigned short uint16_t;\r
+ typedef int int32_t;\r
+ typedef unsigned int uint32_t;\r
+ typedef long long int64_t;\r
+ typedef unsigned long long uint64_t;\r
+ #else \r
+ #include <stdint.h>\r
+ #endif\r
+#endif // BAMTOOLS_TYPES\r
+\r
+namespace BamTools {\r
+\r
+// zlib constants\r
+const int GZIP_ID1 = 31;\r
+const int GZIP_ID2 = 139;\r
+const int CM_DEFLATE = 8;\r
+const int FLG_FEXTRA = 4;\r
+const int OS_UNKNOWN = 255;\r
+const int BGZF_XLEN = 6;\r
+const int BGZF_ID1 = 66;\r
+const int BGZF_ID2 = 67;\r
+const int BGZF_LEN = 2;\r
+const int GZIP_WINDOW_BITS = -15;\r
+const int Z_DEFAULT_MEM_LEVEL = 8;\r
+\r
+// BZGF constants\r
+const int BLOCK_HEADER_LENGTH = 18;\r
+const int BLOCK_FOOTER_LENGTH = 8;\r
+const int MAX_BLOCK_SIZE = 65536;\r
+const int DEFAULT_BLOCK_SIZE = 65536;\r
+\r
+struct BgzfData {\r
+\r
+ // data members\r
+ public:\r
+ unsigned int UncompressedBlockSize;\r
+ unsigned int CompressedBlockSize;\r
+ unsigned int BlockLength;\r
+ unsigned int BlockOffset;\r
+ uint64_t BlockAddress;\r
+ bool IsOpen;\r
+ bool IsWriteOnly;\r
+ bool IsWriteUncompressed;\r
+ FILE* Stream;\r
+ char* UncompressedBlock;\r
+ char* CompressedBlock;\r
+\r
+ // constructor & destructor\r
+ public:\r
+ BgzfData(void);\r
+ ~BgzfData(void);\r
+\r
+ // main interface methods\r
+ public: \r
+ // closes BGZF file\r
+ void Close(void);\r
+ // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing)\r
+ bool Open(const std::string& filename, const char* mode, bool isWriteUncompressed = false);\r
+ // reads BGZF data into a byte buffer\r
+ int Read(char* data, const unsigned int dataLength);\r
+ // seek to position in BGZF file\r
+ bool Seek(int64_t position);\r
+ // get file position in BGZF file\r
+ int64_t Tell(void);\r
+ // writes the supplied data into the BGZF buffer\r
+ unsigned int Write(const char* data, const unsigned int dataLen);\r
+\r
+ // internal methods\r
+ private:\r
+ // compresses the current block\r
+ int DeflateBlock(void);\r
+ // flushes the data in the BGZF block\r
+ void FlushBlock(void);\r
+ // de-compresses the current block\r
+ int InflateBlock(const int& blockLength);\r
+ // reads a BGZF block\r
+ bool ReadBlock(void);\r
+ \r
+ // static 'utility' methods\r
+ public:\r
+ // checks BGZF block header\r
+ static inline bool CheckBlockHeader(char* header);\r
+ // packs an unsigned integer into the specified buffer\r
+ static inline void PackUnsignedInt(char* buffer, unsigned int value);\r
+ // packs an unsigned short into the specified buffer\r
+ static inline void PackUnsignedShort(char* buffer, unsigned short value);\r
+ // unpacks a buffer into a double\r
+ static inline double UnpackDouble(char* buffer);\r
+ static inline double UnpackDouble(const char* buffer);\r
+ // unpacks a buffer into a float\r
+ static inline float UnpackFloat(char* buffer);\r
+ static inline float UnpackFloat(const char* buffer);\r
+ // unpacks a buffer into a signed int\r
+ static inline signed int UnpackSignedInt(char* buffer);\r
+ static inline signed int UnpackSignedInt(const char* buffer);\r
+ // unpacks a buffer into a signed short\r
+ static inline signed short UnpackSignedShort(char* buffer);\r
+ static inline signed short UnpackSignedShort(const char* buffer);\r
+ // unpacks a buffer into an unsigned int\r
+ static inline unsigned int UnpackUnsignedInt(char* buffer);\r
+ static inline unsigned int UnpackUnsignedInt(const char* buffer);\r
+ // unpacks a buffer into an unsigned short\r
+ static inline unsigned short UnpackUnsignedShort(char* buffer);\r
+ static inline unsigned short UnpackUnsignedShort(const char* buffer);\r
+};\r
+\r
+// -------------------------------------------------------------\r
+// static 'utility' method implementations\r
+\r
+// checks BGZF block header\r
+inline\r
+bool BgzfData::CheckBlockHeader(char* header) {\r
+ return (header[0] == GZIP_ID1 &&\r
+ header[1] == (char)GZIP_ID2 &&\r
+ header[2] == Z_DEFLATED &&\r
+ (header[3] & FLG_FEXTRA) != 0 &&\r
+ BgzfData::UnpackUnsignedShort(&header[10]) == BGZF_XLEN &&\r
+ header[12] == BGZF_ID1 &&\r
+ header[13] == BGZF_ID2 &&\r
+ BgzfData::UnpackUnsignedShort(&header[14]) == BGZF_LEN );\r
+}\r
+\r
+// 'packs' an unsigned integer into the specified buffer\r
+inline\r
+void BgzfData::PackUnsignedInt(char* buffer, unsigned int value) {\r
+ buffer[0] = (char)value;\r
+ buffer[1] = (char)(value >> 8);\r
+ buffer[2] = (char)(value >> 16);\r
+ buffer[3] = (char)(value >> 24);\r
+}\r
+\r
+// 'packs' an unsigned short into the specified buffer\r
+inline\r
+void BgzfData::PackUnsignedShort(char* buffer, unsigned short value) {\r
+ buffer[0] = (char)value;\r
+ buffer[1] = (char)(value >> 8);\r
+}\r
+\r
+// 'unpacks' a buffer into a double (includes both non-const & const char* flavors)\r
+inline\r
+double BgzfData::UnpackDouble(char* buffer) {\r
+ union { double value; unsigned char valueBuffer[sizeof(double)]; } un;\r
+ un.value = 0;\r
+ un.valueBuffer[0] = buffer[0];\r
+ un.valueBuffer[1] = buffer[1];\r
+ un.valueBuffer[2] = buffer[2];\r
+ un.valueBuffer[3] = buffer[3];\r
+ un.valueBuffer[4] = buffer[4];\r
+ un.valueBuffer[5] = buffer[5];\r
+ un.valueBuffer[6] = buffer[6];\r
+ un.valueBuffer[7] = buffer[7];\r
+ return un.value;\r
+}\r
+\r
+inline\r
+double BgzfData::UnpackDouble(const char* buffer) {\r
+ union { double value; unsigned char valueBuffer[sizeof(double)]; } un;\r
+ un.value = 0;\r
+ un.valueBuffer[0] = buffer[0];\r
+ un.valueBuffer[1] = buffer[1];\r
+ un.valueBuffer[2] = buffer[2];\r
+ un.valueBuffer[3] = buffer[3];\r
+ un.valueBuffer[4] = buffer[4];\r
+ un.valueBuffer[5] = buffer[5];\r
+ un.valueBuffer[6] = buffer[6];\r
+ un.valueBuffer[7] = buffer[7];\r
+ return un.value;\r
+}\r
+\r
+// 'unpacks' a buffer into a float (includes both non-const & const char* flavors)\r
+inline\r
+float BgzfData::UnpackFloat(char* buffer) {\r
+ union { float value; unsigned char valueBuffer[sizeof(float)]; } un;\r
+ un.value = 0;\r
+ un.valueBuffer[0] = buffer[0];\r
+ un.valueBuffer[1] = buffer[1];\r
+ un.valueBuffer[2] = buffer[2];\r
+ un.valueBuffer[3] = buffer[3];\r
+ return un.value;\r
+}\r
+\r
+inline\r
+float BgzfData::UnpackFloat(const char* buffer) {\r
+ union { float value; unsigned char valueBuffer[sizeof(float)]; } un;\r
+ un.value = 0;\r
+ un.valueBuffer[0] = buffer[0];\r
+ un.valueBuffer[1] = buffer[1];\r
+ un.valueBuffer[2] = buffer[2];\r
+ un.valueBuffer[3] = buffer[3];\r
+ return un.value;\r
+}\r
+\r
+// 'unpacks' a buffer into a signed int (includes both non-const & const char* flavors)\r
+inline\r
+signed int BgzfData::UnpackSignedInt(char* buffer) {\r
+ union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un;\r
+ un.value = 0;\r
+ un.valueBuffer[0] = buffer[0];\r
+ un.valueBuffer[1] = buffer[1];\r
+ un.valueBuffer[2] = buffer[2];\r
+ un.valueBuffer[3] = buffer[3];\r
+ return un.value;\r
+}\r
+\r
+inline\r
+signed int BgzfData::UnpackSignedInt(const char* buffer) {\r
+ union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un;\r
+ un.value = 0;\r
+ un.valueBuffer[0] = buffer[0];\r
+ un.valueBuffer[1] = buffer[1];\r
+ un.valueBuffer[2] = buffer[2];\r
+ un.valueBuffer[3] = buffer[3];\r
+ return un.value;\r
+}\r
+\r
+// 'unpacks' a buffer into a signed short (includes both non-const & const char* flavors)\r
+inline\r
+signed short BgzfData::UnpackSignedShort(char* buffer) {\r
+ union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un;\r
+ un.value = 0;\r
+ un.valueBuffer[0] = buffer[0];\r
+ un.valueBuffer[1] = buffer[1];\r
+ return un.value;\r
+}\r
+\r
+inline\r
+signed short BgzfData::UnpackSignedShort(const char* buffer) {\r
+ union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un;\r
+ un.value = 0;\r
+ un.valueBuffer[0] = buffer[0];\r
+ un.valueBuffer[1] = buffer[1];\r
+ return un.value;\r
+}\r
+\r
+// 'unpacks' a buffer into an unsigned int (includes both non-const & const char* flavors)\r
+inline\r
+unsigned int BgzfData::UnpackUnsignedInt(char* buffer) {\r
+ union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;\r
+ un.value = 0;\r
+ un.valueBuffer[0] = buffer[0];\r
+ un.valueBuffer[1] = buffer[1];\r
+ un.valueBuffer[2] = buffer[2];\r
+ un.valueBuffer[3] = buffer[3];\r
+ return un.value;\r
+}\r
+\r
+inline\r
+unsigned int BgzfData::UnpackUnsignedInt(const char* buffer) {\r
+ union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;\r
+ un.value = 0;\r
+ un.valueBuffer[0] = buffer[0];\r
+ un.valueBuffer[1] = buffer[1];\r
+ un.valueBuffer[2] = buffer[2];\r
+ un.valueBuffer[3] = buffer[3];\r
+ return un.value;\r
+}\r
+\r
+// 'unpacks' a buffer into an unsigned short (includes both non-const & const char* flavors)\r
+inline\r
+unsigned short BgzfData::UnpackUnsignedShort(char* buffer) {\r
+ union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un;\r
+ un.value = 0;\r
+ un.valueBuffer[0] = buffer[0];\r
+ un.valueBuffer[1] = buffer[1];\r
+ return un.value;\r
+}\r
+\r
+inline\r
+unsigned short BgzfData::UnpackUnsignedShort(const char* buffer) {\r
+ union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un;\r
+ un.value = 0;\r
+ un.valueBuffer[0] = buffer[0];\r
+ un.valueBuffer[1] = buffer[1];\r
+ return un.value;\r
+}\r
+\r
+} // namespace BamTools\r
+\r
+#endif // BGZF_H\r
--- /dev/null
+// ***************************************************************************\r
+// BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 27 July 2010 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic constants, data structures, etc. for using BAM files\r
+// ***************************************************************************\r
+\r
+#ifndef BAMAUX_H\r
+#define BAMAUX_H\r
+\r
+// C inclues\r
+#include <cctype>\r
+#include <cstdio>\r
+#include <cstdlib>\r
+#include <cstring>\r
+\r
+// C++ includes\r
+#include <exception>\r
+#include <map>\r
+#include <string>\r
+#include <utility>\r
+#include <vector>\r
+\r
+// Platform-specific type definitions\r
+#ifndef BAMTOOLS_TYPES\r
+#define BAMTOOLS_TYPES\r
+ #ifdef _MSC_VER\r
+ typedef char int8_t;\r
+ typedef unsigned char uint8_t;\r
+ typedef short int16_t;\r
+ typedef unsigned short uint16_t;\r
+ typedef int int32_t;\r
+ typedef unsigned int uint32_t;\r
+ typedef long long int64_t;\r
+ typedef unsigned long long uint64_t;\r
+ #else\r
+ #include <stdint.h>\r
+ #endif\r
+#endif // BAMTOOLS_TYPES\r
+\r
+namespace BamTools {\r
+\r
+// BAM constants\r
+const int BAM_CORE_SIZE = 32;\r
+const int BAM_CMATCH = 0;\r
+const int BAM_CINS = 1;\r
+const int BAM_CDEL = 2;\r
+const int BAM_CREF_SKIP = 3;\r
+const int BAM_CSOFT_CLIP = 4;\r
+const int BAM_CHARD_CLIP = 5;\r
+const int BAM_CPAD = 6;\r
+const int BAM_CIGAR_SHIFT = 4;\r
+const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1);\r
+\r
+// BAM index constants\r
+const int MAX_BIN = 37450; // =(8^6-1)/7+1\r
+const int BAM_MIN_CHUNK_GAP = 32768;\r
+const int BAM_LIDX_SHIFT = 14;\r
+\r
+// Explicit variable sizes\r
+const int BT_SIZEOF_INT = 4;\r
+\r
+struct CigarOp;\r
+\r
+struct BamAlignment {\r
+\r
+ // constructors & destructor\r
+ public:\r
+ BamAlignment(void);\r
+ BamAlignment(const BamAlignment& other);\r
+ ~BamAlignment(void);\r
+\r
+ // Queries against alignment flags\r
+ public: \r
+ bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate \r
+ bool IsFailedQC(void) const; // Returns true if this read failed quality control \r
+ bool IsFirstMate(void) const; // Returns true if alignment is first mate on read \r
+ bool IsMapped(void) const; // Returns true if alignment is mapped \r
+ bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped \r
+ bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand \r
+ bool IsPaired(void) const; // Returns true if alignment part of paired-end read \r
+ bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment \r
+ bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution \r
+ bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand\r
+ bool IsSecondMate(void) const; // Returns true if alignment is second mate on read\r
+\r
+ // Manipulate alignment flags\r
+ public: \r
+ void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag \r
+ void SetIsFailedQC(bool ok); // Sets "failed quality control" flag \r
+ void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag \r
+ void SetIsMateUnmapped(bool ok); // Sets "alignment's mate is mapped" flag \r
+ void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag \r
+ void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag \r
+ void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag \r
+ void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag \r
+ void SetIsSecondaryAlignment(bool ok); // Sets "position is primary alignment" flag \r
+ void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag \r
+ void SetIsUnmapped(bool ok); // Sets "alignment is mapped" flag\r
+\r
+ // Tag data access methods\r
+ public:\r
+ // -------------------------------------------------------------------------------------\r
+ // N.B. - The following tag-modifying methods may not be used on BamAlignments fetched\r
+ // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in \r
+ // error message (to keep output clean) but will ALWAYS return false. Only user-\r
+ // generated BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid.\r
+\r
+ // add tag data (create new TAG entry with TYPE and VALUE)\r
+ // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details\r
+ // returns true if new data added, false if error or TAG already exists\r
+ // N.B. - will NOT modify existing tag. Use EditTag() instead\r
+ bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H\r
+ bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i\r
+ bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i\r
+ bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f\r
+ \r
+ // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present)\r
+ // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details\r
+ // returns true if edit was successfaul, false if error\r
+ bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H\r
+ bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i\r
+ bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i\r
+ bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f\r
+\r
+ // specific tag data access methods - these only remain for legacy support\r
+ bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (implemented as GetTag("NM", editDistance))\r
+ bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (implemented as GetTag("RG", readGroup)) \r
+ \r
+ // generic tag data access methods \r
+ bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings \r
+ bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data\r
+ bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data\r
+ bool GetTag(const std::string& tag, float& destination) const; // access floating point data\r
+ \r
+ // remove tag data\r
+ // returns true if removal was successful, false if error\r
+ // N.B. - returns false if TAG does not exist (no removal can occur)\r
+ bool RemoveTag(const std::string& tag);\r
+\r
+ // Additional data access methods\r
+ public:\r
+ int GetEndPosition(bool usePadded = false) const; // calculates alignment end position, based on starting position and CIGAR operations\r
+\r
+ // 'internal' utility methods \r
+ private:\r
+ static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed);\r
+ static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed);\r
+\r
+ // Data members\r
+ public:\r
+ std::string Name; // Read name\r
+ int32_t Length; // Query length\r
+ std::string QueryBases; // 'Original' sequence (as reported from sequencing machine)\r
+ std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping)\r
+ std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)\r
+ std::string TagData; // Tag data (accessor methods will pull the requested information out)\r
+ int32_t RefID; // ID number for reference sequence\r
+ int32_t Position; // Position (0-based) where alignment starts\r
+ uint16_t Bin; // Bin in BAM file where this alignment resides\r
+ uint16_t MapQuality; // Mapping quality score\r
+ uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate \r
+ std::vector<CigarOp> CigarData; // CIGAR operations for this alignment\r
+ int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned\r
+ int32_t MatePosition; // Position (0-based) where alignment's mate starts\r
+ int32_t InsertSize; // Mate-pair insert size\r
+ \r
+ // internal data\r
+ private:\r
+ struct BamAlignmentSupportData {\r
+ \r
+ // data members\r
+ std::string AllCharData;\r
+ uint32_t BlockLength;\r
+ uint32_t NumCigarOperations;\r
+ uint32_t QueryNameLength;\r
+ uint32_t QuerySequenceLength;\r
+ bool HasCoreOnly;\r
+ \r
+ // constructor\r
+ BamAlignmentSupportData(void)\r
+ : BlockLength(0)\r
+ , NumCigarOperations(0)\r
+ , QueryNameLength(0)\r
+ , QuerySequenceLength(0)\r
+ , HasCoreOnly(false)\r
+ { }\r
+ };\r
+ \r
+ // contains raw character data & lengths\r
+ BamAlignmentSupportData SupportData; \r
+ \r
+ // allow these classes access to BamAlignment private members (SupportData)\r
+ // but client code should not need to touch this data\r
+ friend class BamReader;\r
+ friend class BamWriter;\r
+\r
+ // Alignment flag query constants\r
+ // Use the get/set methods above instead\r
+ private:\r
+ enum { PAIRED = 1\r
+ , PROPER_PAIR = 2\r
+ , UNMAPPED = 4\r
+ , MATE_UNMAPPED = 8\r
+ , REVERSE = 16\r
+ , MATE_REVERSE = 32\r
+ , READ_1 = 64\r
+ , READ_2 = 128\r
+ , SECONDARY = 256\r
+ , QC_FAILED = 512\r
+ , DUPLICATE = 1024 \r
+ };\r
+};\r
+\r
+// ----------------------------------------------------------------\r
+// Auxiliary data structs & typedefs\r
+\r
+struct CigarOp {\r
+ \r
+ // data members\r
+ char Type; // Operation type (MIDNSHP)\r
+ uint32_t Length; // Operation length (number of bases)\r
+ \r
+ // constructor\r
+ CigarOp(const char type = '\0', \r
+ const uint32_t length = 0) \r
+ : Type(type)\r
+ , Length(length) \r
+ { }\r
+};\r
+\r
+struct RefData {\r
+ \r
+ // data members\r
+ std::string RefName; // Name of reference sequence\r
+ int32_t RefLength; // Length of reference sequence\r
+ bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence\r
+ \r
+ // constructor\r
+ RefData(const int32_t& length = 0, \r
+ bool ok = false)\r
+ : RefLength(length)\r
+ , RefHasAlignments(ok)\r
+ { }\r
+};\r
+\r
+typedef std::vector<RefData> RefVector;\r
+typedef std::vector<BamAlignment> BamAlignmentVector;\r
+\r
+struct BamRegion {\r
+ \r
+ // data members\r
+ int LeftRefID;\r
+ int LeftPosition;\r
+ int RightRefID;\r
+ int RightPosition;\r
+ \r
+ // constructor\r
+ BamRegion(const int& leftID = -1, \r
+ const int& leftPos = -1,\r
+ const int& rightID = -1,\r
+ const int& rightPos = -1)\r
+ : LeftRefID(leftID)\r
+ , LeftPosition(leftPos)\r
+ , RightRefID(rightID)\r
+ , RightPosition(rightPos)\r
+ { }\r
+};\r
+\r
+// ----------------------------------------------------------------\r
+// Added: 3-35-2010 DWB\r
+// Fixed: Routines to provide endian-correctness\r
+// ----------------------------------------------------------------\r
+\r
+// returns true if system is big endian\r
+inline bool SystemIsBigEndian(void) {\r
+ const uint16_t one = 0x0001;\r
+ return ((*(char*) &one) == 0 );\r
+}\r
+\r
+// swaps endianness of 16-bit value 'in place'\r
+inline void SwapEndian_16(int16_t& x) {\r
+ x = ((x >> 8) | (x << 8));\r
+}\r
+\r
+inline void SwapEndian_16(uint16_t& x) {\r
+ x = ((x >> 8) | (x << 8));\r
+}\r
+\r
+// swaps endianness of 32-bit value 'in-place'\r
+inline void SwapEndian_32(int32_t& x) {\r
+ x = ( (x >> 24) | \r
+ ((x << 8) & 0x00FF0000) | \r
+ ((x >> 8) & 0x0000FF00) | \r
+ (x << 24)\r
+ );\r
+}\r
+\r
+inline void SwapEndian_32(uint32_t& x) {\r
+ x = ( (x >> 24) | \r
+ ((x << 8) & 0x00FF0000) | \r
+ ((x >> 8) & 0x0000FF00) | \r
+ (x << 24)\r
+ );\r
+}\r
+\r
+// swaps endianness of 64-bit value 'in-place'\r
+inline void SwapEndian_64(int64_t& x) {\r
+ x = ( (x >> 56) | \r
+ ((x << 40) & 0x00FF000000000000ll) |\r
+ ((x << 24) & 0x0000FF0000000000ll) |\r
+ ((x << 8) & 0x000000FF00000000ll) |\r
+ ((x >> 8) & 0x00000000FF000000ll) |\r
+ ((x >> 24) & 0x0000000000FF0000ll) |\r
+ ((x >> 40) & 0x000000000000FF00ll) |\r
+ (x << 56)\r
+ );\r
+}\r
+\r
+inline void SwapEndian_64(uint64_t& x) {\r
+ x = ( (x >> 56) | \r
+ ((x << 40) & 0x00FF000000000000ll) |\r
+ ((x << 24) & 0x0000FF0000000000ll) |\r
+ ((x << 8) & 0x000000FF00000000ll) |\r
+ ((x >> 8) & 0x00000000FF000000ll) |\r
+ ((x >> 24) & 0x0000000000FF0000ll) |\r
+ ((x >> 40) & 0x000000000000FF00ll) |\r
+ (x << 56)\r
+ );\r
+}\r
+\r
+// swaps endianness of 'next 2 bytes' in a char buffer (in-place)\r
+inline void SwapEndian_16p(char* data) {\r
+ uint16_t& value = (uint16_t&)*data; \r
+ SwapEndian_16(value);\r
+}\r
+\r
+// swaps endianness of 'next 4 bytes' in a char buffer (in-place)\r
+inline void SwapEndian_32p(char* data) {\r
+ uint32_t& value = (uint32_t&)*data; \r
+ SwapEndian_32(value);\r
+}\r
+\r
+// swaps endianness of 'next 8 bytes' in a char buffer (in-place)\r
+inline void SwapEndian_64p(char* data) {\r
+ uint64_t& value = (uint64_t&)*data; \r
+ SwapEndian_64(value);\r
+}\r
+\r
+// ----------------------------------------------------------------\r
+// BamAlignment member methods\r
+\r
+// constructors & destructor\r
+inline BamAlignment::BamAlignment(void) { }\r
+\r
+inline BamAlignment::BamAlignment(const BamAlignment& other)\r
+ : Name(other.Name)\r
+ , Length(other.Length)\r
+ , QueryBases(other.QueryBases)\r
+ , AlignedBases(other.AlignedBases)\r
+ , Qualities(other.Qualities)\r
+ , TagData(other.TagData)\r
+ , RefID(other.RefID)\r
+ , Position(other.Position)\r
+ , Bin(other.Bin)\r
+ , MapQuality(other.MapQuality)\r
+ , AlignmentFlag(other.AlignmentFlag)\r
+ , CigarData(other.CigarData)\r
+ , MateRefID(other.MateRefID)\r
+ , MatePosition(other.MatePosition)\r
+ , InsertSize(other.InsertSize)\r
+ , SupportData(other.SupportData)\r
+{ }\r
+\r
+inline BamAlignment::~BamAlignment(void) { }\r
+\r
+// Queries against alignment flags\r
+inline bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }\r
+inline bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }\r
+inline bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }\r
+inline bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }\r
+inline bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }\r
+inline bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }\r
+inline bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }\r
+inline bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }\r
+inline bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }\r
+inline bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }\r
+inline bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }\r
+\r
+// Manipulate alignment flags \r
+inline void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; }\r
+inline void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; }\r
+inline void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; }\r
+inline void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; }\r
+inline void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; }\r
+inline void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; }\r
+inline void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; }\r
+inline void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; }\r
+inline void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; }\r
+inline void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; }\r
+inline void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; }\r
+\r
+// calculates alignment end position, based on starting position and CIGAR operations\r
+inline \r
+int BamAlignment::GetEndPosition(bool usePadded) const {\r
+\r
+ // initialize alignment end to starting position\r
+ int alignEnd = Position;\r
+\r
+ // iterate over cigar operations\r
+ std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin();\r
+ std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end();\r
+ for ( ; cigarIter != cigarEnd; ++cigarIter) {\r
+ const char cigarType = (*cigarIter).Type;\r
+ if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {\r
+ alignEnd += (*cigarIter).Length;\r
+ } \r
+ else if ( usePadded && cigarType == 'I' ) {\r
+ alignEnd += (*cigarIter).Length;\r
+ }\r
+ }\r
+ return alignEnd;\r
+}\r
+\r
+inline\r
+bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) {\r
+ \r
+ if ( SupportData.HasCoreOnly ) return false;\r
+ if ( tag.size() != 2 || type.size() != 1 ) return false;\r
+ if ( type != "Z" && type != "H" ) return false;\r
+ \r
+ // localize the tag data\r
+ char* pTagData = (char*)TagData.data();\r
+ const unsigned int tagDataLength = TagData.size();\r
+ unsigned int numBytesParsed = 0;\r
+ \r
+ // if tag already exists, return false\r
+ // use EditTag explicitly instead\r
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;\r
+ \r
+ // otherwise, copy tag data to temp buffer\r
+ std::string newTag = tag + type + value;\r
+ const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term\r
+ char originalTagData[newTagDataLength];\r
+ memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term\r
+ \r
+ // append newTag\r
+ strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term\r
+ \r
+ // store temp buffer back in TagData\r
+ const char* newTagData = (const char*)originalTagData;\r
+ TagData.assign(newTagData, newTagDataLength);\r
+ \r
+ // return success\r
+ return true;\r
+}\r
+\r
+inline\r
+bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) {\r
+ \r
+ if ( SupportData.HasCoreOnly ) return false;\r
+ if ( tag.size() != 2 || type.size() != 1 ) return false;\r
+ if ( type == "f" || type == "Z" || type == "H" ) return false;\r
+ \r
+ // localize the tag data\r
+ char* pTagData = (char*)TagData.data();\r
+ const unsigned int tagDataLength = TagData.size();\r
+ unsigned int numBytesParsed = 0;\r
+ \r
+ // if tag already exists, return false\r
+ // use EditTag explicitly instead\r
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;\r
+ \r
+ // otherwise, convert value to string\r
+ union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;\r
+ un.value = value;\r
+\r
+ // copy original tag data to temp buffer\r
+ std::string newTag = tag + type;\r
+ const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer\r
+ char originalTagData[newTagDataLength];\r
+ memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term\r
+ \r
+ // append newTag\r
+ strcat(originalTagData + tagDataLength, newTag.data());\r
+ memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int));\r
+ \r
+ // store temp buffer back in TagData\r
+ const char* newTagData = (const char*)originalTagData;\r
+ TagData.assign(newTagData, newTagDataLength);\r
+ \r
+ // return success\r
+ return true;\r
+}\r
+\r
+inline\r
+bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) {\r
+ return AddTag(tag, type, (const uint32_t&)value);\r
+}\r
+\r
+inline\r
+bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) {\r
+ \r
+ if ( SupportData.HasCoreOnly ) return false;\r
+ if ( tag.size() != 2 || type.size() != 1 ) return false;\r
+ if ( type == "Z" || type == "H" ) return false;\r
+ \r
+ // localize the tag data\r
+ char* pTagData = (char*)TagData.data();\r
+ const unsigned int tagDataLength = TagData.size();\r
+ unsigned int numBytesParsed = 0;\r
+ \r
+ // if tag already exists, return false\r
+ // use EditTag explicitly instead\r
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false;\r
+ \r
+ // otherwise, convert value to string\r
+ union { float value; char valueBuffer[sizeof(float)]; } un;\r
+ un.value = value;\r
+\r
+ // copy original tag data to temp buffer\r
+ std::string newTag = tag + type;\r
+ const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float\r
+ char originalTagData[newTagDataLength];\r
+ memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term\r
+ \r
+ // append newTag\r
+ strcat(originalTagData + tagDataLength, newTag.data());\r
+ memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float));\r
+ \r
+ // store temp buffer back in TagData\r
+ const char* newTagData = (const char*)originalTagData;\r
+ TagData.assign(newTagData, newTagDataLength);\r
+ \r
+ // return success\r
+ return true;\r
+}\r
+\r
+inline\r
+bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) {\r
+ \r
+ if ( SupportData.HasCoreOnly ) return false;\r
+ if ( tag.size() != 2 || type.size() != 1 ) return false;\r
+ if ( type != "Z" && type != "H" ) return false;\r
+ \r
+ // localize the tag data\r
+ char* pOriginalTagData = (char*)TagData.data();\r
+ char* pTagData = pOriginalTagData;\r
+ const unsigned int originalTagDataLength = TagData.size();\r
+ \r
+ unsigned int newTagDataLength = 0;\r
+ unsigned int numBytesParsed = 0;\r
+ \r
+ // if tag found, store data in readGroup, return success\r
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {\r
+ \r
+ // make sure array is more than big enough\r
+ char newTagData[originalTagDataLength + value.size()]; \r
+\r
+ // copy original tag data up til desired tag\r
+ const unsigned int beginningTagDataLength = numBytesParsed;\r
+ newTagDataLength += beginningTagDataLength;\r
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);\r
+ \r
+ // copy new VALUE in place of current tag data\r
+ const unsigned int dataLength = strlen(value.c_str());\r
+ memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 );\r
+ \r
+ // skip to next tag (if tag for removal is last, return true) \r
+ const char* pTagStorageType = pTagData - 1;\r
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;\r
+ \r
+ // copy everything from current tag (the next one after tag for removal) to end\r
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);\r
+ const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1;\r
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;\r
+ memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);\r
+ \r
+ // ensure null-terminator\r
+ newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;\r
+ \r
+ // save new tag data\r
+ TagData.assign(newTagData, endTagOffset + endTagDataLength);\r
+ return true;\r
+ }\r
+ \r
+ // tag not found, attempt AddTag\r
+ else return AddTag(tag, type, value);\r
+}\r
+\r
+inline\r
+bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) {\r
+ \r
+ if ( SupportData.HasCoreOnly ) return false;\r
+ if ( tag.size() != 2 || type.size() != 1 ) return false;\r
+ if ( type == "f" || type == "Z" || type == "H" ) return false;\r
+ \r
+ // localize the tag data\r
+ char* pOriginalTagData = (char*)TagData.data();\r
+ char* pTagData = pOriginalTagData;\r
+ const unsigned int originalTagDataLength = TagData.size();\r
+ \r
+ unsigned int newTagDataLength = 0;\r
+ unsigned int numBytesParsed = 0;\r
+ \r
+ // if tag found, store data in readGroup, return success\r
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {\r
+ \r
+ // make sure array is more than big enough\r
+ char newTagData[originalTagDataLength + sizeof(value)]; \r
+\r
+ // copy original tag data up til desired tag\r
+ const unsigned int beginningTagDataLength = numBytesParsed;\r
+ newTagDataLength += beginningTagDataLength;\r
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);\r
+ \r
+ // copy new VALUE in place of current tag data\r
+ union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un;\r
+ un.value = value;\r
+ memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int));\r
+ \r
+ // skip to next tag (if tag for removal is last, return true) \r
+ const char* pTagStorageType = pTagData - 1;\r
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;\r
+ \r
+ // copy everything from current tag (the next one after tag for removal) to end\r
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);\r
+ const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int);\r
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;\r
+ memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);\r
+ \r
+ // ensure null-terminator\r
+ newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;\r
+ \r
+ // save new tag data\r
+ TagData.assign(newTagData, endTagOffset + endTagDataLength);\r
+ return true;\r
+ }\r
+ \r
+ // tag not found, attempt AddTag\r
+ else return AddTag(tag, type, value);\r
+}\r
+\r
+inline\r
+bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) {\r
+ return EditTag(tag, type, (const uint32_t&)value);\r
+}\r
+\r
+inline\r
+bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) {\r
+ \r
+ if ( SupportData.HasCoreOnly ) return false;\r
+ if ( tag.size() != 2 || type.size() != 1 ) return false;\r
+ if ( type == "Z" || type == "H" ) return false;\r
+ \r
+ // localize the tag data\r
+ char* pOriginalTagData = (char*)TagData.data();\r
+ char* pTagData = pOriginalTagData;\r
+ const unsigned int originalTagDataLength = TagData.size();\r
+ \r
+ unsigned int newTagDataLength = 0;\r
+ unsigned int numBytesParsed = 0;\r
+ \r
+ // if tag found, store data in readGroup, return success\r
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {\r
+ \r
+ // make sure array is more than big enough\r
+ char newTagData[originalTagDataLength + sizeof(value)]; \r
+\r
+ // copy original tag data up til desired tag\r
+ const unsigned int beginningTagDataLength = numBytesParsed;\r
+ newTagDataLength += beginningTagDataLength;\r
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);\r
+ \r
+ // copy new VALUE in place of current tag data\r
+ union { float value; char valueBuffer[sizeof(float)]; } un;\r
+ un.value = value;\r
+ memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float));\r
+ \r
+ // skip to next tag (if tag for removal is last, return true) \r
+ const char* pTagStorageType = pTagData - 1;\r
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;\r
+ \r
+ // copy everything from current tag (the next one after tag for removal) to end\r
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);\r
+ const unsigned int endTagOffset = beginningTagDataLength + sizeof(float);\r
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;\r
+ memcpy(newTagData + endTagOffset, pTagData, endTagDataLength);\r
+ \r
+ // ensure null-terminator\r
+ newTagData[ endTagOffset + endTagDataLength + 1 ] = 0;\r
+ \r
+ // save new tag data\r
+ TagData.assign(newTagData, endTagOffset + endTagDataLength);\r
+ return true;\r
+ }\r
+ \r
+ // tag not found, attempt AddTag\r
+ else return AddTag(tag, type, value);\r
+}\r
+\r
+// get "NM" tag data - originally contributed by Aaron Quinlan\r
+// stores data in 'editDistance', returns success/fail\r
+inline \r
+bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { \r
+ return GetTag("NM", (uint32_t&)editDistance);\r
+}\r
+\r
+// get "RG" tag data\r
+// stores data in 'readGroup', returns success/fail\r
+inline \r
+bool BamAlignment::GetReadGroup(std::string& readGroup) const {\r
+ return GetTag("RG", readGroup);\r
+}\r
+\r
+inline\r
+bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const {\r
+\r
+ // make sure tag data exists\r
+ if ( SupportData.HasCoreOnly || TagData.empty() ) \r
+ return false;\r
+\r
+ // localize the tag data\r
+ char* pTagData = (char*)TagData.data();\r
+ const unsigned int tagDataLength = TagData.size();\r
+ unsigned int numBytesParsed = 0;\r
+ \r
+ // if tag found, store data in readGroup, return success\r
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {\r
+ const unsigned int dataLength = strlen(pTagData);\r
+ destination.clear();\r
+ destination.resize(dataLength);\r
+ memcpy( (char*)destination.data(), pTagData, dataLength );\r
+ return true;\r
+ }\r
+ \r
+ // tag not found, return failure\r
+ return false;\r
+}\r
+\r
+inline\r
+bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const {\r
+ \r
+ // make sure tag data exists\r
+ if ( SupportData.HasCoreOnly || TagData.empty() ) \r
+ return false;\r
+\r
+ // localize the tag data\r
+ char* pTagData = (char*)TagData.data();\r
+ const unsigned int tagDataLength = TagData.size();\r
+ unsigned int numBytesParsed = 0;\r
+ \r
+ // if tag found, determine data byte-length, store data in readGroup, return success\r
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {\r
+ \r
+ // determine data byte-length\r
+ const char type = *(pTagData - 1);\r
+ int destinationLength = 0;\r
+ switch (type) {\r
+ // 1 byte data\r
+ case 'A':\r
+ case 'c':\r
+ case 'C':\r
+ destinationLength = 1;\r
+ break;\r
+\r
+ // 2 byte data\r
+ case 's':\r
+ case 'S':\r
+ destinationLength = 2;\r
+ break;\r
+\r
+ // 4 byte data\r
+ case 'i':\r
+ case 'I':\r
+ destinationLength = 4;\r
+ break;\r
+\r
+ // unsupported type for integer destination (float or var-length strings)\r
+ case 'f':\r
+ case 'Z':\r
+ case 'H':\r
+ printf("ERROR: Cannot store tag of type %c in integer destination\n", type);\r
+ return false;\r
+\r
+ // unknown tag type\r
+ default:\r
+ printf("ERROR: Unknown tag storage class encountered: [%c]\n", type);\r
+ return false;\r
+ }\r
+ \r
+ // store in destination\r
+ destination = 0;\r
+ memcpy(&destination, pTagData, destinationLength);\r
+ return true;\r
+ }\r
+ \r
+ // tag not found, return failure\r
+ return false;\r
+}\r
+\r
+inline\r
+bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const {\r
+ return GetTag(tag, (uint32_t&)destination);\r
+}\r
+\r
+inline\r
+bool BamAlignment::GetTag(const std::string& tag, float& destination) const {\r
+ \r
+ // make sure tag data exists\r
+ if ( SupportData.HasCoreOnly || TagData.empty() ) \r
+ return false;\r
+\r
+ // localize the tag data\r
+ char* pTagData = (char*)TagData.data();\r
+ const unsigned int tagDataLength = TagData.size();\r
+ unsigned int numBytesParsed = 0;\r
+ \r
+ // if tag found, determine data byte-length, store data in readGroup, return success\r
+ if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {\r
+ //pTagData += numBytesParsed;\r
+ \r
+ // determine data byte-length\r
+ const char type = *(pTagData - 1);\r
+ int destinationLength = 0;\r
+ switch(type) {\r
+\r
+ // 1 byte data\r
+ case 'A':\r
+ case 'c':\r
+ case 'C':\r
+ destinationLength = 1;\r
+ break;\r
+\r
+ // 2 byte data\r
+ case 's':\r
+ case 'S':\r
+ destinationLength = 2;\r
+ break;\r
+\r
+ // 4 byte data\r
+ case 'f':\r
+ case 'i':\r
+ case 'I':\r
+ destinationLength = 4;\r
+ break;\r
+ \r
+ // unsupported type (var-length strings)\r
+ case 'Z':\r
+ case 'H':\r
+ printf("ERROR: Cannot store tag of type %c in integer destination\n", type);\r
+ return false;\r
+\r
+ // unknown tag type\r
+ default:\r
+ printf("ERROR: Unknown tag storage class encountered: [%c]\n", type);\r
+ return false;\r
+ }\r
+ \r
+ // store in destination\r
+ destination = 0.0;\r
+ memcpy(&destination, pTagData, destinationLength);\r
+ return true;\r
+ }\r
+ \r
+ // tag not found, return failure\r
+ return false;\r
+}\r
+\r
+inline\r
+bool BamAlignment::RemoveTag(const std::string& tag) {\r
+ \r
+ // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed\r
+ // also, return false if no data present to remove\r
+ if ( SupportData.HasCoreOnly || TagData.empty() ) return false;\r
+ \r
+ // localize the tag data\r
+ char* pOriginalTagData = (char*)TagData.data();\r
+ char* pTagData = pOriginalTagData;\r
+ const unsigned int originalTagDataLength = TagData.size();\r
+ unsigned int newTagDataLength = 0;\r
+ unsigned int numBytesParsed = 0;\r
+ \r
+ // if tag found, store data in readGroup, return success\r
+ if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) {\r
+ \r
+ char newTagData[originalTagDataLength];\r
+\r
+ // copy original tag data up til desired tag\r
+ pTagData -= 3;\r
+ numBytesParsed -= 3;\r
+ const unsigned int beginningTagDataLength = numBytesParsed;\r
+ newTagDataLength += beginningTagDataLength;\r
+ memcpy(newTagData, pOriginalTagData, numBytesParsed);\r
+ \r
+ // skip to next tag (if tag for removal is last, return true) \r
+ const char* pTagStorageType = pTagData + 2;\r
+ pTagData += 3;\r
+ numBytesParsed += 3;\r
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true;\r
+ \r
+ // copy everything from current tag (the next one after tag for removal) to end\r
+ const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength);\r
+ const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength;\r
+ memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength );\r
+ \r
+ // save new tag data\r
+ TagData.assign(newTagData, beginningTagDataLength + endTagDataLength);\r
+ return true;\r
+ }\r
+ \r
+ // tag not found, no removal - return failure\r
+ return false;\r
+}\r
+\r
+inline\r
+bool BamAlignment::FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) {\r
+\r
+ while ( numBytesParsed < tagDataLength ) {\r
+\r
+ const char* pTagType = pTagData;\r
+ const char* pTagStorageType = pTagData + 2;\r
+ pTagData += 3;\r
+ numBytesParsed += 3;\r
+\r
+ // check the current tag, return true on match\r
+ if ( std::strncmp(pTagType, tag.c_str(), 2) == 0 ) \r
+ return true;\r
+\r
+ // get the storage class and find the next tag\r
+ if ( *pTagStorageType == '\0' ) return false; \r
+ if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false;\r
+ if ( *pTagData == '\0' ) return false;\r
+ }\r
+ \r
+ // checked all tags, none match\r
+ return false;\r
+}\r
+\r
+inline\r
+bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) {\r
+ \r
+ switch(storageType) {\r
+\r
+ case 'A':\r
+ case 'c':\r
+ case 'C':\r
+ ++numBytesParsed;\r
+ ++pTagData;\r
+ break;\r
+\r
+ case 's':\r
+ case 'S':\r
+ numBytesParsed += 2;\r
+ pTagData += 2;\r
+ break;\r
+\r
+ case 'f':\r
+ case 'i':\r
+ case 'I':\r
+ numBytesParsed += 4;\r
+ pTagData += 4;\r
+ break;\r
+\r
+ case 'Z':\r
+ case 'H':\r
+ while(*pTagData) {\r
+ ++numBytesParsed;\r
+ ++pTagData;\r
+ }\r
+ // increment for null-terminator\r
+ ++numBytesParsed;\r
+ ++pTagData;\r
+ break;\r
+\r
+ default: \r
+ // error case\r
+ printf("ERROR: Unknown tag storage class encountered: [%c]\n", storageType);\r
+ return false;\r
+ }\r
+ \r
+ // return success\r
+ return true;\r
+}\r
+\r
+} // namespace BamTools\r
+\r
+#endif // BAMAUX_H\r
--- /dev/null
+// ***************************************************************************
+// BamIndex.cpp (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 17 August 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides index functionality - both for the default (standardized) BAM
+// index format (.bai) as well as a BamTools-specific (nonstandard) index
+// format (.bti).
+// ***************************************************************************
+
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
+// #include <iostream>
+#include <map>
+#include "BamIndex.h"
+#include "BamReader.h"
+#include "BGZF.h"
+using namespace std;
+using namespace BamTools;
+
+// -------------------------------
+// BamIndex implementation
+
+BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian)
+ : m_BGZF(bgzf)
+ , m_reader(reader)
+ , m_isBigEndian(isBigEndian)
+{
+ if ( m_reader && m_reader->IsOpen() )
+ m_references = m_reader->GetReferenceData();
+}
+
+bool BamIndex::HasAlignments(const int& referenceID) {
+
+ // return false if invalid ID
+ if ( (referenceID < 0) || (referenceID >= (int)m_references.size()) )
+ return false;
+
+ // else return status of reference (has alignments?)
+ else
+ return m_references.at(referenceID).RefHasAlignments;
+}
+
+// #########################################################################################
+// #########################################################################################
+
+// -------------------------------
+// BamDefaultIndex structs & typedefs
+
+namespace BamTools {
+
+// --------------------------------------------------
+// BamDefaultIndex data structures & typedefs
+struct Chunk {
+
+ // data members
+ uint64_t Start;
+ uint64_t Stop;
+
+ // constructor
+ Chunk(const uint64_t& start = 0,
+ const uint64_t& stop = 0)
+ : Start(start)
+ , Stop(stop)
+ { }
+};
+
+bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
+ return lhs.Start < rhs.Start;
+}
+
+typedef vector<Chunk> ChunkVector;
+typedef map<uint32_t, ChunkVector> BamBinMap;
+typedef vector<uint64_t> LinearOffsetVector;
+
+struct ReferenceIndex {
+
+ // data members
+ BamBinMap Bins;
+ LinearOffsetVector Offsets;
+
+ // constructor
+ ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
+ const LinearOffsetVector& offsets = LinearOffsetVector())
+ : Bins(binMap)
+ , Offsets(offsets)
+ { }
+};
+
+typedef vector<ReferenceIndex> BamDefaultIndexData;
+
+} // namespace BamTools
+
+// -------------------------------
+// BamDefaultIndex implementation
+
+struct BamDefaultIndex::BamDefaultIndexPrivate {
+
+ // -------------------------
+ // data members
+
+ BamDefaultIndexData m_indexData;
+ BamDefaultIndex* m_parent;
+
+ // -------------------------
+ // ctor & dtor
+
+ BamDefaultIndexPrivate(BamDefaultIndex* parent) : m_parent(parent) { }
+ ~BamDefaultIndexPrivate(void) { }
+
+ // -------------------------
+ // internal methods
+
+ // calculate bins that overlap region
+ int BinsFromRegion(const BamTools::BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]);
+ // saves BAM bin entry for index
+ void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset);
+ // saves linear offset entry for index
+ void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset);
+ // simplifies index by merging 'chunks'
+ void MergeChunks(void);
+
+};
+
+BamDefaultIndex::BamDefaultIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
+ : BamIndex(bgzf, reader, isBigEndian)
+{
+ d = new BamDefaultIndexPrivate(this);
+}
+
+BamDefaultIndex::~BamDefaultIndex(void) {
+ d->m_indexData.clear();
+ delete d;
+ d = 0;
+}
+
+// calculate bins that overlap region
+int BamDefaultIndex::BamDefaultIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) {
+
+ // get region boundaries
+ uint32_t begin = (unsigned int)region.LeftPosition;
+ uint32_t end;
+
+ // if right bound specified AND left&right bounds are on same reference
+ // OK to use right bound position
+ if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
+ end = (unsigned int)region.RightPosition;
+
+ // otherwise, use end of left bound reference as cutoff
+ else
+ end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1;
+
+ // initialize list, bin '0' always a valid bin
+ int i = 0;
+ bins[i++] = 0;
+
+ // get rest of bins that contain this region
+ unsigned int k;
+ for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; }
+ for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; }
+ for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; }
+ for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; }
+ for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
+
+ // return number of bins stored
+ return i;
+}
+
+bool BamDefaultIndex::Build(void) {
+
+ // be sure reader & BGZF file are valid & open for reading
+ if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen )
+ return false;
+
+ // move file pointer to beginning of alignments
+ m_reader->Rewind();
+
+ // get reference count, reserve index space
+ int numReferences = (int)m_references.size();
+ for ( int i = 0; i < numReferences; ++i ) {
+ d->m_indexData.push_back(ReferenceIndex());
+ }
+
+ // sets default constant for bin, ID, offset, coordinate variables
+ const uint32_t defaultValue = 0xffffffffu;
+
+ // bin data
+ uint32_t saveBin(defaultValue);
+ uint32_t lastBin(defaultValue);
+
+ // reference ID data
+ int32_t saveRefID(defaultValue);
+ int32_t lastRefID(defaultValue);
+
+ // offset data
+ uint64_t saveOffset = m_BGZF->Tell();
+ uint64_t lastOffset = saveOffset;
+
+ // coordinate data
+ int32_t lastCoordinate = defaultValue;
+
+ BamAlignment bAlignment;
+ while ( m_reader->GetNextAlignmentCore(bAlignment) ) {
+
+ // change of chromosome, save ID, reset bin
+ if ( lastRefID != bAlignment.RefID ) {
+ lastRefID = bAlignment.RefID;
+ lastBin = defaultValue;
+ }
+
+ // if lastCoordinate greater than BAM position - file not sorted properly
+ else if ( lastCoordinate > bAlignment.Position ) {
+ printf("BAM file not properly sorted:\n");
+ printf("Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID);
+ exit(1);
+ }
+
+ // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
+ if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
+
+ // save linear offset entry (matched to BAM entry refID)
+ ReferenceIndex& refIndex = d->m_indexData.at(bAlignment.RefID);
+ LinearOffsetVector& offsets = refIndex.Offsets;
+ d->InsertLinearOffset(offsets, bAlignment, lastOffset);
+ }
+
+ // if current BamAlignment bin != lastBin, "then possibly write the binning index"
+ if ( bAlignment.Bin != lastBin ) {
+
+ // if not first time through
+ if ( saveBin != defaultValue ) {
+
+ // save Bam bin entry
+ ReferenceIndex& refIndex = d->m_indexData.at(saveRefID);
+ BamBinMap& binMap = refIndex.Bins;
+ d->InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
+ }
+
+ // update saveOffset
+ saveOffset = lastOffset;
+
+ // update bin values
+ saveBin = bAlignment.Bin;
+ lastBin = bAlignment.Bin;
+
+ // update saveRefID
+ saveRefID = bAlignment.RefID;
+
+ // if invalid RefID, break out (why?)
+ if ( saveRefID < 0 ) { break; }
+ }
+
+ // make sure that current file pointer is beyond lastOffset
+ if ( m_BGZF->Tell() <= (int64_t)lastOffset ) {
+ printf("Error in BGZF offsets.\n");
+ exit(1);
+ }
+
+ // update lastOffset
+ lastOffset = m_BGZF->Tell();
+
+ // update lastCoordinate
+ lastCoordinate = bAlignment.Position;
+ }
+
+ // save any leftover BAM data (as long as refID is valid)
+ if ( saveRefID >= 0 ) {
+ // save Bam bin entry
+ ReferenceIndex& refIndex = d->m_indexData.at(saveRefID);
+ BamBinMap& binMap = refIndex.Bins;
+ d->InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
+ }
+
+ // simplify index by merging chunks
+ d->MergeChunks();
+
+ // iterate through references in index
+ // store whether reference has data &
+ // sort offsets in linear offset vector
+ BamDefaultIndexData::iterator indexIter = d->m_indexData.begin();
+ BamDefaultIndexData::iterator indexEnd = d->m_indexData.end();
+ for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
+
+ // get reference index data
+ ReferenceIndex& refIndex = (*indexIter);
+ BamBinMap& binMap = refIndex.Bins;
+ LinearOffsetVector& offsets = refIndex.Offsets;
+
+ // store whether reference has alignments or no
+ m_references[i].RefHasAlignments = ( binMap.size() > 0 );
+
+ // sort linear offsets
+ sort(offsets.begin(), offsets.end());
+ }
+
+ // rewind file pointer to beginning of alignments, return success/fail
+ return m_reader->Rewind();
+}
+
+bool BamDefaultIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) {
+
+ // calculate which bins overlap this region
+ uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
+ int numBins = d->BinsFromRegion(region, isRightBoundSpecified, bins);
+
+ // get bins for this reference
+ const ReferenceIndex& refIndex = d->m_indexData.at(region.LeftRefID);
+ const BamBinMap& binMap = refIndex.Bins;
+
+ // get minimum offset to consider
+ const LinearOffsetVector& linearOffsets = refIndex.Offsets;
+ uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
+
+ // store all alignment 'chunk' starts (file offsets) for bins in this region
+ for ( int i = 0; i < numBins; ++i ) {
+
+ const uint16_t binKey = bins[i];
+ map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
+ if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
+
+ const ChunkVector& chunks = (*binIter).second;
+ std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
+ std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
+ for ( ; chunksIter != chunksEnd; ++chunksIter) {
+
+ // if valid chunk found, store its file offset
+ const Chunk& chunk = (*chunksIter);
+ if ( chunk.Stop > minOffset )
+ offsets.push_back( chunk.Start );
+ }
+ }
+ }
+
+ // clean up memory
+ free(bins);
+
+ // sort the offsets before returning
+ sort(offsets.begin(), offsets.end());
+
+ // return whether any offsets were found
+ return ( offsets.size() != 0 );
+}
+
+// saves BAM bin entry for index
+void BamDefaultIndex::BamDefaultIndexPrivate::InsertBinEntry(BamBinMap& binMap,
+ const uint32_t& saveBin,
+ const uint64_t& saveOffset,
+ const uint64_t& lastOffset)
+{
+ // look up saveBin
+ BamBinMap::iterator binIter = binMap.find(saveBin);
+
+ // create new chunk
+ Chunk newChunk(saveOffset, lastOffset);
+
+ // if entry doesn't exist
+ if ( binIter == binMap.end() ) {
+ ChunkVector newChunks;
+ newChunks.push_back(newChunk);
+ binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
+ }
+
+ // otherwise
+ else {
+ ChunkVector& binChunks = (*binIter).second;
+ binChunks.push_back( newChunk );
+ }
+}
+
+// saves linear offset entry for index
+void BamDefaultIndex::BamDefaultIndexPrivate::InsertLinearOffset(LinearOffsetVector& offsets,
+ const BamAlignment& bAlignment,
+ const uint64_t& lastOffset)
+{
+ // get converted offsets
+ int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
+ int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
+
+ // resize vector if necessary
+ int oldSize = offsets.size();
+ int newSize = endOffset + 1;
+ if ( oldSize < newSize )
+ offsets.resize(newSize, 0);
+
+ // store offset
+ for( int i = beginOffset + 1; i <= endOffset; ++i ) {
+ if ( offsets[i] == 0 )
+ offsets[i] = lastOffset;
+ }
+}
+
+bool BamDefaultIndex::Load(const string& filename) {
+
+ // open index file, abort on error
+ FILE* indexStream = fopen(filename.c_str(), "rb");
+ if( !indexStream ) {
+ printf("ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
+ return false;
+ }
+
+ // set placeholder to receive input byte count (suppresses compiler warnings)
+ size_t elementsRead = 0;
+
+ // see if index is valid BAM index
+ char magic[4];
+ elementsRead = fread(magic, 1, 4, indexStream);
+ if ( strncmp(magic, "BAI\1", 4) ) {
+ printf("Problem with index file - invalid format.\n");
+ fclose(indexStream);
+ return false;
+ }
+
+ // get number of reference sequences
+ uint32_t numRefSeqs;
+ elementsRead = fread(&numRefSeqs, 4, 1, indexStream);
+ if ( m_isBigEndian ) { SwapEndian_32(numRefSeqs); }
+
+ // intialize space for BamDefaultIndexData data structure
+ d->m_indexData.reserve(numRefSeqs);
+
+ // iterate over reference sequences
+ for ( unsigned int i = 0; i < numRefSeqs; ++i ) {
+
+ // get number of bins for this reference sequence
+ int32_t numBins;
+ elementsRead = fread(&numBins, 4, 1, indexStream);
+ if ( m_isBigEndian ) { SwapEndian_32(numBins); }
+
+ if ( numBins > 0 ) {
+ RefData& refEntry = m_references[i];
+ refEntry.RefHasAlignments = true;
+ }
+
+ // intialize BinVector
+ BamBinMap binMap;
+
+ // iterate over bins for that reference sequence
+ for ( int j = 0; j < numBins; ++j ) {
+
+ // get binID
+ uint32_t binID;
+ elementsRead = fread(&binID, 4, 1, indexStream);
+
+ // get number of regionChunks in this bin
+ uint32_t numChunks;
+ elementsRead = fread(&numChunks, 4, 1, indexStream);
+
+ if ( m_isBigEndian ) {
+ SwapEndian_32(binID);
+ SwapEndian_32(numChunks);
+ }
+
+ // intialize ChunkVector
+ ChunkVector regionChunks;
+ regionChunks.reserve(numChunks);
+
+ // iterate over regionChunks in this bin
+ for ( unsigned int k = 0; k < numChunks; ++k ) {
+
+ // get chunk boundaries (left, right)
+ uint64_t left;
+ uint64_t right;
+ elementsRead = fread(&left, 8, 1, indexStream);
+ elementsRead = fread(&right, 8, 1, indexStream);
+
+ if ( m_isBigEndian ) {
+ SwapEndian_64(left);
+ SwapEndian_64(right);
+ }
+
+ // save ChunkPair
+ regionChunks.push_back( Chunk(left, right) );
+ }
+
+ // sort chunks for this bin
+ sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan );
+
+ // save binID, chunkVector for this bin
+ binMap.insert( pair<uint32_t, ChunkVector>(binID, regionChunks) );
+ }
+
+ // load linear index for this reference sequence
+
+ // get number of linear offsets
+ int32_t numLinearOffsets;
+ elementsRead = fread(&numLinearOffsets, 4, 1, indexStream);
+ if ( m_isBigEndian ) { SwapEndian_32(numLinearOffsets); }
+
+ // intialize LinearOffsetVector
+ LinearOffsetVector offsets;
+ offsets.reserve(numLinearOffsets);
+
+ // iterate over linear offsets for this reference sequeence
+ uint64_t linearOffset;
+ for ( int j = 0; j < numLinearOffsets; ++j ) {
+ // read a linear offset & store
+ elementsRead = fread(&linearOffset, 8, 1, indexStream);
+ if ( m_isBigEndian ) { SwapEndian_64(linearOffset); }
+ offsets.push_back(linearOffset);
+ }
+
+ // sort linear offsets
+ sort( offsets.begin(), offsets.end() );
+
+ // store index data for that reference sequence
+ d->m_indexData.push_back( ReferenceIndex(binMap, offsets) );
+ }
+
+ // close index file (.bai) and return
+ fclose(indexStream);
+ return true;
+}
+
+// merges 'alignment chunks' in BAM bin (used for index building)
+void BamDefaultIndex::BamDefaultIndexPrivate::MergeChunks(void) {
+
+ // iterate over reference enties
+ BamDefaultIndexData::iterator indexIter = m_indexData.begin();
+ BamDefaultIndexData::iterator indexEnd = m_indexData.end();
+ for ( ; indexIter != indexEnd; ++indexIter ) {
+
+ // get BAM bin map for this reference
+ ReferenceIndex& refIndex = (*indexIter);
+ BamBinMap& bamBinMap = refIndex.Bins;
+
+ // iterate over BAM bins
+ BamBinMap::iterator binIter = bamBinMap.begin();
+ BamBinMap::iterator binEnd = bamBinMap.end();
+ for ( ; binIter != binEnd; ++binIter ) {
+
+ // get chunk vector for this bin
+ ChunkVector& binChunks = (*binIter).second;
+ if ( binChunks.size() == 0 ) { continue; }
+
+ ChunkVector mergedChunks;
+ mergedChunks.push_back( binChunks[0] );
+
+ // iterate over chunks
+ int i = 0;
+ ChunkVector::iterator chunkIter = binChunks.begin();
+ ChunkVector::iterator chunkEnd = binChunks.end();
+ for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
+
+ // get 'currentChunk' based on numeric index
+ Chunk& currentChunk = mergedChunks[i];
+
+ // get iteratorChunk based on vector iterator
+ Chunk& iteratorChunk = (*chunkIter);
+
+ // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted)
+ if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) {
+
+ // set currentChunk.Stop to iteratorChunk.Stop
+ currentChunk.Stop = iteratorChunk.Stop;
+ }
+
+ // otherwise
+ else {
+ // set currentChunk + 1 to iteratorChunk
+ mergedChunks.push_back(iteratorChunk);
+ ++i;
+ }
+ }
+
+ // saved merged chunk vector
+ (*binIter).second = mergedChunks;
+ }
+ }
+}
+
+// writes in-memory index data out to file
+// N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
+bool BamDefaultIndex::Write(const std::string& bamFilename) {
+
+ string indexFilename = bamFilename + ".bai";
+ FILE* indexStream = fopen(indexFilename.c_str(), "wb");
+ if ( indexStream == 0 ) {
+ printf("ERROR: Could not open file to save index.\n");
+ return false;
+ }
+
+ // write BAM index header
+ fwrite("BAI\1", 1, 4, indexStream);
+
+ // write number of reference sequences
+ int32_t numReferenceSeqs = d->m_indexData.size();
+ if ( m_isBigEndian ) { SwapEndian_32(numReferenceSeqs); }
+ fwrite(&numReferenceSeqs, 4, 1, indexStream);
+
+ // iterate over reference sequences
+ BamDefaultIndexData::const_iterator indexIter = d->m_indexData.begin();
+ BamDefaultIndexData::const_iterator indexEnd = d->m_indexData.end();
+ for ( ; indexIter != indexEnd; ++ indexIter ) {
+
+ // get reference index data
+ const ReferenceIndex& refIndex = (*indexIter);
+ const BamBinMap& binMap = refIndex.Bins;
+ const LinearOffsetVector& offsets = refIndex.Offsets;
+
+ // write number of bins
+ int32_t binCount = binMap.size();
+ if ( m_isBigEndian ) { SwapEndian_32(binCount); }
+ fwrite(&binCount, 4, 1, indexStream);
+
+ // iterate over bins
+ BamBinMap::const_iterator binIter = binMap.begin();
+ BamBinMap::const_iterator binEnd = binMap.end();
+ for ( ; binIter != binEnd; ++binIter ) {
+
+ // get bin data (key and chunk vector)
+ uint32_t binKey = (*binIter).first;
+ const ChunkVector& binChunks = (*binIter).second;
+
+ // save BAM bin key
+ if ( m_isBigEndian ) { SwapEndian_32(binKey); }
+ fwrite(&binKey, 4, 1, indexStream);
+
+ // save chunk count
+ int32_t chunkCount = binChunks.size();
+ if ( m_isBigEndian ) { SwapEndian_32(chunkCount); }
+ fwrite(&chunkCount, 4, 1, indexStream);
+
+ // iterate over chunks
+ ChunkVector::const_iterator chunkIter = binChunks.begin();
+ ChunkVector::const_iterator chunkEnd = binChunks.end();
+ for ( ; chunkIter != chunkEnd; ++chunkIter ) {
+
+ // get current chunk data
+ const Chunk& chunk = (*chunkIter);
+ uint64_t start = chunk.Start;
+ uint64_t stop = chunk.Stop;
+
+ if ( m_isBigEndian ) {
+ SwapEndian_64(start);
+ SwapEndian_64(stop);
+ }
+
+ // save chunk offsets
+ fwrite(&start, 8, 1, indexStream);
+ fwrite(&stop, 8, 1, indexStream);
+ }
+ }
+
+ // write linear offsets size
+ int32_t offsetSize = offsets.size();
+ if ( m_isBigEndian ) { SwapEndian_32(offsetSize); }
+ fwrite(&offsetSize, 4, 1, indexStream);
+
+ // iterate over linear offsets
+ LinearOffsetVector::const_iterator offsetIter = offsets.begin();
+ LinearOffsetVector::const_iterator offsetEnd = offsets.end();
+ for ( ; offsetIter != offsetEnd; ++offsetIter ) {
+
+ // write linear offset value
+ uint64_t linearOffset = (*offsetIter);
+ if ( m_isBigEndian ) { SwapEndian_64(linearOffset); }
+ fwrite(&linearOffset, 8, 1, indexStream);
+ }
+ }
+
+ // flush buffer, close file, and return success
+ fflush(indexStream);
+ fclose(indexStream);
+ return true;
+}
+
+// #########################################################################################
+// #########################################################################################
+
+// -------------------------------------
+// BamToolsIndex implementation
+
+namespace BamTools {
+
+struct BamToolsIndexEntry {
+
+ // data members
+ int64_t Offset;
+ int RefID;
+ int Position;
+
+ // ctor
+ BamToolsIndexEntry(const uint64_t& offset = 0,
+ const int& id = -1,
+ const int& position = -1)
+ : Offset(offset)
+ , RefID(id)
+ , Position(position)
+ { }
+};
+
+typedef vector<BamToolsIndexEntry> BamToolsIndexData;
+
+} // namespace BamTools
+
+struct BamToolsIndex::BamToolsIndexPrivate {
+
+ // -------------------------
+ // data members
+ BamToolsIndexData m_indexData;
+ BamToolsIndex* m_parent;
+ int32_t m_blockSize;
+
+ // -------------------------
+ // ctor & dtor
+
+ BamToolsIndexPrivate(BamToolsIndex* parent)
+ : m_parent(parent)
+ , m_blockSize(1000)
+ { }
+
+ ~BamToolsIndexPrivate(void) { }
+
+ // -------------------------
+ // internal methods
+};
+
+BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian)
+ : BamIndex(bgzf, reader, isBigEndian)
+{
+ d = new BamToolsIndexPrivate(this);
+}
+
+BamToolsIndex::~BamToolsIndex(void) {
+ delete d;
+ d = 0;
+}
+
+bool BamToolsIndex::Build(void) {
+
+ // be sure reader & BGZF file are valid & open for reading
+ if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen )
+ return false;
+
+ // move file pointer to beginning of alignments
+ m_reader->Rewind();
+
+ // plow through alignments, store block offsets
+ int32_t currentBlockCount = 0;
+ int64_t blockStartOffset = m_BGZF->Tell();
+ int blockStartId = -1;
+ int blockStartPosition = -1;
+ BamAlignment al;
+ while ( m_reader->GetNextAlignmentCore(al) ) {
+
+ // set reference flag
+ m_references[al.RefID].RefHasAlignments = true;
+
+ // if beginning of block, save first alignment's refID & position
+ if ( currentBlockCount == 0 ) {
+ blockStartId = al.RefID;
+ blockStartPosition = al.Position;
+ }
+
+ // increment block counter
+ ++currentBlockCount;
+
+ // if block is full, get offset for next block, reset currentBlockCount
+ if ( currentBlockCount == d->m_blockSize ) {
+
+ d->m_indexData.push_back( BamToolsIndexEntry(blockStartOffset, blockStartId, blockStartPosition) );
+ blockStartOffset = m_BGZF->Tell();
+ currentBlockCount = 0;
+ }
+ }
+
+ return m_reader->Rewind();
+}
+
+// N.B. - ignores isRightBoundSpecified
+bool BamToolsIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) {
+
+ // return false if no index data present
+ if ( d->m_indexData.empty() ) return false;
+
+ // clear any prior data
+ offsets.clear();
+
+ // calculate nearest index to jump to
+ int64_t previousOffset = -1;
+ BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin();
+ BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end();
+ for ( ; indexIter != indexEnd; ++indexIter ) {
+
+ const BamToolsIndexEntry& entry = (*indexIter);
+
+ // check if we are 'past' beginning of desired region
+ // if so, we will break out & use previously stored offset
+ if ( entry.RefID > region.LeftRefID ) break;
+ if ( (entry.RefID == region.LeftRefID) && (entry.Position > region.LeftPosition) ) break;
+
+ // not past desired region, so store current entry offset in previousOffset
+ previousOffset = entry.Offset;
+ }
+
+ // no index was found
+ if ( previousOffset == -1 )
+ return false;
+
+ // store offset & return success
+ offsets.push_back(previousOffset);
+ return true;
+}
+
+bool BamToolsIndex::Load(const string& filename) {
+
+ // open index file, abort on error
+ FILE* indexStream = fopen(filename.c_str(), "rb");
+ if( !indexStream ) {
+ printf("ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
+ return false;
+ }
+
+ // set placeholder to receive input byte count (suppresses compiler warnings)
+ size_t elementsRead = 0;
+
+ // see if index is valid BAM index
+ char magic[4];
+ elementsRead = fread(magic, 1, 4, indexStream);
+ if ( strncmp(magic, "BTI\1", 4) ) {
+ printf("Problem with index file - invalid format.\n");
+ fclose(indexStream);
+ return false;
+ }
+
+ // read in block size
+ elementsRead = fread(&d->m_blockSize, sizeof(d->m_blockSize), 1, indexStream);
+ if ( m_isBigEndian ) { SwapEndian_32(d->m_blockSize); }
+
+ // read in number of offsets
+ uint32_t numOffsets;
+ elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream);
+ if ( m_isBigEndian ) { SwapEndian_32(numOffsets); }
+
+ // reserve space for index data
+ d->m_indexData.reserve(numOffsets);
+
+ // iterate over index entries
+ for ( unsigned int i = 0; i < numOffsets; ++i ) {
+
+ uint64_t offset;
+ int id;
+ int position;
+
+ // read in data
+ elementsRead = fread(&offset, sizeof(offset), 1, indexStream);
+ elementsRead = fread(&id, sizeof(id), 1, indexStream);
+ elementsRead = fread(&position, sizeof(position), 1, indexStream);
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_64(offset);
+ SwapEndian_32(id);
+ SwapEndian_32(position);
+ }
+
+ // save reference index entry
+ d->m_indexData.push_back( BamToolsIndexEntry(offset, id, position) );
+
+ // set reference flag
+ m_references[id].RefHasAlignments = true; // what about sparse references? wont be able to set flag?
+ }
+
+ // close index file and return
+ fclose(indexStream);
+ return true;
+}
+
+// writes in-memory index data out to file
+// N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
+bool BamToolsIndex::Write(const std::string& bamFilename) {
+
+ string indexFilename = bamFilename + ".bti";
+ FILE* indexStream = fopen(indexFilename.c_str(), "wb");
+ if ( indexStream == 0 ) {
+ printf("ERROR: Could not open file to save index.\n");
+ return false;
+ }
+
+ // write BAM index header
+ fwrite("BTI\1", 1, 4, indexStream);
+
+ // write block size
+ int32_t blockSize = d->m_blockSize;
+ if ( m_isBigEndian ) { SwapEndian_32(blockSize); }
+ fwrite(&blockSize, sizeof(blockSize), 1, indexStream);
+
+ // write number of offset entries
+ uint32_t numOffsets = d->m_indexData.size();
+ if ( m_isBigEndian ) { SwapEndian_32(numOffsets); }
+ fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream);
+
+ // iterate over offset entries
+ BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin();
+ BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end();
+ for ( ; indexIter != indexEnd; ++ indexIter ) {
+
+ // get reference index data
+ const BamToolsIndexEntry& entry = (*indexIter);
+
+ // copy entry data
+ uint64_t offset = entry.Offset;
+ int id = entry.RefID;
+ int position = entry.Position;
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_64(offset);
+ SwapEndian_32(id);
+ SwapEndian_32(position);
+ }
+
+ // write the reference index entry
+ fwrite(&offset, sizeof(offset), 1, indexStream);
+ fwrite(&id, sizeof(id), 1, indexStream);
+ fwrite(&position, sizeof(position), 1, indexStream);
+ }
+
+ // flush file buffer, close file, and return success
+ fflush(indexStream);
+ fclose(indexStream);
+ return true;
+}
--- /dev/null
+// ***************************************************************************
+// BamIndex.h (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 17 August 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides index functionality - both for the default (standardized) BAM
+// index format (.bai) as well as a BamTools-specific (nonstandard) index
+// format (.bti).
+// ***************************************************************************
+
+#ifndef BAM_INDEX_H
+#define BAM_INDEX_H
+
+#include <string>
+#include <vector>
+#include "BamAux.h"
+
+namespace BamTools {
+
+class BamReader;
+class BgzfData;
+
+// --------------------------------------------------
+// BamIndex base class
+class BamIndex {
+
+ public:
+ BamIndex(BamTools::BgzfData* bgzf,
+ BamTools::BamReader* reader,
+ bool isBigEndian);
+ virtual ~BamIndex(void) { }
+
+ public:
+ // creates index data (in-memory) from current reader data
+ virtual bool Build(void) =0;
+ // calculates offset(s) for a given region
+ virtual bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector<int64_t>& offsets) =0;
+ // loads existing data from file into memory
+ virtual bool Load(const std::string& filename) =0;
+ // returns whether reference has alignments or no
+ virtual bool HasAlignments(const int& referenceID);
+ // writes in-memory index data out to file
+ // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
+ virtual bool Write(const std::string& bamFilename) =0;
+
+ protected:
+ BamTools::BgzfData* m_BGZF;
+ BamTools::BamReader* m_reader;
+ BamTools::RefVector m_references;
+ bool m_isBigEndian;
+};
+
+// --------------------------------------------------
+// BamDefaultIndex class
+//
+// implements default (per SAM/BAM spec) index file ops
+class BamDefaultIndex : public BamIndex {
+
+
+ // ctor & dtor
+ public:
+ BamDefaultIndex(BamTools::BgzfData* bgzf,
+ BamTools::BamReader* reader,
+ bool isBigEndian);
+ ~BamDefaultIndex(void);
+
+ // interface (implements BamIndex virtual methods)
+ public:
+ // creates index data (in-memory) from current reader data
+ bool Build(void);
+ // calculates offset(s) for a given region
+ bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector<int64_t>& offsets);
+ // loads existing data from file into memory
+ bool Load(const std::string& filename);
+ // writes in-memory index data out to file
+ // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
+ bool Write(const std::string& bamFilename);
+
+ // internal implementation
+ private:
+ struct BamDefaultIndexPrivate;
+ BamDefaultIndexPrivate* d;
+};
+
+// --------------------------------------------------
+// BamToolsIndex class
+//
+// implements BamTools-specific index file ops
+class BamToolsIndex : public BamIndex {
+
+ // ctor & dtor
+ public:
+ BamToolsIndex(BamTools::BgzfData* bgzf,
+ BamTools::BamReader* reader,
+ bool isBigEndian);
+ ~BamToolsIndex(void);
+
+ // interface (implements BamIndex virtual methods)
+ public:
+ // creates index data (in-memory) from current reader data
+ bool Build(void);
+ // calculates offset(s) for a given region
+ bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector<int64_t>& offsets);
+ // loads existing data from file into memory
+ bool Load(const std::string& filename);
+ // writes in-memory index data out to file
+ // N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
+ bool Write(const std::string& bamFilename);
+
+ // internal implementation
+ private:
+ struct BamToolsIndexPrivate;
+ BamToolsIndexPrivate* d;
+};
+
+} // namespace BamTools
+
+#endif // BAM_INDEX_H
\ No newline at end of file
--- /dev/null
+// ***************************************************************************
+// BamMultiReader.cpp (c) 2010 Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 20 July 2010 (DB)
+// ---------------------------------------------------------------------------
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
+// Functionality for simultaneously reading multiple BAM files.
+//
+// This functionality allows applications to work on very large sets of files
+// without requiring intermediate merge, sort, and index steps for each file
+// subset. It also improves the performance of our merge system as it
+// precludes the need to sort merged files.
+// ***************************************************************************
+
+// C++ includes
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sstream>
+
+// BamTools includes
+#include "BGZF.h"
+#include "BamMultiReader.h"
+using namespace BamTools;
+using namespace std;
+
+// -----------------------------------------------------
+// BamMultiReader implementation
+// -----------------------------------------------------
+
+// constructor
+BamMultiReader::BamMultiReader(void)
+ : CurrentRefID(0)
+ , CurrentLeft(0)
+{ }
+
+// destructor
+BamMultiReader::~BamMultiReader(void) {
+ Close(); // close the bam files
+ // clean up reader objects
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ delete it->first;
+ delete it->second;
+ }
+}
+
+// close the BAM files
+void BamMultiReader::Close(void) {
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ reader->Close(); // close the reader
+ }
+}
+
+// updates the reference id stored in the BamMultiReader
+// to reflect the current state of the readers
+void BamMultiReader::UpdateReferenceID(void) {
+ // the alignments are sorted by position, so the first alignment will always have the lowest reference ID
+ if (alignments.begin()->second.second->RefID != CurrentRefID) {
+ // get the next reference id
+ // while there aren't any readers at the next ref id
+ // increment the ref id
+ int nextRefID = CurrentRefID;
+ while (alignments.begin()->second.second->RefID != nextRefID) {
+ ++nextRefID;
+ }
+ //cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl;
+ CurrentRefID = nextRefID;
+ }
+}
+
+// checks if any readers still have alignments
+bool BamMultiReader::HasOpenReaders() {
+ return alignments.size() > 0;
+}
+
+// get next alignment among all files
+bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
+
+ // bail out if we are at EOF in all files, means no more alignments to process
+ if (!HasOpenReaders())
+ return false;
+
+ // when all alignments have stepped into a new target sequence, update our
+ // current reference sequence id
+ UpdateReferenceID();
+
+ // our lowest alignment and reader will be at the front of our alignment index
+ BamAlignment* alignment = alignments.begin()->second.second;
+ BamReader* reader = alignments.begin()->second.first;
+
+ // now that we have the lowest alignment in the set, save it by copy to our argument
+ nextAlignment = BamAlignment(*alignment);
+
+ // remove this alignment index entry from our alignment index
+ alignments.erase(alignments.begin());
+
+ // and add another entry if we can get another alignment from the reader
+ if (reader->GetNextAlignment(*alignment)) {
+ alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+ make_pair(reader, alignment)));
+ } else { // do nothing
+ //cerr << "reached end of file " << lowestReader->GetFilename() << endl;
+ }
+
+ return true;
+
+}
+
+// get next alignment among all files without parsing character data from alignments
+bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) {
+
+ // bail out if we are at EOF in all files, means no more alignments to process
+ if (!HasOpenReaders())
+ return false;
+
+ // when all alignments have stepped into a new target sequence, update our
+ // current reference sequence id
+ UpdateReferenceID();
+
+ // our lowest alignment and reader will be at the front of our alignment index
+ BamAlignment* alignment = alignments.begin()->second.second;
+ BamReader* reader = alignments.begin()->second.first;
+
+ // now that we have the lowest alignment in the set, save it by copy to our argument
+ nextAlignment = BamAlignment(*alignment);
+ //memcpy(&nextAlignment, alignment, sizeof(BamAlignment));
+
+ // remove this alignment index entry from our alignment index
+ alignments.erase(alignments.begin());
+
+ // and add another entry if we can get another alignment from the reader
+ if (reader->GetNextAlignmentCore(*alignment)) {
+ alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+ make_pair(reader, alignment)));
+ } else { // do nothing
+ //cerr << "reached end of file " << lowestReader->GetFilename() << endl;
+ }
+
+ return true;
+
+}
+
+// jumps to specified region(refID, leftBound) in BAM files, returns success/fail
+bool BamMultiReader::Jump(int refID, int position) {
+
+ //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {
+ CurrentRefID = refID;
+ CurrentLeft = position;
+
+ bool result = true;
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ result &= reader->Jump(refID, position);
+ if (!result) {
+ cerr << "ERROR: could not jump " << reader->GetFilename() << " to " << refID << ":" << position << endl;
+ exit(1);
+ }
+ }
+ if (result) UpdateAlignments();
+ return result;
+}
+
+bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) {
+
+ BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition);
+
+ return SetRegion(region);
+
+}
+
+bool BamMultiReader::SetRegion(const BamRegion& region) {
+
+ Region = region;
+
+ // NB: While it may make sense to track readers in which we can
+ // successfully SetRegion, In practice a failure of SetRegion means "no
+ // alignments here." It makes sense to simply accept the failure,
+ // UpdateAlignments(), and continue.
+
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ it->first->SetRegion(region);
+ }
+
+ UpdateAlignments();
+
+ return true;
+
+}
+
+void BamMultiReader::UpdateAlignments(void) {
+ // Update Alignments
+ alignments.clear();
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* br = it->first;
+ BamAlignment* ba = it->second;
+ if (br->GetNextAlignment(*ba)) {
+ alignments.insert(make_pair(make_pair(ba->RefID, ba->Position),
+ make_pair(br, ba)));
+ } else {
+ // assume BamReader end of region / EOF
+ }
+ }
+}
+
+// opens BAM files
+bool BamMultiReader::Open(const vector<string> filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) {
+
+ // for filename in filenames
+ fileNames = filenames; // save filenames in our multireader
+ for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) {
+ string filename = *it;
+ BamReader* reader = new BamReader;
+
+ bool openedOK = true;
+ if (openIndexes) {
+ if (useDefaultIndex)
+ openedOK = reader->Open(filename, filename + ".bai");
+ else
+ openedOK = reader->Open(filename, filename + ".bti");
+ } else {
+ openedOK = reader->Open(filename); // for merging, jumping is disallowed
+ }
+
+ // if file opened ok, check that it can be read
+ if ( openedOK ) {
+
+ bool fileOK = true;
+ BamAlignment* alignment = new BamAlignment;
+ if (coreMode) {
+ fileOK &= reader->GetNextAlignmentCore(*alignment);
+ } else {
+ fileOK &= reader->GetNextAlignment(*alignment);
+ }
+
+ if (fileOK) {
+ readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup
+ alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
+ make_pair(reader, alignment)));
+ } else {
+ cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl;
+ // if only file available & could not be read, return failure
+ if ( filenames.size() == 1 ) return false;
+ }
+
+ }
+
+ // TODO; any more error handling on openedOK ??
+ else
+ return false;
+ }
+
+ // files opened ok, at least one alignment could be read,
+ // now need to check that all files use same reference data
+ ValidateReaders();
+ return true;
+}
+
+void BamMultiReader::PrintFilenames(void) {
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ cout << reader->GetFilename() << endl;
+ }
+}
+
+// for debugging
+void BamMultiReader::DumpAlignmentIndex(void) {
+ for (AlignmentIndex::const_iterator it = alignments.begin(); it != alignments.end(); ++it) {
+ cerr << it->first.first << ":" << it->first.second << " " << it->second.first->GetFilename() << endl;
+ }
+}
+
+// returns BAM file pointers to beginning of alignment data
+bool BamMultiReader::Rewind(void) {
+ bool result = true;
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ result &= reader->Rewind();
+ }
+ return result;
+}
+
+// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail
+bool BamMultiReader::CreateIndexes(bool useDefaultIndex) {
+ bool result = true;
+ for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ result &= reader->CreateIndex(useDefaultIndex);
+ }
+ return result;
+}
+
+// makes a virtual, unified header for all the bam files in the multireader
+const string BamMultiReader::GetHeaderText(void) const {
+
+ string mergedHeader = "";
+ map<string, bool> readGroups;
+
+ // foreach extraction entry (each BAM file)
+ for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) {
+
+ map<string, bool> currentFileReadGroups;
+
+ BamReader* reader = rs->first;
+
+ stringstream header(reader->GetHeaderText());
+ vector<string> lines;
+ string item;
+ while (getline(header, item))
+ lines.push_back(item);
+
+ for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) {
+
+ // get next line from header, skip if empty
+ string headerLine = *it;
+ if ( headerLine.empty() ) { continue; }
+
+ // if first file, save HD & SQ entries
+ if ( rs == readers.begin() ) {
+ if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
+ mergedHeader.append(headerLine.c_str());
+ mergedHeader.append(1, '\n');
+ }
+ }
+
+ // (for all files) append RG entries if they are unique
+ if ( headerLine.find("@RG") == 0 ) {
+ stringstream headerLineSs(headerLine);
+ string part, readGroupPart, readGroup;
+ while(std::getline(headerLineSs, part, '\t')) {
+ stringstream partSs(part);
+ string subtag;
+ std::getline(partSs, subtag, ':');
+ if (subtag == "ID") {
+ std::getline(partSs, readGroup, ':');
+ break;
+ }
+ }
+ if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries
+ mergedHeader.append(headerLine.c_str() );
+ mergedHeader.append(1, '\n');
+ readGroups[readGroup] = true;
+ currentFileReadGroups[readGroup] = true;
+ } else {
+ // warn iff we are reading one file and discover duplicated @RG tags in the header
+ // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
+ if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) {
+ cerr << "WARNING: duplicate @RG tag " << readGroup
+ << " entry in header of " << reader->GetFilename() << endl;
+ }
+ }
+ }
+ }
+ }
+
+ // return merged header text
+ return mergedHeader;
+}
+
+// ValidateReaders checks that all the readers point to BAM files representing
+// alignments against the same set of reference sequences, and that the
+// sequences are identically ordered. If these checks fail the operation of
+// the multireader is undefined, so we force program exit.
+void BamMultiReader::ValidateReaders(void) const {
+ int firstRefCount = readers.front().first->GetReferenceCount();
+ BamTools::RefVector firstRefData = readers.front().first->GetReferenceData();
+ for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamReader* reader = it->first;
+ BamTools::RefVector currentRefData = reader->GetReferenceData();
+ BamTools::RefVector::const_iterator f = firstRefData.begin();
+ BamTools::RefVector::const_iterator c = currentRefData.begin();
+ if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) {
+ cerr << "ERROR: mismatched number of references in " << reader->GetFilename()
+ << " expected " << firstRefCount
+ << " reference sequences but only found " << reader->GetReferenceCount() << endl;
+ exit(1);
+ }
+ // this will be ok; we just checked above that we have identically-sized sets of references
+ // here we simply check if they are all, in fact, equal in content
+ while (f != firstRefData.end()) {
+ if (f->RefName != c->RefName || f->RefLength != c->RefLength) {
+ cerr << "ERROR: mismatched references found in " << reader->GetFilename()
+ << " expected: " << endl;
+ for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a)
+ cerr << a->RefName << " " << a->RefLength << endl;
+ cerr << "but found: " << endl;
+ for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a)
+ cerr << a->RefName << " " << a->RefLength << endl;
+ exit(1);
+ }
+ ++f; ++c;
+ }
+ }
+}
+
+// NB: The following functions assume that we have identical references for all
+// BAM files. We enforce this by invoking the above validation function
+// (ValidateReaders) to verify that our reference data is the same across all
+// files on Open, so we will not encounter a situation in which there is a
+// mismatch and we are still live.
+
+// returns the number of reference sequences
+const int BamMultiReader::GetReferenceCount(void) const {
+ return readers.front().first->GetReferenceCount();
+}
+
+// returns vector of reference objects
+const BamTools::RefVector BamMultiReader::GetReferenceData(void) const {
+ return readers.front().first->GetReferenceData();
+}
+
+const int BamMultiReader::GetReferenceID(const string& refName) const {
+ return readers.front().first->GetReferenceID(refName);
+}
--- /dev/null
+// ***************************************************************************\r
+// BamMultiReader.h (c) 2010 Erik Garrison\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 20 July 2010 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Functionality for simultaneously reading multiple BAM files\r
+// ***************************************************************************\r
+\r
+#ifndef BAMMULTIREADER_H\r
+#define BAMMULTIREADER_H\r
+\r
+// C++ includes\r
+#include <string>\r
+#include <map>\r
+#include <utility> // for pair\r
+#include <sstream>\r
+\r
+using namespace std;\r
+\r
+// BamTools includes\r
+#include "BamAux.h"\r
+#include "BamReader.h"\r
+\r
+namespace BamTools {\r
+\r
+// index mapping reference/position pairings to bamreaders and their alignments\r
+typedef multimap<pair<int, int>, pair<BamReader*, BamAlignment*> > AlignmentIndex;\r
+\r
+\r
+class BamMultiReader {\r
+\r
+ // constructor / destructor\r
+ public:\r
+ BamMultiReader(void);\r
+ ~BamMultiReader(void);\r
+\r
+ // public interface\r
+ public:\r
+\r
+ // positioning\r
+ int CurrentRefID;\r
+ int CurrentLeft;\r
+\r
+ // region under analysis, specified using SetRegion\r
+ BamRegion Region;\r
+\r
+ // ----------------------\r
+ // BAM file operations\r
+ // ----------------------\r
+\r
+ // close BAM files\r
+ void Close(void);\r
+\r
+ // opens BAM files (and optional BAM index files, if provided)\r
+ // @openIndexes - triggers index opening, useful for suppressing\r
+ // error messages during merging of files in which we may not have\r
+ // indexes.\r
+ // @coreMode - setup our first alignments using GetNextAlignmentCore();\r
+ // also useful for merging\r
+ bool Open(const vector<string> filenames, bool openIndexes = true, bool coreMode = false, bool useDefaultIndex = true);\r
+\r
+ // performs random-access jump to reference, position\r
+ bool Jump(int refID, int position = 0);\r
+\r
+ // sets the target region\r
+ bool SetRegion(const BamRegion& region);\r
+ bool SetRegion(const int&, const int&, const int&, const int&); // convenience function to above\r
+\r
+ // returns file pointers to beginning of alignments\r
+ bool Rewind(void);\r
+\r
+ // ----------------------\r
+ // access alignment data\r
+ // ----------------------\r
+ // updates the reference id marker to match the lower limit of our readers\r
+ void UpdateReferenceID(void);\r
+\r
+ // retrieves next available alignment (returns success/fail) from all files\r
+ bool GetNextAlignment(BamAlignment&);\r
+ // retrieves next available alignment (returns success/fail) from all files\r
+ // and populates the support data with information about the alignment\r
+ // *** BUT DOES NOT PARSE CHARACTER DATA FROM THE ALIGNMENT\r
+ bool GetNextAlignmentCore(BamAlignment&);\r
+ // ... should this be private?\r
+ bool HasOpenReaders(void);\r
+\r
+ // ----------------------\r
+ // access auxiliary data\r
+ // ----------------------\r
+\r
+ // returns unified SAM header text for all files\r
+ const string GetHeaderText(void) const;\r
+ // returns number of reference sequences\r
+ const int GetReferenceCount(void) const;\r
+ // returns vector of reference objects\r
+ const BamTools::RefVector GetReferenceData(void) const;\r
+ // returns reference id (used for BamMultiReader::Jump()) for the given reference name\r
+ const int GetReferenceID(const std::string& refName) const;\r
+ // validates that we have a congruent set of BAM files that are aligned against the same reference sequences\r
+ void ValidateReaders() const;\r
+\r
+ // ----------------------\r
+ // BAM index operations\r
+ // ----------------------\r
+\r
+ // creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai")\r
+ bool CreateIndexes(bool useDefaultIndex = true);\r
+\r
+ //const int GetReferenceID(const string& refName) const;\r
+\r
+ // utility\r
+ void PrintFilenames(void);\r
+ void DumpAlignmentIndex(void);\r
+ void UpdateAlignments(void); // updates our alignment cache\r
+\r
+ // private implementation\r
+ private:\r
+\r
+ // the set of readers and alignments which we operate on, maintained throughout the life of this class\r
+ vector<pair<BamReader*, BamAlignment*> > readers;\r
+\r
+ // readers and alignments sorted by reference id and position, to keep track of the lowest (next) alignment\r
+ // when a reader reaches EOF, its entry is removed from this index\r
+ AlignmentIndex alignments;\r
+\r
+ vector<string> fileNames;\r
+};\r
+\r
+} // namespace BamTools\r
+\r
+#endif // BAMMULTIREADER_H\r
--- /dev/null
+// ***************************************************************************\r
+// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 15 July 2010 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for reading BAM files\r
+// ***************************************************************************\r
+\r
+// C++ includes\r
+#include <algorithm>\r
+#include <iterator>\r
+#include <string>\r
+#include <vector>\r
+#include <iostream>\r
+\r
+// BamTools includes\r
+#include "BGZF.h"\r
+#include "BamReader.h"\r
+#include "BamIndex.h"\r
+using namespace BamTools;\r
+using namespace std;\r
+\r
+struct BamReader::BamReaderPrivate {\r
+\r
+ // -------------------------------\r
+ // structs, enums, typedefs\r
+ // -------------------------------\r
+ enum RegionState { BEFORE_REGION = 0\r
+ , WITHIN_REGION\r
+ , AFTER_REGION\r
+ };\r
+\r
+ // -------------------------------\r
+ // data members\r
+ // -------------------------------\r
+\r
+ // general file data\r
+ BgzfData mBGZF;\r
+ string HeaderText;\r
+ //BamIndex Index;\r
+ BamIndex* NewIndex;\r
+ RefVector References;\r
+ bool IsIndexLoaded;\r
+ int64_t AlignmentsBeginOffset;\r
+ string Filename;\r
+ string IndexFilename;\r
+ \r
+ // system data\r
+ bool IsBigEndian;\r
+\r
+ // user-specified region values\r
+ BamRegion Region;\r
+ bool IsLeftBoundSpecified;\r
+ bool IsRightBoundSpecified;\r
+ \r
+ bool IsRegionSpecified;\r
+ int CurrentRefID;\r
+ int CurrentLeft;\r
+\r
+ // parent BamReader\r
+ BamReader* Parent;\r
+ \r
+ // BAM character constants\r
+ const char* DNA_LOOKUP;\r
+ const char* CIGAR_LOOKUP;\r
+\r
+ // -------------------------------\r
+ // constructor & destructor\r
+ // -------------------------------\r
+ BamReaderPrivate(BamReader* parent);\r
+ ~BamReaderPrivate(void);\r
+\r
+ // -------------------------------\r
+ // "public" interface\r
+ // -------------------------------\r
+\r
+ // file operations\r
+ void Close(void);\r
+ bool Jump(int refID, int position = 0);\r
+ bool Open(const string& filename, const string& indexFilename = "");\r
+ bool Rewind(void);\r
+ bool SetRegion(const BamRegion& region);\r
+\r
+ // access alignment data\r
+ bool GetNextAlignment(BamAlignment& bAlignment);\r
+ bool GetNextAlignmentCore(BamAlignment& bAlignment);\r
+\r
+ // access auxiliary data\r
+ int GetReferenceID(const string& refName) const;\r
+\r
+ // index operations\r
+ bool CreateIndex(bool useDefaultIndex);\r
+\r
+ // -------------------------------\r
+ // internal methods\r
+ // -------------------------------\r
+\r
+ // *** reading alignments and auxiliary data *** //\r
+\r
+ // fills out character data for BamAlignment data\r
+ bool BuildCharData(BamAlignment& bAlignment);\r
+ // checks to see if alignment overlaps current region\r
+ RegionState IsOverlap(BamAlignment& bAlignment);\r
+ // retrieves header text from BAM file\r
+ void LoadHeaderData(void);\r
+ // retrieves BAM alignment under file pointer\r
+ bool LoadNextAlignment(BamAlignment& bAlignment);\r
+ // builds reference data structure from BAM file\r
+ void LoadReferenceData(void);\r
+\r
+ // *** index file handling *** //\r
+\r
+ // clear out inernal index data structure\r
+ void ClearIndex(void);\r
+ // loads index from BAM index file\r
+ bool LoadIndex(void);\r
+};\r
+\r
+// -----------------------------------------------------\r
+// BamReader implementation (wrapper around BRPrivate)\r
+// -----------------------------------------------------\r
+// constructor\r
+BamReader::BamReader(void) {\r
+ d = new BamReaderPrivate(this);\r
+}\r
+\r
+// destructor\r
+BamReader::~BamReader(void) {\r
+ delete d;\r
+ d = 0;\r
+}\r
+\r
+// file operations\r
+void BamReader::Close(void) { d->Close(); }\r
+bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; }\r
+bool BamReader::Jump(int refID, int position) { \r
+ d->Region.LeftRefID = refID;\r
+ d->Region.LeftPosition = position;\r
+ d->IsLeftBoundSpecified = true;\r
+ d->IsRightBoundSpecified = false;\r
+ return d->Jump(refID, position); \r
+}\r
+bool BamReader::Open(const string& filename, const string& indexFilename) { return d->Open(filename, indexFilename); }\r
+bool BamReader::Rewind(void) { return d->Rewind(); }\r
+bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); }\r
+bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) {\r
+ return d->SetRegion( BamRegion(leftRefID, leftBound, rightRefID, rightBound) );\r
+}\r
+\r
+// access alignment data\r
+bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); }\r
+bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); }\r
+\r
+// access auxiliary data\r
+const string BamReader::GetHeaderText(void) const { return d->HeaderText; }\r
+int BamReader::GetReferenceCount(void) const { return d->References.size(); }\r
+const RefVector& BamReader::GetReferenceData(void) const { return d->References; }\r
+int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }\r
+const std::string BamReader::GetFilename(void) const { return d->Filename; }\r
+\r
+// index operations\r
+bool BamReader::CreateIndex(bool useDefaultIndex) { return d->CreateIndex(useDefaultIndex); }\r
+\r
+// -----------------------------------------------------\r
+// BamReaderPrivate implementation\r
+// -----------------------------------------------------\r
+\r
+// constructor\r
+BamReader::BamReaderPrivate::BamReaderPrivate(BamReader* parent)\r
+ : NewIndex(0)\r
+ , IsIndexLoaded(false)\r
+ , AlignmentsBeginOffset(0)\r
+ , IsLeftBoundSpecified(false)\r
+ , IsRightBoundSpecified(false)\r
+ , IsRegionSpecified(false)\r
+ , CurrentRefID(0)\r
+ , CurrentLeft(0)\r
+ , Parent(parent)\r
+ , DNA_LOOKUP("=ACMGRSVTWYHKDBN")\r
+ , CIGAR_LOOKUP("MIDNSHP")\r
+{ \r
+ IsBigEndian = SystemIsBigEndian();\r
+}\r
+\r
+// destructor\r
+BamReader::BamReaderPrivate::~BamReaderPrivate(void) {\r
+ Close();\r
+}\r
+\r
+bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) {\r
+ \r
+ // calculate character lengths/offsets\r
+ const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
+ const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;\r
+ const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);\r
+ const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;\r
+ const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;\r
+ const unsigned int tagDataLength = dataLength - tagDataOffset;\r
+ \r
+ // set up char buffers\r
+ const char* allCharData = bAlignment.SupportData.AllCharData.data();\r
+ uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);\r
+ const char* seqData = ((const char*)allCharData) + seqDataOffset;\r
+ const char* qualData = ((const char*)allCharData) + qualDataOffset;\r
+ char* tagData = ((char*)allCharData) + tagDataOffset;\r
+ \r
+ // store alignment name (depends on null char as terminator)\r
+ bAlignment.Name.assign((const char*)(allCharData)); \r
+ \r
+ // save CigarOps \r
+ CigarOp op;\r
+ bAlignment.CigarData.clear();\r
+ bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations);\r
+ for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {\r
+\r
+ // swap if necessary\r
+ if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); }\r
+ \r
+ // build CigarOp structure\r
+ op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
+ op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
+\r
+ // save CigarOp\r
+ bAlignment.CigarData.push_back(op);\r
+ }\r
+ \r
+ \r
+ // save query sequence\r
+ bAlignment.QueryBases.clear();\r
+ bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
+ for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {\r
+ char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];\r
+ bAlignment.QueryBases.append(1, singleBase);\r
+ }\r
+ \r
+ // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character\r
+ bAlignment.Qualities.clear();\r
+ bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength);\r
+ for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {\r
+ char singleQuality = (char)(qualData[i]+33);\r
+ bAlignment.Qualities.append(1, singleQuality);\r
+ }\r
+ \r
+ // if QueryBases is empty (and this is a allowed case)\r
+ if ( bAlignment.QueryBases.empty() ) \r
+ bAlignment.AlignedBases = bAlignment.QueryBases;\r
+ \r
+ // if QueryBases contains data, then build AlignedBases using CIGAR data\r
+ else {\r
+ \r
+ // resize AlignedBases\r
+ bAlignment.AlignedBases.clear();\r
+ bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
+ \r
+ // iterate over CigarOps\r
+ int k = 0;\r
+ vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();\r
+ vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end();\r
+ for ( ; cigarIter != cigarEnd; ++cigarIter ) {\r
+ \r
+ const CigarOp& op = (*cigarIter);\r
+ switch(op.Type) {\r
+ \r
+ case ('M') :\r
+ case ('I') :\r
+ bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases\r
+ // fall through\r
+ \r
+ case ('S') :\r
+ k += op.Length; // for 'S' - soft clip, skip over query bases\r
+ break;\r
+ \r
+ case ('D') :\r
+ bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character\r
+ break;\r
+ \r
+ case ('P') :\r
+ bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character\r
+ break;\r
+ \r
+ case ('N') :\r
+ bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence\r
+ break;\r
+ \r
+ case ('H') :\r
+ break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op\r
+ \r
+ default:\r
+ printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here\r
+ exit(1);\r
+ }\r
+ }\r
+ }\r
+ \r
+ // -----------------------\r
+ // Added: 3-25-2010 DB\r
+ // Fixed: endian-correctness for tag data\r
+ // -----------------------\r
+ if ( IsBigEndian ) {\r
+ int i = 0;\r
+ while ( (unsigned int)i < tagDataLength ) {\r
+ \r
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)\r
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning \r
+ ++i; // skip value type\r
+ \r
+ switch (type) {\r
+ \r
+ case('A') :\r
+ case('C') : \r
+ ++i;\r
+ break;\r
+\r
+ case('S') : \r
+ SwapEndian_16p(&tagData[i]); \r
+ i += sizeof(uint16_t);\r
+ break;\r
+ \r
+ case('F') :\r
+ case('I') : \r
+ SwapEndian_32p(&tagData[i]);\r
+ i += sizeof(uint32_t);\r
+ break;\r
+ \r
+ case('D') : \r
+ SwapEndian_64p(&tagData[i]);\r
+ i += sizeof(uint64_t);\r
+ break;\r
+ \r
+ case('H') :\r
+ case('Z') : \r
+ while (tagData[i]) { ++i; }\r
+ ++i; // increment one more for null terminator\r
+ break;\r
+ \r
+ default : \r
+ printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
+ exit(1);\r
+ }\r
+ }\r
+ }\r
+ \r
+ // store TagData\r
+ bAlignment.TagData.clear();\r
+ bAlignment.TagData.resize(tagDataLength);\r
+ memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength);\r
+ \r
+ // clear the core-only flag\r
+ bAlignment.SupportData.HasCoreOnly = false;\r
+ \r
+ // return success\r
+ return true;\r
+}\r
+\r
+// clear index data structure\r
+void BamReader::BamReaderPrivate::ClearIndex(void) {\r
+ delete NewIndex;\r
+ NewIndex = 0;\r
+}\r
+\r
+// closes the BAM file\r
+void BamReader::BamReaderPrivate::Close(void) {\r
+ \r
+ // close BGZF file stream\r
+ mBGZF.Close();\r
+ \r
+ // clear out index data\r
+ ClearIndex();\r
+ \r
+ // clear out header data\r
+ HeaderText.clear();\r
+ \r
+ // clear out region flags\r
+ IsLeftBoundSpecified = false;\r
+ IsRightBoundSpecified = false;\r
+ IsRegionSpecified = false;\r
+}\r
+\r
+// create BAM index from BAM file (keep structure in memory) and write to default index output file\r
+bool BamReader::BamReaderPrivate::CreateIndex(bool useDefaultIndex) {\r
+\r
+ // clear out prior index data\r
+ ClearIndex();\r
+ \r
+ // create default index\r
+ if ( useDefaultIndex )\r
+ NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian);\r
+ // create BamTools 'custom' index\r
+ else\r
+ NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);\r
+ \r
+ bool ok = true;\r
+ ok &= NewIndex->Build();\r
+ ok &= NewIndex->Write(Filename); \r
+ \r
+ // return success/fail\r
+ return ok;\r
+}\r
+\r
+// get next alignment (from specified region, if given)\r
+bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {\r
+\r
+ // if valid alignment found, attempt to parse char data, and return success/failure\r
+ if ( GetNextAlignmentCore(bAlignment) )\r
+ return BuildCharData(bAlignment);\r
+ \r
+ // no valid alignment found\r
+ else\r
+ return false;\r
+}\r
+\r
+// retrieves next available alignment core data (returns success/fail)\r
+// ** DOES NOT parse any character data (read name, bases, qualities, tag data)\r
+// these can be accessed, if necessary, from the supportData \r
+// useful for operations requiring ONLY positional or other alignment-related information\r
+bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) {\r
+\r
+ // if valid alignment available\r
+ if ( LoadNextAlignment(bAlignment) ) {\r
+\r
+ // set core-only flag\r
+ bAlignment.SupportData.HasCoreOnly = true;\r
+ \r
+ // if region not specified, return success\r
+ if ( !IsLeftBoundSpecified ) return true;\r
+\r
+ // determine region state (before, within, after)\r
+ BamReader::BamReaderPrivate::RegionState state = IsOverlap(bAlignment);\r
+ \r
+ // if alignment lies after region, return false\r
+ if ( state == AFTER_REGION ) \r
+ return false;\r
+\r
+ while ( state != WITHIN_REGION ) {\r
+ // if no valid alignment available (likely EOF) return failure\r
+ if ( !LoadNextAlignment(bAlignment) ) return false;\r
+ // if alignment lies after region, return false (no available read within region)\r
+ state = IsOverlap(bAlignment);\r
+ if ( state == AFTER_REGION) return false;\r
+ \r
+ }\r
+\r
+ // return success (alignment found that overlaps region)\r
+ return true;\r
+ }\r
+\r
+ // no valid alignment\r
+ else\r
+ return false;\r
+}\r
+\r
+// returns RefID for given RefName (returns References.size() if not found)\r
+int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const {\r
+\r
+ // retrieve names from reference data\r
+ vector<string> refNames;\r
+ RefVector::const_iterator refIter = References.begin();\r
+ RefVector::const_iterator refEnd = References.end();\r
+ for ( ; refIter != refEnd; ++refIter) {\r
+ refNames.push_back( (*refIter).RefName );\r
+ }\r
+\r
+ // return 'index-of' refName ( if not found, returns refNames.size() )\r
+ return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));\r
+}\r
+\r
+// returns region state - whether alignment ends before, overlaps, or starts after currently specified region\r
+// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true\r
+BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {\r
+ \r
+ // --------------------------------------------------\r
+ // check alignment start against right bound cutoff\r
+ \r
+ // if full region of interest was given\r
+ if ( IsRightBoundSpecified ) {\r
+ \r
+ // read starts on right bound reference, but AFTER right bound position\r
+ if ( bAlignment.RefID == Region.RightRefID && bAlignment.Position > Region.RightPosition )\r
+ return AFTER_REGION;\r
+ \r
+ // if read starts on reference AFTER right bound, return false\r
+ if ( bAlignment.RefID > Region.RightRefID ) \r
+ return AFTER_REGION;\r
+ }\r
+ \r
+ // --------------------------------------------------------\r
+ // no right bound given OR read starts before right bound\r
+ // so, check if it overlaps left bound \r
+ \r
+ // if read starts on left bound reference AND after left boundary, return success\r
+ if ( bAlignment.RefID == Region.LeftRefID && bAlignment.Position >= Region.LeftPosition)\r
+ return WITHIN_REGION;\r
+ \r
+ // if read is on any reference sequence before left bound, return false\r
+ if ( bAlignment.RefID < Region.LeftRefID )\r
+ return BEFORE_REGION;\r
+\r
+ // --------------------------------------------------------\r
+ // read is on left bound reference, but starts before left bound position\r
+\r
+ // if it overlaps, return WITHIN_REGION\r
+ if ( bAlignment.GetEndPosition() >= Region.LeftPosition )\r
+ return WITHIN_REGION;\r
+ // else begins before left bound position\r
+ else\r
+ return BEFORE_REGION;\r
+}\r
+\r
+// jumps to specified region(refID, leftBound) in BAM file, returns success/fail\r
+bool BamReader::BamReaderPrivate::Jump(int refID, int position) {\r
+\r
+ // -----------------------------------------------------------------------\r
+ // check for existing index \r
+ if ( NewIndex == 0 ) return false; \r
+ // see if reference has alignments\r
+ if ( !NewIndex->HasAlignments(refID) ) return false; \r
+ // make sure position is valid\r
+ if ( position > References.at(refID).RefLength ) return false;\r
+ \r
+ // determine possible offsets\r
+ vector<int64_t> offsets;\r
+ if ( !NewIndex->GetOffsets(Region, IsRightBoundSpecified, offsets) ) {\r
+ printf("ERROR: Could not jump: unable to calculate offset for specified region.\n");\r
+ return false;\r
+ }\r
+ \r
+ // iterate through offsets\r
+ BamAlignment bAlignment;\r
+ bool result = true;\r
+ for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {\r
+ \r
+ // attempt seek & load first available alignment\r
+ result &= mBGZF.Seek(*o);\r
+ LoadNextAlignment(bAlignment);\r
+ \r
+ // if this alignment corresponds to desired position\r
+ // return success of seeking back to 'current offset'\r
+ if ( (bAlignment.RefID == refID && bAlignment.Position + bAlignment.Length > position) || (bAlignment.RefID > refID) ) {\r
+ if ( o != offsets.begin() ) --o;\r
+ return mBGZF.Seek(*o);\r
+ }\r
+ }\r
+ \r
+ return result;\r
+}\r
+\r
+// load BAM header data\r
+void BamReader::BamReaderPrivate::LoadHeaderData(void) {\r
+\r
+ // check to see if proper BAM header\r
+ char buffer[4];\r
+ if (mBGZF.Read(buffer, 4) != 4) {\r
+ printf("Could not read header type\n");\r
+ exit(1);\r
+ }\r
+\r
+ if (strncmp(buffer, "BAM\001", 4)) {\r
+ printf("wrong header type!\n");\r
+ exit(1);\r
+ }\r
+\r
+ // get BAM header text length\r
+ mBGZF.Read(buffer, 4);\r
+ unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(headerTextLength); }\r
+ \r
+ // get BAM header text\r
+ char* headerText = (char*)calloc(headerTextLength + 1, 1);\r
+ mBGZF.Read(headerText, headerTextLength);\r
+ HeaderText = (string)((const char*)headerText);\r
+\r
+ // clean up calloc-ed temp variable\r
+ free(headerText);\r
+}\r
+\r
+// load existing index data from BAM index file (".bai"), return success/fail\r
+bool BamReader::BamReaderPrivate::LoadIndex(void) {\r
+\r
+ // clear out any existing index data\r
+ ClearIndex();\r
+\r
+ // skip if index file empty\r
+ if ( IndexFilename.empty() )\r
+ return false;\r
+\r
+ // check supplied filename for index type\r
+ size_t defaultExtensionFound = IndexFilename.find(".bai");\r
+ size_t customExtensionFound = IndexFilename.find(".bti");\r
+ \r
+ // if SAM/BAM default (".bai")\r
+ if ( defaultExtensionFound != string::npos )\r
+ NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian);\r
+ \r
+ // if BamTools custom index (".bti")\r
+ else if ( customExtensionFound != string::npos )\r
+ NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);\r
+ \r
+ // else unknown\r
+ else {\r
+ printf("ERROR: Unknown index file extension.\n");\r
+ return false;\r
+ }\r
+ \r
+ // return success of loading index data\r
+ return NewIndex->Load(IndexFilename);\r
+}\r
+\r
+// populates BamAlignment with alignment data under file pointer, returns success/fail\r
+bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {\r
+\r
+ // read in the 'block length' value, make sure it's not zero\r
+ char buffer[4];\r
+ mBGZF.Read(buffer, 4);\r
+ bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); }\r
+ if ( bAlignment.SupportData.BlockLength == 0 ) { return false; }\r
+\r
+ // read in core alignment data, make sure the right size of data was read\r
+ char x[BAM_CORE_SIZE];\r
+ if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }\r
+\r
+ if ( IsBigEndian ) {\r
+ for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) { \r
+ SwapEndian_32p(&x[i]); \r
+ }\r
+ }\r
+ \r
+ // set BamAlignment 'core' and 'support' data\r
+ bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); \r
+ bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);\r
+ \r
+ unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]);\r
+ bAlignment.Bin = tempValue >> 16;\r
+ bAlignment.MapQuality = tempValue >> 8 & 0xff;\r
+ bAlignment.SupportData.QueryNameLength = tempValue & 0xff;\r
+\r
+ tempValue = BgzfData::UnpackUnsignedInt(&x[12]);\r
+ bAlignment.AlignmentFlag = tempValue >> 16;\r
+ bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff;\r
+\r
+ bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);\r
+ bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]);\r
+ bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);\r
+ bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);\r
+ \r
+ // set BamAlignment length\r
+ bAlignment.Length = bAlignment.SupportData.QuerySequenceLength;\r
+ \r
+ // read in character data - make sure proper data size was read\r
+ bool readCharDataOK = false;\r
+ const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
+ char* allCharData = (char*)calloc(sizeof(char), dataLength);\r
+ \r
+ if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { \r
+ \r
+ // store 'allCharData' in supportData structure\r
+ bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength);\r
+ \r
+ // set success flag\r
+ readCharDataOK = true;\r
+ }\r
+\r
+ free(allCharData);\r
+ return readCharDataOK;\r
+}\r
+\r
+// loads reference data from BAM file\r
+void BamReader::BamReaderPrivate::LoadReferenceData(void) {\r
+\r
+ // get number of reference sequences\r
+ char buffer[4];\r
+ mBGZF.Read(buffer, 4);\r
+ unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(numberRefSeqs); }\r
+ if (numberRefSeqs == 0) { return; }\r
+ References.reserve((int)numberRefSeqs);\r
+\r
+ // iterate over all references in header\r
+ for (unsigned int i = 0; i != numberRefSeqs; ++i) {\r
+\r
+ // get length of reference name\r
+ mBGZF.Read(buffer, 4);\r
+ unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(refNameLength); }\r
+ char* refName = (char*)calloc(refNameLength, 1);\r
+\r
+ // get reference name and reference sequence length\r
+ mBGZF.Read(refName, refNameLength);\r
+ mBGZF.Read(buffer, 4);\r
+ int refLength = BgzfData::UnpackSignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(refLength); }\r
+\r
+ // store data for reference\r
+ RefData aReference;\r
+ aReference.RefName = (string)((const char*)refName);\r
+ aReference.RefLength = refLength;\r
+ References.push_back(aReference);\r
+\r
+ // clean up calloc-ed temp variable\r
+ free(refName);\r
+ }\r
+}\r
+\r
+// opens BAM file (and index)\r
+bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) {\r
+\r
+ Filename = filename;\r
+ IndexFilename = indexFilename;\r
+\r
+ // open the BGZF file for reading, return false on failure\r
+ if ( !mBGZF.Open(filename, "rb") ) \r
+ return false;\r
+ \r
+ // retrieve header text & reference data\r
+ LoadHeaderData();\r
+ LoadReferenceData();\r
+\r
+ // store file offset of first alignment\r
+ AlignmentsBeginOffset = mBGZF.Tell();\r
+\r
+ // open index file & load index data (if exists)\r
+ if ( !IndexFilename.empty() )\r
+ LoadIndex();\r
+ \r
+ // return success\r
+ return true;\r
+}\r
+\r
+// returns BAM file pointer to beginning of alignment data\r
+bool BamReader::BamReaderPrivate::Rewind(void) {\r
+ \r
+ // rewind to first alignment\r
+ if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false;\r
+ \r
+ // retrieve first alignment data\r
+ BamAlignment al;\r
+ if ( !LoadNextAlignment(al) ) return false;\r
+ \r
+ // reset default region info using first alignment in file\r
+ Region.LeftRefID = al.RefID;\r
+ Region.LeftPosition = al.Position;\r
+ Region.RightRefID = -1;\r
+ Region.RightPosition = -1;\r
+ IsLeftBoundSpecified = false;\r
+ IsRightBoundSpecified = false; \r
+\r
+ // rewind back to before first alignment\r
+ // return success/fail of seek\r
+ return mBGZF.Seek(AlignmentsBeginOffset);\r
+}\r
+\r
+// sets a region of interest (with left & right bound reference/position)\r
+// attempts a Jump() to left bound as well\r
+// returns success/failure of Jump()\r
+bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) {\r
+ \r
+ // save region of interest\r
+ Region = region;\r
+ \r
+ // set flags\r
+ if ( region.LeftRefID >= 0 && region.LeftPosition >= 0 ) \r
+ IsLeftBoundSpecified = true;\r
+ if ( region.RightRefID >= 0 && region.RightPosition >= 0 ) \r
+ IsRightBoundSpecified = true;\r
+ \r
+ // attempt jump to beginning of region, return success/fail of Jump()\r
+ return Jump( Region.LeftRefID, Region.LeftPosition );\r
+}\r
--- /dev/null
+// ***************************************************************************\r
+// BamReader.h (c) 2009 Derek Barnett, Michael Str�mberg\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 9 July 2010 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for reading BAM files\r
+// ***************************************************************************\r
+\r
+#ifndef BAMREADER_H\r
+#define BAMREADER_H\r
+\r
+// C++ includes\r
+#include <string>\r
+\r
+// BamTools includes\r
+#include "BamAux.h"\r
+\r
+namespace BamTools {\r
+ \r
+class BamReader {\r
+\r
+ // constructor / destructor\r
+ public:\r
+ BamReader(void);\r
+ ~BamReader(void);\r
+\r
+ // public interface\r
+ public:\r
+\r
+ // ----------------------\r
+ // BAM file operations\r
+ // ----------------------\r
+\r
+ // close BAM file\r
+ void Close(void);\r
+ // returns whether reader is open for reading or not\r
+ bool IsOpen(void) const;\r
+ // performs random-access jump to reference, position\r
+ bool Jump(int refID, int position = 0);\r
+ // opens BAM file (and optional BAM index file, if provided)\r
+ bool Open(const std::string& filename, const std::string& indexFilename = "");\r
+ // returns file pointer to beginning of alignments\r
+ bool Rewind(void);\r
+ // sets a region of interest (with left & right bound reference/position)\r
+ // attempts a Jump() to left bound as well\r
+ // returns success/failure of Jump()\r
+ bool SetRegion(const BamRegion& region);\r
+ bool SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound);\r
+\r
+ // ----------------------\r
+ // access alignment data\r
+ // ----------------------\r
+\r
+ // retrieves next available alignment (returns success/fail)\r
+ bool GetNextAlignment(BamAlignment& bAlignment);\r
+ \r
+ // retrieves next available alignment core data (returns success/fail)\r
+ // ** DOES NOT parse any character data (read name, bases, qualities, tag data)\r
+ // these can be accessed, if necessary, from the supportData \r
+ // useful for operations requiring ONLY positional or other alignment-related information\r
+ bool GetNextAlignmentCore(BamAlignment& bAlignment);\r
+\r
+ // ----------------------\r
+ // access auxiliary data\r
+ // ----------------------\r
+\r
+ // returns SAM header text\r
+ const std::string GetHeaderText(void) const;\r
+ // returns number of reference sequences\r
+ int GetReferenceCount(void) const;\r
+ // returns vector of reference objects\r
+ const BamTools::RefVector& GetReferenceData(void) const;\r
+ // returns reference id (used for BamReader::Jump()) for the given reference name\r
+ int GetReferenceID(const std::string& refName) const;\r
+ // returns the name of the file associated with this BamReader\r
+ const std::string GetFilename(void) const;\r
+\r
+ // ----------------------\r
+ // BAM index operations\r
+ // ----------------------\r
+\r
+ // creates index for BAM file, saves to file (default = bamFilename + ".bai")\r
+ bool CreateIndex(bool useDefaultIndex = true);\r
+ \r
+ // private implementation\r
+ private:\r
+ struct BamReaderPrivate;\r
+ BamReaderPrivate* d;\r
+};\r
+\r
+} // namespace BamTools\r
+\r
+#endif // BAMREADER_H\r
--- /dev/null
+// ***************************************************************************\r
+// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 17 August 2010 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for producing BAM files\r
+// ***************************************************************************\r
+\r
+#include <iostream>\r
+\r
+#include "BGZF.h"\r
+#include "BamWriter.h"\r
+using namespace BamTools;\r
+using namespace std;\r
+\r
+struct BamWriter::BamWriterPrivate {\r
+\r
+ // data members\r
+ BgzfData mBGZF;\r
+ bool IsBigEndian;\r
+ \r
+ // constructor / destructor\r
+ BamWriterPrivate(void) { \r
+ IsBigEndian = SystemIsBigEndian(); \r
+ }\r
+ \r
+ ~BamWriterPrivate(void) {\r
+ mBGZF.Close();\r
+ }\r
+\r
+ // "public" interface\r
+ void Close(void);\r
+ bool Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed);\r
+ void SaveAlignment(const BamAlignment& al);\r
+\r
+ // internal methods\r
+ const unsigned int CalculateMinimumBin(const int begin, int end) const;\r
+ void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);\r
+ void EncodeQuerySequence(const string& query, string& encodedQuery);\r
+};\r
+\r
+// -----------------------------------------------------\r
+// BamWriter implementation\r
+// -----------------------------------------------------\r
+\r
+// constructor\r
+BamWriter::BamWriter(void) {\r
+ d = new BamWriterPrivate;\r
+}\r
+\r
+// destructor\r
+BamWriter::~BamWriter(void) {\r
+ delete d;\r
+ d = 0;\r
+}\r
+\r
+// closes the alignment archive\r
+void BamWriter::Close(void) { \r
+ d->Close(); \r
+}\r
+\r
+// opens the alignment archive\r
+bool BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) {\r
+ return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed);\r
+}\r
+\r
+// saves the alignment to the alignment archive\r
+void BamWriter::SaveAlignment(const BamAlignment& al) { \r
+ d->SaveAlignment(al);\r
+}\r
+\r
+// -----------------------------------------------------\r
+// BamWriterPrivate implementation\r
+// -----------------------------------------------------\r
+\r
+// closes the alignment archive\r
+void BamWriter::BamWriterPrivate::Close(void) {\r
+ mBGZF.Close();\r
+}\r
+\r
+// calculates minimum bin for a BAM alignment interval\r
+const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { \r
+ --end;\r
+ if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);\r
+ if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);\r
+ if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);\r
+ if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);\r
+ if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);\r
+ return 0;\r
+}\r
+\r
+// creates a cigar string from the supplied alignment\r
+void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {\r
+\r
+ // initialize\r
+ const unsigned int numCigarOperations = cigarOperations.size();\r
+ packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);\r
+\r
+ // pack the cigar data into the string\r
+ unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();\r
+\r
+ unsigned int cigarOp;\r
+ vector<CigarOp>::const_iterator coIter;\r
+ for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {\r
+\r
+ switch(coIter->Type) {\r
+ case 'M':\r
+ cigarOp = BAM_CMATCH;\r
+ break;\r
+ case 'I':\r
+ cigarOp = BAM_CINS;\r
+ break;\r
+ case 'D':\r
+ cigarOp = BAM_CDEL;\r
+ break;\r
+ case 'N':\r
+ cigarOp = BAM_CREF_SKIP;\r
+ break;\r
+ case 'S':\r
+ cigarOp = BAM_CSOFT_CLIP;\r
+ break;\r
+ case 'H':\r
+ cigarOp = BAM_CHARD_CLIP;\r
+ break;\r
+ case 'P':\r
+ cigarOp = BAM_CPAD;\r
+ break;\r
+ default:\r
+ printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);\r
+ exit(1);\r
+ }\r
+\r
+ *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;\r
+ pPackedCigar++;\r
+ }\r
+}\r
+\r
+// encodes the supplied query sequence into 4-bit notation\r
+void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {\r
+\r
+ // prepare the encoded query string\r
+ const unsigned int queryLen = query.size();\r
+ const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);\r
+ encodedQuery.resize(encodedQueryLen);\r
+ char* pEncodedQuery = (char*)encodedQuery.data();\r
+ const char* pQuery = (const char*)query.data();\r
+\r
+ unsigned char nucleotideCode;\r
+ bool useHighWord = true;\r
+\r
+ while(*pQuery) {\r
+\r
+ switch(*pQuery) {\r
+ \r
+ case '=':\r
+ nucleotideCode = 0;\r
+ break;\r
+ \r
+ case 'A':\r
+ nucleotideCode = 1;\r
+ break;\r
+ \r
+ case 'C':\r
+ nucleotideCode = 2;\r
+ break;\r
+ \r
+ case 'G':\r
+ nucleotideCode = 4;\r
+ break;\r
+ \r
+ case 'T':\r
+ nucleotideCode = 8;\r
+ break;\r
+ \r
+ case 'N':\r
+ nucleotideCode = 15;\r
+ break;\r
+ \r
+ default:\r
+ printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
+ exit(1);\r
+ }\r
+\r
+ // pack the nucleotide code\r
+ if(useHighWord) {\r
+ *pEncodedQuery = nucleotideCode << 4;\r
+ useHighWord = false;\r
+ } else {\r
+ *pEncodedQuery |= nucleotideCode;\r
+ pEncodedQuery++;\r
+ useHighWord = true;\r
+ }\r
+\r
+ // increment the query position\r
+ pQuery++;\r
+ }\r
+}\r
+\r
+// opens the alignment archive\r
+bool BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) {\r
+\r
+ // open the BGZF file for writing, return failure if error\r
+ if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )\r
+ return false;\r
+\r
+ // ================\r
+ // write the header\r
+ // ================\r
+\r
+ // write the BAM signature\r
+ const unsigned char SIGNATURE_LENGTH = 4;\r
+ const char* BAM_SIGNATURE = "BAM\1";\r
+ mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);\r
+\r
+ // write the SAM header text length\r
+ uint32_t samHeaderLen = samHeader.size();\r
+ if (IsBigEndian) SwapEndian_32(samHeaderLen);\r
+ mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
+\r
+ // write the SAM header text\r
+ if(samHeaderLen > 0) \r
+ mBGZF.Write(samHeader.data(), samHeaderLen);\r
+\r
+ // write the number of reference sequences\r
+ uint32_t numReferenceSequences = referenceSequences.size();\r
+ if (IsBigEndian) SwapEndian_32(numReferenceSequences);\r
+ mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
+\r
+ // =============================\r
+ // write the sequence dictionary\r
+ // =============================\r
+\r
+ RefVector::const_iterator rsIter;\r
+ for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {\r
+\r
+ // write the reference sequence name length\r
+ uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
+ if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);\r
+ mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
+\r
+ // write the reference sequence name\r
+ mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);\r
+\r
+ // write the reference sequence length\r
+ int32_t referenceLength = rsIter->RefLength;\r
+ if (IsBigEndian) SwapEndian_32(referenceLength);\r
+ mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);\r
+ }\r
+ \r
+ // return success\r
+ return true;\r
+}\r
+\r
+// saves the alignment to the alignment archive\r
+void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {\r
+\r
+ // if BamAlignment contains only the core data and a raw char data buffer\r
+ // (as a result of BamReader::GetNextAlignmentCore())\r
+ if ( al.SupportData.HasCoreOnly ) {\r
+ \r
+ // write the block size\r
+ unsigned int blockSize = al.SupportData.BlockLength;\r
+ if (IsBigEndian) SwapEndian_32(blockSize);\r
+ mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
+\r
+ // assign the BAM core data\r
+ uint32_t buffer[8];\r
+ buffer[0] = al.RefID;\r
+ buffer[1] = al.Position;\r
+ buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;\r
+ buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;\r
+ buffer[4] = al.SupportData.QuerySequenceLength;\r
+ buffer[5] = al.MateRefID;\r
+ buffer[6] = al.MatePosition;\r
+ buffer[7] = al.InsertSize;\r
+ \r
+ // swap BAM core endian-ness, if necessary\r
+ if ( IsBigEndian ) { \r
+ for ( int i = 0; i < 8; ++i )\r
+ SwapEndian_32(buffer[i]); \r
+ }\r
+ \r
+ // write the BAM core\r
+ mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
+ \r
+ // write the raw char data\r
+ mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); \r
+ }\r
+ \r
+ // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc\r
+ // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )\r
+ else {\r
+ \r
+ // calculate char lengths\r
+ const unsigned int nameLength = al.Name.size() + 1;\r
+ const unsigned int numCigarOperations = al.CigarData.size();\r
+ const unsigned int queryLength = al.QueryBases.size();\r
+ const unsigned int tagDataLength = al.TagData.size();\r
+ \r
+ // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)\r
+ // force calculation of Bin before storing\r
+ const int endPosition = al.GetEndPosition();\r
+ const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);\r
+ \r
+ // create our packed cigar string\r
+ string packedCigar;\r
+ CreatePackedCigar(al.CigarData, packedCigar);\r
+ const unsigned int packedCigarLength = packedCigar.size();\r
+\r
+ // encode the query\r
+ string encodedQuery;\r
+ EncodeQuerySequence(al.QueryBases, encodedQuery);\r
+ const unsigned int encodedQueryLength = encodedQuery.size(); \r
+ \r
+ // write the block size\r
+ const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;\r
+ unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
+ if (IsBigEndian) SwapEndian_32(blockSize);\r
+ mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
+\r
+ // assign the BAM core data\r
+ uint32_t buffer[8];\r
+ buffer[0] = al.RefID;\r
+ buffer[1] = al.Position;\r
+ buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;\r
+ buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;\r
+ buffer[4] = queryLength;\r
+ buffer[5] = al.MateRefID;\r
+ buffer[6] = al.MatePosition;\r
+ buffer[7] = al.InsertSize;\r
+ \r
+ // swap BAM core endian-ness, if necessary\r
+ if ( IsBigEndian ) { \r
+ for ( int i = 0; i < 8; ++i )\r
+ SwapEndian_32(buffer[i]); \r
+ }\r
+ \r
+ // write the BAM core\r
+ mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
+ \r
+ // write the query name\r
+ mBGZF.Write(al.Name.c_str(), nameLength);\r
+\r
+ // write the packed cigar\r
+ if ( IsBigEndian ) {\r
+ \r
+ char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);\r
+ memcpy(cigarData, packedCigar.data(), packedCigarLength);\r
+ \r
+ for (unsigned int i = 0; i < packedCigarLength; ++i) {\r
+ if ( IsBigEndian )\r
+ SwapEndian_32p(&cigarData[i]); \r
+ }\r
+ \r
+ mBGZF.Write(cigarData, packedCigarLength);\r
+ free(cigarData); \r
+ } \r
+ else \r
+ mBGZF.Write(packedCigar.data(), packedCigarLength);\r
+\r
+ // write the encoded query sequence\r
+ mBGZF.Write(encodedQuery.data(), encodedQueryLength);\r
+\r
+ // write the base qualities\r
+ string baseQualities(al.Qualities);\r
+ char* pBaseQualities = (char*)al.Qualities.data();\r
+ for(unsigned int i = 0; i < queryLength; i++) { \r
+ pBaseQualities[i] -= 33; \r
+ }\r
+ mBGZF.Write(pBaseQualities, queryLength);\r
+\r
+ // write the read group tag\r
+ if ( IsBigEndian ) {\r
+ \r
+ char* tagData = (char*)calloc(sizeof(char), tagDataLength);\r
+ memcpy(tagData, al.TagData.data(), tagDataLength);\r
+ \r
+ int i = 0;\r
+ while ( (unsigned int)i < tagDataLength ) {\r
+ \r
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)\r
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning \r
+ ++i; // skip value type\r
+ \r
+ switch (type) {\r
+ \r
+ case('A') :\r
+ case('C') : \r
+ ++i;\r
+ break;\r
+ \r
+ case('S') : \r
+ SwapEndian_16p(&tagData[i]); \r
+ i+=2; // sizeof(uint16_t)\r
+ break;\r
+ \r
+ case('F') :\r
+ case('I') : \r
+ SwapEndian_32p(&tagData[i]);\r
+ i+=4; // sizeof(uint32_t)\r
+ break;\r
+ \r
+ case('D') : \r
+ SwapEndian_64p(&tagData[i]);\r
+ i+=8; // sizeof(uint64_t)\r
+ break;\r
+ \r
+ case('H') :\r
+ case('Z') : \r
+ while (tagData[i]) { ++i; }\r
+ ++i; // increment one more for null terminator\r
+ break;\r
+ \r
+ default : \r
+ printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
+ free(tagData);\r
+ exit(1); \r
+ }\r
+ }\r
+ \r
+ mBGZF.Write(tagData, tagDataLength);\r
+ free(tagData);\r
+ } \r
+ else \r
+ mBGZF.Write(al.TagData.data(), tagDataLength); \r
+ }\r
+}\r
--- /dev/null
+// ***************************************************************************\r
+// BamWriter.h (c) 2009 Michael Str�mberg, Derek Barnett\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 17 August 2010 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for producing BAM files\r
+// ***************************************************************************\r
+\r
+#ifndef BAMWRITER_H\r
+#define BAMWRITER_H\r
+\r
+// C++ includes\r
+#include <string>\r
+\r
+// BamTools includes\r
+#include "BamAux.h"\r
+\r
+namespace BamTools {\r
+\r
+class BamWriter {\r
+\r
+ // constructor/destructor\r
+ public:\r
+ BamWriter(void);\r
+ ~BamWriter(void);\r
+\r
+ // public interface\r
+ public:\r
+ // closes the alignment archive\r
+ void Close(void);\r
+ // opens the alignment archive\r
+ bool Open(const std::string& filename, \r
+ const std::string& samHeader, \r
+ const BamTools::RefVector& referenceSequences, \r
+ bool writeUncompressed = false);\r
+ // saves the alignment to the alignment archive\r
+ void SaveAlignment(const BamTools::BamAlignment& al);\r
+\r
+ // private implementation\r
+ private:\r
+ struct BamWriterPrivate;\r
+ BamWriterPrivate* d;\r
+};\r
+\r
+} // namespace BamTools\r
+\r
+#endif // BAMWRITER_H\r
--- /dev/null
+OBJ_DIR = ../../obj
+BIN_DIR = ../../bin
+
+# ----------------------------------
+# define our source and object files
+# ----------------------------------
+SOURCES = BGZF.cpp \
+ BamIndex.cpp \
+ BamReader.cpp \
+ BamMultiReader.cpp \
+ BamWriter.cpp
+OBJECTS= $(SOURCES:.cpp=.o)
+BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
+
+all: $(BUILT_OBJECTS)
+
+$(BUILT_OBJECTS): $(SOURCES)
+ @echo " * compiling" $(*F).cpp
+ @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
+
+.PHONY: all
+
+clean:
+ @echo "Cleaning up."
+ @rm -f $(OBJ_DIR)/* $(BIN_DIR)/*
+
+.PHONY: clean
--- /dev/null
+API_DIR = ../api
+UTILS_DIR = ../utils
+OBJ_DIR = ../../obj
+BIN_DIR = ../../bin
+
+INCLUDES = -I$(API_DIR)/ -I$(UTILS_DIR)
+
+# ----------------------------------
+# define our source and object files
+# ----------------------------------
+SOURCES = bamtools_convert.cpp \
+ bamtools_count.cpp \
+ bamtools_coverage.cpp \
+ bamtools_filter.cpp \
+ bamtools_header.cpp \
+ bamtools_index.cpp \
+ bamtools_merge.cpp \
+ bamtools_random.cpp \
+ bamtools_sort.cpp \
+ bamtools_stats.cpp \
+ bamtools.cpp
+OBJECTS= $(SOURCES:.cpp=.o)
+BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
+EXT_OBJECTS = $(OBJ_DIR)/*.o
+PROGRAM = bamtools
+
+all: $(PROGRAM)
+
+.PHONY: all
+
+$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS)
+ @echo " * linking $(PROGRAM)"
+ @$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS)
+
+$(BUILT_OBJECTS): $(SOURCES)
+ @echo " * compiling" $(*F).cpp
+ @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
+
+$(EXT_OBJECTS):
+ @$(MAKE) --no-print-directory -C $(API_DIR) $(UTILS_DIR)
+
+clean:
+ @echo "Cleaning up."
+ @rm -f $(OBJ_DIR)/* $(BIN_DIR)/*
+
+.PHONY: clean
--- /dev/null
+// ***************************************************************************
+// bamtools.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 July 2010
+// ---------------------------------------------------------------------------
+// Integrates a number of BamTools functionalities into a single executable.
+// ***************************************************************************
+
+// Std C/C++ includes
+#include <iostream>
+
+// BamTools includes
+#include "bamtools_convert.h"
+#include "bamtools_count.h"
+#include "bamtools_coverage.h"
+#include "bamtools_filter.h"
+#include "bamtools_header.h"
+#include "bamtools_index.h"
+#include "bamtools_merge.h"
+#include "bamtools_random.h"
+#include "bamtools_sort.h"
+#include "bamtools_stats.h"
+
+using namespace std;
+using namespace BamTools;
+
+// ------------------------------------------
+// bamtools subtool names
+static const string CONVERT = "convert";
+static const string COUNT = "count";
+static const string COVERAGE = "coverage";
+static const string FILTER = "filter";
+static const string HEADER = "header";
+static const string INDEX = "index";
+static const string MERGE = "merge";
+static const string RANDOM = "random";
+static const string SORT = "sort";
+static const string STATS = "stats";
+
+// ------------------------------------------
+// bamtools help/version names
+static const string HELP = "help";
+static const string LONG_HELP = "--help";
+static const string SHORT_HELP = "-h";
+
+static const string VERSION = "version";
+static const string LONG_VERSION = "--version";
+static const string SHORT_VERSION = "-v";
+
+// ------------------------------------------
+// Print help info
+int Help(int argc, char* argv[]) {
+
+ // 'bamtools help COMMAND'
+ if (argc > 2) {
+
+ AbstractTool* tool(0);
+ if ( argv[2] == CONVERT ) tool = new ConvertTool;
+ if ( argv[2] == COUNT ) tool = new CountTool;
+ if ( argv[2] == COVERAGE ) tool = new CoverageTool;
+ if ( argv[2] == FILTER ) tool = new FilterTool;
+ if ( argv[2] == HEADER ) tool = new HeaderTool;
+ if ( argv[2] == INDEX ) tool = new IndexTool;
+ if ( argv[2] == MERGE ) tool = new MergeTool;
+ if ( argv[2] == RANDOM ) tool = new RandomTool;
+ if ( argv[2] == SORT ) tool = new SortTool;
+ if ( argv[2] == STATS ) tool = new StatsTool;
+
+ // if tool known, print its help screen
+ if ( tool ) return tool->Help();
+ }
+
+ // either 'bamtools help' or unrecognized argument after 'help'
+ cerr << endl;
+ cerr << "usage: bamtools [--help] COMMAND [ARGS]" << endl;
+ cerr << endl;
+ cerr << "Available bamtools commands:" << endl;
+ cerr << "\tconvert Converts between BAM and a number of other formats" << endl;
+ cerr << "\tcount Prints number of alignments in BAM file" << endl;
+ cerr << "\tcoverage Prints coverage statistics from the input BAM file" << endl;
+ cerr << "\tfilter Filters BAM file(s) by user-specified criteria" << endl;
+ cerr << "\theader Prints BAM header information" << endl;
+ cerr << "\tindex Generates index for BAM file" << endl;
+ cerr << "\tmerge Merge multiple BAM files into single file" << endl;
+ cerr << "\trandom Grab a random subset of alignments" << endl;
+ cerr << "\tsort Sorts the BAM file according to some criteria" << endl;
+ cerr << "\tstats Prints general alignment statistics" << endl;
+ cerr << endl;
+ cerr << "See 'bamtools help COMMAND' for more information on a specific command." << endl;
+ cerr << endl;
+ return 0;
+}
+
+// ------------------------------------------
+// Print version info
+int Version(void) {
+ cout << endl;
+ cout << "bamtools v0.8.xx" << endl;
+ cout << "Part of BamTools API and toolkit" << endl;
+ cout << "Primary authors: Derek Barnett, Erik Garrison, Michael Stromberg" << endl;
+ cout << "(c) 2009-2010 Marth Lab, Biology Dept., Boston College" << endl;
+ cout << endl;
+ return 0;
+}
+
+// ------------------------------------------
+// toolkit entry point
+int main(int argc, char* argv[]) {
+
+ // just 'bamtools'
+ if ( (argc == 1) ) return Help(argc, argv);
+
+ // 'bamtools help', 'bamtools --help', or 'bamtools -h'
+ if ( (argv[1] == HELP) || (argv[1] == LONG_HELP) || (argv[1] == SHORT_HELP) ) return Help(argc, argv);
+
+ // 'bamtools version', 'bamtools --version', or 'bamtools -v'
+ if ( (argv[1] == VERSION) || (argv[1] == LONG_VERSION) || (argv[1] == SHORT_VERSION) ) return Version();
+
+ // determine desired sub-tool
+ AbstractTool* tool(0);
+ if ( argv[1] == CONVERT ) tool = new ConvertTool;
+ if ( argv[1] == COUNT ) tool = new CountTool;
+ if ( argv[1] == COVERAGE ) tool = new CoverageTool;
+ if ( argv[1] == FILTER ) tool = new FilterTool;
+ if ( argv[1] == HEADER ) tool = new HeaderTool;
+ if ( argv[1] == INDEX ) tool = new IndexTool;
+ if ( argv[1] == MERGE ) tool = new MergeTool;
+ if ( argv[1] == RANDOM ) tool = new RandomTool;
+ if ( argv[1] == SORT ) tool = new SortTool;
+ if ( argv[1] == STATS ) tool = new StatsTool;
+
+ // if found, run tool
+ if ( tool ) return tool->Run(argc, argv);
+ // no match found, show help
+ else return Help(argc, argv);
+}
--- /dev/null
+// ***************************************************************************
+// bamtools_convert.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 July 2010
+// ---------------------------------------------------------------------------
+// Converts between BAM and a number of other formats
+// ***************************************************************************
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "bamtools_convert.h"
+#include "bamtools_options.h"
+#include "bamtools_pileup.h"
+#include "bamtools_utilities.h"
+#include "BGZF.h"
+#include "BamReader.h"
+#include "BamMultiReader.h"
+
+using namespace std;
+using namespace BamTools;
+
+namespace BamTools {
+
+ // format names
+ static const string FORMAT_BED = "bed";
+ static const string FORMAT_BEDGRAPH = "bedgraph";
+ static const string FORMAT_FASTA = "fasta";
+ static const string FORMAT_FASTQ = "fastq";
+ static const string FORMAT_JSON = "json";
+ static const string FORMAT_SAM = "sam";
+ static const string FORMAT_PILEUP = "pileup";
+ static const string FORMAT_WIGGLE = "wig";
+
+ // other constants
+ static const unsigned int FASTA_LINE_MAX = 50;
+
+} // namespace BamTools
+
+struct ConvertTool::ConvertToolPrivate {
+
+ // ctor & dtor
+ public:
+ ConvertToolPrivate(ConvertTool::ConvertSettings* settings);
+ ~ConvertToolPrivate(void);
+
+ // interface
+ public:
+ bool Run(void);
+
+ // internal methods
+ private:
+ void PrintBed(const BamAlignment& a);
+ void PrintBedGraph(const BamAlignment& a);
+ void PrintFasta(const BamAlignment& a);
+ void PrintFastq(const BamAlignment& a);
+ void PrintJson(const BamAlignment& a);
+ void PrintSam(const BamAlignment& a);
+ void PrintWiggle(const BamAlignment& a);
+
+ // data members
+ private:
+ ConvertTool::ConvertSettings* m_settings;
+ RefVector m_references;
+ ostream m_out;
+};
+
+// ---------------------------------------------
+// ConvertSettings implementation
+
+struct ConvertTool::ConvertSettings {
+
+ // flags
+ bool HasInput;
+ bool HasOutput;
+ bool HasFormat;
+ bool HasRegion;
+
+ // pileup flags
+ bool HasFastaFilename;
+ bool IsOmittingSamHeader;
+ bool IsPrintingPileupMapQualities;
+
+ // options
+ vector<string> InputFiles;
+ string OutputFilename;
+ string Format;
+ string Region;
+
+ // pileup options
+ string FastaFilename;
+
+ // constructor
+ ConvertSettings(void)
+ : HasInput(false)
+ , HasOutput(false)
+ , HasFormat(false)
+ , HasRegion(false)
+ , HasFastaFilename(false)
+ , IsOmittingSamHeader(false)
+ , IsPrintingPileupMapQualities(false)
+ , OutputFilename(Options::StandardOut())
+ { }
+};
+
+// ---------------------------------------------
+// ConvertTool implementation
+
+ConvertTool::ConvertTool(void)
+ : AbstractTool()
+ , m_settings(new ConvertSettings)
+ , m_impl(0)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools convert", "converts BAM to a number of other formats", "-format <FORMAT> [-in <filename> -in <filename> ...] [-out <filename>] [other options]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
+ Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
+ Options::AddValueOption("-format", "FORMAT", "the output file format - see README for recognized formats", "", m_settings->HasFormat, m_settings->Format, IO_Opts);
+
+ OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
+ Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as <filename>.bai. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
+
+ OptionGroup* PileupOpts = Options::CreateOptionGroup("Pileup Options");
+ Options::AddValueOption("-fasta", "FASTA filename", "FASTA reference file", "", m_settings->HasFastaFilename, m_settings->FastaFilename, PileupOpts, "");
+ Options::AddOption("-mapqual", "print the mapping qualities", m_settings->IsPrintingPileupMapQualities, PileupOpts);
+
+ OptionGroup* SamOpts = Options::CreateOptionGroup("SAM Options");
+ Options::AddOption("-noheader", "omit the SAM header from output", m_settings->IsOmittingSamHeader, SamOpts);
+}
+
+ConvertTool::~ConvertTool(void) {
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int ConvertTool::Help(void) {
+ Options::DisplayHelp();
+ return 0;
+}
+
+int ConvertTool::Run(int argc, char* argv[]) {
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // run internal ConvertTool implementation, return success/fail
+ m_impl = new ConvertToolPrivate(m_settings);
+
+ if ( m_impl->Run() )
+ return 0;
+ else
+ return 1;
+}
+
+// ---------------------------------------------
+// ConvertToolPrivate implementation
+
+ConvertTool::ConvertToolPrivate::ConvertToolPrivate(ConvertTool::ConvertSettings* settings)
+ : m_settings(settings)
+ , m_out(cout.rdbuf()) // default output to cout
+{ }
+
+ConvertTool::ConvertToolPrivate::~ConvertToolPrivate(void) { }
+
+bool ConvertTool::ConvertToolPrivate::Run(void) {
+
+ bool convertedOk = true;
+
+ // ------------------------------------
+ // initialize conversion input/output
+
+ // set to default input if none provided
+ if ( !m_settings->HasInput )
+ m_settings->InputFiles.push_back(Options::StandardIn());
+
+ // open input files
+ BamMultiReader reader;
+ reader.Open(m_settings->InputFiles, false);
+ m_references = reader.GetReferenceData();
+
+ // set region if specified
+ BamRegion region;
+ if ( m_settings->HasRegion ) {
+ if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
+ if ( !reader.SetRegion(region) )
+ cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
+ }
+ }
+
+ // if output file given
+ ofstream outFile;
+ if ( m_settings->HasOutput ) {
+
+ // open output file stream
+ outFile.open(m_settings->OutputFilename.c_str());
+ if ( !outFile ) {
+ cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl;
+ return false;
+ }
+
+ // set m_out to file's streambuf
+ m_out.rdbuf(outFile.rdbuf());
+ }
+
+ // ------------------------
+ // pileup is special case
+ if ( m_settings->Format == FORMAT_PILEUP ) {
+
+ // initialize pileup input/output
+ Pileup pileup(&reader, &m_out);
+
+ // ---------------------------
+ // configure pileup settings
+
+ if ( m_settings->HasRegion )
+ pileup.SetRegion(region);
+
+ if ( m_settings->HasFastaFilename )
+ pileup.SetFastaFilename(m_settings->FastaFilename);
+
+ pileup.SetIsPrintingMapQualities( m_settings->IsPrintingPileupMapQualities );
+
+ // run pileup
+ convertedOk = pileup.Run();
+ }
+
+ // -------------------------------------
+ // else determine 'simpler' format type
+ else {
+
+ bool formatError = false;
+ void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0;
+ if ( m_settings->Format == FORMAT_BED ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed;
+ else if ( m_settings->Format == FORMAT_BEDGRAPH ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBedGraph;
+ else if ( m_settings->Format == FORMAT_FASTA ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta;
+ else if ( m_settings->Format == FORMAT_FASTQ ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq;
+ else if ( m_settings->Format == FORMAT_JSON ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson;
+ else if ( m_settings->Format == FORMAT_SAM ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam;
+ else if ( m_settings->Format == FORMAT_WIGGLE ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintWiggle;
+ else {
+ cerr << "Unrecognized format: " << m_settings->Format << endl;
+ cerr << "Please see help|README (?) for details on supported formats " << endl;
+ formatError = true;
+ convertedOk = false;
+ }
+
+ // if SAM format & not omitting header, print SAM header
+ if ( (m_settings->Format == FORMAT_SAM) && !m_settings->IsOmittingSamHeader ) {
+ string headerText = reader.GetHeaderText();
+ m_out << headerText;
+ }
+
+ // ------------------------
+ // do conversion
+ if ( !formatError ) {
+ BamAlignment a;
+ while ( reader.GetNextAlignment(a) ) {
+ (this->*pFunction)(a);
+ }
+ }
+ }
+
+ // ------------------------
+ // clean up & exit
+ reader.Close();
+ if ( m_settings->HasOutput ) outFile.close();
+ return convertedOk;
+}
+
+// ----------------------------------------------------------
+// Conversion/output methods
+// ----------------------------------------------------------
+
+void ConvertTool::ConvertToolPrivate::PrintBed(const BamAlignment& a) {
+
+ // tab-delimited, 0-based half-open
+ // (e.g. a 50-base read aligned to pos 10 could have BED coordinates (10, 60) instead of BAM coordinates (10, 59) )
+ // <chromName> <chromStart> <chromEnd> <readName> <score> <strand>
+
+ m_out << m_references.at(a.RefID).RefName << "\t"
+ << a.Position << "\t"
+ << a.GetEndPosition() + 1 << "\t"
+ << a.Name << "\t"
+ << a.MapQuality << "\t"
+ << (a.IsReverseStrand() ? "-" : "+") << endl;
+}
+
+void ConvertTool::ConvertToolPrivate::PrintBedGraph(const BamAlignment& a) {
+ ;
+}
+
+// print BamAlignment in FASTA format
+// N.B. - uses QueryBases NOT AlignedBases
+void ConvertTool::ConvertToolPrivate::PrintFasta(const BamAlignment& a) {
+
+ // >BamAlignment.Name
+ // BamAlignment.QueryBases (up to FASTA_LINE_MAX bases per line)
+ // ...
+
+ // print header
+ m_out << "> " << a.Name << endl;
+
+ // if sequence fits on single line
+ if ( a.QueryBases.length() <= FASTA_LINE_MAX )
+ m_out << a.QueryBases << endl;
+
+ // else split over multiple lines
+ else {
+
+ size_t position = 0;
+ size_t seqLength = a.QueryBases.length();
+
+ // write subsequences to each line
+ while ( position < (seqLength - FASTA_LINE_MAX) ) {
+ m_out << a.QueryBases.substr(position, FASTA_LINE_MAX) << endl;
+ position += FASTA_LINE_MAX;
+ }
+
+ // write final subsequence
+ m_out << a.QueryBases.substr(position) << endl;
+ }
+}
+
+// print BamAlignment in FASTQ format
+// N.B. - uses QueryBases NOT AlignedBases
+void ConvertTool::ConvertToolPrivate::PrintFastq(const BamAlignment& a) {
+
+ // @BamAlignment.Name
+ // BamAlignment.QueryBases
+ // +
+ // BamAlignment.Qualities
+
+ m_out << "@" << a.Name << endl
+ << a.QueryBases << endl
+ << "+" << endl
+ << a.Qualities << endl;
+}
+
+// print BamAlignment in JSON format
+void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) {
+
+ // write name & alignment flag
+ m_out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" << a.AlignmentFlag << "\",";
+
+ // write reference name
+ if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) )
+ m_out << "\"reference\":\"" << m_references[a.RefID].RefName << "\",";
+
+ // write position & map quality
+ m_out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ",";
+
+ // write CIGAR
+ const vector<CigarOp>& cigarData = a.CigarData;
+ if ( !cigarData.empty() ) {
+ m_out << "\"cigar\":[";
+ vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
+ vector<CigarOp>::const_iterator cigarIter = cigarBegin;
+ vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
+ for ( ; cigarIter != cigarEnd; ++cigarIter ) {
+ const CigarOp& op = (*cigarIter);
+ if (cigarIter != cigarBegin) m_out << ",";
+ m_out << "\"" << op.Length << op.Type << "\"";
+ }
+ m_out << "],";
+ }
+
+ // write mate reference name, mate position, & insert size
+ if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
+ m_out << "\"mate\":{"
+ << "\"reference\":\"" << m_references[a.MateRefID].RefName << "\","
+ << "\"position\":" << a.MatePosition+1
+ << ",\"insertSize\":" << a.InsertSize << "},";
+ }
+
+ // write sequence
+ if ( !a.QueryBases.empty() )
+ m_out << "\"queryBases\":\"" << a.QueryBases << "\",";
+
+ // write qualities
+ if ( !a.Qualities.empty() ) {
+ string::const_iterator s = a.Qualities.begin();
+ m_out << "\"qualities\":[" << static_cast<short>(*s) - 33;
+ ++s;
+ for (; s != a.Qualities.end(); ++s) {
+ m_out << "," << static_cast<short>(*s) - 33;
+ }
+ m_out << "],";
+ }
+
+ // write tag data
+ const char* tagData = a.TagData.c_str();
+ const size_t tagDataLength = a.TagData.length();
+ size_t index = 0;
+ if (index < tagDataLength) {
+
+ m_out << "\"tags\":{";
+
+ while ( index < tagDataLength ) {
+
+ if (index > 0)
+ m_out << ",";
+
+ // write tag name
+ m_out << "\"" << a.TagData.substr(index, 2) << "\":";
+ index += 2;
+
+ // get data type
+ char type = a.TagData.at(index);
+ ++index;
+
+ switch (type) {
+ case('A') :
+ m_out << "\"" << tagData[index] << "\"";
+ ++index;
+ break;
+
+ case('C') :
+ m_out << (int)tagData[index];
+ ++index;
+ break;
+
+ case('c') :
+ m_out << (int)tagData[index];
+ ++index;
+ break;
+
+ case('S') :
+ m_out << BgzfData::UnpackUnsignedShort(&tagData[index]);
+ index += 2;
+ break;
+
+ case('s') :
+ m_out << BgzfData::UnpackSignedShort(&tagData[index]);
+ index += 2;
+ break;
+
+ case('I') :
+ m_out << BgzfData::UnpackUnsignedInt(&tagData[index]);
+ index += 4;
+ break;
+
+ case('i') :
+ m_out << BgzfData::UnpackSignedInt(&tagData[index]);
+ index += 4;
+ break;
+
+ case('f') :
+ m_out << BgzfData::UnpackFloat(&tagData[index]);
+ index += 4;
+ break;
+
+ case('d') :
+ m_out << BgzfData::UnpackDouble(&tagData[index]);
+ index += 8;
+ break;
+
+ case('Z') :
+ case('H') :
+ m_out << "\"";
+ while (tagData[index]) {
+ m_out << tagData[index];
+ ++index;
+ }
+ m_out << "\"";
+ ++index;
+ break;
+ }
+
+ if ( tagData[index] == '\0')
+ break;
+ }
+
+ m_out << "}";
+ }
+
+ m_out << "}" << endl;
+
+}
+
+// print BamAlignment in SAM format
+void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) {
+
+ // tab-delimited
+ // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ]
+
+ // write name & alignment flag
+ m_out << a.Name << "\t" << a.AlignmentFlag << "\t";
+
+ // write reference name
+ if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) )
+ m_out << m_references[a.RefID].RefName << "\t";
+ else
+ m_out << "*\t";
+
+ // write position & map quality
+ m_out << a.Position+1 << "\t" << a.MapQuality << "\t";
+
+ // write CIGAR
+ const vector<CigarOp>& cigarData = a.CigarData;
+ if ( cigarData.empty() ) m_out << "*\t";
+ else {
+ vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
+ vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
+ for ( ; cigarIter != cigarEnd; ++cigarIter ) {
+ const CigarOp& op = (*cigarIter);
+ m_out << op.Length << op.Type;
+ }
+ m_out << "\t";
+ }
+
+ // write mate reference name, mate position, & insert size
+ if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
+ if ( a.MateRefID == a.RefID ) m_out << "=\t";
+ else m_out << m_references[a.MateRefID].RefName << "\t";
+ m_out << a.MatePosition+1 << "\t" << a.InsertSize << "\t";
+ }
+ else m_out << "*\t0\t0\t";
+
+ // write sequence
+ if ( a.QueryBases.empty() ) m_out << "*\t";
+ else m_out << a.QueryBases << "\t";
+
+ // write qualities
+ if ( a.Qualities.empty() ) m_out << "*";
+ else m_out << a.Qualities;
+
+ // write tag data
+ const char* tagData = a.TagData.c_str();
+ const size_t tagDataLength = a.TagData.length();
+
+ size_t index = 0;
+ while ( index < tagDataLength ) {
+
+ // write tag name
+ string tagName = a.TagData.substr(index, 2);
+ m_out << "\t" << tagName << ":";
+ index += 2;
+
+ // get data type
+ char type = a.TagData.at(index);
+ ++index;
+ switch (type) {
+ case('A') :
+ m_out << "A:" << tagData[index];
+ ++index;
+ break;
+
+ case('C') :
+ m_out << "i:" << (int)tagData[index];
+ ++index;
+ break;
+
+ case('c') :
+ m_out << "i:" << (int)tagData[index];
+ ++index;
+ break;
+
+ case('S') :
+ m_out << "i:" << BgzfData::UnpackUnsignedShort(&tagData[index]);
+ index += 2;
+ break;
+
+ case('s') :
+ m_out << "i:" << BgzfData::UnpackSignedShort(&tagData[index]);
+ index += 2;
+ break;
+
+ case('I') :
+ m_out << "i:" << BgzfData::UnpackUnsignedInt(&tagData[index]);
+ index += 4;
+ break;
+
+ case('i') :
+ m_out << "i:" << BgzfData::UnpackSignedInt(&tagData[index]);
+ index += 4;
+ break;
+
+ case('f') :
+ m_out << "f:" << BgzfData::UnpackFloat(&tagData[index]);
+ index += 4;
+ break;
+
+ case('d') :
+ m_out << "d:" << BgzfData::UnpackDouble(&tagData[index]);
+ index += 8;
+ break;
+
+ case('Z') :
+ case('H') :
+ m_out << type << ":";
+ while (tagData[index]) {
+ m_out << tagData[index];
+ ++index;
+ }
+ ++index;
+ break;
+ }
+
+ if ( tagData[index] == '\0')
+ break;
+ }
+
+ m_out << endl;
+}
+
+void ConvertTool::ConvertToolPrivate::PrintWiggle(const BamAlignment& a) {
+ ;
+}
--- /dev/null
+// ***************************************************************************
+// bamtools_convert.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 9 July 2010
+// ---------------------------------------------------------------------------
+// Converts between BAM and a number of other formats
+// ***************************************************************************
+
+#ifndef BAMTOOLS_CONVERT_H
+#define BAMTOOLS_CONVERT_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class ConvertTool : public AbstractTool {
+
+ public:
+ ConvertTool(void);
+ ~ConvertTool(void);
+
+ public:
+ int Help(void);
+ int Run(int argc, char* argv[]);
+
+ private:
+ struct ConvertSettings;
+ ConvertSettings* m_settings;
+
+ struct ConvertToolPrivate;
+ ConvertToolPrivate* m_impl;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_CONVERT_H
\ No newline at end of file
--- /dev/null
+// ***************************************************************************
+// bamtools_count.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 2 June 2010
+// ---------------------------------------------------------------------------
+// Prints alignment count for BAM file
+//
+// ** Expand to multiple??
+//
+// ***************************************************************************
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "bamtools_count.h"
+#include "bamtools_options.h"
+#include "bamtools_utilities.h"
+#include "BamReader.h"
+#include "BamMultiReader.h"
+
+using namespace std;
+using namespace BamTools;
+
+// ---------------------------------------------
+// CountSettings implementation
+
+struct CountTool::CountSettings {
+
+ // flags
+ bool HasInput;
+ bool HasRegion;
+
+ // filenames
+ vector<string> InputFiles;
+ string Region;
+
+ // constructor
+ CountSettings(void)
+ : HasInput(false)
+ , HasRegion(false)
+ { }
+};
+
+// ---------------------------------------------
+// CountTool implementation
+
+CountTool::CountTool(void)
+ : AbstractTool()
+ , m_settings(new CountSettings)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools count", "prints alignment counts for a BAM file", "-in <filename> [-region <REGION>]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts);
+ //Options::AddValueOption("-index", "BAM index filename", "the BAM index file", "", m_settings->HasBamIndexFilename, m_settings->BamIndexFilename, IO_Opts);
+
+ OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
+ Options::AddValueOption("-region", "REGION", "genomic region. Index file is recommended for better performance, and is read automatically if it exists as <filename>.bai or <filename>.bti. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
+}
+
+CountTool::~CountTool(void) {
+ delete m_settings;
+ m_settings = 0;
+}
+
+int CountTool::Help(void) {
+ Options::DisplayHelp();
+ return 0;
+}
+
+int CountTool::Run(int argc, char* argv[]) {
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ if ( !m_settings->HasInput )
+ m_settings->InputFiles.push_back(Options::StandardIn());
+
+ BamMultiReader reader;
+ reader.Open(m_settings->InputFiles, false, true);
+
+ // alignment counter
+ int alignmentCount(0);
+
+ // set up error handling
+ ostringstream errorStream("");
+ bool foundError(false);
+
+ // if no region specified, count entire file
+ if ( !m_settings->HasRegion ) {
+ BamAlignment al;
+ while ( reader.GetNextAlignmentCore(al) )
+ ++alignmentCount;
+ }
+
+ // more complicated - region specified
+ else {
+
+ BamRegion region;
+ if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
+
+ // check if there are index files *.bai/*.bti corresponding to the input files
+ bool hasDefaultIndex = false;
+ bool hasBamtoolsIndex = false;
+ bool hasNoIndex = false;
+ int defaultIndexCount = 0;
+ int bamtoolsIndexCount = 0;
+ for (vector<string>::const_iterator f = m_settings->InputFiles.begin(); f != m_settings->InputFiles.end(); ++f) {
+
+ if ( Utilities::FileExists(*f + ".bai") ) {
+ hasDefaultIndex = true;
+ ++defaultIndexCount;
+ }
+
+ if ( Utilities::FileExists(*f + ".bti") ) {
+ hasBamtoolsIndex = true;
+ ++bamtoolsIndexCount;
+ }
+
+ if ( !hasDefaultIndex && !hasBamtoolsIndex ) {
+ hasNoIndex = true;
+ cerr << "*WARNING - could not find index file for " << *f
+ << ", parsing whole file(s) to get alignment counts for target region"
+ << " (could be slow)" << endl;
+ break;
+ }
+ }
+
+ // determine if index file types are heterogeneous
+ bool hasDifferentIndexTypes = false;
+ if ( defaultIndexCount > 0 && bamtoolsIndexCount > 0 ) {
+ hasDifferentIndexTypes = true;
+ cerr << "*WARNING - different index file formats found"
+ << ", parsing whole file(s) to get alignment counts for target region"
+ << " (could be slow)" << endl;
+ }
+
+ // if any input file has no index, or if input files use different index formats
+ // can't use BamMultiReader to jump directly (**for now**)
+ if ( hasNoIndex || hasDifferentIndexTypes ) {
+
+ // read through sequentially, counting all overlapping reads
+ BamAlignment al;
+ while( reader.GetNextAlignmentCore(al) ) {
+ if ( (al.RefID >= region.LeftRefID) && ( (al.Position + al.Length) >= region.LeftPosition ) &&
+ (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) )
+ {
+ ++alignmentCount;
+ }
+ }
+ }
+
+ // has index file for each input file (and same format)
+ else {
+
+ // this is kind of a hack...?
+ BamMultiReader reader;
+ reader.Open(m_settings->InputFiles, true, true, hasDefaultIndex );
+
+ if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
+ foundError = true;
+ errorStream << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
+ } else {
+ BamAlignment al;
+ while ( reader.GetNextAlignmentCore(al) )
+ ++alignmentCount;
+ }
+ }
+
+ } else {
+ foundError = true;
+ errorStream << "Could not parse REGION: " << m_settings->Region << endl;
+ errorStream << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl;
+ }
+ }
+
+ // print errors OR results
+ if ( foundError )
+ cerr << errorStream.str() << endl;
+ else
+ cout << alignmentCount << endl;
+
+ // clean & exit
+ reader.Close();
+ return (int)foundError;
+}
--- /dev/null
+// ***************************************************************************
+// bamtools_count.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 1 June 2010
+// ---------------------------------------------------------------------------
+// Prints alignment count for BAM file
+//
+// ** Expand to multiple??
+//
+// ***************************************************************************
+
+#ifndef BAMTOOLS_COUNT_H
+#define BAMTOOLS_COUNT_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class CountTool : public AbstractTool {
+
+ public:
+ CountTool(void);
+ ~CountTool(void);
+
+ public:
+ int Help(void);
+ int Run(int argc, char* argv[]);
+
+ private:
+ struct CountSettings;
+ CountSettings* m_settings;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_COUNT_H
--- /dev/null
+// ***************************************************************************
+// bamtools_coverage.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 1 June 2010
+// ---------------------------------------------------------------------------
+// Prints coverage statistics for a single BAM file
+//
+// ** Expand to multiple??
+//
+// ***************************************************************************
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "bamtools_coverage.h"
+#include "bamtools_options.h"
+#include "BamReader.h"
+
+using namespace std;
+using namespace BamTools;
+
+// ---------------------------------------------
+// CoverageSettings implementation
+
+struct CoverageTool::CoverageSettings {
+
+ // flags
+ bool HasInputBamFilename;
+
+ // filenames
+ std::string InputBamFilename;
+
+ // constructor
+ CoverageSettings(void)
+ : HasInputBamFilename(false)
+ , InputBamFilename(Options::StandardIn())
+ { }
+};
+
+// ---------------------------------------------
+// CoverageTool implementation
+
+CoverageTool::CoverageTool(void)
+ : AbstractTool()
+ , m_settings(new CoverageSettings)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools coverage", "prints coverage stats for a BAM file", "-in <filename> ");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn());
+}
+
+CoverageTool::~CoverageTool(void) {
+ delete m_settings;
+ m_settings = 0;
+}
+
+int CoverageTool::Help(void) {
+ Options::DisplayHelp();
+ return 0;
+}
+
+int CoverageTool::Run(int argc, char* argv[]) {
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ //open our BAM reader
+ BamReader reader;
+ reader.Open(m_settings->InputBamFilename);
+
+ // generate coverage stats
+ cerr << "Generating coverage stats for " << m_settings->InputBamFilename << endl;
+ cerr << "FEATURE NOT YET IMPLEMENTED!" << endl;
+
+ // clean & exit
+ reader.Close();
+ return 0;
+}
\ No newline at end of file
--- /dev/null
+// ***************************************************************************
+// bamtools_coverage.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 1 June 2010
+// ---------------------------------------------------------------------------
+// Prints coverage statistics for a single BAM file
+//
+// ** Expand to multiple??
+//
+// ***************************************************************************
+
+#ifndef BAMTOOLS_COVERAGE_H
+#define BAMTOOLS_COVERAGE_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class CoverageTool : public AbstractTool {
+
+ public:
+ CoverageTool(void);
+ ~CoverageTool(void);
+
+ public:
+ int Help(void);
+ int Run(int argc, char* argv[]);
+
+ private:
+ struct CoverageSettings;
+ CoverageSettings* m_settings;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_COVERAGE_H
--- /dev/null
+// ***************************************************************************
+// bamtools_filter.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 1 June 2010
+// ---------------------------------------------------------------------------
+// Filters a single BAM file (or filters multiple BAM files and merges)
+// according to some user-specified criteria.
+// ***************************************************************************
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "bamtools_filter.h"
+#include "bamtools_options.h"
+#include "BamReader.h"
+#include "BamMultiReader.h"
+
+using namespace std;
+using namespace BamTools;
+
+// ---------------------------------------------
+// FilterSettings implementation
+
+struct FilterTool::FilterSettings {
+
+ // flags
+ bool HasInputBamFilename;
+ bool HasOutputBamFilename;
+
+ // filenames
+ vector<string> InputFiles;
+ string OutputFilename;
+
+ // constructor
+ FilterSettings(void)
+ : HasInputBamFilename(false)
+ , HasOutputBamFilename(false)
+ , OutputFilename(Options::StandardOut())
+ { }
+};
+
+// ---------------------------------------------
+// FilterTool implementation
+
+FilterTool::FilterTool(void)
+ : AbstractTool()
+ , m_settings(new FilterSettings)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", "-in <filename> [-in <filename> ... ] -out <filename> ");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts, Options::StandardIn());
+ Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
+}
+
+FilterTool::~FilterTool(void) {
+ delete m_settings;
+ m_settings = 0;
+}
+
+int FilterTool::Help(void) {
+ Options::DisplayHelp();
+ return 0;
+}
+
+int FilterTool::Run(int argc, char* argv[]) {
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // set to default input if none provided
+ if ( !m_settings->HasInputBamFilename )
+ m_settings->InputFiles.push_back(Options::StandardIn());
+
+ // open files
+ BamMultiReader reader;
+ reader.Open(m_settings->InputFiles, false);
+
+ // do filtering
+
+ // clean up & exit
+ reader.Close();
+ return 0;
+}
\ No newline at end of file
--- /dev/null
+// ***************************************************************************
+// bamtools_filter.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 2 June 2010
+// ---------------------------------------------------------------------------
+// Filters a single BAM file (or filters multiple BAM files and merges)
+// according to some user-specified criteria.
+// ***************************************************************************
+
+#ifndef BAMTOOLS_FILTER_H
+#define BAMTOOLS_FILTER_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class FilterTool : public AbstractTool {
+
+ public:
+ FilterTool(void);
+ ~FilterTool(void);
+
+ public:
+ int Help(void);
+ int Run(int argc, char* argv[]);
+
+ private:
+ struct FilterSettings;
+ FilterSettings* m_settings;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_FILTER_H
--- /dev/null
+// ***************************************************************************
+// bamtools_header.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 1 June 2010
+// ---------------------------------------------------------------------------
+// Prints the SAM-style header from a single BAM file ( or merged header from
+// multiple BAM files) to stdout
+// ***************************************************************************
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "bamtools_header.h"
+#include "bamtools_options.h"
+#include "BamReader.h"
+#include "BamMultiReader.h"
+
+using namespace std;
+using namespace BamTools;
+
+// ---------------------------------------------
+// HeaderSettings implementation
+
+struct HeaderTool::HeaderSettings {
+
+ // flags
+ bool HasInputBamFilename;
+
+ // filenames
+ vector<string> InputFiles;
+
+ // constructor
+ HeaderSettings(void)
+ : HasInputBamFilename(false)
+ { }
+};
+
+// ---------------------------------------------
+// HeaderTool implementation
+
+HeaderTool::HeaderTool(void)
+ : AbstractTool()
+ , m_settings(new HeaderSettings)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools header", "prints header from BAM file(s)", "-in <filename> [-in <filename> ... ] ");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts, Options::StandardIn());
+}
+
+HeaderTool::~HeaderTool(void) {
+ delete m_settings;
+ m_settings = 0;
+}
+
+int HeaderTool::Help(void) {
+ Options::DisplayHelp();
+ return 0;
+}
+
+int HeaderTool::Run(int argc, char* argv[]) {
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // set to default input if none provided
+ if ( !m_settings->HasInputBamFilename )
+ m_settings->InputFiles.push_back(Options::StandardIn());
+
+ // open files
+ BamMultiReader reader;
+ if ( reader.Open(m_settings->InputFiles, false) ) {
+ // dump header contents to stdout
+ cout << reader.GetHeaderText() << endl;
+ }
+
+ // clean up & exit
+ reader.Close();
+ return 0;
+}
\ No newline at end of file
--- /dev/null
+// ***************************************************************************
+// bamtools_header.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 1 June 2010
+// ---------------------------------------------------------------------------
+// Prints the SAM-style header from a single BAM file ( or merged header from
+// multiple BAM files) to stdout
+// ***************************************************************************
+
+#ifndef BAMTOOLS_HEADER_H
+#define BAMTOOLS_HEADER_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class HeaderTool : public AbstractTool {
+
+ public:
+ HeaderTool(void);
+ ~HeaderTool(void);
+
+ public:
+ int Help(void);
+ int Run(int argc, char* argv[]);
+
+ private:
+ struct HeaderSettings;
+ HeaderSettings* m_settings;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_HEADER_H
--- /dev/null
+// ***************************************************************************
+// bamtools_index.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 7 July 2010
+// ---------------------------------------------------------------------------
+// Creates a BAM index (".bai") file for the provided BAM file.
+// ***************************************************************************
+
+#include <iostream>
+#include <string>
+
+#include "bamtools_index.h"
+#include "bamtools_options.h"
+#include "BamReader.h"
+
+using namespace std;
+using namespace BamTools;
+
+// ---------------------------------------------
+// IndexSettings implementation
+
+struct IndexTool::IndexSettings {
+
+ // flags
+ bool HasInputBamFilename;
+ bool IsUsingBamtoolsIndex;
+
+ // filenames
+ string InputBamFilename;
+
+ // constructor
+ IndexSettings(void)
+ : HasInputBamFilename(false)
+ , IsUsingBamtoolsIndex(false)
+ , InputBamFilename(Options::StandardIn())
+ { }
+};
+
+// ---------------------------------------------
+// IndexTool implementation
+
+IndexTool::IndexTool(void)
+ : AbstractTool()
+ , m_settings(new IndexSettings)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools index", "creates index for BAM file", "[-in <filename>] [-bti]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn());
+ Options::AddOption("-bti", "use (non-standard) BamTools indexing scheme", m_settings->IsUsingBamtoolsIndex, IO_Opts);
+}
+
+IndexTool::~IndexTool(void) {
+ delete m_settings;
+ m_settings = 0;
+}
+
+int IndexTool::Help(void) {
+ Options::DisplayHelp();
+ return 0;
+}
+
+int IndexTool::Run(int argc, char* argv[]) {
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // open our BAM reader
+ BamReader reader;
+ reader.Open(m_settings->InputBamFilename);
+
+ // create index for BAM file
+ bool useDefaultIndex = !m_settings->IsUsingBamtoolsIndex;
+ reader.CreateIndex(useDefaultIndex);
+
+ // clean & exit
+ reader.Close();
+ return 0;
+}
--- /dev/null
+// ***************************************************************************
+// bamtools_index.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 1 June 2010
+// ---------------------------------------------------------------------------
+// Creates a BAM index (".bai") file for the provided BAM file
+// ***************************************************************************
+
+#ifndef BAMTOOLS_INDEX_H
+#define BAMTOOLS_INDEX_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class IndexTool : public AbstractTool {
+
+ public:
+ IndexTool(void);
+ ~IndexTool(void);
+
+ public:
+ int Help(void);
+ int Run(int argc, char* argv[]);
+
+ private:
+ struct IndexSettings;
+ IndexSettings* m_settings;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_INDEX_H
--- /dev/null
+// ***************************************************************************
+// bamtools_merge.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 2 June 2010
+// ---------------------------------------------------------------------------
+// Merges multiple BAM files into one.
+//
+// ** Provide selectable region? eg chr2:10000..20000
+//
+// ***************************************************************************
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "bamtools_merge.h"
+#include "bamtools_options.h"
+#include "bamtools_utilities.h"
+#include "BamMultiReader.h"
+#include "BamWriter.h"
+
+using namespace std;
+using namespace BamTools;
+
+// ---------------------------------------------
+// MergeSettings implementation
+
+struct MergeTool::MergeSettings {
+
+ // flags
+ bool HasInputBamFilename;
+ bool HasOutputBamFilename;
+// bool HasRegion;
+
+ // filenames
+ vector<string> InputFiles;
+
+ // other parameters
+ string OutputFilename;
+// string Region;
+
+ // constructor
+ MergeSettings(void)
+ : HasInputBamFilename(false)
+ , HasOutputBamFilename(false)
+// , HasRegion(false)
+ , OutputFilename(Options::StandardOut())
+ { }
+};
+
+// ---------------------------------------------
+// MergeTool implementation
+
+MergeTool::MergeTool(void)
+ : AbstractTool()
+ , m_settings(new MergeSettings)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools merge", "merges multiple BAM files into one", "[-in <filename> -in <filename> ...] [-out <filename>]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts);
+ Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts);
+
+// OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
+// Options::AddValueOption("-region", "REGION", "genomic region. See README for more details", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
+}
+
+MergeTool::~MergeTool(void) {
+ delete m_settings;
+ m_settings = 0;
+}
+
+int MergeTool::Help(void) {
+ Options::DisplayHelp();
+ return 0;
+}
+
+int MergeTool::Run(int argc, char* argv[]) {
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // set to default input if none provided
+ if ( !m_settings->HasInputBamFilename ) m_settings->InputFiles.push_back(Options::StandardIn());
+
+ // opens the BAM files without checking for indexes
+ BamMultiReader reader;
+ reader.Open(m_settings->InputFiles, false, true);
+
+ // retrieve header & reference dictionary info
+ std::string mergedHeader = reader.GetHeaderText();
+ RefVector references = reader.GetReferenceData();
+
+ // open BamWriter
+ BamWriter writer;
+ writer.Open(m_settings->OutputFilename, mergedHeader, references);
+
+ // store alignments to output file
+ BamAlignment bAlignment;
+ while (reader.GetNextAlignmentCore(bAlignment)) {
+ writer.SaveAlignment(bAlignment);
+ }
+
+ // clean & exit
+ reader.Close();
+ writer.Close();
+ return 0;
+}
--- /dev/null
+// ***************************************************************************
+// bamtools_merge.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 1 June 2010
+// ---------------------------------------------------------------------------
+// Merges multiple BAM files into one
+// ***************************************************************************
+
+#ifndef BAMTOOLS_MERGE_H
+#define BAMTOOLS_MERGE_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class MergeTool : public AbstractTool {
+
+ public:
+ MergeTool(void);
+ ~MergeTool(void);
+
+ public:
+ int Help(void);
+ int Run(int argc, char* argv[]);
+
+ private:
+ struct MergeSettings;
+ MergeSettings* m_settings;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_MERGE_H
--- /dev/null
+// ***************************************************************************
+// bamtools_random.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 20 July 2010 (DB)
+// ---------------------------------------------------------------------------
+// Grab a random subset of alignments.
+// ***************************************************************************
+
+#include <ctime>
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "bamtools_random.h"
+#include "bamtools_options.h"
+#include "bamtools_utilities.h"
+#include "BamMultiReader.h"
+#include "BamWriter.h"
+using namespace std;
+using namespace BamTools;
+
+namespace BamTools {
+
+ // define constants
+ const unsigned int RANDOM_MAX_ALIGNMENT_COUNT = 10000;
+
+} // namespace BamTools
+
+// ---------------------------------------------
+// RandomSettings implementation
+
+struct RandomTool::RandomSettings {
+
+ // flags
+ bool HasAlignmentCount;
+ bool HasInput;
+ bool HasOutput;
+ bool HasRegion;
+
+ // parameters
+ unsigned int AlignmentCount;
+ vector<string> InputFiles;
+ string OutputFilename;
+ string Region;
+
+ // constructor
+ RandomSettings(void)
+ : HasAlignmentCount(false)
+ , HasInput(false)
+ , HasOutput(false)
+ , HasRegion(false)
+ , AlignmentCount(RANDOM_MAX_ALIGNMENT_COUNT)
+ { }
+};
+
+// ---------------------------------------------
+// RandomTool implementation
+
+RandomTool::RandomTool(void)
+ : AbstractTool()
+ , m_settings(new RandomSettings)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools random", "grab a random subset of alignments", "[-in <filename> -in <filename> ...] [-out <filename>] [-region <REGION>]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
+ Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
+
+ OptionGroup* FilterOpts = Options::CreateOptionGroup("Filters");
+ Options::AddValueOption("-n", "count", "number of alignments to grab. Note - no duplicate checking is performed (currently)", "", m_settings->HasAlignmentCount, m_settings->AlignmentCount, FilterOpts, RANDOM_MAX_ALIGNMENT_COUNT);
+ Options::AddValueOption("-region", "REGION", "limit source of random alignment subset to a particular genomic region. Index file is recommended for better performance, and is read automatically if it exists as <filename>.bai or <filename>.bti. See \'bamtools help index\' for more details on creating one", "", m_settings->HasRegion, m_settings->Region, FilterOpts);
+}
+
+RandomTool::~RandomTool(void) {
+ delete m_settings;
+ m_settings = 0;
+}
+
+int RandomTool::Help(void) {
+ Options::DisplayHelp();
+ return 0;
+}
+
+int RandomTool::Run(int argc, char* argv[]) {
+
+ // TODO: Handle BAM input WITHOUT index files.
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // set to default input if none provided
+ if ( !m_settings->HasInput )
+ m_settings->InputFiles.push_back(Options::StandardIn());
+
+ // open our BAM reader
+ BamMultiReader reader;
+ reader.Open(m_settings->InputFiles);
+ string headerText = reader.GetHeaderText();
+ RefVector references = reader.GetReferenceData();
+
+ // check that reference data is available, used for generating random jumps
+ if ( references.empty() ) {
+ cerr << "No reference data available... quitting." << endl;
+ reader.Close();
+ return 1;
+ }
+
+ // see if user specified a REGION
+ BamRegion region;
+ if ( m_settings->HasRegion ) {
+ if ( Utilities::ParseRegionString(m_settings->Region, reader, region) )
+ reader.SetRegion(region);
+ }
+
+ // open out BAM writer
+ BamWriter writer;
+ writer.Open(m_settings->OutputFilename, headerText, references);
+
+ // seed our random number generator
+ srand (time(NULL) );
+
+ // grab random alignments
+ BamAlignment al;
+ unsigned int i = 0;
+ while ( i < m_settings->AlignmentCount ) {
+
+ int randomRefId = 0;
+ int randomPosition = 0;
+
+ // use REGION constraints to generate random refId & position
+ if ( m_settings->HasRegion ) {
+
+ int lowestRefId = region.LeftRefID;
+ int highestRefId = region.RightRefID;
+ int rangeRefId = (highestRefId - lowestRefId) + 1;
+ randomRefId = lowestRefId + (int)(rangeRefId * (double)(rand()/((double)RAND_MAX + 1)));
+
+ int lowestPosition = ( (randomRefId == region.LeftRefID) ? region.LeftPosition : 0 );
+ int highestPosition = ( (randomRefId == region.RightRefID) ? region.RightPosition : references.at(randomRefId).RefLength - 1 );
+ int rangePosition = (highestPosition - lowestPosition) + 1;
+ randomPosition = lowestPosition + (int)(rangePosition * (double)(rand()/((double)RAND_MAX + 1)));
+ }
+
+ // otherwise generate 'normal' random refId & position
+ else {
+
+ // generate random refId
+ int lowestRefId = 0;
+ int highestRefId = references.size() - 1;
+ int rangeRefId = (highestRefId - lowestRefId) + 1;
+ randomRefId = lowestRefId + (int)(rangeRefId * (double)(rand()/((double)RAND_MAX + 1)));
+
+ // generate random position
+ int lowestPosition = 0;
+ int highestPosition = references.at(randomRefId).RefLength - 1;
+ int rangePosition = (highestPosition - lowestPosition) + 1;
+ randomPosition = lowestPosition + (int)(rangePosition * (double)(rand()/((double)RAND_MAX + 1)));
+ }
+
+ // if jump & read successful, save alignment
+ if ( reader.Jump(randomRefId, randomPosition) ) {
+ while ( reader.GetNextAlignmentCore(al) ) {
+ if ( al.RefID == randomRefId && al.Position >= randomPosition ) {
+ writer.SaveAlignment(al);
+ ++i;
+ break;
+ }
+ }
+ }
+ }
+
+ // close reader & writer
+ reader.Close();
+ writer.Close();
+ return 0;
+}
\ No newline at end of file
--- /dev/null
+// ***************************************************************************
+// bamtools_random.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 20 July 2010 (DB)
+// ---------------------------------------------------------------------------
+// Grab a random subset of alignments.
+// ***************************************************************************
+
+#ifndef BAMTOOLS_RANDOM_H
+#define BAMTOOLS_RANDOM_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class RandomTool : public AbstractTool {
+
+ public:
+ RandomTool(void);
+ ~RandomTool(void);
+
+ public:
+ int Help(void);
+ int Run(int argc, char* argv[]);
+
+ private:
+ struct RandomSettings;
+ RandomSettings* m_settings;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_RANDOM _H
--- /dev/null
+// ***************************************************************************
+// bamtools_sort.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 21 June 2010 (DB)
+// ---------------------------------------------------------------------------
+// Sorts an input BAM file (default by position) and stores in a new BAM file.
+// ***************************************************************************
+
+#include <cstdio>
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "bamtools_sort.h"
+#include "bamtools_options.h"
+#include "BamReader.h"
+#include "BamMultiReader.h"
+#include "BamWriter.h"
+
+using namespace std;
+using namespace BamTools;
+
+namespace BamTools {
+
+ // defaults
+ //
+ // ** These defaults should be tweaked & 'optimized' per testing ** //
+ // I say 'optimized' because each system will naturally perform
+ // differently. We will attempt to determine a sensible
+ // compromise that should perform well on average.
+ const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT = 10000; // max numberOfAlignments for buffer
+ const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024; // Mb
+
+ // -----------------------------------
+ // comparison objects (for sorting)
+
+ struct SortLessThanPosition {
+ bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) {
+ if ( lhs.RefID != rhs.RefID )
+ return lhs.RefID < rhs.RefID;
+ else
+ return lhs.Position < rhs.Position;
+ }
+ };
+
+ struct SortLessThanName {
+ bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) {
+ return lhs.Name < rhs.Name;
+ }
+ };
+
+} // namespace BamTools
+
+// ---------------------------------------------
+// SortToolPrivate declaration
+class SortTool::SortToolPrivate {
+
+ // ctor & dtor
+ public:
+ SortToolPrivate(SortTool::SortSettings* settings);
+ ~SortToolPrivate(void);
+
+ // 'public' interface
+ public:
+ bool Run(void);
+
+ // internal methods
+ private:
+ void ClearBuffer(vector<BamAlignment>& buffer);
+ bool GenerateSortedRuns(void);
+ bool HandleBufferContents(vector<BamAlignment>& buffer);
+ bool MergeSortedRuns(void);
+ bool WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename);
+ void SortBuffer(vector<BamAlignment>& buffer);
+
+ // data members
+ private:
+ SortTool::SortSettings* m_settings;
+ string m_tempFilenameStub;
+ int m_numberOfRuns;
+ string m_headerText;
+ RefVector m_references;
+ vector<string> m_tempFilenames;
+};
+
+// ---------------------------------------------
+// SortSettings implementation
+
+struct SortTool::SortSettings {
+
+ // flags
+ bool HasInputBamFilename;
+ bool HasMaxBufferCount;
+ bool HasMaxBufferMemory;
+ bool HasOutputBamFilename;
+ bool IsSortingByName;
+
+ // filenames
+ string InputBamFilename;
+ string OutputBamFilename;
+
+ // parameters
+ unsigned int MaxBufferCount;
+ unsigned int MaxBufferMemory;
+
+ // constructor
+ SortSettings(void)
+ : HasInputBamFilename(false)
+ , HasMaxBufferCount(false)
+ , HasMaxBufferMemory(false)
+ , HasOutputBamFilename(false)
+ , IsSortingByName(false)
+ , InputBamFilename(Options::StandardIn())
+ , OutputBamFilename(Options::StandardOut())
+ , MaxBufferCount(SORT_DEFAULT_MAX_BUFFER_COUNT)
+ , MaxBufferMemory(SORT_DEFAULT_MAX_BUFFER_MEMORY)
+ { }
+};
+
+// ---------------------------------------------
+// SortTool implementation
+
+SortTool::SortTool(void)
+ : AbstractTool()
+ , m_settings(new SortSettings)
+ , m_impl(0)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools sort", "sorts a BAM file", "[-in <filename>] [-out <filename>]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn());
+ Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputBamFilename, IO_Opts, Options::StandardOut());
+
+ OptionGroup* SortOpts = Options::CreateOptionGroup("Sorting Methods");
+ Options::AddOption("-byname", "sort by alignment name", m_settings->IsSortingByName, SortOpts);
+
+ OptionGroup* MemOpts = Options::CreateOptionGroup("Memory Settings");
+ Options::AddValueOption("-n", "count", "max number of alignments per tempfile", "", m_settings->HasMaxBufferCount, m_settings->MaxBufferCount, MemOpts, SORT_DEFAULT_MAX_BUFFER_COUNT);
+ Options::AddValueOption("-mem", "Mb", "max memory to use", "", m_settings->HasMaxBufferMemory, m_settings->MaxBufferMemory, MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY);
+}
+
+SortTool::~SortTool(void) {
+
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int SortTool::Help(void) {
+ Options::DisplayHelp();
+ return 0;
+}
+
+int SortTool::Run(int argc, char* argv[]) {
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // run internal SortTool implementation, return success/fail
+ m_impl = new SortToolPrivate(m_settings);
+
+ if ( m_impl->Run() ) return 0;
+ else return 1;
+}
+
+// ---------------------------------------------
+// SortToolPrivate implementation
+
+// constructor
+SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings)
+ : m_settings(settings)
+ , m_numberOfRuns(0)
+{
+ // set filename stub depending on inputfile path
+ // that way multiple sort runs don't trip on each other's temp files
+ if ( m_settings) {
+ size_t extensionFound = m_settings->InputBamFilename.find(".bam");
+ if (extensionFound != string::npos )
+ m_tempFilenameStub = m_settings->InputBamFilename.substr(0,extensionFound);
+ m_tempFilenameStub.append(".sort.temp.");
+ }
+}
+
+// destructor
+SortTool::SortToolPrivate::~SortToolPrivate(void) { }
+
+// generates mutiple sorted temp BAM files from single unsorted BAM file
+bool SortTool::SortToolPrivate::GenerateSortedRuns(void) {
+
+ // open input BAM file
+ BamReader inputReader;
+ inputReader.Open(m_settings->InputBamFilename);
+
+ // get basic data that will be shared by all temp/output files
+ m_headerText = inputReader.GetHeaderText();
+ m_references = inputReader.GetReferenceData();
+
+ // set up alignments buffer
+ vector<BamAlignment> buffer;
+ buffer.reserve(m_settings->MaxBufferCount);
+
+ // while data available
+ BamAlignment al;
+ while ( inputReader.GetNextAlignmentCore(al)) {
+
+ // store alignments in buffer
+ buffer.push_back(al);
+
+ // if buffer is full, handle contents (sort & write to temp file)
+ if ( buffer.size() == m_settings->MaxBufferCount )
+ HandleBufferContents(buffer);
+ }
+
+ // handle any remaining buffer contents
+ if ( buffer.size() > 0 )
+ HandleBufferContents(buffer);
+
+ // close reader & return success
+ inputReader.Close();
+ return true;
+}
+
+bool SortTool::SortToolPrivate::HandleBufferContents(vector<BamAlignment>& buffer ) {
+
+ // do sorting
+ SortBuffer(buffer);
+
+ // write sorted contents to temp file, store success/fail
+ stringstream tempStr;
+ tempStr << m_tempFilenameStub << m_numberOfRuns;
+ bool success = WriteTempFile( buffer, tempStr.str() );
+
+ // save temp filename for merging later
+ m_tempFilenames.push_back(tempStr.str());
+
+ // clear buffer contents & update run counter
+ buffer.clear();
+ ++m_numberOfRuns;
+
+ // return success/fail of writing to temp file
+ return success;
+}
+
+// merges sorted temp BAM files into single sorted output BAM file
+bool SortTool::SortToolPrivate::MergeSortedRuns(void) {
+
+ // open up multi reader for all of our temp files
+ // this might get broken up if we do a multi-pass system later ??
+ BamMultiReader multiReader;
+ multiReader.Open(m_tempFilenames, false, true);
+
+ // open writer for our completely sorted output BAM file
+ BamWriter mergedWriter;
+ mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references);
+
+ // while data available in temp files
+ BamAlignment al;
+ while ( multiReader.GetNextAlignmentCore(al) ) {
+ mergedWriter.SaveAlignment(al);
+ }
+
+ // close readers
+ multiReader.Close();
+ mergedWriter.Close();
+
+ // delete all temp files
+ vector<string>::const_iterator tempIter = m_tempFilenames.begin();
+ vector<string>::const_iterator tempEnd = m_tempFilenames.end();
+ for ( ; tempIter != tempEnd; ++tempIter ) {
+ const string& tempFilename = (*tempIter);
+ remove(tempFilename.c_str());
+ }
+
+ return true;
+}
+
+bool SortTool::SortToolPrivate::Run(void) {
+
+ // this does a single pass, chunking up the input file into smaller sorted temp files,
+ // then write out using BamMultiReader to handle merging
+
+ if ( GenerateSortedRuns() )
+ return MergeSortedRuns();
+ else
+ return false;
+}
+
+void SortTool::SortToolPrivate::SortBuffer(vector<BamAlignment>& buffer) {
+
+ // ** add further custom sort options later ?? **
+
+ // sort buffer by desired method
+ if ( m_settings->IsSortingByName )
+ sort ( buffer.begin(), buffer.end(), SortLessThanName() );
+ else
+ sort ( buffer.begin(), buffer.end(), SortLessThanPosition() );
+}
+
+
+bool SortTool::SortToolPrivate::WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename) {
+
+ // open temp file for writing
+ BamWriter tempWriter;
+ tempWriter.Open(tempFilename, m_headerText, m_references);
+
+ // write data
+ vector<BamAlignment>::const_iterator buffIter = buffer.begin();
+ vector<BamAlignment>::const_iterator buffEnd = buffer.end();
+ for ( ; buffIter != buffEnd; ++buffIter ) {
+ const BamAlignment& al = (*buffIter);
+ tempWriter.SaveAlignment(al);
+ }
+
+ // close temp file & return success
+ tempWriter.Close();
+ return true;
+}
\ No newline at end of file
--- /dev/null
+// ***************************************************************************
+// bamtools_sort.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 21 June 2010 (DB)
+// ---------------------------------------------------------------------------
+// Sorts a BAM file.
+// ***************************************************************************
+
+#ifndef BAMTOOLS_SORT_H
+#define BAMTOOLS_SORT_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class SortTool : public AbstractTool {
+
+ public:
+ SortTool(void);
+ ~SortTool(void);
+
+ public:
+ int Help(void);
+ int Run(int argc, char* argv[]);
+
+ private:
+ struct SortSettings;
+ SortSettings* m_settings;
+
+ struct SortToolPrivate;
+ SortToolPrivate* m_impl;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_SORT_H
--- /dev/null
+// ***************************************************************************
+// bamtools_cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 July 2010
+// ---------------------------------------------------------------------------
+// Prints general alignment statistics for BAM file(s).
+// ***************************************************************************
+
+#include <cmath>
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "bamtools_stats.h"
+#include "bamtools_options.h"
+#include "BamMultiReader.h"
+using namespace std;
+using namespace BamTools;
+
+// ---------------------------------------------
+// StatsSettings implementation
+
+struct StatsTool::StatsSettings {
+
+ // flags
+ bool HasInput;
+ bool IsShowingInsertSizeSummary;
+
+ // filenames
+ vector<string> InputFiles;
+
+ // constructor
+ StatsSettings(void)
+ : HasInput(false)
+ , IsShowingInsertSizeSummary(false)
+ { }
+};
+
+// ---------------------------------------------
+// StatsToolPrivate implementation
+
+struct StatsTool::StatsToolPrivate {
+
+ // ctor & dtor
+ public:
+ StatsToolPrivate(StatsTool::StatsSettings* _settings);
+ ~StatsToolPrivate(void);
+
+ // 'public' interface
+ public:
+ bool Run(void);
+
+ // internal methods
+ private:
+ bool CalculateMedian(vector<int>& data, double& median);
+ void PrintStats(void);
+ void ProcessAlignment(const BamAlignment& al);
+
+ // data members
+ private:
+ StatsTool::StatsSettings* settings;
+ unsigned int numReads;
+ unsigned int numPaired;
+ unsigned int numProperPair;
+ unsigned int numMapped;
+ unsigned int numBothMatesMapped;
+ unsigned int numForwardStrand;
+ unsigned int numReverseStrand;
+ unsigned int numFirstMate;
+ unsigned int numSecondMate;
+ unsigned int numSingletons;
+ unsigned int numFailedQC;
+ unsigned int numDuplicates;
+ vector<int> insertSizes;
+};
+
+StatsTool::StatsToolPrivate::StatsToolPrivate(StatsTool::StatsSettings* _settings)
+ : settings(_settings)
+ , numReads(0)
+ , numPaired(0)
+ , numProperPair(0)
+ , numMapped(0)
+ , numBothMatesMapped(0)
+ , numForwardStrand(0)
+ , numReverseStrand(0)
+ , numFirstMate(0)
+ , numSecondMate(0)
+ , numSingletons(0)
+ , numFailedQC(0)
+ , numDuplicates(0)
+{
+ insertSizes.reserve(100000);
+}
+
+StatsTool::StatsToolPrivate::~StatsToolPrivate(void) { }
+
+bool StatsTool::StatsToolPrivate::CalculateMedian(vector<int>& data, double& median) { // median is double in case of even data size, need to return average of middle 2 elements
+
+ // check that data exists
+ if ( data.empty() ) return false;
+
+ size_t dataSize = data.size();
+ size_t middleIndex = dataSize / 2;
+
+ vector<int>::iterator target = data.begin() + middleIndex;
+ nth_element(data.begin(), target, data.end());
+
+ // odd number of elements
+ if ( (dataSize % 2) != 0) {
+ median = (double)(*target);
+ return true;
+ }
+
+ // even number of elements
+ else {
+ double rightTarget = (double)(*target);
+ vector<int>::iterator leftTarget = target - 1;
+ nth_element(data.begin(), leftTarget, data.end());
+ median = (double)((rightTarget+*leftTarget)/2.0);
+ return true;
+ }
+}
+
+// print BAM file alignment stats
+void StatsTool::StatsToolPrivate::PrintStats(void) {
+
+ cout << endl;
+ cout << "**********************************************" << endl;
+ cout << "Stats for BAM file(s): " << endl;
+ cout << "**********************************************" << endl;
+ cout << endl;
+ cout << "Total reads: " << numReads << endl;
+ cout << "Mapped reads: " << numMapped << "\t(" << ((float)numMapped/numReads)*100 << "%)" << endl;
+ cout << "Forward strand: " << numForwardStrand << "\t(" << ((float)numForwardStrand/numReads)*100 << "%)" << endl;
+ cout << "Reverse strand: " << numReverseStrand << "\t(" << ((float)numReverseStrand/numReads)*100 << "%)" << endl;
+ cout << "Failed QC: " << numFailedQC << "\t(" << ((float)numFailedQC/numReads)*100 << "%)" << endl;
+ cout << "Duplicates: " << numDuplicates << "\t(" << ((float)numDuplicates/numReads)*100 << "%)" << endl;
+ cout << "Paired-end reads: " << numPaired << "\t(" << ((float)numPaired/numReads)*100 << "%)" << endl;
+
+ if ( numPaired != 0 ) {
+ cout << "'Proper-pairs': " << numProperPair << "\t(" << ((float)numProperPair/numPaired)*100 << "%)" << endl;
+ cout << "Both pairs mapped: " << numBothMatesMapped << "\t(" << ((float)numBothMatesMapped/numPaired)*100 << "%)" << endl;
+ cout << "Read 1: " << numFirstMate << endl;
+ cout << "Read 2: " << numSecondMate << endl;
+ cout << "Singletons: " << numSingletons << "\t(" << ((float)numSingletons/numPaired)*100 << "%)" << endl;
+ }
+
+ if ( settings->IsShowingInsertSizeSummary ) {
+
+ double avgInsertSize = 0.0;
+ if ( !insertSizes.empty() ) {
+ avgInsertSize = ( accumulate(insertSizes.begin(), insertSizes.end(), 0.0) / (double)insertSizes.size() );
+ cout << "Average insert size (absolute value): " << avgInsertSize << endl;
+ }
+
+ double medianInsertSize = 0.0;
+ if ( CalculateMedian(insertSizes, medianInsertSize) )
+ cout << "Median insert size (absolute value): " << medianInsertSize << endl;
+ }
+ cout << endl;
+}
+
+// use current input alignment to update BAM file alignment stats
+void StatsTool::StatsToolPrivate::ProcessAlignment(const BamAlignment& al) {
+
+ // increment total alignment counter
+ ++numReads;
+
+ // check the paired-independent flags
+ if ( al.IsDuplicate() ) ++numDuplicates;
+ if ( al.IsFailedQC() ) ++numFailedQC;
+ if ( al.IsMapped() ) ++numMapped;
+
+ // check forward/reverse strand
+ if ( al.IsReverseStrand() )
+ ++numReverseStrand;
+ else
+ ++numForwardStrand;
+
+ // if alignment is paired-end
+ if ( al.IsPaired() ) {
+
+ // increment PE counter
+ ++numPaired;
+
+ // increment first mate/second mate counters
+ if ( al.IsFirstMate() ) ++numFirstMate;
+ if ( al.IsSecondMate() ) ++numSecondMate;
+
+ // if alignment is mapped, check mate status
+ if ( al.IsMapped() ) {
+ // if mate mapped
+ if ( al.IsMateMapped() )
+ ++numBothMatesMapped;
+ // else singleton
+ else
+ ++numSingletons;
+ }
+
+ // check for explicit proper pair flag
+ if ( al.IsProperPair() ) ++numProperPair;
+
+ // store insert size for first mate
+ if ( settings->IsShowingInsertSizeSummary && al.IsFirstMate() && (al.InsertSize != 0) ) {
+ int insertSize = abs(al.InsertSize);
+ insertSizes.push_back( insertSize );
+ }
+ }
+}
+
+bool StatsTool::StatsToolPrivate::Run() {
+
+ // opens the BAM files without checking for indexes
+ BamMultiReader reader;
+ if ( !reader.Open(settings->InputFiles, false, true) ) {
+ cerr << "Could not open input BAM file(s)... quitting." << endl;
+ reader.Close();
+ return false;
+ }
+
+ // plow through file, keeping track of stats
+ BamAlignment al;
+ while ( reader.GetNextAlignmentCore(al) ) {
+ ProcessAlignment(al);
+ }
+
+ // print stats
+ PrintStats();
+
+ // clean and exit
+ reader.Close();
+ return true;
+}
+
+// ---------------------------------------------
+// StatsTool implementation
+
+StatsTool::StatsTool(void)
+ : AbstractTool()
+ , m_settings(new StatsSettings)
+ , m_impl(0)
+{
+ // set program details
+ Options::SetProgramInfo("bamtools stats", "prints general alignment statistics", "[-in <filename> -in <filename> ... ]");
+
+ // set up options
+ OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+ Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
+
+ OptionGroup* AdditionalOpts = Options::CreateOptionGroup("Additional Stats");
+ Options::AddOption("-insert", "summarize insert size data", m_settings->IsShowingInsertSizeSummary, AdditionalOpts);
+}
+
+StatsTool::~StatsTool(void) {
+ delete m_settings;
+ m_settings = 0;
+
+ delete m_impl;
+ m_impl = 0;
+}
+
+int StatsTool::Help(void) {
+ Options::DisplayHelp();
+ return 0;
+}
+
+int StatsTool::Run(int argc, char* argv[]) {
+
+ // parse command line arguments
+ Options::Parse(argc, argv, 1);
+
+ // set to default input if none provided
+ if ( !m_settings->HasInput )
+ m_settings->InputFiles.push_back(Options::StandardIn());
+
+ // run internal SortTool implementation, return success/fail
+ m_impl = new StatsToolPrivate(m_settings);
+
+ if ( m_impl->Run() ) return 0;
+ else return 1;
+}
--- /dev/null
+// ***************************************************************************
+// bamtools_stats.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 1 June 2010
+// ---------------------------------------------------------------------------
+// Prints general statistics for a single BAM file.
+//
+// ** Expand to multiple? **
+//
+// ***************************************************************************
+
+#ifndef BAMTOOLS_STATS_H
+#define BAMTOOLS_STATS_H
+
+#include "bamtools_tool.h"
+
+namespace BamTools {
+
+class StatsTool : public AbstractTool {
+
+ public:
+ StatsTool(void);
+ ~StatsTool(void);
+
+ public:
+ int Help(void);
+ int Run(int argc, char* argv[]);
+
+ private:
+ struct StatsSettings;
+ StatsSettings* m_settings;
+
+ struct StatsToolPrivate;
+ StatsToolPrivate* m_impl;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_STATS_H
--- /dev/null
+// ***************************************************************************
+// bamtools_tool.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 2 June 2010
+// ---------------------------------------------------------------------------
+// Base class for all other BamTools sub-tools
+// All derived classes must provide Help() and Run() methods
+// ***************************************************************************
+
+#ifndef BAMTOOLS_ABSTRACTTOOL_H
+#define BAMTOOLS_ABSTRACTTOOL_H
+
+#include <string>
+
+namespace BamTools {
+
+class AbstractTool {
+
+ public:
+ AbstractTool(void) { }
+ virtual ~AbstractTool(void) { }
+
+ public:
+ virtual int Help(void) =0;
+ virtual int Run(int argc, char* argv[]) =0;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_ABSTRACTTOOL_H
\ No newline at end of file
--- /dev/null
+API_DIR = ../api
+OBJ_DIR = ../../obj
+BIN_DIR = ../../bin
+
+INCLUDES = -I$(API_DIR)/
+
+# ----------------------------------
+# define our source and object files
+# ----------------------------------
+SOURCES = bamtools_fasta.cpp \
+ bamtools_options.cpp \
+ bamtools_pileup.cpp \
+ bamtools_utilities.cpp
+OBJECTS= $(SOURCES:.cpp=.o)
+BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
+
+all: $(BUILT_OBJECTS)
+
+$(BUILT_OBJECTS): $(SOURCES)
+ @echo " * compiling" $(*F).cpp
+ @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
+
+.PHONY: all
+
+clean:
+ @echo "Cleaning up."
+ @rm -f $(OBJ_DIR)/* $(BIN_DIR)/*
+
+.PHONY: clean
--- /dev/null
+// ***************************************************************************
+// bamtools_fasta.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 13 July 2010
+// ---------------------------------------------------------------------------
+// Provides FASTA reading/indexing functionality.
+// ***************************************************************************
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include "bamtools_fasta.h"
+using namespace std;
+using namespace BamTools;
+
+struct Fasta::FastaPrivate {
+
+ struct FastaIndexData {
+ string Name;
+ int32_t Length;
+ int64_t Offset;
+ int32_t LineLength;
+ int32_t ByteLength; // LineLength + newline character(s) - varies on OS where file was generated
+ };
+
+ // data members
+ FILE* Stream;
+ bool IsOpen;
+
+ FILE* IndexStream;
+ bool HasIndex;
+ bool IsIndexOpen;
+
+ vector<FastaIndexData> Index;
+
+ // ctor
+ FastaPrivate(void);
+ ~FastaPrivate(void);
+
+ // 'public' API methods
+ bool Close(void);
+ bool CreateIndex(const string& indexFilename);
+ bool GetBase(const int& refId, const int& position, char& base);
+ bool GetSequence(const int& refId, const int& start, const int& stop, string& sequence);
+ bool Open(const string& filename, const string& indexFilename);
+
+ // internal methods
+ private:
+ void Chomp(char* sequence);
+ bool GetNameFromHeader(const string& header, string& name);
+ bool GetNextHeader(string& header);
+ bool GetNextSequence(string& sequence);
+ bool LoadIndexData(void);
+ bool Rewind(void);
+ bool WriteIndexData(void);
+};
+
+Fasta::FastaPrivate::FastaPrivate(void)
+ : IsOpen(false)
+ , HasIndex(false)
+ , IsIndexOpen(false)
+{ }
+
+Fasta::FastaPrivate::~FastaPrivate(void) {
+ Close();
+}
+
+// remove any trailing newlines
+void Fasta::FastaPrivate::Chomp(char* sequence) {
+
+ static const int CHAR_LF = 10;
+ static const int CHAR_CR = 13;
+
+ size_t seqLength = strlen(sequence);
+ if ( seqLength == 0 ) return;
+ --seqLength; // ignore null terminator
+
+ while ( sequence[seqLength] == CHAR_LF ||
+ sequence[seqLength] == CHAR_CR
+ )
+ {
+ sequence[seqLength] = 0;
+ --seqLength;
+ if (seqLength < 0)
+ break;
+ }
+}
+
+bool Fasta::FastaPrivate::Close(void) {
+
+ // close fasta file
+ if ( IsOpen ) {
+ fclose(Stream);
+ IsOpen = false;
+ }
+
+ // close index file
+ if ( HasIndex && IsIndexOpen ) {
+ fclose(IndexStream);
+ HasIndex = false;
+ IsIndexOpen = false;
+ }
+
+ // return success
+ return true;
+}
+
+bool Fasta::FastaPrivate::CreateIndex(const string& indexFilename) {
+
+ // check that file is open
+ if ( !IsOpen ) {
+ cerr << "FASTA error : cannot create index, FASTA file not open" << endl;
+ return false;
+ }
+
+ // rewind FASTA file
+ if ( !Rewind() ) {
+ cerr << "FASTA error : could not rewind FASTA file" << endl;
+ return false;
+ }
+
+ // clear out prior index data
+ Index.clear();
+
+ // -------------------------------------------
+ // calculate lineLength & byteLength
+
+ int lineLength = 0;
+ int byteLength = 0;
+
+ // skip over header
+ char buffer[1024];
+ if ( fgets(buffer, 1024, Stream) == 0 ) {
+ cerr << "FASTA error : could not read from file" << endl;
+ return false;
+ }
+ if ( feof(Stream) ) return false;
+ if ( buffer[0] != '>' ) {
+ cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
+ return false;
+ }
+
+ // read in first line of sequence
+ char c = fgetc(Stream);
+ while ( (c >= 0) && (c != '\n') ) {
+ ++byteLength;
+ if (isgraph(c)) ++lineLength;
+ c = fgetc(Stream);
+ }
+ ++byteLength; // store newline
+
+ // rewind FASTA file
+ if ( !Rewind() ) {
+ cerr << "FASTA error : could not rewind FASTA file" << endl;
+ return false;
+ }
+
+ // iterate through fasta entries
+ int currentId = 0;
+ string header = "";
+ string sequence = "";
+ while ( GetNextHeader(header) ) {
+
+ // ---------------------------
+ // build index entry data
+ FastaIndexData data;
+
+ // store file offset of beginning of DNA sequence (after header)
+ data.Offset = ftello(Stream);
+
+ // parse header, store sequence name in data.Name
+ if ( !GetNameFromHeader(header, data.Name) ) {
+ cerr << "FASTA error : could not parse read name from FASTA header" << endl;
+ return false;
+ }
+
+ // retrieve FASTA sequence
+ if ( !GetNextSequence(sequence) ) {
+ cerr << "FASTA error : could not read in next sequence from FASTA file" << endl;
+ return false;
+ }
+
+ // store sequence length & line/byte lengths
+ data.Length = sequence.length();
+ data.LineLength = lineLength;
+ data.ByteLength = byteLength;
+
+ // store index entry
+ Index.push_back(data);
+
+ // update ref Id
+ ++currentId;
+ }
+
+ // open index file
+ if ( !indexFilename.empty() ) {
+ IndexStream = fopen(indexFilename.c_str(), "wb");
+ if ( !IndexStream ) {
+ cerr << "FASTA error : Could not open " << indexFilename << " for writing." << endl;
+ return false;
+ }
+ IsIndexOpen = true;
+ }
+
+ // write index data
+ if ( !WriteIndexData() ) return false;
+ HasIndex = true;
+
+ // close index file
+ fclose(IndexStream);
+ IsIndexOpen = false;
+
+ // return succes status
+ return true;
+}
+
+bool Fasta::FastaPrivate::GetBase(const int& refId, const int& position, char& base) {
+
+ // make sure FASTA file is open
+ if ( !IsOpen ) {
+ cerr << "FASTA error : file not open for reading" << endl;
+ return false;
+ }
+
+ // use index if available
+ if ( HasIndex && !Index.empty() ) {
+
+ // validate reference id
+ if ( (refId < 0) || (refId >= (int)Index.size()) ) {
+ cerr << "FASTA error: invalid refId specified: " << refId << endl;
+ return false;
+ }
+
+ // retrieve reference index data
+ const FastaIndexData& referenceData = Index.at(refId);
+
+ // validate position
+ if ( (position < 0) || (position > referenceData.Length) ) {
+ cerr << "FASTA error: invalid position specified: " << position << endl;
+ return false;
+ }
+
+ // seek to beginning of sequence data
+ if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
+ cerr << "FASTA error : could not sek in file" << endl;
+ return false;
+ }
+
+ // retrieve sequence
+ string sequence = "";
+ if ( !GetNextSequence(sequence) ) {
+ cerr << "FASTA error : could not retrieve base from FASTA file" << endl;
+ return false;
+ }
+
+ // set base & return success
+ base = sequence.at(position);
+ return true;
+ }
+
+ // else plow through sequentially
+ else {
+
+ // rewind FASTA file
+ if ( !Rewind() ) {
+ cerr << "FASTA error : could not rewind FASTA file" << endl;
+ return false;
+ }
+
+ // iterate through fasta entries
+ int currentId = 0;
+ string header = "";
+ string sequence = "";
+
+ // get first entry
+ GetNextHeader(header);
+ GetNextSequence(sequence);
+
+ while ( currentId != refId ) {
+ GetNextHeader(header);
+ GetNextSequence(sequence);
+ ++currentId;
+ }
+
+ // get desired base from sequence
+ // TODO: error reporting on invalid position
+ if ( currentId == refId && (sequence.length() >= (size_t)position) ) {
+ base = sequence.at(position);
+ return true;
+ }
+
+ // could not get sequence
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+bool Fasta::FastaPrivate::GetNameFromHeader(const string& header, string& name) {
+
+ // get rid of the leading greater than sign
+ string s = header.substr(1);
+
+ // extract the first non-whitespace segment
+ char* pName = (char*)s.data();
+ unsigned int nameLen = (unsigned int)s.size();
+
+ unsigned int start = 0;
+ while ( (pName[start] == 32) || (pName[start] == 9) || (pName[start] == 10) || (pName[start] == 13) ) {
+ start++;
+ if ( start == nameLen )
+ break;
+ }
+
+ unsigned int stop = start;
+ if ( stop < nameLen ) {
+ while( (pName[stop] != 32) && (pName[stop] != 9) && (pName[stop] != 10) && (pName[stop] != 13) ) {
+ stop++;
+ if ( stop == nameLen )
+ break;
+ }
+ }
+
+ if ( start == stop ) {
+ cerr << "FASTA error : could not parse read name from FASTA header" << endl;
+ return false;
+ }
+
+ name = s.substr(start, stop - start).c_str();
+ return true;
+}
+
+bool Fasta::FastaPrivate::GetNextHeader(string& header) {
+
+ // validate input stream
+ if ( !IsOpen || feof(Stream) )
+ return false;
+
+ // read in header line
+ char buffer[1024];
+ if ( fgets(buffer, 1024, Stream) == 0 ) {
+ cerr << "FASTA error : could not read from file" << endl;
+ return false;
+ }
+
+ // make sure it's a FASTA header
+ if ( buffer[0] != '>' ) {
+ cerr << "FASTA error : expected header ('>'), instead : " << buffer[0] << endl;
+ return false;
+ }
+
+ // import buffer contents to header string
+ stringstream headerBuffer("");
+ headerBuffer << buffer;
+ header = headerBuffer.str();
+
+ // return success
+ return true;
+}
+
+bool Fasta::FastaPrivate::GetNextSequence(string& sequence) {
+
+ // validate input stream
+ if ( !IsOpen || feof(Stream) )
+ return false;
+
+ // read in sequence
+ char buffer[1024];
+ ostringstream seqBuffer("");
+ while(true) {
+
+ char ch = fgetc(Stream);
+ ungetc(ch, Stream);
+ if( (ch == '>') || feof(Stream) )
+ break;
+
+ if ( fgets(buffer, 1024, Stream) == 0 ) {
+ cerr << "FASTA error : could not read from file" << endl;
+ return false;
+ }
+
+ Chomp(buffer);
+ seqBuffer << buffer;
+ }
+
+ // import buffer contents to sequence string
+ sequence = seqBuffer.str();
+
+ // return success
+ return true;
+}
+
+bool Fasta::FastaPrivate::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
+
+ // make sure FASTA file is open
+ if ( !IsOpen ) {
+ cerr << "FASTA error : file not open for reading" << endl;
+ return false;
+ }
+
+ // use index if available
+ if ( HasIndex && !Index.empty() ) {
+
+ // validate reference id
+ if ( (refId < 0) || (refId >= (int)Index.size()) ) {
+ cerr << "FASTA error: invalid refId specified: " << refId << endl;
+ return false;
+ }
+
+ // retrieve reference index data
+ const FastaIndexData& referenceData = Index.at(refId);
+
+ // validate stop position
+ if ( (start < 0) || (start > stop) || (stop > referenceData.Length) ) {
+ cerr << "FASTA error: invalid start/stop positions specified: " << start << ", " << stop << endl;
+ return false;
+ }
+
+ // seek to beginning of sequence data
+ if ( fseeko(Stream, referenceData.Offset, SEEK_SET) != 0 ) {
+ cerr << "FASTA error : could not sek in file" << endl;
+ return false;
+ }
+
+ // retrieve full sequence
+ string fullSequence = "";
+ if ( !GetNextSequence(fullSequence) ) {
+ cerr << "FASTA error : could not retrieve sequence from FASTA file" << endl;
+ return false;
+ }
+
+ // set sub-sequence & return success
+ const int seqLength = (stop - start) + 1;
+ sequence = fullSequence.substr(start, seqLength);
+ return true;
+ }
+
+ // else plow through sequentially
+ else {
+
+ // rewind FASTA file
+ if ( !Rewind() ) {
+ cerr << "FASTA error : could not rewind FASTA file" << endl;
+ return false;
+ }
+
+ // iterate through fasta entries
+ int currentId = 0;
+ string header = "";
+ string fullSequence = "";
+
+ // get first entry
+ GetNextHeader(header);
+ GetNextSequence(fullSequence);
+
+ while ( currentId != refId ) {
+ GetNextHeader(header);
+ GetNextSequence(fullSequence);
+ ++currentId;
+ }
+
+ // get desired substring from sequence
+ // TODO: error reporting on invalid start/stop positions
+ if ( currentId == refId && (fullSequence.length() >= (size_t)stop) ) {
+ const int seqLength = (stop - start) + 1;
+ sequence = fullSequence.substr(start, seqLength);
+ return true;
+ }
+
+ // could not get sequence
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+bool Fasta::FastaPrivate::LoadIndexData(void) {
+
+ // skip if no index file available
+ if ( !IsIndexOpen ) return false;
+
+ // clear any prior index data
+ Index.clear();
+
+ char buffer[1024];
+ stringstream indexBuffer;
+ while ( true ) {
+
+ char c = fgetc(IndexStream);
+ if ( (c == '\n') || feof(IndexStream) ) break;
+ ungetc(c, IndexStream);
+
+ // clear index buffer
+ indexBuffer.str("");
+
+ // read line from index file
+ if ( fgets(buffer, 1024, IndexStream) == 0 ) {
+ cerr << "FASTA LoadIndexData() error : could not read from index file" << endl;
+ HasIndex = false;
+ return false;
+ }
+
+ // store line in indexBuffer
+ indexBuffer << buffer;
+
+ // retrieve fasta index data from line
+ FastaIndexData data;
+ indexBuffer >> data.Name;
+ indexBuffer >> data.Length;
+ indexBuffer >> data.Offset;
+ indexBuffer >> data.LineLength;
+ indexBuffer >> data.ByteLength;
+
+ // store index entry
+ Index.push_back(data);
+ }
+
+ return true;
+}
+
+bool Fasta::FastaPrivate::Open(const string& filename, const string& indexFilename) {
+
+ bool success = true;
+
+ // open FASTA filename
+ Stream = fopen(filename.c_str(), "rb");
+ if ( !Stream ) {
+ cerr << "FASTA error: Could not open " << filename << " for reading" << endl;
+ return false;
+ }
+ IsOpen = true;
+ success &= IsOpen;
+
+ // open index file if it exists
+ if ( !indexFilename.empty() ) {
+ IndexStream = fopen(indexFilename.c_str(), "rb");
+ if ( !IndexStream ) {
+ cerr << "FASTA error : Could not open " << indexFilename << " for reading." << endl;
+ return false;
+ }
+ IsIndexOpen = true;
+ success &= IsIndexOpen;
+
+ // attempt to load index data
+ HasIndex = LoadIndexData();
+ success &= HasIndex;
+ }
+
+ // return success status
+ return success;
+}
+
+bool Fasta::FastaPrivate::Rewind(void) {
+ if ( !IsOpen ) return false;
+ return ( fseeko(Stream, 0, SEEK_SET) == 0 );
+}
+
+bool Fasta::FastaPrivate::WriteIndexData(void) {
+
+ // skip if no index file available
+ if ( !IsIndexOpen ) return false;
+
+ // iterate over index entries
+ bool success = true;
+ stringstream indexBuffer;
+ vector<FastaIndexData>::const_iterator indexIter = Index.begin();
+ vector<FastaIndexData>::const_iterator indexEnd = Index.end();
+ for ( ; indexIter != indexEnd; ++indexIter ) {
+
+ // clear stream
+ indexBuffer.str("");
+
+ // write data to stream
+ const FastaIndexData& data = (*indexIter);
+ indexBuffer << data.Name << "\t"
+ << data.Length << "\t"
+ << data.Offset << "\t"
+ << data.LineLength << "\t"
+ << data.ByteLength << endl;
+
+ // write stream to file
+ success &= ( fputs(indexBuffer.str().c_str(), IndexStream) >= 0 );
+ }
+
+ // return success status
+ return success;
+}
+
+// --------------------------------
+// Fasta implementation
+
+Fasta::Fasta(void) {
+ d = new FastaPrivate;
+}
+
+Fasta::~Fasta(void) {
+ delete d;
+ d = 0;
+}
+
+bool Fasta::Close(void) {
+ return d->Close();
+}
+
+bool Fasta::CreateIndex(const string& indexFilename) {
+ return d->CreateIndex(indexFilename);
+}
+
+bool Fasta::GetBase(const int& refId, const int& position, char& base) {
+ return d->GetBase(refId, position, base);
+}
+
+bool Fasta::GetSequence(const int& refId, const int& start, const int& stop, string& sequence) {
+ return d->GetSequence(refId, start, stop, sequence);
+}
+
+bool Fasta::Open(const string& filename, const string& indexFilename) {
+ return d->Open(filename, indexFilename);
+}
--- /dev/null
+// ***************************************************************************
+// bamtools_fasta.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 13 July 2010
+// ---------------------------------------------------------------------------
+// Provides FASTA reading/indexing functionality.
+// ***************************************************************************
+
+#ifndef BAMTOOLS_FASTA_H
+#define BAMTOOLS_FASTA_H
+
+#include <string>
+
+namespace BamTools {
+
+class Fasta {
+
+ // ctor & dtor
+ public:
+ Fasta(void);
+ ~Fasta(void);
+
+ // file-handling methods
+ public:
+ bool Close(void);
+ bool Open(const std::string& filename, const std::string& indexFilename = "");
+
+ // sequence access methods
+ public:
+ bool GetBase(const int& refID, const int& position, char& base);
+ bool GetSequence(const int& refId, const int& start, const int& stop, std::string& sequence);
+
+ // index-handling methods
+ public:
+ bool CreateIndex(const std::string& indexFilename);
+
+ // internal implementation
+ private:
+ struct FastaPrivate;
+ FastaPrivate* d;
+};
+
+} // BAMTOOLS_FASTA_H
+
+#endif // BAMTOOLS_FASTA_H
\ No newline at end of file
--- /dev/null
+// ***************************************************************************
+// bamtools_options.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 2 June 2010
+// ---------------------------------------------------------------------------
+// Parses command line arguments and creates a help menu
+// ---------------------------------------------------------------------------
+// Modified from:
+// The Mosaik suite's command line parser class: COptions
+// (c) 2006 - 2009 Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// Dual licenced under the GNU General Public License 2.0+ license or as
+// a commercial license with the Marth Lab.
+//
+// * Modified slightly to fit BamTools, otherwise code is same.
+// * (BamTools namespace, added stdin/stdout) (DB)
+// ***************************************************************************
+
+#include "bamtools_options.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <sstream>
+using namespace std;
+using namespace BamTools;
+
+string Options::m_programName; // the program name
+string Options::m_description; // the main description
+string Options::m_exampleArguments; // the example arguments
+vector<OptionGroup> Options::m_optionGroups; // stores the option groups
+map<string, OptionValue> Options::m_optionsMap; // stores the options in a map
+string Options::m_stdin = "stdin"; // string representation of stdin
+string Options::m_stdout = "stdout"; // string representation of stdout
+
+// adds a simple option to the parser
+void Options::AddOption(const string& argument, const string& optionDescription, bool& foundArgument, OptionGroup* group) {
+
+ Option o;
+ o.Argument = argument;
+ o.Description = optionDescription;
+ o.StoreValue = false;
+ group->Options.push_back(o);
+
+ OptionValue ov;
+ ov.pFoundArgument = &foundArgument;
+ ov.StoreValue = false;
+
+ m_optionsMap[argument] = ov;
+}
+
+// creates an option group
+OptionGroup* Options::CreateOptionGroup(const string& groupName) {
+ OptionGroup og;
+ og.Name = groupName;
+ m_optionGroups.push_back(og);
+ return &m_optionGroups[m_optionGroups.size() - 1];
+}
+
+// displays the help menu
+void Options::DisplayHelp(void) {
+
+ // initialize
+ char argumentBuffer[ARGUMENT_LENGTH + 1];
+ ostringstream sb;
+
+ char indentBuffer[MAX_LINE_LENGTH - DESC_LENGTH + 1];
+ memset(indentBuffer, ' ', MAX_LINE_LENGTH - DESC_LENGTH);
+ indentBuffer[MAX_LINE_LENGTH - DESC_LENGTH] = 0;
+
+ // display the menu
+ printf("Description: %s.\n\n", m_description.c_str());
+ printf("Usage: ");
+ printf("%s", m_programName.c_str());
+ printf(" %s\n\n", m_exampleArguments.c_str());
+
+ vector<Option>::const_iterator optionIter;
+ vector<OptionGroup>::const_iterator groupIter;
+ for (groupIter = m_optionGroups.begin(); groupIter != m_optionGroups.end(); ++groupIter) {
+
+ printf("%s:\n", groupIter->Name.c_str());
+
+ for (optionIter = groupIter->Options.begin(); optionIter != groupIter->Options.end(); ++optionIter) {
+
+ if (optionIter->StoreValue)
+ snprintf(argumentBuffer, ARGUMENT_LENGTH + 1, " %s <%s>", optionIter->Argument.c_str(), optionIter->ValueDescription.c_str());
+ else
+ snprintf(argumentBuffer, ARGUMENT_LENGTH + 1, " %s", optionIter->Argument.c_str());
+ printf("%-35s ", argumentBuffer);
+
+ string description = optionIter->Description;
+
+ // handle default values
+ if (optionIter->HasDefaultValue) {
+
+ sb.str("");
+ sb << description << " [";
+
+ if (optionIter->DefaultValue.is_type<unsigned int>()) {
+ sb << (unsigned int)optionIter->DefaultValue;
+ } else if (optionIter->DefaultValue.is_type<unsigned char>()) {
+ sb << (unsigned short)(unsigned char)optionIter->DefaultValue;
+ } else if (optionIter->DefaultValue.is_type<float>()) {
+ sb << std::fixed << std::setprecision(2) << (float)optionIter->DefaultValue;
+ } else if (optionIter->DefaultValue.is_type<double>()) {
+ sb << std::fixed << std::setprecision(4) << (double)optionIter->DefaultValue;
+ } else if (optionIter->DefaultValue.is_type<std::string>()) {
+ const std::string stringValue = optionIter->DefaultValue;
+ sb << stringValue;
+ } else {
+ printf("ERROR: Found an unsupported data type for argument %s when casting the default value.\n", optionIter->Argument.c_str());
+ exit(1);
+ }
+
+ sb << "]";
+ description = sb.str();
+ }
+
+ if ( description.size() <= DESC_LENGTH_FIRST_ROW ) {
+ printf("%s\n", description.c_str());
+ } else {
+
+ // handle the first row
+ const char* pDescription = description.data();
+ unsigned int cutIndex = DESC_LENGTH_FIRST_ROW;
+ while(pDescription[cutIndex] != ' ')
+ cutIndex--;
+ printf("%s\n", description.substr(0, cutIndex).c_str());
+ description = description.substr(cutIndex + 1);
+
+ // handle subsequent rows
+ while(description.size() > DESC_LENGTH) {
+ pDescription = description.data();
+ cutIndex = DESC_LENGTH;
+ while(pDescription[cutIndex] != ' ')
+ cutIndex--;
+ printf("%s%s\n", indentBuffer, description.substr(0, cutIndex).c_str());
+ description = description.substr(cutIndex + 1);
+ }
+
+ // handle last row
+ printf("%s%s\n", indentBuffer, description.c_str());
+ }
+ }
+
+ printf("\n");
+ }
+
+ printf("Help:\n");
+ printf(" --help, -h shows this help text\n");
+ exit(1);
+}
+
+// parses the command line
+void Options::Parse(int argc, char* argv[], int offset) {
+
+ // initialize
+ map<string, OptionValue>::const_iterator ovMapIter;
+ map<string, OptionValue>::const_iterator checkMapIter;
+ const int LAST_INDEX = argc - 1;
+ ostringstream errorBuilder;
+ bool foundError = false;
+ char* end_ptr = NULL;
+ const string ERROR_SPACER(7, ' ');
+
+ // check if we should show the help menu
+ bool showHelpMenu = false;
+ if (argc > 1) {
+ for (int i = 1; i < argc; i++) {
+ const std::string argument = argv[i];
+ if ( (argument == "-h") || (argument == "--help") || (argument == "help") )
+ showHelpMenu = true;
+ }
+ } else showHelpMenu = true;
+
+ if (showHelpMenu)
+ DisplayHelp();
+
+ // check each argument
+ for (int i = offset+1; i < argc; i++) {
+
+ const string argument = argv[i];
+ ovMapIter = m_optionsMap.find(argument);
+
+ if (ovMapIter == m_optionsMap.end()) {
+ errorBuilder << ERROR_SPACER << "An unrecognized argument was found: " << argument << std::endl;
+ foundError = true;
+ } else {
+
+ *ovMapIter->second.pFoundArgument = true;
+
+ // grab the value
+ if (ovMapIter->second.StoreValue) {
+
+ if (i < LAST_INDEX) {
+
+ // check if the next argument is really a command line option
+ const string val = argv[i + 1];
+ checkMapIter = m_optionsMap.find(val);
+
+ if (checkMapIter == m_optionsMap.end()) {
+
+ ++i;
+
+ if (ovMapIter->second.VariantValue.is_type<unsigned int>()) {
+ const unsigned int uint32 = (unsigned int)strtoul(val.c_str(), &end_ptr, 10);
+ unsigned int* varValue = (unsigned int*)ovMapIter->second.pValue;
+ *varValue = uint32;
+ } else if (ovMapIter->second.VariantValue.is_type<unsigned char>()) {
+ const unsigned char uint8 = (unsigned char)strtoul(val.c_str(), &end_ptr, 10);
+ unsigned char* varValue = (unsigned char*)ovMapIter->second.pValue;
+ *varValue = uint8;
+ } else if (ovMapIter->second.VariantValue.is_type<uint64_t>()) {
+ const uint64_t uint64 = strtoui64(val.c_str(), &end_ptr, 10);
+ uint64_t* varValue = (uint64_t*)ovMapIter->second.pValue;
+ *varValue = uint64;
+ } else if (ovMapIter->second.VariantValue.is_type<double>()) {
+ const double d = strtod(val.c_str(), &end_ptr);
+ double* varValue = (double*)ovMapIter->second.pValue;
+ *varValue = d;
+ } else if (ovMapIter->second.VariantValue.is_type<float>()) {
+ const float f = (float)strtod(val.c_str(), &end_ptr);
+ float* varValue = (float*)ovMapIter->second.pValue;
+ *varValue = f;
+ } else if (ovMapIter->second.VariantValue.is_type<string>()) {
+ string* pStringValue = (string*)ovMapIter->second.pValue;
+ *pStringValue = val;
+ } else if (ovMapIter->second.VariantValue.is_type<vector<string> >()) {
+ vector<string>* pVectorValue = (vector<string>*)ovMapIter->second.pValue;
+ pVectorValue->push_back(val);
+ } else {
+ printf("ERROR: Found an unsupported data type for argument %s when parsing the arguments.\n", argument.c_str());
+ exit(1);
+ }
+ } else {
+ errorBuilder << ERROR_SPACER << "The argument (" << argument << ") expects a value, but none was found." << endl;
+ foundError = true;
+ }
+ } else {
+ errorBuilder << ERROR_SPACER << "The argument (" << argument << ") expects a value, but none was found." << endl;
+ foundError = true;
+ }
+ }
+ }
+ }
+
+ // check if we missed any required parameters
+ for (ovMapIter = m_optionsMap.begin(); ovMapIter != m_optionsMap.end(); ++ovMapIter) {
+ if (ovMapIter->second.IsRequired && !*ovMapIter->second.pFoundArgument) {
+ errorBuilder << ERROR_SPACER << ovMapIter->second.ValueTypeDescription << " was not specified. Please use the " << ovMapIter->first << " parameter." << endl;
+ foundError = true;
+ }
+ }
+
+ // print the errors if any were found
+ if (foundError) {
+ printf("ERROR: Some problems were encountered when parsing the command line options:\n");
+ printf("%s\n", errorBuilder.str().c_str());
+ printf("For a complete list of command line options, type \"%s help %s\"\n", argv[0], argv[1]);
+ exit(1);
+ }
+}
+
+// sets the program info
+void Options::SetProgramInfo(const string& programName, const string& description, const string& arguments) {
+ m_programName = programName;
+ m_description = description;
+ m_exampleArguments = arguments;
+}
+
+// return string representations of stdin
+const string& Options::StandardIn(void) { return m_stdin; }
+
+// return string representations of stdout
+const string& Options::StandardOut(void) { return m_stdout; }
--- /dev/null
+// ***************************************************************************
+// bamtools_options.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 1 June 2010
+// ---------------------------------------------------------------------------
+// Parses command line arguments and creates a help menu
+// ---------------------------------------------------------------------------
+// Modified from:
+// The Mosaik suite's command line parser class: COptions
+// (c) 2006 - 2009 Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// Dual licenced under the GNU General Public License 2.0+ license or as
+// a commercial license with the Marth Lab.
+//
+// * Modified to fit BamTools code-style, otherwise code is same. (DB)
+// ***************************************************************************
+
+#ifndef BAMTOOLS_OPTIONS_H
+#define BAMTOOLS_OPTIONS_H
+
+#include <map>
+#include <string>
+#include <vector>
+#include "bamtools_variant.h"
+
+#ifndef WIN32
+ #include <stdint.h>
+#endif
+
+namespace BamTools {
+
+#define ARGUMENT_LENGTH 35
+#define DESC_LENGTH_FIRST_ROW 50
+#define DESC_LENGTH 39
+#define MAX_LINE_LENGTH 78
+
+#ifdef WIN32
+ #define snprintf _snprintf
+ typedef __int64 int64_t;
+ typedef unsigned __int64 uint64_t;
+ #define strtoui64 _strtoui64
+#else
+ #define strtoui64 strtoull
+#endif
+
+struct Option {
+
+ // data members
+ std::string Argument;
+ std::string ValueDescription;
+ std::string Description;
+ bool StoreValue;
+ bool HasDefaultValue;
+ Variant DefaultValue;
+
+ // constructor
+ Option(void)
+ : StoreValue(true)
+ , HasDefaultValue(false)
+ { }
+};
+
+struct OptionValue {
+
+ // data members
+ bool* pFoundArgument;
+ void* pValue;
+ std::string ValueTypeDescription;
+ bool UseVector;
+ bool StoreValue;
+ bool IsRequired;
+ Variant VariantValue;
+
+ // constructor
+ OptionValue(void)
+ : pFoundArgument(NULL)
+ , pValue(NULL)
+ , UseVector(false)
+ , StoreValue(true)
+ , IsRequired(false)
+ { }
+};
+
+struct OptionGroup {
+ std::string Name;
+ std::vector<Option> Options;
+};
+
+class Options {
+
+ // add option/argument rules
+ public:
+ // adds a simple option to the parser
+ static void AddOption(const std::string& argument,
+ const std::string& optionDescription,
+ bool& foundArgument,
+ OptionGroup* group);
+
+ // adds a value option to the parser
+ template<typename T>
+ static void AddValueOption(const std::string& argument,
+ const std::string& valueDescription,
+ const std::string& optionDescription,
+ const std::string& valueTypeDescription,
+ bool& foundArgument,
+ T& val,
+ OptionGroup* group);
+
+ // adds a value option to the parser (with a default value)
+ template<typename T, typename D>
+ static void AddValueOption(const std::string& argument,
+ const std::string& valueDescription,
+ const std::string& optionDescription,
+ const std::string& valueTypeDescription,
+ bool& foundArgument,
+ T& val,
+ OptionGroup* group,
+ D& defaultValue);
+
+ // other API methods
+ public:
+ // creates an option group
+ static OptionGroup* CreateOptionGroup(const std::string& groupName);
+ // displays the help menu
+ static void DisplayHelp(void);
+ // parses the command line
+ static void Parse(int argc, char* argv[], int offset = 0);
+ // sets the program info
+ static void SetProgramInfo(const std::string& programName, const std::string& description, const std::string& arguments);
+ // returns string representation of stdin
+ static const std::string& StandardIn(void);
+ // returns string representation of stdout
+ static const std::string& StandardOut(void);
+
+ // static data members
+ private:
+ // the program name
+ static std::string m_programName;
+ // the main description
+ static std::string m_description;
+ // the example arguments
+ static std::string m_exampleArguments;
+ // stores the option groups
+ static std::vector<OptionGroup> m_optionGroups;
+ // stores the options in a map
+ static std::map<std::string, OptionValue> m_optionsMap;
+ // string representation of stdin
+ static std::string m_stdin;
+ // string representation of stdout
+ static std::string m_stdout;
+};
+
+// adds a value option to the parser
+template<typename T>
+void Options::AddValueOption(const std::string& argument,
+ const std::string& valueDescription,
+ const std::string& optionDescription,
+ const std::string& valueTypeDescription,
+ bool& foundArgument,
+ T& val,
+ OptionGroup* group)
+{
+ Option o;
+ o.Argument = argument;
+ o.ValueDescription = valueDescription;
+ o.Description = optionDescription;
+ group->Options.push_back(o);
+
+ OptionValue ov;
+ ov.pFoundArgument = &foundArgument;
+ ov.pValue = (void*)&val;
+ ov.VariantValue = val;
+ ov.IsRequired = (valueTypeDescription.empty() ? false : true);
+ ov.ValueTypeDescription = valueTypeDescription;
+ m_optionsMap[argument] = ov;
+}
+
+// adds a value option to the parser (with a default value)
+template<typename T, typename D>
+void Options::AddValueOption(const std::string& argument,
+ const std::string& valueDescription,
+ const std::string& optionDescription,
+ const std::string& valueTypeDescription,
+ bool& foundArgument,
+ T& val,
+ OptionGroup* group,
+ D& defaultValue)
+{
+ Option o;
+ o.Argument = argument;
+ o.ValueDescription = valueDescription;
+ o.Description = optionDescription;
+ o.DefaultValue = defaultValue;
+ o.HasDefaultValue = true;
+ group->Options.push_back(o);
+
+ OptionValue ov;
+ ov.pFoundArgument = &foundArgument;
+ ov.pValue = (void*)&val;
+ ov.VariantValue = val;
+ ov.IsRequired = (valueTypeDescription.empty() ? false : true);
+ ov.ValueTypeDescription = valueTypeDescription;
+ m_optionsMap[argument] = ov;
+}
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_OPTIONS_H
\ No newline at end of file
--- /dev/null
+// ***************************************************************************
+// bamtools_pileup.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 13 July 2010
+// ---------------------------------------------------------------------------
+// Provides pileup conversion functionality.
+//
+// The 'assembly' aspect of pileup makes this more complicated than the
+// simpler one-to-one conversion methods for other formats.
+// ***************************************************************************
+
+#include <vector>
+#include "BamMultiReader.h"
+#include "bamtools_pileup.h"
+using namespace std;
+using namespace BamTools;
+
+struct Pileup::PileupPrivate {
+
+ // ---------------------
+ // data members
+
+ // IO & settings
+ BamMultiReader* Reader;
+ ostream* OutStream;
+ string FastaFilename;
+ bool IsPrintingMapQualities;
+ BamRegion Region;
+
+ // parsing data
+ int CurrentId;
+ int CurrentPosition;
+ vector<BamAlignment> CurrentData;
+ RefVector References;
+
+ // ----------------------
+ // ctor
+
+ PileupPrivate(BamMultiReader* reader, ostream* outStream)
+ : Reader(reader)
+ , OutStream(outStream)
+ , FastaFilename("")
+ , IsPrintingMapQualities(false)
+ { }
+
+ // ----------------------
+ // internal methods
+
+ void PrintCurrentData(void);
+ bool Run(void);
+};
+
+void Pileup::PileupPrivate::PrintCurrentData(void) {
+
+ // remove any data that ends before CurrentPosition
+ size_t i = 0;
+ while ( i < CurrentData.size() ) {
+ if ( CurrentData[i].GetEndPosition() < CurrentPosition )
+ CurrentData.erase(CurrentData.begin() + i);
+ else
+ ++i;
+ }
+
+ // if not data remains, return
+ if ( CurrentData.empty() ) return;
+
+ // initialize empty strings
+ string bases = "";
+ string baseQuals = "";
+ string mapQuals = "";
+
+ // iterate over alignments
+ vector<BamAlignment>::const_iterator dataIter = CurrentData.begin();
+ vector<BamAlignment>::const_iterator dataEnd = CurrentData.end();
+ for ( ; dataIter != dataEnd; ++dataIter ) {
+
+ // retrieve alignment
+ const BamAlignment& al = (*dataIter);
+
+ // determine current base character & store
+ const char base = al.AlignedBases[CurrentPosition -al.Position];
+ if ( al.IsReverseStrand() )
+ bases.push_back( tolower(base) );
+ else
+ bases.push_back( toupper(base) );
+
+ // determine current base quality & store
+ baseQuals.push_back( al.Qualities[CurrentPosition - al.Position] );
+
+ // if using mapQuals, determine current mapQual & store
+ if ( IsPrintingMapQualities ) {
+ int mapQuality = (int)(al.MapQuality + 33);
+ if ( mapQuality > 126 ) mapQuality = 126;
+ mapQuals.push_back((char)mapQuality);
+ }
+ }
+
+ // print results to OutStream
+ const string& refName = References[CurrentId].RefName;
+ const char refBase = 'N';
+
+ *OutStream << refName << "\t" << CurrentPosition << "\t" << refBase << "\t" << CurrentData.size() << "\t" << bases << "\t" << baseQuals;
+ if ( IsPrintingMapQualities ) *OutStream << "\t" << mapQuals;
+ *OutStream << endl;
+}
+
+bool Pileup::PileupPrivate::Run(void) {
+
+ // -----------------------------
+ // validate input & output
+
+ if ( !Reader ) {
+ cerr << "Pileup::Run() : Invalid multireader" << endl;
+ return false;
+ }
+
+ if ( !OutStream) {
+ cerr << "Pileup::Run() : Invalid output stream" << endl;
+ return false;
+ }
+
+ References = Reader->GetReferenceData();
+
+ // -----------------------------
+ // process input data
+
+ // get first entry
+ BamAlignment al;
+ if ( !Reader->GetNextAlignment(al) ) {
+ cerr << "Pileup::Run() : Could not read from multireader" << endl;
+ return false;
+ }
+
+ // set initial markers & store first entry
+ CurrentId = al.RefID;
+ CurrentPosition = al.Position;
+ CurrentData.clear();
+ CurrentData.push_back(al);
+
+ // iterate over remaining data
+ while ( Reader->GetNextAlignment(al) ) {
+
+ // if same reference
+ if ( al.RefID == CurrentId ) {
+
+ // if same position, store and move on
+ if ( al.Position == CurrentPosition )
+ CurrentData.push_back(al);
+
+ // if less than CurrentPosition - sorting error => ABORT
+ else if ( al.Position < CurrentPosition ) {
+ cerr << "Pileup::Run() : Data not sorted correctly!" << endl;
+ return false;
+ }
+
+ // else print pileup data until 'catching up' to CurrentPosition
+ else {
+ while ( al.Position > CurrentPosition ) {
+ PrintCurrentData();
+ ++CurrentPosition;
+ }
+ CurrentData.push_back(al);
+ }
+ }
+
+ // if reference ID less than CurrentID - sorting error => ABORT
+ else if ( al.RefID < CurrentId ) {
+ cerr << "Pileup::Run() : Data not sorted correctly!" << endl;
+ return false;
+ }
+
+ // else moved forward onto next reference
+ else {
+
+ // print any remaining pileup data from previous reference
+ while ( !CurrentData.empty() ) {
+ PrintCurrentData();
+ ++CurrentPosition;
+ }
+
+ // store first entry on this new reference, update markers
+ CurrentData.clear();
+ CurrentData.push_back(al);
+ CurrentId = al.RefID;
+ CurrentPosition = al.Position;
+ }
+ }
+
+ // ------------------------------------
+ // handle any remaining data entries
+
+ while ( !CurrentData.empty() ) {
+ PrintCurrentData();
+ ++CurrentPosition;
+ }
+
+ // -------------------------
+ // return success
+
+ return true;
+}
+
+// ----------------------------------------------------------
+// Pileup implementation
+
+Pileup::Pileup(BamMultiReader* reader, ostream* outStream) {
+ d = new PileupPrivate(reader, outStream);
+}
+
+Pileup::~Pileup(void) {
+ delete d;
+ d = 0;
+}
+
+bool Pileup::Run(void) {
+ return d->Run();
+}
+
+void Pileup::SetFastaFilename(const string& filename) {
+ d->FastaFilename = filename;
+}
+
+void Pileup::SetIsPrintingMapQualities(bool ok) {
+ d->IsPrintingMapQualities = ok;
+}
+
+void Pileup::SetRegion(const BamRegion& region) {
+ d->Region = region;
+}
--- /dev/null
+// ***************************************************************************
+// bamtools_pileup.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 13 July 2010
+// ---------------------------------------------------------------------------
+// Provides pileup conversion functionality.
+//
+// The 'assembly' aspect of pileup makes this more complicated than the
+// simpler one-to-one conversion methods for other formats.
+// ***************************************************************************
+
+#ifndef BAMTOOLS_PILEUP_H
+#define BAMTOOLS_PILEUP_H
+
+#include <iostream>
+#include <string>
+
+namespace BamTools {
+
+class BamMultiReader;
+class BamRegion;
+
+class Pileup {
+
+ public:
+ Pileup(BamMultiReader* reader, std::ostream* outStream);
+ ~Pileup(void);
+
+ public:
+ bool Run(void);
+ void SetFastaFilename(const std::string& filename);
+ void SetIsPrintingMapQualities(bool ok);
+ void SetRegion(const BamRegion& region);
+
+ private:
+ struct PileupPrivate;
+ PileupPrivate* d;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_PILEUP_H
\ No newline at end of file
--- /dev/null
+// ***************************************************************************
+// bamtools_utilities.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 2 June 2010
+// ---------------------------------------------------------------------------
+// Provides general utilities used by BamTools sub-tools.
+// ***************************************************************************
+
+#include <cstdlib>
+#include <sys/stat.h>
+#include "bamtools_utilities.h"
+#include "BamReader.h"
+#include "BamMultiReader.h"
+
+using namespace std;
+using namespace BamTools;
+
+// Parses a region string, does validation (valid ID's, positions), stores in Region struct
+// Returns success (true/false)
+bool Utilities::ParseRegionString(const std::string& regionString, const BamReader& reader, BamRegion& region) {
+
+ // -------------------------------
+ // parse region string
+
+ // check first for empty string
+ if ( regionString.empty() )
+ return false;
+
+ // non-empty string, look for a colom
+ size_t foundFirstColon = regionString.find(':');
+
+ // store chrom strings, and numeric positions
+ string startChrom;
+ string stopChrom;
+ int startPos;
+ int stopPos;
+
+ // no colon found
+ // going to use entire contents of requested chromosome
+ // just store entire region string as startChrom name
+ // use BamReader methods to check if its valid for current BAM file
+ if ( foundFirstColon == string::npos ) {
+ startChrom = regionString;
+ startPos = 0;
+ stopChrom = regionString;
+ stopPos = -1;
+ }
+
+ // colon found, so we at least have some sort of startPos requested
+ else {
+
+ // store start chrom from beginning to first colon
+ startChrom = regionString.substr(0,foundFirstColon);
+
+ // look for ".." after the colon
+ size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
+
+ // no dots found
+ // so we have a startPos but no range
+ // store contents before colon as startChrom, after as startPos
+ if ( foundRangeDots == string::npos ) {
+ startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
+ stopChrom = startChrom;
+ stopPos = -1;
+ }
+
+ // ".." found, so we have some sort of range selected
+ else {
+
+ // store startPos between first colon and range dots ".."
+ startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
+
+ // look for second colon
+ size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
+
+ // no second colon found
+ // so we have a "standard" chrom:start..stop input format (on single chrom)
+ if ( foundSecondColon == string::npos ) {
+ stopChrom = startChrom;
+ stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
+ }
+
+ // second colon found
+ // so we have a range requested across 2 chrom's
+ else {
+ stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
+ stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
+ }
+ }
+ }
+
+ // -------------------------------
+ // validate reference IDs & genomic positions
+
+ const RefVector references = reader.GetReferenceData();
+
+ // if startRefID not found, return false
+ int startRefID = reader.GetReferenceID(startChrom);
+ if ( startRefID == (int)references.size() ) return false;
+
+ // if startPos is larger than reference, return false
+ const RefData& startReference = references.at(startRefID);
+ if ( startPos > startReference.RefLength ) return false;
+
+ // if stopRefID not found, return false
+ int stopRefID = reader.GetReferenceID(stopChrom);
+ if ( stopRefID == (int)references.size() ) return false;
+
+ // if stopPosition larger than reference, return false
+ const RefData& stopReference = references.at(stopRefID);
+ if ( stopPos > stopReference.RefLength ) return false;
+
+ // if no stopPosition specified, set to reference end
+ if ( stopPos == -1 ) stopPos = stopReference.RefLength;
+
+ // -------------------------------
+ // set up Region struct & return
+
+ region.LeftRefID = startRefID;
+ region.LeftPosition = startPos;
+ region.RightRefID = stopRefID;;
+ region.RightPosition = stopPos;
+ return true;
+}
+
+// Same as ParseRegionString() above, but accepts a BamMultiReader
+bool Utilities::ParseRegionString(const std::string& regionString, const BamMultiReader& reader, BamRegion& region) {
+
+ // -------------------------------
+ // parse region string
+
+ // check first for empty string
+ if ( regionString.empty() )
+ return false;
+
+ // non-empty string, look for a colom
+ size_t foundFirstColon = regionString.find(':');
+
+ // store chrom strings, and numeric positions
+ string startChrom;
+ string stopChrom;
+ int startPos;
+ int stopPos;
+
+ // no colon found
+ // going to use entire contents of requested chromosome
+ // just store entire region string as startChrom name
+ // use BamReader methods to check if its valid for current BAM file
+ if ( foundFirstColon == string::npos ) {
+ startChrom = regionString;
+ startPos = 0;
+ stopChrom = regionString;
+ stopPos = -1;
+ }
+
+ // colon found, so we at least have some sort of startPos requested
+ else {
+
+ // store start chrom from beginning to first colon
+ startChrom = regionString.substr(0,foundFirstColon);
+
+ // look for ".." after the colon
+ size_t foundRangeDots = regionString.find("..", foundFirstColon+1);
+
+ // no dots found
+ // so we have a startPos but no range
+ // store contents before colon as startChrom, after as startPos
+ if ( foundRangeDots == string::npos ) {
+ startPos = atoi( regionString.substr(foundFirstColon+1).c_str() );
+ stopChrom = startChrom;
+ stopPos = -1;
+ }
+
+ // ".." found, so we have some sort of range selected
+ else {
+
+ // store startPos between first colon and range dots ".."
+ startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );
+
+ // look for second colon
+ size_t foundSecondColon = regionString.find(':', foundRangeDots+1);
+
+ // no second colon found
+ // so we have a "standard" chrom:start..stop input format (on single chrom)
+ if ( foundSecondColon == string::npos ) {
+ stopChrom = startChrom;
+ stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() );
+ }
+
+ // second colon found
+ // so we have a range requested across 2 chrom's
+ else {
+ stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
+ stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() );
+ }
+ }
+ }
+
+ // -------------------------------
+ // validate reference IDs & genomic positions
+
+ const RefVector references = reader.GetReferenceData();
+
+ // if startRefID not found, return false
+ int startRefID = reader.GetReferenceID(startChrom);
+ if ( startRefID == (int)references.size() ) return false;
+
+ // if startPos is larger than reference, return false
+ const RefData& startReference = references.at(startRefID);
+ if ( startPos > startReference.RefLength ) return false;
+
+ // if stopRefID not found, return false
+ int stopRefID = reader.GetReferenceID(stopChrom);
+ if ( stopRefID == (int)references.size() ) return false;
+
+ // if stopPosition larger than reference, return false
+ const RefData& stopReference = references.at(stopRefID);
+ if ( stopPos > stopReference.RefLength ) return false;
+
+ // if no stopPosition specified, set to reference end
+ if ( stopPos == -1 ) stopPos = stopReference.RefLength;
+
+ // -------------------------------
+ // set up Region struct & return
+
+ region.LeftRefID = startRefID;
+ region.LeftPosition = startPos;
+ region.RightRefID = stopRefID;;
+ region.RightPosition = stopPos;
+
+ return true;
+}
+
+bool Utilities::FileExists(const std::string& filename) {
+
+ struct stat fileInfo;
+ return stat(filename.c_str(), &fileInfo) == 0;
+
+}
--- /dev/null
+// ***************************************************************************
+// bamtools_utilities.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 2 June 2010
+// ---------------------------------------------------------------------------
+// Provides general utilities used by BamTools sub-tools.
+// ***************************************************************************
+
+#ifndef BAMTOOLS_UTILITIES_H
+#define BAMTOOLS_UTILITIES_H
+
+#include <string>
+#include "BamAux.h"
+
+namespace BamTools {
+
+class BamReader;
+class BamMultiReader;
+
+class Utilities {
+
+ public:
+ // Parses a region string, uses reader to do validation (valid ID's, positions), stores in Region struct
+ // Returns success (true/false)
+ static bool ParseRegionString(const std::string& regionString, const BamReader& reader, BamRegion& region);
+ // Same as above, but accepts a BamMultiReader
+ static bool ParseRegionString(const std::string& regionString, const BamMultiReader& reader, BamRegion& region);
+
+ // check if a file exists
+ static bool FileExists(const std::string& fname);
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_UTILITIES_H
--- /dev/null
+// ***************************************************************************
+// bamtools_variant.h (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 2 June 2010
+// ---------------------------------------------------------------------------
+// Provides a template-based variant type
+// ---------------------------------------------------------------------------
+// Modified from:
+// variant_t - An Improved Variant Type Based on Member Templates
+// (c) 2000 Fernando Cacciola
+// Dr. Dobb's (http://www.ddj.com/cpp/184401293)
+//
+// * Modified to be in BamTools namespace, otherwise code is same. (DB)
+// ***************************************************************************
+
+#ifndef BAMTOOLS_VARIANT_H
+#define BAMTOOLS_VARIANT_H
+
+#include <stdexcept>
+#include <typeinfo>
+#include <string>
+
+namespace BamTools {
+
+class Variant {
+
+ public:
+ Variant(void) : data (NULL) { }
+
+ Variant(const Variant& other) {
+ if(other.data != NULL)
+ other.data->AddRef();
+ data = other.data;
+ }
+
+ ~Variant(void) {
+ if(data != NULL) data->Release();
+ }
+
+ // NOTE: This code takes care of self-assignment.
+ // DO NOT CHANGE THE ORDER of the statements.
+ Variant& operator=(const Variant& rhs) {
+ if(rhs.data != NULL)
+ rhs.data->AddRef();
+ if(data != NULL)
+ data->Release();
+ data = rhs.data;
+ return * this;
+ }
+
+ // This member template constructor allows you to
+ // instance a variant_t object with a value of any type.
+ template<typename T>
+ Variant(T v)
+ : data(new Impl<T>(v))
+ {
+ data->AddRef();
+ }
+
+ // This generic conversion operator let you retrieve
+ // the value held. To avoid template specialization conflicts,
+ // it returns an instance of type T, which will be a COPY
+ // of the value contained.
+ template<typename T>
+ operator T() const {
+ return CastFromBase<T>(data)->data;
+ }
+
+ // This forms returns a REFERENCE and not a COPY, which
+ // will be significant in some cases.
+ template<typename T>
+ const T& get(void) const {
+ return CastFromBase<T>(data)->data;
+ }
+
+ template<typename T>
+ bool is_type(void) const {
+ return typeid(*data)==typeid(Impl<T>);
+ }
+
+ template<typename T>
+ bool is_type(T v) const {
+ return typeid(*data)==typeid(v);
+ }
+
+ private:
+ struct ImplBase {
+
+ ImplBase() : refs(0) {}
+ virtual ~ImplBase() {}
+
+ void AddRef(void) { refs ++; }
+ void Release(void) {
+ --refs;
+ if(refs == 0) delete this;
+ }
+
+ size_t refs;
+ };
+
+ template<typename T>
+ struct Impl : ImplBase {
+ Impl(T v) : data (v) { }
+ ~Impl(void) { }
+ T data;
+ };
+
+ // The following method is static because it doesn't
+ // operate on variant_t instances.
+ template<typename T>
+ static Impl<T>* CastFromBase(ImplBase* v) {
+ // This upcast will fail if T is other than the T used
+ // with the constructor of variant_t.
+ Impl<T>* p = dynamic_cast<Impl<T>*> (v);
+ if (p == NULL)
+ throw std::invalid_argument(typeid(T).name()+std::string(" is not a valid type"));
+ return p;
+ }
+
+ ImplBase* data;
+};
+
+} // namespace BamTools
+
+#endif // BAMTOOLS_VARIANT_H