From: derek Date: Mon, 28 Nov 2011 23:55:31 +0000 (-0500) Subject: merge with remoteio branch X-Git-Url: https://git.donarmstrong.com/?p=bamtools.git;a=commitdiff_plain;h=8077f86ef52bfb08c17430b797c737d217d41cf3;hp=a50400c186a05160e1164bf544571b739ff6fac8 merge with remoteio branch --- diff --git a/src/api/BamAux.h b/src/api/BamAux.h index f451125..0dd3e99 100644 --- a/src/api/BamAux.h +++ b/src/api/BamAux.h @@ -2,7 +2,7 @@ // BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) +// Last modified: 25 October 2011 (DB) // --------------------------------------------------------------------------- // Provides data structures & utility methods that are used throughout the API. // *************************************************************************** @@ -11,6 +11,7 @@ #define BAMAUX_H #include "api/api_global.h" +#include #include #include #include @@ -441,13 +442,25 @@ API_EXPORT inline unsigned short UnpackUnsignedShort(char* buffer) { \internal */ struct RaiiBuffer { + + // data members + char* Buffer; + const size_t NumBytes; + + // ctor & dtor RaiiBuffer(const size_t n) : Buffer( new char[n]() ) + , NumBytes(n) { } + ~RaiiBuffer(void) { delete[] Buffer; } - char* Buffer; + + // add'l methods + void Clear(void) { + memset(Buffer, 0, NumBytes); + } }; } // namespace BamTools diff --git a/src/api/BamMultiReader.cpp b/src/api/BamMultiReader.cpp index ef38469..f61aa26 100644 --- a/src/api/BamMultiReader.cpp +++ b/src/api/BamMultiReader.cpp @@ -2,7 +2,7 @@ // BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) +// Last modified: 25 October 2011 (DB) // --------------------------------------------------------------------------- // Convenience class for reading multiple BAM files. // @@ -13,7 +13,7 @@ // *************************************************************************** #include "api/BamMultiReader.h" -#include "api/internal/BamMultiReader_p.h" +#include "api/internal/bam/BamMultiReader_p.h" using namespace BamTools; #include diff --git a/src/api/BamMultiReader.h b/src/api/BamMultiReader.h index ea068d0..e5fc9c9 100644 --- a/src/api/BamMultiReader.h +++ b/src/api/BamMultiReader.h @@ -2,7 +2,7 @@ // BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) +// Last modified: 25 October 2011 (DB) // --------------------------------------------------------------------------- // Convenience class for reading multiple BAM files. // *************************************************************************** diff --git a/src/api/BamReader.cpp b/src/api/BamReader.cpp index 6080b36..ae2adec 100644 --- a/src/api/BamReader.cpp +++ b/src/api/BamReader.cpp @@ -2,13 +2,13 @@ // BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) +// Last modified: 25 October 2011 (DB) // --------------------------------------------------------------------------- // Provides read access to BAM files. // *************************************************************************** #include "api/BamReader.h" -#include "api/internal/BamReader_p.h" +#include "api/internal/bam/BamReader_p.h" using namespace BamTools; using namespace BamTools::Internal; diff --git a/src/api/BamWriter.cpp b/src/api/BamWriter.cpp index b1582a8..cbbfdae 100644 --- a/src/api/BamWriter.cpp +++ b/src/api/BamWriter.cpp @@ -2,7 +2,7 @@ // BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) +// Last modified: 25 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** @@ -10,7 +10,7 @@ #include "api/BamAlignment.h" #include "api/BamWriter.h" #include "api/SamHeader.h" -#include "api/internal/BamWriter_p.h" +#include "api/internal/bam/BamWriter_p.h" using namespace BamTools; using namespace BamTools::Internal; using namespace std; diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt index c8504ed..539feca 100644 --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@ -12,7 +12,10 @@ include_directories( ${BamTools_SOURCE_DIR}/src ) add_definitions( -DBAMTOOLS_API_LIBRARY ) # (for proper exporting of library symbols) add_definitions( -fPIC ) # (attempt to force PIC compiling on CentOS, not being set on shared libs by CMake) -# list of all BamTools API source (.cpp) files +# fetch all internal source files +add_subdirectory ( internal ) + +# make list of all API source files set( BamToolsAPISources BamAlignment.cpp BamMultiReader.cpp @@ -25,26 +28,7 @@ set( BamToolsAPISources SamReadGroupDictionary.cpp SamSequence.cpp SamSequenceDictionary.cpp - internal/BamDeviceFactory_p.cpp - internal/BamException_p.cpp - internal/BamFile_p.cpp - internal/BamFtp_p.cpp - internal/BamHeader_p.cpp - internal/BamHttp_p.cpp - internal/BamIndexFactory_p.cpp - internal/BamMultiReader_p.cpp - internal/BamPipe_p.cpp - internal/BamRandomAccessController_p.cpp - internal/BamReader_p.cpp - internal/BamStandardIndex_p.cpp - internal/BamToolsIndex_p.cpp - internal/BamWriter_p.cpp - internal/BgzfStream_p.cpp - internal/ILocalIODevice_p.cpp - internal/IRemoteIODevice_p.cpp - internal/SamFormatParser_p.cpp - internal/SamFormatPrinter_p.cpp - internal/SamHeaderValidator_p.cpp + ${InternalSources} ) # create main BamTools API shared library @@ -52,38 +36,48 @@ add_library( BamTools SHARED ${BamToolsAPISources} ) set_target_properties( BamTools PROPERTIES SOVERSION "2.0.5" OUTPUT_NAME "bamtools" ) -target_link_libraries( BamTools z ) -install( TARGETS BamTools LIBRARY DESTINATION "lib/bamtools" RUNTIME DESTINATION "bin" ) # create main BamTools API static library add_library( BamTools-static STATIC ${BamToolsAPISources} ) -set_target_properties( BamTools-static PROPERTIES - OUTPUT_NAME "bamtools" +set_target_properties( BamTools-static PROPERTIES + OUTPUT_NAME "bamtools" PREFIX "lib" ) -target_link_libraries( BamTools-static z ) -install( TARGETS BamTools-static ARCHIVE DESTINATION "lib/bamtools" ) + +# link libraries with zlib automatically +if ( _WIN32 ) + set( APILibs z ws2_32 ) +else ( _WIN32 ) + set( APILibs z ) +endif ( _WIN32 ) + +target_link_libraries( BamTools ${APILibs} ) +target_link_libraries( BamTools-static ${APILibs} ) + +# set library install destinations +install( TARGETS BamTools LIBRARY DESTINATION "lib/bamtools" RUNTIME DESTINATION "bin") +install( TARGETS BamTools-static ARCHIVE DESTINATION "lib/bamtools") # export API headers -include( ../ExportHeader.cmake ) -set( ApiIncludeDir "api" ) -ExportHeader( APIHeaders api_global.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders BamAlgorithms.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders BamAlignment.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders BamAux.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders BamConstants.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders BamIndex.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders BamMultiReader.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders BamReader.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders BamWriter.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders IBamIODevice.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders SamConstants.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders SamHeader.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders SamProgram.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders SamProgramChain.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders SamReadGroup.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders SamReadGroupDictionary.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders SamSequence.h ${ApiIncludeDir} ) -ExportHeader( APIHeaders SamSequenceDictionary.h ${ApiIncludeDir} ) +include(../ExportHeader.cmake) +set(ApiIncludeDir "api") +ExportHeader(APIHeaders api_global.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamAlgorithms.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamAlignment.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamAux.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamConstants.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamIndex.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamMultiReader.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamReader.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamWriter.h ${ApiIncludeDir}) +ExportHeader(APIHeaders IBamIODevice.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamConstants.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamHeader.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamProgram.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamProgramChain.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamReadGroup.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamReadGroupDictionary.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamSequence.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamSequenceDictionary.h ${ApiIncludeDir}) set( AlgorithmsIncludeDir "api/algorithms" ) ExportHeader( AlgorithmsHeaders algorithms/Sort.h ${AlgorithmsIncludeDir} ) diff --git a/src/api/IBamIODevice.h b/src/api/IBamIODevice.h index b34e449..cf64129 100644 --- a/src/api/IBamIODevice.h +++ b/src/api/IBamIODevice.h @@ -2,7 +2,7 @@ // IBamIODevice.h (c) 2011 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) +// Last modified: 10 November 2011 (DB) // --------------------------------------------------------------------------- // Base class for all BAM I/O devices (e.g. local file, pipe, HTTP, FTP, etc.) // @@ -19,6 +19,7 @@ #define IBAMIODEVICE_H #include "api/api_global.h" +#include #include namespace BamTools { @@ -26,9 +27,10 @@ namespace BamTools { class API_EXPORT IBamIODevice { // enums - public: enum OpenMode { NotOpen = 0 - , ReadOnly - , WriteOnly + public: enum OpenMode { NotOpen = 0x0000 + , ReadOnly = 0x0001 + , WriteOnly = 0x0002 + , ReadWrite = ReadOnly | WriteOnly }; // ctor & dtor @@ -38,14 +40,16 @@ class API_EXPORT IBamIODevice { // IBamIODevice interface public: + // TODO: add seek(pos, *from*) + // pure virtuals virtual void Close(void) =0; virtual bool IsRandomAccess(void) const =0; virtual bool Open(const OpenMode mode) =0; - virtual size_t Read(char* data, const unsigned int numBytes) =0; - virtual bool Seek(const int64_t& position) =0; + virtual int64_t Read(char* data, const unsigned int numBytes) =0; + virtual bool Seek(const int64_t& position, const int origin = SEEK_SET) =0; virtual int64_t Tell(void) const =0; - virtual size_t Write(const char* data, const unsigned int numBytes) =0; + virtual int64_t Write(const char* data, const unsigned int numBytes) =0; // default implementation provided virtual std::string GetErrorString(void); diff --git a/src/api/SamHeader.cpp b/src/api/SamHeader.cpp index 5de2abc..b2925f2 100644 --- a/src/api/SamHeader.cpp +++ b/src/api/SamHeader.cpp @@ -2,17 +2,17 @@ // SamHeader.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College // --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) +// Last modified: 25 October 2011 (DB) // --------------------------------------------------------------------------- // Provides direct read/write access to the SAM header data fields. // *************************************************************************** #include "api/SamConstants.h" #include "api/SamHeader.h" -#include "api/internal/BamException_p.h" -#include "api/internal/SamFormatParser_p.h" -#include "api/internal/SamFormatPrinter_p.h" -#include "api/internal/SamHeaderValidator_p.h" +#include "api/internal/utils/BamException_p.h" +#include "api/internal/sam/SamFormatParser_p.h" +#include "api/internal/sam/SamFormatPrinter_p.h" +#include "api/internal/sam/SamHeaderValidator_p.h" using namespace BamTools; using namespace BamTools::Internal; using namespace std; diff --git a/src/api/internal/BamDeviceFactory_p.cpp b/src/api/internal/BamDeviceFactory_p.cpp deleted file mode 100644 index 895d08c..0000000 --- a/src/api/internal/BamDeviceFactory_p.cpp +++ /dev/null @@ -1,37 +0,0 @@ -// *************************************************************************** -// BamDeviceFactory_p.cpp (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 September 2011 (DB) -// --------------------------------------------------------------------------- -// Creates built-in concrete implementations of IBamIODevices -// *************************************************************************** - -#include "api/internal/BamDeviceFactory_p.h" -#include "api/internal/BamFile_p.h" -#include "api/internal/BamFtp_p.h" -#include "api/internal/BamHttp_p.h" -#include "api/internal/BamPipe_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -#include -using namespace std; - -IBamIODevice* BamDeviceFactory::CreateDevice(const string& source) { - - // check for requested pipe - if ( source == "-" || source == "stdin" || source == "stdout" ) - return new BamPipe; - - // check for HTTP prefix - if ( source.find("http://") == 0 ) - return new BamHttp(source); - - // check for FTP prefix - if ( source.find("ftp://") == 0 ) - return new BamFtp(source); - - // otherwise assume a "normal" file - return new BamFile(source); -} diff --git a/src/api/internal/BamDeviceFactory_p.h b/src/api/internal/BamDeviceFactory_p.h deleted file mode 100644 index 1d48533..0000000 --- a/src/api/internal/BamDeviceFactory_p.h +++ /dev/null @@ -1,37 +0,0 @@ -// *************************************************************************** -// BamDeviceFactory_p.h (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Creates built-in concrete implementations of IBamIODevices -// *************************************************************************** - -#ifndef BAMDEVICEFACTORY_P_H -#define BAMDEVICEFACTORY_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include "api/IBamIODevice.h" -#include - -namespace BamTools { -namespace Internal { - -class BamDeviceFactory { - public: - static IBamIODevice* CreateDevice(const std::string& source); -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMDEVICEFACTORY_P_H diff --git a/src/api/internal/BamException_p.cpp b/src/api/internal/BamException_p.cpp deleted file mode 100644 index 38469e7..0000000 --- a/src/api/internal/BamException_p.cpp +++ /dev/null @@ -1,15 +0,0 @@ -// *************************************************************************** -// BamException_p.cpp (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides a basic exception class for BamTools internals -// *************************************************************************** - -#include "api/internal/BamException_p.h" -using namespace BamTools; -using namespace BamTools::Internal; -using namespace std; - -const string BamException::SEPARATOR = ": "; diff --git a/src/api/internal/BamException_p.h b/src/api/internal/BamException_p.h deleted file mode 100644 index 5199737..0000000 --- a/src/api/internal/BamException_p.h +++ /dev/null @@ -1,51 +0,0 @@ -// *************************************************************************** -// BamException_p.h (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 6 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides a basic exception class for BamTools internals -// *************************************************************************** - -#ifndef BAMEXCEPTION_P_H -#define BAMEXCEPTION_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include -#include - -namespace BamTools { -namespace Internal { - -class BamException : public std::exception { - - public: - inline BamException(const std::string& where, const std::string& message) - : std::exception() - , m_errorString(where + SEPARATOR + message) - { } - - inline ~BamException(void) throw() { } - - inline const char* what(void) const throw() { - return m_errorString.c_str(); - } - - private: - std::string m_errorString; - static const std::string SEPARATOR; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMEXCEPTION_P_H diff --git a/src/api/internal/BamFile_p.cpp b/src/api/internal/BamFile_p.cpp deleted file mode 100644 index 74c4ed6..0000000 --- a/src/api/internal/BamFile_p.cpp +++ /dev/null @@ -1,67 +0,0 @@ -// *************************************************************************** -// BamFile_p.cpp (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides BAM file-specific IO behavior -// *************************************************************************** - -#include "api/internal/BamFile_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -#include -#include -using namespace std; - -BamFile::BamFile(const string& filename) - : ILocalIODevice() - , m_filename(filename) -{ } - -BamFile::~BamFile(void) { } - -void BamFile::Close(void) { - if ( IsOpen() ) { - m_filename.clear(); - ILocalIODevice::Close(); - } -} - -bool BamFile::IsRandomAccess(void) const { - return true; -} - -bool BamFile::Open(const IBamIODevice::OpenMode mode) { - - // make sure we're starting with a fresh file stream - Close(); - - // attempt to open FILE* depending on requested openmode - if ( mode == IBamIODevice::ReadOnly ) - m_stream = fopen(m_filename.c_str(), "rb"); - else if ( mode == IBamIODevice::WriteOnly ) - m_stream = fopen(m_filename.c_str(), "wb"); - else { - SetErrorString("BamFile::Open", "unknown open mode requested"); - return false; - } - - // check that we obtained a valid FILE* - if ( m_stream == 0 ) { - const string message_base = string("could not open file handle for "); - const string message = message_base + ( (m_filename.empty()) ? "empty filename" : m_filename ); - SetErrorString("BamFile::Open", message); - return false; - } - - // store current IO mode & return success - m_mode = mode; - return true; -} - -bool BamFile::Seek(const int64_t& position) { - BT_ASSERT_X( m_stream, "BamFile::Seek() - null stream" ); - return ( fseek64(m_stream, position, SEEK_SET) == 0 ); -} diff --git a/src/api/internal/BamFile_p.h b/src/api/internal/BamFile_p.h deleted file mode 100644 index 873e71a..0000000 --- a/src/api/internal/BamFile_p.h +++ /dev/null @@ -1,51 +0,0 @@ -// *************************************************************************** -// BamFile_p.h (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides BAM file-specific IO behavior -// *************************************************************************** - -#ifndef BAMFILE_P_H -#define BAMFILE_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include "api/internal/ILocalIODevice_p.h" -#include - -namespace BamTools { -namespace Internal { - -class BamFile : public ILocalIODevice { - - // ctor & dtor - public: - BamFile(const std::string& filename); - ~BamFile(void); - - // ILocalIODevice implementation - public: - void Close(void); - bool IsRandomAccess(void) const; - bool Open(const IBamIODevice::OpenMode mode); - bool Seek(const int64_t& position); - - // data members - private: - std::string m_filename; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMFILE_P_H diff --git a/src/api/internal/BamFtp_p.cpp b/src/api/internal/BamFtp_p.cpp deleted file mode 100644 index 779d099..0000000 --- a/src/api/internal/BamFtp_p.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// *************************************************************************** -// BamFtp_p.cpp (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides reading/writing of BAM files on FTP server -// *************************************************************************** - -#include "api/internal/BamFtp_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -using namespace std; - -BamFtp::BamFtp(const string& url) - : IBamIODevice() -{ - BT_ASSERT_X(false, "BamFtp not yet implemented"); -} - -BamFtp::~BamFtp(void) { } - -void BamFtp::Close(void) { - return ; -} - -bool BamFtp::IsRandomAccess(void) const { - return true; -} - -bool BamFtp::Open(const IBamIODevice::OpenMode mode) { - (void) mode; - return true; -} - -size_t BamFtp::Read(char* data, const unsigned int numBytes) { - (void)data; - (void)numBytes; - return 0; -} - -bool BamFtp::Seek(const int64_t& position) { - (void)position; - return true; -} - -int64_t BamFtp::Tell(void) const { - return -1; -} - -size_t BamFtp::Write(const char* data, const unsigned int numBytes) { - (void)data; - (void)numBytes; - return 0; -} diff --git a/src/api/internal/BamFtp_p.h b/src/api/internal/BamFtp_p.h deleted file mode 100644 index 1f5ee0f..0000000 --- a/src/api/internal/BamFtp_p.h +++ /dev/null @@ -1,56 +0,0 @@ -// *************************************************************************** -// BamFtp_p.h (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides reading/writing of BAM files on FTP server -// *************************************************************************** - -#ifndef BAMFTP_P_H -#define BAMFTP_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include "api/IBamIODevice.h" -#include - -namespace BamTools { -namespace Internal { - -class BamFtp : public IBamIODevice { - - // ctor & dtor - public: - BamFtp(const std::string& url); - ~BamFtp(void); - - // IBamIODevice implementation - public: - void Close(void); - bool IsRandomAccess(void) const; - bool Open(const IBamIODevice::OpenMode mode); - size_t Read(char* data, const unsigned int numBytes); - bool Seek(const int64_t& position); - int64_t Tell(void) const; - size_t Write(const char* data, const unsigned int numBytes); - - // internal methods - private: - - // data members - private: -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMFTP_P_H diff --git a/src/api/internal/BamHeader_p.cpp b/src/api/internal/BamHeader_p.cpp deleted file mode 100644 index dc734bf..0000000 --- a/src/api/internal/BamHeader_p.cpp +++ /dev/null @@ -1,120 +0,0 @@ -// *************************************************************************** -// BamHeader_p.cpp (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for handling BAM headers. -// *************************************************************************** - -#include "api/BamAux.h" -#include "api/BamConstants.h" -#include "api/internal/BamException_p.h" -#include "api/internal/BamHeader_p.h" -#include "api/internal/BgzfStream_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -#include -#include -using namespace std; - -// ------------------------ -// static utility methods -// ------------------------ - -static inline -bool isValidMagicNumber(const char* buffer) { - return ( strncmp(buffer, Constants::BAM_HEADER_MAGIC, - Constants::BAM_HEADER_MAGIC_LENGTH) == 0 ); -} - -// -------------------------- -// BamHeader implementation -// -------------------------- - -// ctor -BamHeader::BamHeader(void) { } - -// dtor -BamHeader::~BamHeader(void) { } - -// reads magic number from BGZF stream, returns true if valid -void BamHeader::CheckMagicNumber(BgzfStream* stream) { - - // try to read magic number - char buffer[Constants::BAM_HEADER_MAGIC_LENGTH]; - const size_t numBytesRead = stream->Read(buffer, Constants::BAM_HEADER_MAGIC_LENGTH); - if ( numBytesRead != (int)Constants::BAM_HEADER_MAGIC_LENGTH ) - throw BamException("BamHeader::CheckMagicNumber", "could not read magic number"); - - // validate magic number - if ( !isValidMagicNumber(buffer) ) - throw BamException("BamHeader::CheckMagicNumber", "invalid magic number"); -} - -// clear SamHeader data -void BamHeader::Clear(void) { - m_header.Clear(); -} - -// return true if SamHeader data is valid -bool BamHeader::IsValid(void) const { - return m_header.IsValid(); -} - -// load BAM header ('magic number' and SAM header text) from BGZF stream -void BamHeader::Load(BgzfStream* stream) { - - // read & check magic number - CheckMagicNumber(stream); - - // read header (length, then actual text) - uint32_t length(0); - ReadHeaderLength(stream, length); - ReadHeaderText(stream, length); -} - -// reads SAM header text length from BGZF stream, stores it in @length -void BamHeader::ReadHeaderLength(BgzfStream* stream, uint32_t& length) { - - // read BAM header text length - char buffer[sizeof(uint32_t)]; - const size_t numBytesRead = stream->Read(buffer, sizeof(uint32_t)); - if ( numBytesRead != sizeof(uint32_t) ) - throw BamException("BamHeader::ReadHeaderLength", "could not read header length"); - - // convert char buffer to length - length = BamTools::UnpackUnsignedInt(buffer); - if ( BamTools::SystemIsBigEndian() ) - BamTools::SwapEndian_32(length); -} - -// reads SAM header text from BGZF stream, stores in SamHeader object -void BamHeader::ReadHeaderText(BgzfStream* stream, const uint32_t& length) { - - // read header text - char* headerText = (char*)calloc(length + 1, 1); - const size_t bytesRead = stream->Read(headerText, length); - - // if error reading, clean up buffer & throw - if ( bytesRead != length ) { - free(headerText); - throw BamException("BamHeader::ReadHeaderText", "could not read header text"); - } - - // otherwise, text was read OK - // store & cleanup - m_header.SetHeaderText( (string)((const char*)headerText) ); - free(headerText); -} - -// returns *copy* of SamHeader data object -SamHeader BamHeader::ToSamHeader(void) const { - return m_header; -} - -// returns SAM-formatted string of header data -string BamHeader::ToString(void) const { - return m_header.ToString(); -} diff --git a/src/api/internal/BamHeader_p.h b/src/api/internal/BamHeader_p.h deleted file mode 100644 index 499ad96..0000000 --- a/src/api/internal/BamHeader_p.h +++ /dev/null @@ -1,69 +0,0 @@ -// *************************************************************************** -// BamHeader_p.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for handling BAM headers. -// *************************************************************************** - -#ifndef BAMHEADER_P_H -#define BAMHEADER_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include "api/SamHeader.h" -#include - -namespace BamTools { -namespace Internal { - -class BgzfStream; - -class BamHeader { - - // ctor & dtor - public: - BamHeader(void); - ~BamHeader(void); - - // BamHeader interface - public: - // clear SamHeader data - void Clear(void); - // return true if SamHeader data is valid - bool IsValid(void) const; - // load BAM header ('magic number' and SAM header text) from BGZF stream - // returns true if all OK - void Load(BgzfStream* stream); - // returns (editable) copy of SamHeader data object - SamHeader ToSamHeader(void) const; - // returns SAM-formatted string of header data - std::string ToString(void) const; - - // internal methods - private: - // reads magic number from BGZF stream - void CheckMagicNumber(BgzfStream* stream); - // reads SAM header length from BGZF stream, stores it in @length - void ReadHeaderLength(BgzfStream* stream, uint32_t& length); - // reads SAM header text from BGZF stream, stores in SamHeader object - void ReadHeaderText(BgzfStream* stream, const uint32_t& length); - - // data members - private: - SamHeader m_header; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMHEADER_P_H diff --git a/src/api/internal/BamHttp_p.cpp b/src/api/internal/BamHttp_p.cpp deleted file mode 100644 index 83b4c3b..0000000 --- a/src/api/internal/BamHttp_p.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// *************************************************************************** -// BamHttp_p.cpp (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides reading/writing of BAM files on HTTP server -// *************************************************************************** - -#include "api/internal/BamHttp_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -using namespace std; - -BamHttp::BamHttp(const string& url) - : IBamIODevice() -{ - BT_ASSERT_X(false, "BamHttp not yet implemented"); -} - -BamHttp::~BamHttp(void) { } - -void BamHttp::Close(void) { - return ; -} - -bool BamHttp::IsRandomAccess(void) const { - return true; -} - -bool BamHttp::Open(const IBamIODevice::OpenMode mode) { - (void) mode; - return true; -} - -size_t BamHttp::Read(char* data, const unsigned int numBytes) { - (void)data; - (void)numBytes; - return 0; -} - -bool BamHttp::Seek(const int64_t& position) { - (void)position; - return true; -} - -int64_t BamHttp::Tell(void) const { - return -1; -} - -size_t BamHttp::Write(const char* data, const unsigned int numBytes) { - (void)data; - (void)numBytes; - return 0; -} diff --git a/src/api/internal/BamHttp_p.h b/src/api/internal/BamHttp_p.h deleted file mode 100644 index 38e94b7..0000000 --- a/src/api/internal/BamHttp_p.h +++ /dev/null @@ -1,56 +0,0 @@ -// *************************************************************************** -// BamHttp_p.h (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides reading/writing of BAM files on HTTP server -// *************************************************************************** - -#ifndef BAMHTTP_P_H -#define BAMHTTP_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include "api/IBamIODevice.h" -#include - -namespace BamTools { -namespace Internal { - -class BamHttp : public IBamIODevice { - - // ctor & dtor - public: - BamHttp(const std::string& url); - ~BamHttp(void); - - // IBamIODevice implementation - public: - void Close(void); - bool IsRandomAccess(void) const; - bool Open(const IBamIODevice::OpenMode mode); - size_t Read(char* data, const unsigned int numBytes); - bool Seek(const int64_t& position); - int64_t Tell(void) const; - size_t Write(const char* data, const unsigned int numBytes); - - // internal methods - private: - - // data members - private: -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMHTTP_P_H diff --git a/src/api/internal/BamIndexFactory_p.cpp b/src/api/internal/BamIndexFactory_p.cpp deleted file mode 100644 index 2cf871f..0000000 --- a/src/api/internal/BamIndexFactory_p.cpp +++ /dev/null @@ -1,112 +0,0 @@ -// *************************************************************************** -// BamIndexFactory_p.cpp (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides interface for generating BamIndex implementations -// *************************************************************************** - -#include "api/BamAux.h" -#include "api/internal/BamIndexFactory_p.h" -#include "api/internal/BamStandardIndex_p.h" -#include "api/internal/BamToolsIndex_p.h" -using namespace BamTools; -using namespace BamTools::Internal; -using namespace std; - -// generates index filename from BAM filename (depending on requested type) -// if type is unknown, returns empty string -const string BamIndexFactory::CreateIndexFilename(const string& bamFilename, - const BamIndex::IndexType& type) -{ - switch ( type ) { - case ( BamIndex::STANDARD ) : return ( bamFilename + BamStandardIndex::Extension() ); - case ( BamIndex::BAMTOOLS ) : return ( bamFilename + BamToolsIndex::Extension() ); - default : - return string(); - } -} - -// creates a new BamIndex object, depending on extension of @indexFilename -BamIndex* BamIndexFactory::CreateIndexFromFilename(const string& indexFilename, BamReaderPrivate* reader) { - - // if file doesn't exist, return null index - if ( !BamTools::FileExists(indexFilename) ) - return 0; - - // get file extension from index filename, including dot (".EXT") - // if can't get file extension, return null index - const string extension = FileExtension(indexFilename); - if ( extension.empty() ) - return 0; - - // create index based on extension - if ( extension == BamStandardIndex::Extension() ) return new BamStandardIndex(reader); - else if ( extension == BamToolsIndex::Extension() ) return new BamToolsIndex(reader); - else - return 0; -} - -// creates a new BamIndex, object of requested @type -BamIndex* BamIndexFactory::CreateIndexOfType(const BamIndex::IndexType& type, - BamReaderPrivate* reader) -{ - switch ( type ) { - case ( BamIndex::STANDARD ) : return new BamStandardIndex(reader); - case ( BamIndex::BAMTOOLS ) : return new BamToolsIndex(reader); - default : - return 0; - } -} - -// retrieves file extension (including '.') -const string BamIndexFactory::FileExtension(const string& filename) { - - // if filename cannot contain valid path + extension, return empty string - if ( filename.empty() || filename.length() <= 4 ) - return string(); - - // look for last dot in filename - const size_t lastDotPosition = filename.find_last_of('.'); - - // if none found, return empty string - if ( lastDotPosition == string::npos ) - return string(); - - // return substring from last dot position - return filename.substr(lastDotPosition); -} - -// returns name of existing index file that corresponds to @bamFilename -// will defer to @preferredType if possible, if not will attempt to load any supported type -// returns empty string if not found -const string BamIndexFactory::FindIndexFilename(const string& bamFilename, - const BamIndex::IndexType& preferredType) -{ - // skip if BAM filename provided is empty - if ( bamFilename.empty() ) - return string(); - - // try to find index of preferred type first - // return index filename if found - string indexFilename = CreateIndexFilename(bamFilename, preferredType); - if ( !indexFilename.empty() && BamTools::FileExists(indexFilename) ) - return indexFilename; - - // couldn't find preferred type, try the other supported types - // return index filename if found - if ( preferredType != BamIndex::STANDARD ) { - indexFilename = CreateIndexFilename(bamFilename, BamIndex::STANDARD); - if ( !indexFilename.empty() && BamTools::FileExists(indexFilename) ) - return indexFilename; - } - if ( preferredType != BamIndex::BAMTOOLS ) { - indexFilename = CreateIndexFilename(bamFilename, BamIndex::BAMTOOLS); - if ( !indexFilename.empty() && BamTools::FileExists(indexFilename) ) - return indexFilename; - } - - // otherwise couldn't find any index matching this filename - return string(); -} diff --git a/src/api/internal/BamIndexFactory_p.h b/src/api/internal/BamIndexFactory_p.h deleted file mode 100644 index 4e4f1cf..0000000 --- a/src/api/internal/BamIndexFactory_p.h +++ /dev/null @@ -1,49 +0,0 @@ -// *************************************************************************** -// BamIndexFactory_p.h (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides interface for generating BamIndex implementations -// *************************************************************************** - -#ifndef BAMINDEX_FACTORY_P_H -#define BAMINDEX_FACTORY_P_H - -#include "api/BamIndex.h" -#include - -namespace BamTools { -namespace Internal { - -class BamIndexFactory { - - // static interface methods - public: - // creates a new BamIndex object, depending on extension of @indexFilename - static BamIndex* CreateIndexFromFilename(const std::string& indexFilename, - BamReaderPrivate* reader); - // creates a new BamIndex object, of requested @type - static BamIndex* CreateIndexOfType(const BamIndex::IndexType& type, - BamReaderPrivate* reader); - // returns name of existing index file that corresponds to @bamFilename - // will defer to @preferredType if possible - // if @preferredType not found, will attempt to load any supported index type - // returns empty string if no index file (of any type) is found - static const std::string FindIndexFilename(const std::string& bamFilename, - const BamIndex::IndexType& preferredType); - - // internal methods - public: - // generates index filename from BAM filename (depending on requested type) - // if type is unknown, returns empty string - static const std::string CreateIndexFilename(const std::string& bamFilename, - const BamIndex::IndexType& type); - // retrieves file extension (including '.') - static const std::string FileExtension(const std::string& filename); -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMINDEX_FACTORY_P_H diff --git a/src/api/internal/BamMultiMerger_p.h b/src/api/internal/BamMultiMerger_p.h deleted file mode 100644 index 3000097..0000000 --- a/src/api/internal/BamMultiMerger_p.h +++ /dev/null @@ -1,266 +0,0 @@ -// *************************************************************************** -// BamMultiMerger_p.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides merging functionality for BamMultiReader. At this point, supports -// sorting results by (refId, position) or by read name. -// *************************************************************************** - -#ifndef BAMMULTIMERGER_P_H -#define BAMMULTIMERGER_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include "api/BamAlignment.h" -#include "api/BamReader.h" -#include "api/algorithms/Sort.h" -#include -#include -#include -#include - -namespace BamTools { -namespace Internal { - -struct MergeItem { - - // data members - BamReader* Reader; - BamAlignment* Alignment; - - // ctors & dtor - MergeItem(BamReader* reader = 0, - BamAlignment* alignment = 0) - : Reader(reader) - , Alignment(alignment) - { } - - MergeItem(const MergeItem& other) - : Reader(other.Reader) - , Alignment(other.Alignment) - { } - - ~MergeItem(void) { } -}; - -template -struct MergeItemSorter : public std::binary_function { - - public: - MergeItemSorter(const Compare& comp = Compare()) - : m_comp(comp) - { } - - bool operator()(const MergeItem& lhs, const MergeItem& rhs) { - const BamAlignment& l = *lhs.Alignment; - const BamAlignment& r = *rhs.Alignment; - return m_comp(l,r); - } - - private: - Compare m_comp; -}; - -// pure ABC so we can just work polymorphically with any specific merger implementation -class IMultiMerger { - - public: - IMultiMerger(void) { } - virtual ~IMultiMerger(void) { } - public: - virtual void Add(MergeItem item) =0; - virtual void Clear(void) =0; - virtual const MergeItem& First(void) const =0; - virtual bool IsEmpty(void) const =0; - virtual void Remove(BamReader* reader) =0; - virtual int Size(void) const =0; - virtual MergeItem TakeFirst(void) =0; -}; - -// general merger -template -class MultiMerger : public IMultiMerger { - - public: - typedef Compare CompareType; - typedef MergeItemSorter MergeType; - - public: - explicit MultiMerger(const Compare& comp = Compare()) - : IMultiMerger() - , m_data( MergeType(comp) ) - { } - ~MultiMerger(void) { } - - public: - void Add(MergeItem item); - void Clear(void); - const MergeItem& First(void) const; - bool IsEmpty(void) const; - void Remove(BamReader* reader); - int Size(void) const; - MergeItem TakeFirst(void); - - private: - typedef MergeItem ValueType; - typedef std::multiset ContainerType; - typedef typename ContainerType::iterator DataIterator; - typedef typename ContainerType::const_iterator DataConstIterator; - ContainerType m_data; -}; - -template -inline void MultiMerger::Add(MergeItem item) { - - // N.B. - any future custom Compare types must define this method - // see algorithms/Sort.h - - if ( CompareType::UsesCharData() ) - item.Alignment->BuildCharData(); - m_data.insert(item); -} - -template -inline void MultiMerger::Clear(void) { - m_data.clear(); -} - -template -inline const MergeItem& MultiMerger::First(void) const { - const ValueType& entry = (*m_data.begin()); - return entry; -} - -template -inline bool MultiMerger::IsEmpty(void) const { - return m_data.empty(); -} -template -inline void MultiMerger::Remove(BamReader* reader) { - - if ( reader == 0 ) return; - const std::string& filenameToRemove = reader->GetFilename(); - - // iterate over readers in cache - DataIterator dataIter = m_data.begin(); - DataIterator dataEnd = m_data.end(); - for ( ; dataIter != dataEnd; ++dataIter ) { - const MergeItem& item = (*dataIter); - const BamReader* itemReader = item.Reader; - if ( itemReader == 0 ) continue; - - // remove iterator on match - if ( itemReader->GetFilename() == filenameToRemove ) { - m_data.erase(dataIter); - return; - } - } -} -template -inline int MultiMerger::Size(void) const { - return m_data.size(); -} - -template -inline MergeItem MultiMerger::TakeFirst(void) { - DataIterator firstIter = m_data.begin(); - MergeItem firstItem = (*firstIter); - m_data.erase(firstIter); - return firstItem; -} - -// unsorted "merger" -template<> -class MultiMerger : public IMultiMerger { - - public: - explicit MultiMerger(const Algorithms::Sort::Unsorted& comp = Algorithms::Sort::Unsorted()) - : IMultiMerger() - { } - ~MultiMerger(void) { } - - public: - void Add(MergeItem item); - void Clear(void); - const MergeItem& First(void) const; - bool IsEmpty(void) const; - void Remove(BamReader* reader); - int Size(void) const; - MergeItem TakeFirst(void); - - private: - typedef MergeItem ValueType; - typedef std::deque ContainerType; - typedef ContainerType::iterator DataIterator; - typedef ContainerType::const_iterator DataConstIterator; - ContainerType m_data; -}; - -inline -void MultiMerger::Add(MergeItem item) { - m_data.push_back(item); -} - -inline -void MultiMerger::Clear(void) { - m_data.clear(); -} - -inline -const MergeItem& MultiMerger::First(void) const { - return m_data.front(); -} - -inline -bool MultiMerger::IsEmpty(void) const { - return m_data.empty(); -} - -inline -void MultiMerger::Remove(BamReader* reader) { - - if ( reader == 0 ) return; - const std::string filenameToRemove = reader->GetFilename(); - - // iterate over readers in cache - DataIterator dataIter = m_data.begin(); - DataIterator dataEnd = m_data.end(); - for ( ; dataIter != dataEnd; ++dataIter ) { - const MergeItem& item = (*dataIter); - const BamReader* itemReader = item.Reader; - if ( itemReader == 0 ) continue; - - // remove iterator on match - if ( itemReader->GetFilename() == filenameToRemove ) { - m_data.erase(dataIter); - return; - } - } -} - -inline -int MultiMerger::Size(void) const { - return m_data.size(); -} - -inline -MergeItem MultiMerger::TakeFirst(void) { - MergeItem firstItem = m_data.front(); - m_data.pop_front(); - return firstItem; -} - -} // namespace Internal -} // namespace BamTools - -#endif // BAMMULTIMERGER_P_H diff --git a/src/api/internal/BamMultiReader_p.cpp b/src/api/internal/BamMultiReader_p.cpp deleted file mode 100644 index 55ae615..0000000 --- a/src/api/internal/BamMultiReader_p.cpp +++ /dev/null @@ -1,799 +0,0 @@ -// *************************************************************************** -// BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 14 October 2011 (DB) -// --------------------------------------------------------------------------- -// Functionality for simultaneously reading multiple BAM files -// ************************************************************************* - -#include "api/BamAlignment.h" -#include "api/BamMultiReader.h" -#include "api/SamConstants.h" -#include "api/algorithms/Sort.h" -#include "api/internal/BamMultiReader_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -#include -#include -#include -#include -#include -using namespace std; - -// ctor -BamMultiReaderPrivate::BamMultiReaderPrivate(void) - : m_alignmentCache(0) -{ } - -// dtor -BamMultiReaderPrivate::~BamMultiReaderPrivate(void) { - Close(); -} - -// close all BAM files -bool BamMultiReaderPrivate::Close(void) { - - m_errorString.clear(); - - if ( CloseFiles(Filenames()) ) - return true; - else { - const string currentError = m_errorString; - const string message = string("error encountered while closing all files: \n\t") + currentError; - SetErrorString("BamMultiReader::Close", message); - return false; - } -} - -// close requested BAM file -bool BamMultiReaderPrivate::CloseFile(const string& filename) { - - m_errorString.clear(); - - vector filenames(1, filename); - if ( CloseFiles(filenames) ) - return true; - else { - const string currentError = m_errorString; - const string message = string("error while closing file: ") + filename + "\n" + currentError; - SetErrorString("BamMultiReader::CloseFile", message); - return false; - } -} - -// close requested BAM files -bool BamMultiReaderPrivate::CloseFiles(const vector& filenames) { - - bool errorsEncountered = false; - m_errorString.clear(); - - // iterate over filenames - vector::const_iterator filesIter = filenames.begin(); - vector::const_iterator filesEnd = filenames.end(); - for ( ; filesIter != filesEnd; ++filesIter ) { - const string& filename = (*filesIter); - if ( filename.empty() ) continue; - - // iterate over readers - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - MergeItem& item = (*readerIter); - BamReader* reader = item.Reader; - if ( reader == 0 ) continue; - - // if reader matches requested filename - if ( reader->GetFilename() == filename ) { - - // remove reader's entry from alignment cache - m_alignmentCache->Remove(reader); - - // clean up reader & its alignment - if ( !reader->Close() ) { - m_errorString.append(1, '\t'); - m_errorString.append(reader->GetErrorString()); - m_errorString.append(1, '\n'); - errorsEncountered = true; - } - delete reader; - reader = 0; - - // delete reader's alignment entry - BamAlignment* alignment = item.Alignment; - delete alignment; - alignment = 0; - - // remove reader from reader list - m_readers.erase(readerIter); - - // on match, just go on to next filename - // (no need to keep looking and item iterator is invalid now anyway) - break; - } - } - } - - // make sure alignment cache is cleaned up if all readers closed - if ( m_readers.empty() && m_alignmentCache ) { - m_alignmentCache->Clear(); - delete m_alignmentCache; - m_alignmentCache = 0; - } - - // return whether all readers closed OK - return !errorsEncountered; -} - -// creates index files for BAM files that don't have them -bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) { - - bool errorsEncountered = false; - m_errorString.clear(); - - // iterate over readers - vector::iterator itemIter = m_readers.begin(); - vector::iterator itemEnd = m_readers.end(); - for ( ; itemIter != itemEnd; ++itemIter ) { - MergeItem& item = (*itemIter); - BamReader* reader = item.Reader; - if ( reader == 0 ) continue; - - // if reader doesn't have an index, create one - if ( !reader->HasIndex() ) { - if ( !reader->CreateIndex(type) ) { - m_errorString.append(1, '\t'); - m_errorString.append(reader->GetErrorString()); - m_errorString.append(1, '\n'); - errorsEncountered = true; - } - } - } - - // check for errors encountered before returning success/fail - if ( errorsEncountered ) { - const string currentError = m_errorString; - const string message = string("error while creating index files: ") + "\n" + currentError; - SetErrorString("BamMultiReader::CreateIndexes", message); - return false; - } else - return true; -} - -IMultiMerger* BamMultiReaderPrivate::CreateAlignmentCache(void) const { - - // fetch SamHeader - SamHeader header = GetHeader(); - - // if BAM files are sorted by position - if ( header.SortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ) - return new MultiMerger(); - - // if BAM files are sorted by read name - if ( header.SortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ) - return new MultiMerger(); - - // otherwise "unknown" or "unsorted", use unsorted merger and just read in - return new MultiMerger(); -} - -const vector BamMultiReaderPrivate::Filenames(void) const { - - // init filename container - vector filenames; - filenames.reserve( m_readers.size() ); - - // iterate over readers - vector::const_iterator itemIter = m_readers.begin(); - vector::const_iterator itemEnd = m_readers.end(); - for ( ; itemIter != itemEnd; ++itemIter ) { - const MergeItem& item = (*itemIter); - const BamReader* reader = item.Reader; - if ( reader == 0 ) continue; - - // store filename if not empty - const string& filename = reader->GetFilename(); - if ( !filename.empty() ) - filenames.push_back(filename); - } - - // return result - return filenames; -} - -string BamMultiReaderPrivate::GetErrorString(void) const { - return m_errorString; -} - -SamHeader BamMultiReaderPrivate::GetHeader(void) const { - const string& text = GetHeaderText(); - return SamHeader(text); -} - -// makes a virtual, unified header for all the bam files in the multireader -string BamMultiReaderPrivate::GetHeaderText(void) const { - - // N.B. - right now, simply copies all header data from first BAM, - // and then appends RG's from other BAM files - // TODO: make this more intelligent wrt other header lines/fields - - // if no readers open - const size_t numReaders = m_readers.size(); - if ( numReaders == 0 ) return string(); - - // retrieve first reader's header - const MergeItem& firstItem = m_readers.front(); - const BamReader* reader = firstItem.Reader; - if ( reader == 0 ) return string(); - SamHeader mergedHeader = reader->GetHeader(); - - // iterate over any remaining readers (skipping the first) - for ( size_t i = 1; i < numReaders; ++i ) { - const MergeItem& item = m_readers.at(i); - const BamReader* reader = item.Reader; - if ( reader == 0 ) continue; - - // retrieve current reader's header - const SamHeader currentHeader = reader->GetHeader(); - - // append current reader's RG entries to merged header - // N.B. - SamReadGroupDictionary handles duplicate-checking - mergedHeader.ReadGroups.Add(currentHeader.ReadGroups); - - // TODO: merge anything else?? - } - - // return stringified header - return mergedHeader.ToString(); -} - -// get next alignment among all files -bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) { - return PopNextCachedAlignment(al, true); -} - -// get next alignment among all files without parsing character data from alignments -bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) { - return PopNextCachedAlignment(al, false); -} - -// --------------------------------------------------------------------------------------- -// -// NB: The following GetReferenceX() functions assume that we have identical -// references for all BAM files. We enforce this by invoking the -// ValidateReaders() method to verify that our reference data is the same -// across all files on Open - so we will not encounter a situation in which -// there is a mismatch and we are still live. -// -// --------------------------------------------------------------------------------------- - -// returns the number of reference sequences -int BamMultiReaderPrivate::GetReferenceCount(void) const { - - // handle empty multireader - if ( m_readers.empty() ) return 0; - - // return reference count from first reader - const MergeItem& item = m_readers.front(); - const BamReader* reader = item.Reader; - if ( reader == 0 ) return 0; - else - return reader->GetReferenceCount(); -} - -// returns vector of reference objects -const RefVector BamMultiReaderPrivate::GetReferenceData(void) const { - - // handle empty multireader - if ( m_readers.empty() ) return RefVector(); - - // return reference data from first BamReader - const MergeItem& item = m_readers.front(); - const BamReader* reader = item.Reader; - if ( reader == 0 ) return RefVector(); - else - return reader->GetReferenceData(); -} - -// returns refID from reference name -int BamMultiReaderPrivate::GetReferenceID(const string& refName) const { - - // handle empty multireader - if ( m_readers.empty() ) return -1; - - // return reference ID from first BamReader - const MergeItem& item = m_readers.front(); - const BamReader* reader = item.Reader; - if ( reader == 0 ) return -1; - else - return reader->GetReferenceID(refName); -} -// --------------------------------------------------------------------------------------- - -// returns true if all readers have index data available -// this is useful to indicate whether Jump() or SetRegion() are possible -bool BamMultiReaderPrivate::HasIndexes(void) const { - - // handle empty multireader - if ( m_readers.empty() ) - return false; - - bool result = true; - - // iterate over readers - vector::const_iterator readerIter = m_readers.begin(); - vector::const_iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - const MergeItem& item = (*readerIter); - const BamReader* reader = item.Reader; - if ( reader == 0 ) continue; - - // see if current reader has index data - result &= reader->HasIndex(); - } - - return result; -} - -// returns true if multireader has open readers -bool BamMultiReaderPrivate::HasOpenReaders(void) { - - // iterate over readers - vector::const_iterator readerIter = m_readers.begin(); - vector::const_iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - const MergeItem& item = (*readerIter); - const BamReader* reader = item.Reader; - if ( reader == 0 ) continue; - - // return true whenever an open reader is found - if ( reader->IsOpen() ) return true; - } - - // no readers open - return false; -} - -// performs random-access jump using (refID, position) as a left-bound -bool BamMultiReaderPrivate::Jump(int refID, int position) { - - // NB: While it may make sense to track readers in which we can - // successfully Jump, in practice a failure of Jump means "no - // alignments here." It makes sense to simply accept the failure, - // UpdateAlignments(), and continue. - - // iterate over readers - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - MergeItem& item = (*readerIter); - BamReader* reader = item.Reader; - if ( reader == 0 ) continue; - - // jump in each BamReader to position of interest - reader->Jump(refID, position); - } - - // returns status of cache update - return UpdateAlignmentCache(); -} - -// locate (& load) index files for BAM readers that don't already have one loaded -bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredType) { - - bool errorsEncountered = false; - m_errorString.clear(); - - // iterate over readers - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - MergeItem& item = (*readerIter); - BamReader* reader = item.Reader; - if ( reader == 0 ) continue; - - // if reader has no index, try to locate one - if ( !reader->HasIndex() ) { - if ( !reader->LocateIndex(preferredType) ) { - m_errorString.append(1, '\t'); - m_errorString.append(reader->GetErrorString()); - m_errorString.append(1, '\n'); - errorsEncountered = true; - } - } - } - - // check for errors encountered before returning success/fail - if ( errorsEncountered ) { - const string currentError = m_errorString; - const string message = string("error while locating index files: ") + "\n" + currentError; - SetErrorString("BamMultiReader::LocatingIndexes", message); - return false; - } else - return true; -} - -// opens BAM files -bool BamMultiReaderPrivate::Open(const vector& filenames) { - - m_errorString.clear(); - - // put all current readers back at beginning (refreshes alignment cache) - if ( !Rewind() ) { - const string currentError = m_errorString; - const string message = string("unable to rewind existing readers: \n\t") + currentError; - SetErrorString("BamMultiReader::Open", message); - return false; - } - - // iterate over filenames - bool errorsEncountered = false; - vector::const_iterator filenameIter = filenames.begin(); - vector::const_iterator filenameEnd = filenames.end(); - for ( ; filenameIter != filenameEnd; ++filenameIter ) { - const string& filename = (*filenameIter); - if ( filename.empty() ) continue; - - // attempt to open BamReader - BamReader* reader = new BamReader; - const bool readerOpened = reader->Open(filename); - - // if opened OK, store it - if ( readerOpened ) - m_readers.push_back( MergeItem(reader, new BamAlignment) ); - - // otherwise store error & clean up invalid reader - else { - m_errorString.append(1, '\t'); - m_errorString += string("unable to open file: ") + filename; - m_errorString.append(1, '\n'); - errorsEncountered = true; - - delete reader; - reader = 0; - } - } - - // check for errors while opening - if ( errorsEncountered ) { - const string currentError = m_errorString; - const string message = string("unable to open all files: \t\n") + currentError; - SetErrorString("BamMultiReader::Open", message); - return false; - } - - // check for BAM file consistency - if ( !ValidateReaders() ) { - const string currentError = m_errorString; - const string message = string("unable to open inconsistent files: \t\n") + currentError; - SetErrorString("BamMultiReader::Open", message); - return false; - } - - // update alignment cache - return UpdateAlignmentCache(); -} - -bool BamMultiReaderPrivate::OpenFile(const std::string& filename) { - vector filenames(1, filename); - if ( Open(filenames) ) - return true; - else { - const string currentError = m_errorString; - const string message = string("could not open file: ") + filename + "\n\t" + currentError; - SetErrorString("BamMultiReader::OpenFile", message); - return false; - } -} - -bool BamMultiReaderPrivate::OpenIndexes(const vector& indexFilenames) { - - // TODO: This needs to be cleaner - should not assume same order. - // And either way, shouldn't start at first reader. Should start at - // first reader without an index? - - // make sure same number of index filenames as readers - if ( m_readers.size() != indexFilenames.size() ) { - const string message("size of index file list does not match current BAM file count"); - SetErrorString("BamMultiReader::OpenIndexes", message); - return false; - } - - bool errorsEncountered = false; - m_errorString.clear(); - - // iterate over BamReaders - vector::const_iterator indexFilenameIter = indexFilenames.begin(); - vector::const_iterator indexFilenameEnd = indexFilenames.end(); - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - MergeItem& item = (*readerIter); - BamReader* reader = item.Reader; - - // open index filename on reader - if ( reader ) { - const string& indexFilename = (*indexFilenameIter); - if ( !reader->OpenIndex(indexFilename) ) { - m_errorString.append(1, '\t'); - m_errorString += reader->GetErrorString(); - m_errorString.append(1, '\n'); - errorsEncountered = true; - } - } - - // increment filename iterator, skip if no more index files to open - if ( ++indexFilenameIter == indexFilenameEnd ) - break; - } - - // return success/fail - if ( errorsEncountered ) { - const string currentError = m_errorString; - const string message = string("could not open all index files: \n\t") + currentError; - SetErrorString("BamMultiReader::OpenIndexes", message); - return false; - } else - return true; -} - -bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData) { - - // skip if no alignments available - if ( m_alignmentCache == 0 || m_alignmentCache->IsEmpty() ) - return false; - - // pop next merge item entry from cache - MergeItem item = m_alignmentCache->TakeFirst(); - BamReader* reader = item.Reader; - BamAlignment* alignment = item.Alignment; - if ( reader == 0 || alignment == 0 ) - return false; - - // set char data if requested - if ( needCharData ) { - alignment->BuildCharData(); - alignment->Filename = reader->GetFilename(); - } - - // store cached alignment into destination parameter (by copy) - al = *alignment; - - // load next alignment from reader & store in cache - SaveNextAlignment(reader, alignment); - return true; -} - -// returns BAM file pointers to beginning of alignment data & resets alignment cache -bool BamMultiReaderPrivate::Rewind(void) { - - // skip if no readers open - if ( m_readers.empty() ) - return true; - - // attempt to rewind files - if ( !RewindReaders() ) { - const string currentError = m_errorString; - const string message = string("could not rewind readers: \n\t") + currentError; - SetErrorString("BamMultiReader::Rewind", message); - return false; - } - - // return status of cache update - return UpdateAlignmentCache(); -} - -// returns BAM file pointers to beginning of alignment data -bool BamMultiReaderPrivate::RewindReaders(void) { - - m_errorString.clear(); - bool errorsEncountered = false; - - // iterate over readers - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - MergeItem& item = (*readerIter); - BamReader* reader = item.Reader; - if ( reader == 0 ) continue; - - // attempt rewind on BamReader - if ( !reader->Rewind() ) { - m_errorString.append(1, '\t'); - m_errorString.append( reader->GetErrorString() ); - m_errorString.append(1, '\n'); - errorsEncountered = true; - } - } - - return !errorsEncountered; -} - -void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) { - - // if can read alignment from reader, store in cache - // - // N.B. - lazy building of alignment's char data - populated only: - // automatically by alignment cache to maintain its sorting OR - // on demand from client call to future call to GetNextAlignment() - - if ( reader->GetNextAlignmentCore(*alignment) ) - m_alignmentCache->Add( MergeItem(reader, alignment) ); -} - -void BamMultiReaderPrivate::SetErrorString(const string& where, const string& what) const { - static const string SEPARATOR = ": "; - m_errorString = where + SEPARATOR + what; -} - -bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) { - - // NB: While it may make sense to track readers in which we can - // successfully SetRegion, In practice a failure of SetRegion means "no - // alignments here." It makes sense to simply accept the failure, - // UpdateAlignments(), and continue. - - // iterate over alignments - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - MergeItem& item = (*readerIter); - BamReader* reader = item.Reader; - if ( reader == 0 ) continue; - - // set region of interest - reader->SetRegion(region); - } - - // return status of cache update - return UpdateAlignmentCache(); -} - -// updates our alignment cache -bool BamMultiReaderPrivate::UpdateAlignmentCache(void) { - - // create alignment cache if not created yet - if ( m_alignmentCache == 0 ) { - m_alignmentCache = CreateAlignmentCache(); - if ( m_alignmentCache == 0 ) { - SetErrorString("BamMultiReader::UpdateAlignmentCache", "unable to create new alignment cache"); - return false; - } - } - - // clear any prior cache data - m_alignmentCache->Clear(); - - // iterate over readers - vector::iterator readerIter = m_readers.begin(); - vector::iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - MergeItem& item = (*readerIter); - BamReader* reader = item.Reader; - BamAlignment* alignment = item.Alignment; - if ( reader == 0 || alignment == 0 ) continue; - - // save next alignment from each reader in cache - SaveNextAlignment(reader, alignment); - } - - // if we get here, ok - return true; -} - -// ValidateReaders checks that all the readers point to BAM files representing -// alignments against the same set of reference sequences, and that the -// sequences are identically ordered. If these checks fail the operation of -// the multireader is undefined, so we force program exit. -bool BamMultiReaderPrivate::ValidateReaders(void) const { - - m_errorString.clear(); - - // skip if 0 or 1 readers opened - if ( m_readers.empty() || (m_readers.size() == 1) ) - return true; - - // retrieve first reader - const MergeItem& firstItem = m_readers.front(); - const BamReader* firstReader = firstItem.Reader; - if ( firstReader == 0 ) return false; - - // retrieve first reader's header data - const SamHeader& firstReaderHeader = firstReader->GetHeader(); - const string& firstReaderSortOrder = firstReaderHeader.SortOrder; - - // retrieve first reader's reference data - const RefVector& firstReaderRefData = firstReader->GetReferenceData(); - const int firstReaderRefCount = firstReader->GetReferenceCount(); - const int firstReaderRefSize = firstReaderRefData.size(); - - // iterate over all readers - vector::const_iterator readerIter = m_readers.begin(); - vector::const_iterator readerEnd = m_readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - const MergeItem& item = (*readerIter); - BamReader* reader = item.Reader; - if ( reader == 0 ) continue; - - // get current reader's header data - const SamHeader& currentReaderHeader = reader->GetHeader(); - const string& currentReaderSortOrder = currentReaderHeader.SortOrder; - - // check compatible sort order - if ( currentReaderSortOrder != firstReaderSortOrder ) { - const string message = string("mismatched sort order in ") + reader->GetFilename() + - ", expected " + firstReaderSortOrder + - ", but found " + currentReaderSortOrder; - SetErrorString("BamMultiReader::ValidateReaders", message); - return false; - } - - // get current reader's reference data - const RefVector currentReaderRefData = reader->GetReferenceData(); - const int currentReaderRefCount = reader->GetReferenceCount(); - const int currentReaderRefSize = currentReaderRefData.size(); - - // init reference data iterators - RefVector::const_iterator firstRefIter = firstReaderRefData.begin(); - RefVector::const_iterator firstRefEnd = firstReaderRefData.end(); - RefVector::const_iterator currentRefIter = currentReaderRefData.begin(); - - // compare reference counts from BamReader ( & container size, in case of BR error) - if ( (currentReaderRefCount != firstReaderRefCount) || - (firstReaderRefSize != currentReaderRefSize) ) - { - stringstream s(""); - s << "mismatched reference count in " << reader->GetFilename() - << ", expected " << firstReaderRefCount - << ", but found " << currentReaderRefCount; - SetErrorString("BamMultiReader::ValidateReaders", s.str()); - return false; - } - - // this will be ok; we just checked above that we have identically-sized sets of references - // here we simply check if they are all, in fact, equal in content - while ( firstRefIter != firstRefEnd ) { - const RefData& firstRef = (*firstRefIter); - const RefData& currentRef = (*currentRefIter); - - // compare reference name & length - if ( (firstRef.RefName != currentRef.RefName) || - (firstRef.RefLength != currentRef.RefLength) ) - { - stringstream s(""); - s << "mismatched references found in" << reader->GetFilename() - << "expected: " << endl; - - // print first reader's reference data - RefVector::const_iterator refIter = firstReaderRefData.begin(); - RefVector::const_iterator refEnd = firstReaderRefData.end(); - for ( ; refIter != refEnd; ++refIter ) { - const RefData& entry = (*refIter); - stringstream s(""); - s << entry.RefName << " " << endl; - } - - s << "but found: " << endl; - - // print current reader's reference data - refIter = currentReaderRefData.begin(); - refEnd = currentReaderRefData.end(); - for ( ; refIter != refEnd; ++refIter ) { - const RefData& entry = (*refIter); - s << entry.RefName << " " << entry.RefLength << endl; - } - - SetErrorString("BamMultiReader::ValidateReaders", s.str()); - return false; - } - - // update iterators - ++firstRefIter; - ++currentRefIter; - } - } - - // if we get here, everything checks out - return true; -} diff --git a/src/api/internal/BamMultiReader_p.h b/src/api/internal/BamMultiReader_p.h deleted file mode 100644 index 9d001f5..0000000 --- a/src/api/internal/BamMultiReader_p.h +++ /dev/null @@ -1,99 +0,0 @@ -// *************************************************************************** -// BamMultiReader_p.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Functionality for simultaneously reading multiple BAM files -// ************************************************************************* - -#ifndef BAMMULTIREADER_P_H -#define BAMMULTIREADER_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include "api/SamHeader.h" -#include "api/BamMultiReader.h" -#include "api/internal/BamMultiMerger_p.h" -#include -#include - -namespace BamTools { -namespace Internal { - -class BamMultiReaderPrivate { - - // typedefs - public: - typedef std::pair ReaderAlignment; - - // constructor / destructor - public: - BamMultiReaderPrivate(void); - ~BamMultiReaderPrivate(void); - - // public interface - public: - - // file operations - bool Close(void); - bool CloseFile(const std::string& filename); - const std::vector Filenames(void) const; - bool Jump(int refID, int position = 0); - bool Open(const std::vector& filenames); - bool OpenFile(const std::string& filename); - bool Rewind(void); - bool SetRegion(const BamRegion& region); - - // access alignment data - bool GetNextAlignment(BamAlignment& al); - bool GetNextAlignmentCore(BamAlignment& al); - bool HasOpenReaders(void); - - // access auxiliary data - SamHeader GetHeader(void) const; - std::string GetHeaderText(void) const; - int GetReferenceCount(void) const; - const BamTools::RefVector GetReferenceData(void) const; - int GetReferenceID(const std::string& refName) const; - - // BAM index operations - bool CreateIndexes(const BamIndex::IndexType& type = BamIndex::STANDARD); - bool HasIndexes(void) const; - bool LocateIndexes(const BamIndex::IndexType& preferredType = BamIndex::STANDARD); - bool OpenIndexes(const std::vector& indexFilenames); - - // error handling - std::string GetErrorString(void) const; - - // 'internal' methods - public: - - bool CloseFiles(const std::vector& filenames); - IMultiMerger* CreateAlignmentCache(void) const; - bool PopNextCachedAlignment(BamAlignment& al, const bool needCharData); - bool RewindReaders(void); - void SaveNextAlignment(BamReader* reader, BamAlignment* alignment); - void SetErrorString(const std::string& where, const std::string& what) const; // - bool UpdateAlignmentCache(void); - bool ValidateReaders(void) const; - - // data members - public: - std::vector m_readers; - IMultiMerger* m_alignmentCache; - mutable std::string m_errorString; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMMULTIREADER_P_H diff --git a/src/api/internal/BamPipe_p.cpp b/src/api/internal/BamPipe_p.cpp deleted file mode 100644 index e13ad7c..0000000 --- a/src/api/internal/BamPipe_p.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// *************************************************************************** -// BamPipe_p.cpp (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides BAM pipe-specific IO behavior -// *************************************************************************** - -#include "api/internal/BamPipe_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -#include -#include -using namespace std; - -BamPipe::BamPipe(void) : ILocalIODevice() { } - -BamPipe::~BamPipe(void) { } - -bool BamPipe::IsRandomAccess(void) const { - return false; -} - -bool BamPipe::Open(const IBamIODevice::OpenMode mode) { - - // make sure we're starting with a fresh pipe - Close(); - - // open stdin/stdout depending on requested openmode - if ( mode == IBamIODevice::ReadOnly ) - m_stream = freopen(0, "rb", stdin); - else if ( mode == IBamIODevice::WriteOnly ) - m_stream = freopen(0, "wb", stdout); - else { - SetErrorString("BamPipe::Open", "unknown open mode requested"); - return false; - } - - // check that we obtained a valid FILE* - if ( m_stream == 0 ) { - const string message_base = string("could not open handle on "); - const string message = message_base + ( (mode == IBamIODevice::ReadOnly) ? "stdin" : "stdout" ); - SetErrorString("BamPipe::Open", message); - return false; - } - - // store current IO mode & return success - m_mode = mode; - return true; -} - -bool BamPipe::Seek(const int64_t& ) { - SetErrorString("BamPipe::Seek", "random access not allowed in FIFO pipe"); - return false; -} diff --git a/src/api/internal/BamPipe_p.h b/src/api/internal/BamPipe_p.h deleted file mode 100644 index 8996766..0000000 --- a/src/api/internal/BamPipe_p.h +++ /dev/null @@ -1,46 +0,0 @@ -// *************************************************************************** -// BamPipe_p.h (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides BAM pipe-specific IO behavior -// *************************************************************************** - -#ifndef BAMPIPE_P_H -#define BAMPIPE_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include "api/internal/ILocalIODevice_p.h" -#include - -namespace BamTools { -namespace Internal { - -class BamPipe : public ILocalIODevice { - - // ctor & dtor - public: - BamPipe(void); - ~BamPipe(void); - - // IBamIODevice implementation - public: - bool IsRandomAccess(void) const; - bool Open(const IBamIODevice::OpenMode mode); - bool Seek(const int64_t& position); -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMPIPE_P_H diff --git a/src/api/internal/BamRandomAccessController_p.cpp b/src/api/internal/BamRandomAccessController_p.cpp deleted file mode 100644 index c223ed7..0000000 --- a/src/api/internal/BamRandomAccessController_p.cpp +++ /dev/null @@ -1,289 +0,0 @@ -// *************************************************************************** -// BamRandomAccessController_p.cpp (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011(DB) -// --------------------------------------------------------------------------- -// Manages random access operations in a BAM file -// ************************************************************************** - -#include "api/BamIndex.h" -#include "api/internal/BamException_p.h" -#include "api/internal/BamRandomAccessController_p.h" -#include "api/internal/BamReader_p.h" -#include "api/internal/BamIndexFactory_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -#include -#include -using namespace std; - -BamRandomAccessController::BamRandomAccessController(void) - : m_index(0) - , m_hasAlignmentsInRegion(true) -{ } - -BamRandomAccessController::~BamRandomAccessController(void) { - Close(); -} - -void BamRandomAccessController::AdjustRegion(const int& referenceCount) { - - // skip if no index available - if ( m_index == 0 ) - return; - - // see if any references in region have alignments - m_hasAlignmentsInRegion = false; - int currentId = m_region.LeftRefID; - const int rightBoundRefId = ( m_region.isRightBoundSpecified() ? m_region.RightRefID : referenceCount - 1 ); - while ( currentId <= rightBoundRefId ) { - m_hasAlignmentsInRegion = m_index->HasAlignments(currentId); - if ( m_hasAlignmentsInRegion ) break; - ++currentId; - } - - // if no data found on any reference in region - if ( !m_hasAlignmentsInRegion ) - return; - - // if left bound of desired region had no data, use first reference that had data - // otherwise, leave requested region as-is - if ( currentId != m_region.LeftRefID ) { - m_region.LeftRefID = currentId; - m_region.LeftPosition = 0; - } -} - -// returns alignments' "RegionState": { Before|Overlaps|After } current region -BamRandomAccessController::RegionState -BamRandomAccessController::AlignmentState(const BamAlignment& alignment) const { - - // if region has no left bound at all - if ( !m_region.isLeftBoundSpecified() ) - return OverlapsRegion; - - // handle unmapped reads - return AFTER region to halt processing - if ( alignment.RefID == -1 ) - return AfterRegion; - - // if alignment is on any reference before left bound reference - if ( alignment.RefID < m_region.LeftRefID ) - return BeforeRegion; - - // if alignment is on left bound reference - else if ( alignment.RefID == m_region.LeftRefID ) { - - // if alignment starts at or after left bound position - if ( alignment.Position >= m_region.LeftPosition) { - - if ( m_region.isRightBoundSpecified() && // right bound is specified AND - m_region.LeftRefID == m_region.RightRefID && // left & right bounds on same reference AND - alignment.Position >= m_region.RightPosition ) // alignment starts on or after right bound position - return AfterRegion; - - // otherwise, alignment overlaps region - else return OverlapsRegion; - } - - // alignment starts before left bound position - else { - - // if alignment overlaps left bound position - if ( alignment.GetEndPosition() > m_region.LeftPosition ) - return OverlapsRegion; - else - return BeforeRegion; - } - } - - // otherwise alignment is on a reference after left bound reference - else { - - // if region has a right bound - if ( m_region.isRightBoundSpecified() ) { - - // alignment is on any reference between boundaries - if ( alignment.RefID < m_region.RightRefID ) - return OverlapsRegion; - - // alignment is on any reference after right boundary - else if ( alignment.RefID > m_region.RightRefID ) - return AfterRegion; - - // alignment is on right bound reference - else { - - // if alignment starts before right bound position - if ( alignment.Position < m_region.RightPosition ) - return OverlapsRegion; - else - return AfterRegion; - } - } - - // otherwise, alignment starts after left bound and there is no right bound given - else return OverlapsRegion; - } -} - -void BamRandomAccessController::Close(void) { - ClearIndex(); - ClearRegion(); -} - -void BamRandomAccessController::ClearIndex(void) { - if ( m_index ) { - delete m_index; - m_index = 0; - } -} - -void BamRandomAccessController::ClearRegion(void) { - m_region.clear(); - m_hasAlignmentsInRegion = true; -} - -bool BamRandomAccessController::CreateIndex(BamReaderPrivate* reader, - const BamIndex::IndexType& type) -{ - // skip if reader is invalid - assert(reader); - if ( !reader->IsOpen() ) { - SetErrorString("BamRandomAccessController::CreateIndex", - "cannot create index for unopened reader"); - return false; - } - - // create new index of requested type - BamIndex* newIndex = BamIndexFactory::CreateIndexOfType(type, reader); - if ( newIndex == 0 ) { - stringstream s(""); - s << "could not create index of type: " << type; - SetErrorString("BamRandomAccessController::CreateIndex", s.str()); - return false; - } - - // attempt to build index from current BamReader file - if ( !newIndex->Create() ) { - const string indexError = newIndex->GetErrorString(); - const string message = "could not create index: \n\t" + indexError; - SetErrorString("BamRandomAccessController::CreateIndex", message); - return false; - } - - // save new index & return success - SetIndex(newIndex); - return true; -} - -string BamRandomAccessController::GetErrorString(void) const { - return m_errorString; -} - -bool BamRandomAccessController::HasIndex(void) const { - return ( m_index != 0 ); -} - -bool BamRandomAccessController::HasRegion(void) const { - return ( !m_region.isNull() ); -} - -bool BamRandomAccessController::IndexHasAlignmentsForReference(const int& refId) { - return m_index->HasAlignments(refId); -} - -bool BamRandomAccessController::LocateIndex(BamReaderPrivate* reader, - const BamIndex::IndexType& preferredType) -{ - // look up index filename, deferring to preferredType if possible - assert(reader); - const string& indexFilename = BamIndexFactory::FindIndexFilename(reader->Filename(), preferredType); - - // if no index file found (of any type) - if ( indexFilename.empty() ) { - const string message = string("could not find index file for:") + reader->Filename(); - SetErrorString("BamRandomAccessController::LocateIndex", message); - return false; - } - - // otherwise open & use index file that was found - return OpenIndex(indexFilename, reader); -} - -bool BamRandomAccessController::OpenIndex(const string& indexFilename, BamReaderPrivate* reader) { - - // attempt create new index of type based on filename - BamIndex* index = BamIndexFactory::CreateIndexFromFilename(indexFilename, reader); - if ( index == 0 ) { - const string message = string("could not open index file: ") + indexFilename; - SetErrorString("BamRandomAccessController::OpenIndex", message); - return false; - } - - // attempt to load data from index file - if ( !index->Load(indexFilename) ) { - const string indexError = index->GetErrorString(); - const string message = string("could not load index data from file: ") + indexFilename + - "\n\t" + indexError; - SetErrorString("BamRandomAccessController::OpenIndex", message); - return false; - } - - // save new index & return success - SetIndex(index); - return true; -} - -bool BamRandomAccessController::RegionHasAlignments(void) const { - return m_hasAlignmentsInRegion; -} - -void BamRandomAccessController::SetErrorString(const string& where, const string& what) { - m_errorString = where + ": " + what; -} - -void BamRandomAccessController::SetIndex(BamIndex* index) { - if ( m_index ) - ClearIndex(); - m_index = index; -} - -bool BamRandomAccessController::SetRegion(const BamRegion& region, const int& referenceCount) { - - // store region - m_region = region; - - // cannot jump when no index is available - if ( !HasIndex() ) { - SetErrorString("BamRandomAccessController", "cannot jump if no index data available"); - return false; - } - - // adjust region as necessary to reflect where data actually begins - AdjustRegion(referenceCount); - - // if no data present, return true - // * Not an error, but future attempts to access alignments in this region will not return data - // Returning true is useful in a BamMultiReader setting where some BAM files may - // lack alignments in regions where other files still have data available. - if ( !m_hasAlignmentsInRegion ) - return true; - - // return success/failure of jump to specified region, - // - // * Index::Jump() is allowed to modify the m_hasAlignmentsInRegion flag - // This covers 'corner case' where a region is requested that lies beyond the last - // alignment on a reference. If this occurs, any subsequent calls to GetNextAlignment[Core] - // will not return data. BamMultiReader will still be able to successfully pull alignments - // from a region from other files even if this one has no data. - if ( !m_index->Jump(m_region, &m_hasAlignmentsInRegion) ) { - const string indexError = m_index->GetErrorString(); - const string message = string("could not set region\n\t") + indexError; - SetErrorString("BamRandomAccessController::OpenIndex", message); - return false; - } - else - return true; -} diff --git a/src/api/internal/BamRandomAccessController_p.h b/src/api/internal/BamRandomAccessController_p.h deleted file mode 100644 index 9262a61..0000000 --- a/src/api/internal/BamRandomAccessController_p.h +++ /dev/null @@ -1,94 +0,0 @@ -// *************************************************************************** -// BamRandomAccessController_p.h (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011(DB) -// --------------------------------------------------------------------------- -// Manages random access operations in a BAM file -// *************************************************************************** - -#ifndef BAMRACONTROLLER_P_H -#define BAMRACONTROLLER_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include "api/BamAux.h" -#include "api/BamIndex.h" - -namespace BamTools { - -class BamAlignment; - -namespace Internal { - -class BamReaderPrivate; - -class BamRandomAccessController { - - // enums - public: enum RegionState { BeforeRegion = 0 - , OverlapsRegion - , AfterRegion - }; - - // ctor & dtor - public: - BamRandomAccessController(void); - ~BamRandomAccessController(void); - - // BamRandomAccessController interface - public: - - // index methods - void ClearIndex(void); - bool CreateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& type); - bool HasIndex(void) const; - bool IndexHasAlignmentsForReference(const int& refId); - bool LocateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& preferredType); - bool OpenIndex(const std::string& indexFilename, BamReaderPrivate* reader); - void SetIndex(BamIndex* index); - - // region methods - void ClearRegion(void); - bool HasRegion(void) const; - RegionState AlignmentState(const BamAlignment& alignment) const; - bool RegionHasAlignments(void) const; - bool SetRegion(const BamRegion& region, const int& referenceCount); - - // general methods - void Close(void); - std::string GetErrorString(void) const; - - // internal methods - private: - // adjusts requested region if necessary (depending on where data actually begins) - void AdjustRegion(const int& referenceCount); - // error-string handling - void SetErrorString(const std::string& where, const std::string& what); - - // data members - private: - - // index data - BamIndex* m_index; // owns the index, not a copy - responsible for deleting - - // region data - BamRegion m_region; - bool m_hasAlignmentsInRegion; - - // general data - std::string m_errorString; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMRACONTROLLER_P_H diff --git a/src/api/internal/BamReader_p.cpp b/src/api/internal/BamReader_p.cpp deleted file mode 100644 index a344358..0000000 --- a/src/api/internal/BamReader_p.cpp +++ /dev/null @@ -1,465 +0,0 @@ -// *************************************************************************** -// BamReader_p.cpp (c) 2009 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 14 November 2011 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading BAM files -// *************************************************************************** - -#include "api/BamConstants.h" -#include "api/BamReader.h" -#include "api/IBamIODevice.h" -#include "api/internal/BamDeviceFactory_p.h" -#include "api/internal/BamException_p.h" -#include "api/internal/BamHeader_p.h" -#include "api/internal/BamRandomAccessController_p.h" -#include "api/internal/BamReader_p.h" -#include "api/internal/BamStandardIndex_p.h" -#include "api/internal/BamToolsIndex_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -#include -#include -#include -#include -#include -using namespace std; - -// constructor -BamReaderPrivate::BamReaderPrivate(BamReader* parent) - : m_alignmentsBeginOffset(0) - , m_parent(parent) -{ - m_isBigEndian = BamTools::SystemIsBigEndian(); -} - -// destructor -BamReaderPrivate::~BamReaderPrivate(void) { - Close(); -} - -// closes the BAM file -bool BamReaderPrivate::Close(void) { - - // clear BAM metadata - m_references.clear(); - m_header.Clear(); - - // clear filename - m_filename.clear(); - - // close random access controller - m_randomAccessController.Close(); - - // if stream is open, attempt close - if ( IsOpen() ) { - try { - m_stream.Close(); - } catch ( BamException& e ) { - const string streamError = e.what(); - const string message = string("encountered error closing BAM file: \n\t") + streamError; - SetErrorString("BamReader::Close", message); - return false; - } - } - - // return success - return true; -} - -// creates an index file of requested type on current BAM file -bool BamReaderPrivate::CreateIndex(const BamIndex::IndexType& type) { - - // skip if BAM file not open - if ( !IsOpen() ) { - SetErrorString("BamReader::CreateIndex", "cannot create index on unopened BAM file"); - return false; - } - - // attempt to create index - if ( m_randomAccessController.CreateIndex(this, type) ) - return true; - else { - const string bracError = m_randomAccessController.GetErrorString(); - const string message = string("could not create index: \n\t") + bracError; - SetErrorString("BamReader::CreateIndex", message); - return false; - } -} - -// return path & filename of current BAM file -const string BamReaderPrivate::Filename(void) const { - return m_filename; -} - -string BamReaderPrivate::GetErrorString(void) const { - return m_errorString; -} - -// return header data as std::string -string BamReaderPrivate::GetHeaderText(void) const { - return m_header.ToString(); -} - -// return header data as SamHeader object -SamHeader BamReaderPrivate::GetSamHeader(void) const { - return m_header.ToSamHeader(); -} - -// get next alignment (with character data fully parsed) -bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) { - - // if valid alignment found - if ( GetNextAlignmentCore(alignment) ) { - - // store alignment's "source" filename - alignment.Filename = m_filename; - - // return success/failure of parsing char data - if ( alignment.BuildCharData() ) - return true; - else { - const string alError = alignment.GetErrorString(); - const string message = string("could not populate alignment data: \n\t") + alError; - SetErrorString("BamReader::GetNextAlignment", message); - return false; - } - } - - // no valid alignment found - return false; -} - -// retrieves next available alignment core data (returns success/fail) -// ** DOES NOT populate any character data fields (read name, bases, qualities, tag data, filename) -// these can be accessed, if necessary, from the supportData -// useful for operations requiring ONLY positional or other alignment-related information -bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& alignment) { - - // skip if stream not opened - if ( !m_stream.IsOpen() ) - return false; - - try { - - // skip if region is set but has no alignments - if ( m_randomAccessController.HasRegion() && - !m_randomAccessController.RegionHasAlignments() ) - { - return false; - } - - // if can't read next alignment - if ( !LoadNextAlignment(alignment) ) - return false; - - // check alignment's region-overlap state - BamRandomAccessController::RegionState state = m_randomAccessController.AlignmentState(alignment); - - // if alignment starts after region, no need to keep reading - if ( state == BamRandomAccessController::AfterRegion ) - return false; - - // read until overlap is found - while ( state != BamRandomAccessController::OverlapsRegion ) { - - // if can't read next alignment - if ( !LoadNextAlignment(alignment) ) - return false; - - // check alignment's region-overlap state - state = m_randomAccessController.AlignmentState(alignment); - - // if alignment starts after region, no need to keep reading - if ( state == BamRandomAccessController::AfterRegion ) - return false; - } - - // if we get here, we found the next 'valid' alignment - // (e.g. overlaps current region if one was set, simply the next alignment if not) - alignment.SupportData.HasCoreOnly = true; - return true; - - } catch ( BamException& e ) { - const string streamError = e.what(); - const string message = string("encountered error reading BAM alignment: \n\t") + streamError; - SetErrorString("BamReader::GetNextAlignmentCore", message); - return false; - } -} - -int BamReaderPrivate::GetReferenceCount(void) const { - return m_references.size(); -} - -const RefVector& BamReaderPrivate::GetReferenceData(void) const { - return m_references; -} - -// returns RefID for given RefName (returns References.size() if not found) -int BamReaderPrivate::GetReferenceID(const string& refName) const { - - // retrieve names from reference data - vector refNames; - RefVector::const_iterator refIter = m_references.begin(); - RefVector::const_iterator refEnd = m_references.end(); - for ( ; refIter != refEnd; ++refIter) - refNames.push_back( (*refIter).RefName ); - - // return 'index-of' refName (or -1 if not found) - int index = distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); - if ( index == (int)m_references.size() ) return -1; - else return index; -} - -bool BamReaderPrivate::HasIndex(void) const { - return m_randomAccessController.HasIndex(); -} - -bool BamReaderPrivate::IsOpen(void) const { - return m_stream.IsOpen(); -} - -// load BAM header data -void BamReaderPrivate::LoadHeaderData(void) { - m_header.Load(&m_stream); -} - -// populates BamAlignment with alignment data under file pointer, returns success/fail -bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) { - - // read in the 'block length' value, make sure it's not zero - char buffer[sizeof(uint32_t)]; - m_stream.Read(buffer, sizeof(uint32_t)); - alignment.SupportData.BlockLength = BamTools::UnpackUnsignedInt(buffer); - if ( m_isBigEndian ) BamTools::SwapEndian_32(alignment.SupportData.BlockLength); - if ( alignment.SupportData.BlockLength == 0 ) - return false; - - // read in core alignment data, make sure the right size of data was read - char x[Constants::BAM_CORE_SIZE]; - if ( m_stream.Read(x, Constants::BAM_CORE_SIZE) != Constants::BAM_CORE_SIZE ) - return false; - - // swap core endian-ness if necessary - if ( m_isBigEndian ) { - for ( unsigned int i = 0; i < Constants::BAM_CORE_SIZE; i+=sizeof(uint32_t) ) - BamTools::SwapEndian_32p(&x[i]); - } - - // set BamAlignment 'core' and 'support' data - alignment.RefID = BamTools::UnpackSignedInt(&x[0]); - alignment.Position = BamTools::UnpackSignedInt(&x[4]); - - unsigned int tempValue = BamTools::UnpackUnsignedInt(&x[8]); - alignment.Bin = tempValue >> 16; - alignment.MapQuality = tempValue >> 8 & 0xff; - alignment.SupportData.QueryNameLength = tempValue & 0xff; - - tempValue = BamTools::UnpackUnsignedInt(&x[12]); - alignment.AlignmentFlag = tempValue >> 16; - alignment.SupportData.NumCigarOperations = tempValue & 0xffff; - - alignment.SupportData.QuerySequenceLength = BamTools::UnpackUnsignedInt(&x[16]); - alignment.MateRefID = BamTools::UnpackSignedInt(&x[20]); - alignment.MatePosition = BamTools::UnpackSignedInt(&x[24]); - alignment.InsertSize = BamTools::UnpackSignedInt(&x[28]); - - // set BamAlignment length - alignment.Length = alignment.SupportData.QuerySequenceLength; - - // read in character data - make sure proper data size was read - bool readCharDataOK = false; - const unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE; - RaiiBuffer allCharData(dataLength); - - if ( m_stream.Read(allCharData.Buffer, dataLength) == dataLength ) { - - // store 'allCharData' in supportData structure - alignment.SupportData.AllCharData.assign((const char*)allCharData.Buffer, dataLength); - - // set success flag - readCharDataOK = true; - - // save CIGAR ops - // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly, - // even when GetNextAlignmentCore() is called - const unsigned int cigarDataOffset = alignment.SupportData.QueryNameLength; - uint32_t* cigarData = (uint32_t*)(allCharData.Buffer + cigarDataOffset); - CigarOp op; - alignment.CigarData.clear(); - alignment.CigarData.reserve(alignment.SupportData.NumCigarOperations); - for ( unsigned int i = 0; i < alignment.SupportData.NumCigarOperations; ++i ) { - - // swap endian-ness if necessary - if ( m_isBigEndian ) BamTools::SwapEndian_32(cigarData[i]); - - // build CigarOp structure - op.Length = (cigarData[i] >> Constants::BAM_CIGAR_SHIFT); - op.Type = Constants::BAM_CIGAR_LOOKUP[ (cigarData[i] & Constants::BAM_CIGAR_MASK) ]; - - // save CigarOp - alignment.CigarData.push_back(op); - } - } - - // return success/failure - return readCharDataOK; -} - -// loads reference data from BAM file -bool BamReaderPrivate::LoadReferenceData(void) { - - // get number of reference sequences - char buffer[sizeof(uint32_t)]; - m_stream.Read(buffer, sizeof(uint32_t)); - uint32_t numberRefSeqs = BamTools::UnpackUnsignedInt(buffer); - if ( m_isBigEndian ) BamTools::SwapEndian_32(numberRefSeqs); - m_references.reserve((int)numberRefSeqs); - - // iterate over all references in header - for ( unsigned int i = 0; i != numberRefSeqs; ++i ) { - - // get length of reference name - m_stream.Read(buffer, sizeof(uint32_t)); - uint32_t refNameLength = BamTools::UnpackUnsignedInt(buffer); - if ( m_isBigEndian ) BamTools::SwapEndian_32(refNameLength); - RaiiBuffer refName(refNameLength); - - // get reference name and reference sequence length - m_stream.Read(refName.Buffer, refNameLength); - m_stream.Read(buffer, sizeof(int32_t)); - int32_t refLength = BamTools::UnpackSignedInt(buffer); - if ( m_isBigEndian ) BamTools::SwapEndian_32(refLength); - - // store data for reference - RefData aReference; - aReference.RefName = (string)((const char*)refName.Buffer); - aReference.RefLength = refLength; - m_references.push_back(aReference); - } - - // return success - return true; -} - -bool BamReaderPrivate::LocateIndex(const BamIndex::IndexType& preferredType) { - - if ( m_randomAccessController.LocateIndex(this, preferredType) ) - return true; - else { - const string bracError = m_randomAccessController.GetErrorString(); - const string message = string("could not locate index: \n\t") + bracError; - SetErrorString("BamReader::LocateIndex", message); - return false; - } -} - -// opens BAM file (and index) -bool BamReaderPrivate::Open(const string& filename) { - - try { - - // make sure we're starting with fresh state - Close(); - - // open BgzfStream - m_stream.Open(filename, IBamIODevice::ReadOnly); - - // load BAM metadata - LoadHeaderData(); - LoadReferenceData(); - - // store filename & offset of first alignment - m_filename = filename; - m_alignmentsBeginOffset = m_stream.Tell(); - - // return success - return true; - - } catch ( BamException& e ) { - const string error = e.what(); - const string message = string("could not open file: ") + filename + - "\n\t" + error; - SetErrorString("BamReader::Open", message); - return false; - } -} - -bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) { - - if ( m_randomAccessController.OpenIndex(indexFilename, this) ) - return true; - else { - const string bracError = m_randomAccessController.GetErrorString(); - const string message = string("could not open index: \n\t") + bracError; - SetErrorString("BamReader::OpenIndex", message); - return false; - } -} - -// returns BAM file pointer to beginning of alignment data -bool BamReaderPrivate::Rewind(void) { - - // reset region - m_randomAccessController.ClearRegion(); - - // return status of seeking back to first alignment - if ( Seek(m_alignmentsBeginOffset) ) - return true; - else { - const string currentError = m_errorString; - const string message = string("could not rewind: \n\t") + currentError; - SetErrorString("BamReader::Rewind", message); - return false; - } -} - -bool BamReaderPrivate::Seek(const int64_t& position) { - - // skip if BAM file not open - if ( !IsOpen() ) { - SetErrorString("BamReader::Seek", "cannot seek on unopened BAM file"); - return false; - } - - try { - m_stream.Seek(position); - return true; - } - catch ( BamException& e ) { - const string streamError = e.what(); - const string message = string("could not seek in BAM file: \n\t") + streamError; - SetErrorString("BamReader::Seek", message); - return false; - } -} - -void BamReaderPrivate::SetErrorString(const string& where, const string& what) { - static const string SEPARATOR = ": "; - m_errorString = where + SEPARATOR + what; -} - -void BamReaderPrivate::SetIndex(BamIndex* index) { - m_randomAccessController.SetIndex(index); -} - -// sets current region & attempts to jump to it -// returns success/failure -bool BamReaderPrivate::SetRegion(const BamRegion& region) { - - if ( m_randomAccessController.SetRegion(region, m_references.size()) ) - return true; - else { - const string bracError = m_randomAccessController.GetErrorString(); - const string message = string("could not set region: \n\t") + bracError; - SetErrorString("BamReader::SetRegion", message); - return false; - } -} - -int64_t BamReaderPrivate::Tell(void) const { - return m_stream.Tell(); -} diff --git a/src/api/internal/BamReader_p.h b/src/api/internal/BamReader_p.h deleted file mode 100644 index f928273..0000000 --- a/src/api/internal/BamReader_p.h +++ /dev/null @@ -1,118 +0,0 @@ -// *************************************************************************** -// BamReader_p.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading BAM files -// *************************************************************************** - -#ifndef BAMREADER_P_H -#define BAMREADER_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include "api/BamAlignment.h" -#include "api/BamIndex.h" -#include "api/BamReader.h" -#include "api/SamHeader.h" -#include "api/internal/BamHeader_p.h" -#include "api/internal/BamRandomAccessController_p.h" -#include "api/internal/BgzfStream_p.h" -#include - -namespace BamTools { -namespace Internal { - -class BamReaderPrivate { - - // ctor & dtor - public: - BamReaderPrivate(BamReader* parent); - ~BamReaderPrivate(void); - - // BamReader interface - public: - - // file operations - bool Close(void); - const std::string Filename(void) const; - bool IsOpen(void) const; - bool Open(const std::string& filename); - bool Rewind(void); - bool SetRegion(const BamRegion& region); - - // access alignment data - bool GetNextAlignment(BamAlignment& alignment); - bool GetNextAlignmentCore(BamAlignment& alignment); - - // access auxiliary data - std::string GetHeaderText(void) const; - SamHeader GetSamHeader(void) const; - int GetReferenceCount(void) const; - const RefVector& GetReferenceData(void) const; - int GetReferenceID(const std::string& refName) const; - - // index operations - bool CreateIndex(const BamIndex::IndexType& type); - bool HasIndex(void) const; - bool LocateIndex(const BamIndex::IndexType& preferredType); - bool OpenIndex(const std::string& indexFilename); - void SetIndex(BamIndex* index); - - // error handling - std::string GetErrorString(void) const; - void SetErrorString(const std::string& where, const std::string& what); - - // internal methods, but available as a BamReaderPrivate 'interface' - // - // these methods should only be used by BamTools::Internal classes - // (currently only used by the BamIndex subclasses) - public: - // retrieves header text from BAM file - void LoadHeaderData(void); - // retrieves BAM alignment under file pointer - // (does no overlap checking or character data parsing) - bool LoadNextAlignment(BamAlignment& alignment); - // builds reference data structure from BAM file - bool LoadReferenceData(void); - // seek reader to file position - bool Seek(const int64_t& position); - // return reader's file position - int64_t Tell(void) const; - - // data members - public: - - // general BAM file data - int64_t m_alignmentsBeginOffset; - std::string m_filename; - RefVector m_references; - - // system data - bool m_isBigEndian; - - // parent BamReader - BamReader* m_parent; - - // BamReaderPrivate components - BamHeader m_header; - BamRandomAccessController m_randomAccessController; - BgzfStream m_stream; - - // error handling - std::string m_errorString; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMREADER_P_H diff --git a/src/api/internal/BamStandardIndex_p.cpp b/src/api/internal/BamStandardIndex_p.cpp deleted file mode 100644 index 8b23f74..0000000 --- a/src/api/internal/BamStandardIndex_p.cpp +++ /dev/null @@ -1,954 +0,0 @@ -// *************************************************************************** -// BamStandardIndex.cpp (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides index operations for the standardized BAM index format (".bai") -// *************************************************************************** - -#include "api/BamAlignment.h" -#include "api/internal/BamException_p.h" -#include "api/internal/BamReader_p.h" -#include "api/internal/BamStandardIndex_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -#include -#include -#include -#include -#include -using namespace std; - -// ----------------------------------- -// static BamStandardIndex constants -// ----------------------------------- - -const int BamStandardIndex::MAX_BIN = 37450; // =(8^6-1)/7+1 -const int BamStandardIndex::BAM_LIDX_SHIFT = 14; -const string BamStandardIndex::BAI_EXTENSION = ".bai"; -const char* const BamStandardIndex::BAI_MAGIC = "BAI\1"; -const int BamStandardIndex::SIZEOF_ALIGNMENTCHUNK = sizeof(uint64_t)*2; -const int BamStandardIndex::SIZEOF_BINCORE = sizeof(uint32_t) + sizeof(int32_t); -const int BamStandardIndex::SIZEOF_LINEAROFFSET = sizeof(uint64_t); - -// ---------------------------- -// RaiiWrapper implementation -// ---------------------------- - -BamStandardIndex::RaiiWrapper::RaiiWrapper(void) - : IndexStream(0) - , Buffer(0) -{ } - -BamStandardIndex::RaiiWrapper::~RaiiWrapper(void) { - - if ( IndexStream ) { - fclose(IndexStream); - IndexStream = 0; - } - - if ( Buffer ) { - delete[] Buffer; - Buffer = 0; - } -} - -// --------------------------------- -// BamStandardIndex implementation -// --------------------------------- - -// ctor -BamStandardIndex::BamStandardIndex(Internal::BamReaderPrivate* reader) - : BamIndex(reader) - , m_bufferLength(0) -{ - m_isBigEndian = BamTools::SystemIsBigEndian(); -} - -// dtor -BamStandardIndex::~BamStandardIndex(void) { - CloseFile(); -} - -void BamStandardIndex::AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end) { - - // retrieve references from reader - const RefVector& references = m_reader->GetReferenceData(); - - // LeftPosition cannot be greater than or equal to reference length - if ( region.LeftPosition >= references.at(region.LeftRefID).RefLength ) - throw BamException("BamStandardIndex::AdjustRegion", "invalid region requested"); - - // set region 'begin' - begin = (unsigned int)region.LeftPosition; - - // if right bound specified AND left&right bounds are on same reference - // OK to use right bound position as region 'end' - if ( region.isRightBoundSpecified() && ( region.LeftRefID == region.RightRefID ) ) - end = (unsigned int)region.RightPosition; - - // otherwise, set region 'end' to last reference base - else end = (unsigned int)references.at(region.LeftRefID).RefLength; -} - -// [begin, end) -void BamStandardIndex::CalculateCandidateBins(const uint32_t& begin, - const uint32_t& end, - set& candidateBins) -{ - // initialize list, bin '0' is always a valid bin - candidateBins.insert(0); - - // get rest of bins that contain this region - unsigned int k; - for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { candidateBins.insert(k); } - for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { candidateBins.insert(k); } - for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { candidateBins.insert(k); } - for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { candidateBins.insert(k); } - for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { candidateBins.insert(k); } -} - -void BamStandardIndex::CalculateCandidateOffsets(const BaiReferenceSummary& refSummary, - const uint64_t& minOffset, - set& candidateBins, - vector& offsets) -{ - // seek to first bin - Seek(refSummary.FirstBinFilePosition, SEEK_SET); - - // iterate over reference bins - uint32_t binId; - int32_t numAlignmentChunks; - set::iterator candidateBinIter; - for ( int i = 0; i < refSummary.NumBins; ++i ) { - - // read bin contents (if successful, alignment chunks are now in m_buffer) - ReadBinIntoBuffer(binId, numAlignmentChunks); - - // see if bin is a 'candidate bin' - candidateBinIter = candidateBins.find(binId); - - // if not, move on to next bin - if ( candidateBinIter == candidateBins.end() ) - continue; - - // otherwise, check bin's contents against for overlap - else { - - size_t offset = 0; - uint64_t chunkStart; - uint64_t chunkStop; - - // iterate over alignment chunks - for ( int j = 0; j < numAlignmentChunks; ++j ) { - - // read chunk start & stop from buffer - memcpy((char*)&chunkStart, Resources.Buffer+offset, sizeof(uint64_t)); - offset += sizeof(uint64_t); - memcpy((char*)&chunkStop, Resources.Buffer+offset, sizeof(uint64_t)); - offset += sizeof(uint64_t); - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_64(chunkStart); - SwapEndian_64(chunkStop); - } - - // store alignment chunk's start offset - // if its stop offset is larger than our 'minOffset' - if ( chunkStop >= minOffset ) - offsets.push_back(chunkStart); - } - - // 'pop' bin ID from candidate bins set - candidateBins.erase(candidateBinIter); - - // quit if no more candidates - if ( candidateBins.empty() ) - break; - } - } -} - -uint64_t BamStandardIndex::CalculateMinOffset(const BaiReferenceSummary& refSummary, - const uint32_t& begin) -{ - // if no linear offsets exist, return 0 - if ( refSummary.NumLinearOffsets == 0 ) - return 0; - - // if 'begin' starts beyond last linear offset, use the last linear offset as minimum - // else use the offset corresponding to the requested start position - const int shiftedBegin = begin>>BamStandardIndex::BAM_LIDX_SHIFT; - if ( shiftedBegin >= refSummary.NumLinearOffsets ) - return LookupLinearOffset( refSummary, refSummary.NumLinearOffsets-1 ); - else - return LookupLinearOffset( refSummary, shiftedBegin ); -} - -void BamStandardIndex::CheckBufferSize(char*& buffer, - unsigned int& bufferLength, - const unsigned int& requestedBytes) -{ - try { - if ( requestedBytes > bufferLength ) { - bufferLength = requestedBytes + 10; - delete[] buffer; - buffer = new char[bufferLength]; - } - } catch ( std::bad_alloc& ) { - stringstream s(""); - s << "out of memory when allocating " << requestedBytes << " bytes"; - throw BamException("BamStandardIndex::CheckBufferSize", s.str()); - } -} - -void BamStandardIndex::CheckBufferSize(unsigned char*& buffer, - unsigned int& bufferLength, - const unsigned int& requestedBytes) -{ - try { - if ( requestedBytes > bufferLength ) { - bufferLength = requestedBytes + 10; - delete[] buffer; - buffer = new unsigned char[bufferLength]; - } - } catch ( std::bad_alloc& ) { - stringstream s(""); - s << "out of memory when allocating " << requestedBytes << " bytes"; - throw BamException("BamStandardIndex::CheckBufferSize", s.str()); - } -} - -void BamStandardIndex::CheckMagicNumber(void) { - - // check 'magic number' to see if file is BAI index - char magic[4]; - const size_t elementsRead = fread(magic, sizeof(char), 4, Resources.IndexStream); - if ( elementsRead != 4 ) - throw BamException("BamStandardIndex::CheckMagicNumber", "could not read BAI magic number"); - - // compare to expected value - if ( strncmp(magic, BamStandardIndex::BAI_MAGIC, 4) != 0 ) - throw BamException("BamStandardIndex::CheckMagicNumber", "invalid BAI magic number"); -} - -void BamStandardIndex::ClearReferenceEntry(BaiReferenceEntry& refEntry) { - refEntry.ID = -1; - refEntry.Bins.clear(); - refEntry.LinearOffsets.clear(); -} - -void BamStandardIndex::CloseFile(void) { - - // close file stream - if ( IsFileOpen() ) { - fclose(Resources.IndexStream); - Resources.IndexStream = 0; - } - - // clear index file summary data - m_indexFileSummary.clear(); - - // clean up I/O buffer - delete[] Resources.Buffer; - Resources.Buffer = 0; - m_bufferLength = 0; -} - -// builds index from associated BAM file & writes out to index file -bool BamStandardIndex::Create(void) { - - // skip if BamReader is invalid or not open - if ( m_reader == 0 || !m_reader->IsOpen() ) { - SetErrorString("BamStandardIndex::Create", "could not create index: reader is not open"); - return false; - } - - // rewind BamReader - if ( !m_reader->Rewind() ) { - const string readerError = m_reader->GetErrorString(); - const string message = "could not create index: \n\t" + readerError; - SetErrorString("BamStandardIndex::Create", message); - return false; - } - - try { - - // open new index file (read & write) - string indexFilename = m_reader->Filename() + Extension(); - OpenFile(indexFilename, "w+b"); - - // initialize BaiFileSummary with number of references - const int& numReferences = m_reader->GetReferenceCount(); - ReserveForSummary(numReferences); - - // initialize output file - WriteHeader(); - - // set up bin, ID, offset, & coordinate markers - const uint32_t defaultValue = 0xffffffffu; - uint32_t currentBin = defaultValue; - uint32_t lastBin = defaultValue; - int32_t currentRefID = defaultValue; - int32_t lastRefID = defaultValue; - uint64_t currentOffset = (uint64_t)m_reader->Tell(); - uint64_t lastOffset = currentOffset; - int32_t lastPosition = defaultValue; - - // iterate through alignments in BAM file - BamAlignment al; - BaiReferenceEntry refEntry; - while ( m_reader->LoadNextAlignment(al) ) { - - // changed to new reference - if ( lastRefID != al.RefID ) { - - // if not first reference, save previous reference data - if ( lastRefID != (int32_t)defaultValue ) { - - SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); - WriteReferenceEntry(refEntry); - ClearReferenceEntry(refEntry); - - // write any empty references between (but *NOT* including) lastRefID & al.RefID - for ( int i = lastRefID+1; i < al.RefID; ++i ) { - BaiReferenceEntry emptyEntry(i); - WriteReferenceEntry(emptyEntry); - } - - // update bin markers - currentOffset = lastOffset; - currentBin = al.Bin; - lastBin = al.Bin; - currentRefID = al.RefID; - } - - // otherwise, this is first pass - // be sure to write any empty references up to (but *NOT* including) current RefID - else { - for ( int i = 0; i < al.RefID; ++i ) { - BaiReferenceEntry emptyEntry(i); - WriteReferenceEntry(emptyEntry); - } - } - - // update reference markers - refEntry.ID = al.RefID; - lastRefID = al.RefID; - lastBin = defaultValue; - } - - // if lastPosition greater than current alignment position - file not sorted properly - else if ( lastPosition > al.Position ) { - stringstream s(""); - s << "BAM file is not properly sorted by coordinate" << endl - << "Current alignment position: " << al.Position - << " < previous alignment position: " << lastPosition - << " on reference ID: " << al.RefID << endl; - SetErrorString("BamStandardIndex::Create", s.str()); - return false; - } - - // if alignment's ref ID is valid & its bin is not a 'leaf' - if ( (al.RefID >= 0) && (al.Bin < 4681) ) - SaveLinearOffsetEntry(refEntry.LinearOffsets, al.Position, al.GetEndPosition(), lastOffset); - - // changed to new BAI bin - if ( al.Bin != lastBin ) { - - // if not first bin on reference, save previous bin data - if ( currentBin != defaultValue ) - SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); - - // update markers - currentOffset = lastOffset; - currentBin = al.Bin; - lastBin = al.Bin; - currentRefID = al.RefID; - - // if invalid RefID, break out - if ( currentRefID < 0 ) - break; - } - - // make sure that current file pointer is beyond lastOffset - if ( m_reader->Tell() <= (int64_t)lastOffset ) { - SetErrorString("BamStandardIndex::Create", "calculating offsets failed"); - return false; - } - - // update lastOffset & lastPosition - lastOffset = m_reader->Tell(); - lastPosition = al.Position; - } - - // after finishing alignments, if any data was read, check: - if ( currentRefID >= 0 ) { - - // store last alignment chunk to its bin, then write last reference entry with data - SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); - WriteReferenceEntry(refEntry); - - // then write any empty references remaining at end of file - for ( int i = currentRefID+1; i < numReferences; ++i ) { - BaiReferenceEntry emptyEntry(i); - WriteReferenceEntry(emptyEntry); - } - } - - } catch ( BamException& e) { - m_errorString = e.what(); - return false; - } - - // rewind BamReader - if ( !m_reader->Rewind() ) { - const string readerError = m_reader->GetErrorString(); - const string message = "could not create index: \n\t" + readerError; - SetErrorString("BamStandardIndex::Create", message); - return false; - } - - // return success - return true; -} - -// returns format's file extension -const string BamStandardIndex::Extension(void) { - return BamStandardIndex::BAI_EXTENSION; -} - -void BamStandardIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { - - // cannot calculate offsets if unknown/invalid reference ID requested - if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() ) - throw BamException("BamStandardIndex::GetOffset", "invalid reference ID requested"); - - // retrieve index summary for left bound reference - const BaiReferenceSummary& refSummary = m_indexFileSummary.at(region.LeftRefID); - - // set up region boundaries based on actual BamReader data - uint32_t begin; - uint32_t end; - AdjustRegion(region, begin, end); - - // retrieve all candidate bin IDs for region - set candidateBins; - CalculateCandidateBins(begin, end, candidateBins); - - // use reference's linear offsets to calculate the minimum offset - // that must be considered to find overlap - const uint64_t& minOffset = CalculateMinOffset(refSummary, begin); - - // attempt to use reference summary, minOffset, & candidateBins to calculate offsets - // no data should not be error, just bail - vector offsets; - CalculateCandidateOffsets(refSummary, minOffset, candidateBins, offsets); - if ( offsets.empty() ) - return; - - // ensure that offsets are sorted before processing - sort( offsets.begin(), offsets.end() ); - - // binary search for an overlapping block (may not be first one though) - BamAlignment al; - typedef vector::const_iterator OffsetConstIterator; - OffsetConstIterator offsetFirst = offsets.begin(); - OffsetConstIterator offsetIter = offsetFirst; - OffsetConstIterator offsetLast = offsets.end(); - iterator_traits::difference_type count = distance(offsetFirst, offsetLast); - iterator_traits::difference_type step; - while ( count > 0 ) { - offsetIter = offsetFirst; - step = count/2; - advance(offsetIter, step); - - // attempt seek to candidate offset - const int64_t& candidateOffset = (*offsetIter); - if ( !m_reader->Seek(candidateOffset) ) { - const string readerError = m_reader->GetErrorString(); - const string message = "could not seek in BAM file: \n\t" + readerError; - throw BamException("BamToolsIndex::GetOffset", message); - } - - // load first available alignment, setting flag to true if data exists - *hasAlignmentsInRegion = m_reader->LoadNextAlignment(al); - - // check alignment against region - if ( al.GetEndPosition() <= region.LeftPosition ) { - offsetFirst = ++offsetIter; - count -= step+1; - } else count = step; - } - - // step back to the offset before the 'current offset' (to make sure we cover overlaps) - if ( offsetIter != offsets.begin() ) - --offsetIter; - offset = (*offsetIter); -} - -// returns whether reference has alignments or no -bool BamStandardIndex::HasAlignments(const int& referenceID) const { - if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() ) - return false; - const BaiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID); - return ( refSummary.NumBins > 0 ); -} - -bool BamStandardIndex::IsFileOpen(void) const { - return ( Resources.IndexStream != 0 ); -} - -// attempts to use index data to jump to @region, returns success/fail -// a "successful" jump indicates no error, but not whether this region has data -// * thus, the method sets a flag to indicate whether there are alignments -// available after the jump position -bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { - - // clear out flag - *hasAlignmentsInRegion = false; - - // skip if invalid reader or not open - if ( m_reader == 0 || !m_reader->IsOpen() ) { - SetErrorString("BamStandardIndex::Jump", "could not jump: reader is not open"); - return false; - } - - // calculate nearest offset to jump to - int64_t offset; - try { - GetOffset(region, offset, hasAlignmentsInRegion); - } catch ( BamException& e ) { - m_errorString = e.what(); - return false; - } - - // if region has alignments, return success/fail of seeking there - if ( *hasAlignmentsInRegion ) - return m_reader->Seek(offset); - - // otherwise, simply return true (but hasAlignmentsInRegion flag has been set to false) - // (this is OK, BamReader will check this flag before trying to load data) - return true; -} - -// loads existing data from file into memory -bool BamStandardIndex::Load(const std::string& filename) { - - try { - - // attempt to open file (read-only) - OpenFile(filename, "rb"); - - // validate format - CheckMagicNumber(); - - // load in-memory summary of index data - SummarizeIndexFile(); - - // return success - return true; - - } catch ( BamException& e ) { - m_errorString = e.what(); - return false; - } -} - -uint64_t BamStandardIndex::LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index) { - - // attempt seek to proper index file position - const int64_t linearOffsetFilePosition = (int64_t)refSummary.FirstLinearOffsetFilePosition + - index*BamStandardIndex::SIZEOF_LINEAROFFSET; - Seek(linearOffsetFilePosition, SEEK_SET); - - // read linear offset from BAI file - uint64_t linearOffset; - ReadLinearOffset(linearOffset); - return linearOffset; -} - -void BamStandardIndex::MergeAlignmentChunks(BaiAlignmentChunkVector& chunks) { - - // skip if chunks are empty, nothing to merge - if ( chunks.empty() ) - return; - - // set up merged alignment chunk container - BaiAlignmentChunkVector mergedChunks; - mergedChunks.push_back( chunks[0] ); - - // iterate over chunks - int i = 0; - BaiAlignmentChunkVector::iterator chunkIter = chunks.begin(); - BaiAlignmentChunkVector::iterator chunkEnd = chunks.end(); - for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { - - // get 'currentMergeChunk' based on numeric index - BaiAlignmentChunk& currentMergeChunk = mergedChunks[i]; - - // get sourceChunk based on source vector iterator - BaiAlignmentChunk& sourceChunk = (*chunkIter); - - // if currentMergeChunk ends where sourceChunk starts, then merge the two - if ( currentMergeChunk.Stop>>16 == sourceChunk.Start>>16 ) - currentMergeChunk.Stop = sourceChunk.Stop; - - // otherwise - else { - // append sourceChunk after currentMergeChunk - mergedChunks.push_back(sourceChunk); - - // update i, so the next iteration will consider the - // recently-appended sourceChunk as new mergeChunk candidate - ++i; - } - } - - // saved newly-merged chunks into (parameter) chunks - chunks = mergedChunks; -} - -void BamStandardIndex::OpenFile(const std::string& filename, const char* mode) { - - // make sure any previous index file is closed - CloseFile(); - - // attempt to open file - Resources.IndexStream = fopen(filename.c_str(), mode); - if ( !IsFileOpen() ) { - const string message = string("could not open file: ") + filename; - throw BamException("BamStandardIndex::OpenFile", message); - } -} - -void BamStandardIndex::ReadBinID(uint32_t& binId) { - const size_t elementsRead = fread(&binId, sizeof(binId), 1, Resources.IndexStream); - if ( m_isBigEndian ) SwapEndian_32(binId); - if ( elementsRead != 1 ) - throw BamException("BamStandardIndex::ReadBinID", "could not read BAI bin ID"); -} - -void BamStandardIndex::ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks) { - - // read bin header - ReadBinID(binId); - ReadNumAlignmentChunks(numAlignmentChunks); - - // read bin contents - const unsigned int bytesRequested = numAlignmentChunks*BamStandardIndex::SIZEOF_ALIGNMENTCHUNK; - ReadIntoBuffer(bytesRequested); -} - -void BamStandardIndex::ReadIntoBuffer(const unsigned int& bytesRequested) { - - // ensure that our buffer is big enough for request - BamStandardIndex::CheckBufferSize(Resources.Buffer, m_bufferLength, bytesRequested); - - // read from BAI file stream - const size_t bytesRead = fread( Resources.Buffer, sizeof(char), bytesRequested, Resources.IndexStream ); - if ( bytesRead != (size_t)bytesRequested ) { - stringstream s(""); - s << "expected to read: " << bytesRequested << " bytes, " - << "but instead read: " << bytesRead; - throw BamException("BamStandardIndex::ReadIntoBuffer", s.str()); - } -} - -void BamStandardIndex::ReadLinearOffset(uint64_t& linearOffset) { - const size_t elementsRead = fread(&linearOffset, sizeof(linearOffset), 1, Resources.IndexStream); - if ( m_isBigEndian ) SwapEndian_64(linearOffset); - if ( elementsRead != 1 ) - throw BamException("BamStandardIndex::ReadLinearOffset", "could not read BAI linear offset"); -} - -void BamStandardIndex::ReadNumAlignmentChunks(int& numAlignmentChunks) { - const size_t elementsRead = fread(&numAlignmentChunks, sizeof(numAlignmentChunks), 1, Resources.IndexStream); - if ( m_isBigEndian ) SwapEndian_32(numAlignmentChunks); - if ( elementsRead != 1 ) - throw BamException("BamStandardIndex::ReadNumAlignmentChunks", "could not read BAI chunk count"); -} - -void BamStandardIndex::ReadNumBins(int& numBins) { - const size_t elementsRead = fread(&numBins, sizeof(numBins), 1, Resources.IndexStream); - if ( m_isBigEndian ) SwapEndian_32(numBins); - if ( elementsRead != 1 ) - throw BamException("BamStandardIndex::ReadNumBins", "could not read BAI bin count"); -} - -void BamStandardIndex::ReadNumLinearOffsets(int& numLinearOffsets) { - const size_t elementsRead = fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, Resources.IndexStream); - if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets); - if ( elementsRead != 1 ) - throw BamException("BamStandardIndex::ReadNumAlignmentChunks", "could not read BAI linear offset count"); -} - -void BamStandardIndex::ReadNumReferences(int& numReferences) { - const size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, Resources.IndexStream); - if ( m_isBigEndian ) SwapEndian_32(numReferences); - if ( elementsRead != 1 ) - throw BamException("BamStandardIndex::ReadNumReferences", "could not read reference count"); -} - -void BamStandardIndex::ReserveForSummary(const int& numReferences) { - m_indexFileSummary.clear(); - m_indexFileSummary.assign( numReferences, BaiReferenceSummary() ); -} - -void BamStandardIndex::SaveAlignmentChunkToBin(BaiBinMap& binMap, - const uint32_t& currentBin, - const uint64_t& currentOffset, - const uint64_t& lastOffset) -{ - // create new alignment chunk - BaiAlignmentChunk newChunk(currentOffset, lastOffset); - - // if no entry exists yet for this bin, create one and store alignment chunk - BaiBinMap::iterator binIter = binMap.find(currentBin); - if ( binIter == binMap.end() ) { - BaiAlignmentChunkVector newChunks; - newChunks.push_back(newChunk); - binMap.insert( pair(currentBin, newChunks)); - } - - // otherwise, just append alignment chunk - else { - BaiAlignmentChunkVector& binChunks = (*binIter).second; - binChunks.push_back( newChunk ); - } -} - -void BamStandardIndex::SaveBinsSummary(const int& refId, const int& numBins) { - BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId); - refSummary.NumBins = numBins; - refSummary.FirstBinFilePosition = Tell(); -} - -void BamStandardIndex::SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets, - const int& alignmentStartPosition, - const int& alignmentStopPosition, - const uint64_t& lastOffset) -{ - // get converted offsets - const int beginOffset = alignmentStartPosition >> BamStandardIndex::BAM_LIDX_SHIFT; - const int endOffset = (alignmentStopPosition - 1) >> BamStandardIndex::BAM_LIDX_SHIFT; - - // resize vector if necessary - int oldSize = offsets.size(); - int newSize = endOffset + 1; - if ( oldSize < newSize ) - offsets.resize(newSize, 0); - - // store offset - for( int i = beginOffset + 1; i <= endOffset; ++i ) { - if ( offsets[i] == 0 ) - offsets[i] = lastOffset; - } -} - -void BamStandardIndex::SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets) { - BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId); - refSummary.NumLinearOffsets = numLinearOffsets; - refSummary.FirstLinearOffsetFilePosition = Tell(); -} - -// seek to position in index file stream -void BamStandardIndex::Seek(const int64_t& position, const int& origin) { - if ( fseek64(Resources.IndexStream, position, origin) != 0 ) - throw BamException("BamStandardIndex::Seek", "could not seek in BAI file"); -} - -void BamStandardIndex::SkipBins(const int& numBins) { - uint32_t binId; - int32_t numAlignmentChunks; - for (int i = 0; i < numBins; ++i) - ReadBinIntoBuffer(binId, numAlignmentChunks); // results & buffer ignored -} - -void BamStandardIndex::SkipLinearOffsets(const int& numLinearOffsets) { - const unsigned int bytesRequested = numLinearOffsets*BamStandardIndex::SIZEOF_LINEAROFFSET; - ReadIntoBuffer(bytesRequested); -} - -void BamStandardIndex::SortLinearOffsets(BaiLinearOffsetVector& linearOffsets) { - sort( linearOffsets.begin(), linearOffsets.end() ); -} - -void BamStandardIndex::SummarizeBins(BaiReferenceSummary& refSummary) { - - // load number of bins - int numBins; - ReadNumBins(numBins); - - // store bins summary for this reference - refSummary.NumBins = numBins; - refSummary.FirstBinFilePosition = Tell(); - - // skip this reference's bins - SkipBins(numBins); -} - -void BamStandardIndex::SummarizeIndexFile(void) { - - // load number of reference sequences - int numReferences; - ReadNumReferences(numReferences); - - // initialize file summary data - ReserveForSummary(numReferences); - - // iterate over reference entries - BaiFileSummary::iterator summaryIter = m_indexFileSummary.begin(); - BaiFileSummary::iterator summaryEnd = m_indexFileSummary.end(); - for ( int i = 0; summaryIter != summaryEnd; ++summaryIter, ++i ) - SummarizeReference(*summaryIter); -} - -void BamStandardIndex::SummarizeLinearOffsets(BaiReferenceSummary& refSummary) { - - // load number of linear offsets - int numLinearOffsets; - ReadNumLinearOffsets(numLinearOffsets); - - // store bin summary data for this reference - refSummary.NumLinearOffsets = numLinearOffsets; - refSummary.FirstLinearOffsetFilePosition = Tell(); - - // skip linear offsets in index file - SkipLinearOffsets(numLinearOffsets); -} - -void BamStandardIndex::SummarizeReference(BaiReferenceSummary& refSummary) { - SummarizeBins(refSummary); - SummarizeLinearOffsets(refSummary); -} - -// return position of file pointer in index file stream -int64_t BamStandardIndex::Tell(void) const { - return ftell64(Resources.IndexStream); -} - -void BamStandardIndex::WriteAlignmentChunk(const BaiAlignmentChunk& chunk) { - - // localize alignment chunk offsets - uint64_t start = chunk.Start; - uint64_t stop = chunk.Stop; - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_64(start); - SwapEndian_64(stop); - } - - // write to index file - size_t elementsWritten = 0; - elementsWritten += fwrite(&start, sizeof(start), 1, Resources.IndexStream); - elementsWritten += fwrite(&stop, sizeof(stop), 1, Resources.IndexStream); - if ( elementsWritten != 2 ) - throw BamException("BamStandardIndex::WriteAlignmentChunk", "could not write BAI alignment chunk"); -} - -void BamStandardIndex::WriteAlignmentChunks(BaiAlignmentChunkVector& chunks) { - - // make sure chunks are merged (simplified) before writing & saving summary - MergeAlignmentChunks(chunks); - - // write chunks - int32_t chunkCount = chunks.size(); - if ( m_isBigEndian ) SwapEndian_32(chunkCount); - const size_t elementsWritten = fwrite(&chunkCount, sizeof(chunkCount), 1, Resources.IndexStream); - if ( elementsWritten != 1 ) - throw BamException("BamStandardIndex::WriteAlignmentChunks", "could not write BAI chunk count"); - - // iterate over chunks - BaiAlignmentChunkVector::const_iterator chunkIter = chunks.begin(); - BaiAlignmentChunkVector::const_iterator chunkEnd = chunks.end(); - for ( ; chunkIter != chunkEnd; ++chunkIter ) - WriteAlignmentChunk( (*chunkIter) ); -} - -void BamStandardIndex::WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks) { - - // write BAM bin ID - uint32_t binKey = binId; - if ( m_isBigEndian ) SwapEndian_32(binKey); - const size_t elementsWritten = fwrite(&binKey, sizeof(binKey), 1, Resources.IndexStream); - if ( elementsWritten != 1 ) - throw BamException("BamStandardIndex::WriteBin", "could not write bin ID"); - - // write bin's alignment chunks - WriteAlignmentChunks(chunks); -} - -void BamStandardIndex::WriteBins(const int& refId, BaiBinMap& bins) { - - // write number of bins - int32_t binCount = bins.size(); - if ( m_isBigEndian ) SwapEndian_32(binCount); - const size_t elementsWritten = fwrite(&binCount, sizeof(binCount), 1, Resources.IndexStream); - if ( elementsWritten != 1 ) - throw BamException("BamStandardIndex::WriteBins", "could not write bin count"); - - // save summary for reference's bins - SaveBinsSummary(refId, bins.size()); - - // iterate over bins - BaiBinMap::iterator binIter = bins.begin(); - BaiBinMap::iterator binEnd = bins.end(); - for ( ; binIter != binEnd; ++binIter ) - WriteBin( (*binIter).first, (*binIter).second ); -} - -void BamStandardIndex::WriteHeader(void) { - - size_t elementsWritten = 0; - - // write magic number - elementsWritten += fwrite(BamStandardIndex::BAI_MAGIC, sizeof(char), 4, Resources.IndexStream); - - // write number of reference sequences - int32_t numReferences = m_indexFileSummary.size(); - if ( m_isBigEndian ) SwapEndian_32(numReferences); - elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, Resources.IndexStream); - - if ( elementsWritten != 5 ) - throw BamException("BamStandardIndex::WriteHeader", "could not write BAI header"); -} - -void BamStandardIndex::WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets) { - - // make sure linear offsets are sorted before writing & saving summary - SortLinearOffsets(linearOffsets); - - size_t elementsWritten = 0; - - // write number of linear offsets - int32_t offsetCount = linearOffsets.size(); - if ( m_isBigEndian ) SwapEndian_32(offsetCount); - elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, Resources.IndexStream); - - // save summary for reference's linear offsets - SaveLinearOffsetsSummary(refId, linearOffsets.size()); - - // iterate over linear offsets - BaiLinearOffsetVector::const_iterator offsetIter = linearOffsets.begin(); - BaiLinearOffsetVector::const_iterator offsetEnd = linearOffsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) { - - // write linear offset - uint64_t linearOffset = (*offsetIter); - if ( m_isBigEndian ) SwapEndian_64(linearOffset); - elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, Resources.IndexStream); - } - - if ( elementsWritten != (linearOffsets.size() + 1) ) - throw BamException("BamStandardIndex::WriteLinearOffsets", "could not write BAI linear offsets"); -} - -void BamStandardIndex::WriteReferenceEntry(BaiReferenceEntry& refEntry) { - WriteBins(refEntry.ID, refEntry.Bins); - WriteLinearOffsets(refEntry.ID, refEntry.LinearOffsets); -} diff --git a/src/api/internal/BamStandardIndex_p.h b/src/api/internal/BamStandardIndex_p.h deleted file mode 100644 index e49bc26..0000000 --- a/src/api/internal/BamStandardIndex_p.h +++ /dev/null @@ -1,237 +0,0 @@ -// *************************************************************************** -// BamStandardIndex.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides index operations for the standardized BAM index format (".bai") -// *************************************************************************** - -#ifndef BAM_STANDARD_INDEX_FORMAT_H -#define BAM_STANDARD_INDEX_FORMAT_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to -// version without notice, or even be removed. -// -// We mean it. - -#include "api/BamAux.h" -#include "api/BamIndex.h" -#include -#include -#include -#include - -namespace BamTools { -namespace Internal { - -// ----------------------------------------------------------------------------- -// BamStandardIndex data structures - -// defines start and end of a contiguous run of alignments -struct BaiAlignmentChunk { - - // data members - uint64_t Start; - uint64_t Stop; - - // constructor - BaiAlignmentChunk(const uint64_t& start = 0, - const uint64_t& stop = 0) - : Start(start) - , Stop(stop) - { } -}; - -// comparison operator (for sorting) -inline -bool operator<(const BaiAlignmentChunk& lhs, const BaiAlignmentChunk& rhs) { - return lhs.Start < rhs.Start; -} - -// convenience typedef for a list of all alignment 'chunks' in a BAI bin -typedef std::vector BaiAlignmentChunkVector; - -// convenience typedef for a map of all BAI bins in a reference (ID => chunks) -typedef std::map BaiBinMap; - -// convenience typedef for a list of all 'linear offsets' in a reference -typedef std::vector BaiLinearOffsetVector; - -// contains all fields necessary for building, loading, & writing -// full BAI index data for a single reference -struct BaiReferenceEntry { - - // data members - int32_t ID; - BaiBinMap Bins; - BaiLinearOffsetVector LinearOffsets; - - // ctor - BaiReferenceEntry(const int32_t& id = -1) - : ID(id) - { } -}; - -// provides (persistent) summary of BaiReferenceEntry's index data -struct BaiReferenceSummary { - - // data members - int NumBins; - int NumLinearOffsets; - uint64_t FirstBinFilePosition; - uint64_t FirstLinearOffsetFilePosition; - - // ctor - BaiReferenceSummary(void) - : NumBins(0) - , NumLinearOffsets(0) - , FirstBinFilePosition(0) - , FirstLinearOffsetFilePosition(0) - { } -}; - -// convenience typedef for describing a full BAI index file summary -typedef std::vector BaiFileSummary; - -// end BamStandardIndex data structures -// ----------------------------------------------------------------------------- - -class BamStandardIndex : public BamIndex { - - // ctor & dtor - public: - BamStandardIndex(Internal::BamReaderPrivate* reader); - ~BamStandardIndex(void); - - // BamIndex implementation - public: - // builds index from associated BAM file & writes out to index file - bool Create(void); - // returns whether reference has alignments or no - bool HasAlignments(const int& referenceID) const; - // attempts to use index data to jump to @region, returns success/fail - // a "successful" jump indicates no error, but not whether this region has data - // * thus, the method sets a flag to indicate whether there are alignments - // available after the jump position - bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); - // loads existing data from file into memory - bool Load(const std::string& filename); - BamIndex::IndexType Type(void) const { return BamIndex::STANDARD; } - public: - // returns format's file extension - static const std::string Extension(void); - - // internal methods - private: - - // index file ops - void CheckMagicNumber(void); - void CloseFile(void); - bool IsFileOpen(void) const; - void OpenFile(const std::string& filename, const char* mode); - void Seek(const int64_t& position, const int& origin); - int64_t Tell(void) const; - - // BAI index building methods - void ClearReferenceEntry(BaiReferenceEntry& refEntry); - void SaveAlignmentChunkToBin(BaiBinMap& binMap, - const uint32_t& currentBin, - const uint64_t& currentOffset, - const uint64_t& lastOffset); - void SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets, - const int& alignmentStartPosition, - const int& alignmentStopPosition, - const uint64_t& lastOffset); - - // random-access methods - void AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end); - void CalculateCandidateBins(const uint32_t& begin, - const uint32_t& end, - std::set& candidateBins); - void CalculateCandidateOffsets(const BaiReferenceSummary& refSummary, - const uint64_t& minOffset, - std::set& candidateBins, - std::vector& offsets); - uint64_t CalculateMinOffset(const BaiReferenceSummary& refSummary, const uint32_t& begin); - void GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); - uint64_t LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index); - - // BAI summary (create/load) methods - void ReserveForSummary(const int& numReferences); - void SaveBinsSummary(const int& refId, const int& numBins); - void SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets); - void SkipBins(const int& numBins); - void SkipLinearOffsets(const int& numLinearOffsets); - void SummarizeBins(BaiReferenceSummary& refSummary); - void SummarizeIndexFile(void); - void SummarizeLinearOffsets(BaiReferenceSummary& refSummary); - void SummarizeReference(BaiReferenceSummary& refSummary); - - // BAI full index input methods - void ReadBinID(uint32_t& binId); - void ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks); - void ReadIntoBuffer(const unsigned int& bytesRequested); - void ReadLinearOffset(uint64_t& linearOffset); - void ReadNumAlignmentChunks(int& numAlignmentChunks); - void ReadNumBins(int& numBins); - void ReadNumLinearOffsets(int& numLinearOffsets); - void ReadNumReferences(int& numReferences); - - // BAI full index output methods - void MergeAlignmentChunks(BaiAlignmentChunkVector& chunks); - void SortLinearOffsets(BaiLinearOffsetVector& linearOffsets); - void WriteAlignmentChunk(const BaiAlignmentChunk& chunk); - void WriteAlignmentChunks(BaiAlignmentChunkVector& chunks); - void WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks); - void WriteBins(const int& refId, BaiBinMap& bins); - void WriteHeader(void); - void WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets); - void WriteReferenceEntry(BaiReferenceEntry& refEntry); - - // data members - private: - bool m_isBigEndian; - BaiFileSummary m_indexFileSummary; - - // our input buffer - unsigned int m_bufferLength; - - struct RaiiWrapper { - FILE* IndexStream; - char* Buffer; - RaiiWrapper(void); - ~RaiiWrapper(void); - }; - RaiiWrapper Resources; - - // static methods - private: - // checks if the buffer is large enough to accomodate the requested size - static void CheckBufferSize(char*& buffer, - unsigned int& bufferLength, - const unsigned int& requestedBytes); - // checks if the buffer is large enough to accomodate the requested size - static void CheckBufferSize(unsigned char*& buffer, - unsigned int& bufferLength, - const unsigned int& requestedBytes); - // static constants - private: - static const int MAX_BIN; - static const int BAM_LIDX_SHIFT; - static const std::string BAI_EXTENSION; - static const char* const BAI_MAGIC; - static const int SIZEOF_ALIGNMENTCHUNK; - static const int SIZEOF_BINCORE; - static const int SIZEOF_LINEAROFFSET; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAM_STANDARD_INDEX_FORMAT_H diff --git a/src/api/internal/BamToolsIndex_p.cpp b/src/api/internal/BamToolsIndex_p.cpp deleted file mode 100644 index cdf3d10..0000000 --- a/src/api/internal/BamToolsIndex_p.cpp +++ /dev/null @@ -1,615 +0,0 @@ -// *************************************************************************** -// BamToolsIndex.cpp (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides index operations for the BamTools index format (".bti") -// *************************************************************************** - -#include "api/BamAlignment.h" -#include "api/internal/BamException_p.h" -#include "api/internal/BamReader_p.h" -#include "api/internal/BamToolsIndex_p.h" -#include "api/internal/BgzfStream_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -#include -#include -#include -#include -#include -#include -#include -using namespace std; - -// -------------------------------- -// static BamToolsIndex constants -// -------------------------------- - -const uint32_t BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000; -const string BamToolsIndex::BTI_EXTENSION = ".bti"; -const char* const BamToolsIndex::BTI_MAGIC = "BTI\1"; -const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t)*2 + sizeof(int64_t); - -// ---------------------------- -// RaiiWrapper implementation -// ---------------------------- - -BamToolsIndex::RaiiWrapper::RaiiWrapper(void) - : IndexStream(0) -{ } - -BamToolsIndex::RaiiWrapper::~RaiiWrapper(void) { - if ( IndexStream ) - fclose(IndexStream); -} - -// ------------------------------ -// BamToolsIndex implementation -// ------------------------------ - -// ctor -BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader) - : BamIndex(reader) - , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH) - , m_inputVersion(0) - , m_outputVersion(BTI_2_0) // latest version - used for writing new index files -{ - m_isBigEndian = BamTools::SystemIsBigEndian(); -} - -// dtor -BamToolsIndex::~BamToolsIndex(void) { - CloseFile(); -} - -void BamToolsIndex::CheckMagicNumber(void) { - - // read magic number - char magic[4]; - size_t elementsRead = fread(magic, sizeof(char), 4, Resources.IndexStream); - if ( elementsRead != 4 ) - throw BamException("BamToolsIndex::CheckMagicNumber", "could not read BTI magic number"); - - // validate expected magic number - if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 ) - throw BamException("BamToolsIndex::CheckMagicNumber", "invalid BTI magic number"); -} - -// check index file version, return true if OK -void BamToolsIndex::CheckVersion(void) { - - // read version from file - size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, Resources.IndexStream); - if ( elementsRead != 1 ) - throw BamException("BamToolsIndex::CheckVersion", "could not read format version"); - if ( m_isBigEndian ) SwapEndian_32(m_inputVersion); - - // if version is negative, or zero - if ( m_inputVersion <= 0 ) - throw BamException("BamToolsIndex::CheckVersion", "invalid format version"); - - // if version is newer than can be supported by this version of bamtools - else if ( m_inputVersion > m_outputVersion ) { - const string message = "unsupported format: this index was created by a newer version of BamTools. " - "Update your local version of BamTools to use the index file."; - throw BamException("BamToolsIndex::CheckVersion", message); - } - - // ------------------------------------------------------------------ - // check for deprecated, unsupported versions - // (the format had to be modified to accomodate a particular bug fix) - - // Version 2.0: introduced support for half-open intervals, instead of the old closed intervals - // respondBy: throwing exception - we're not going to try to handle the old BTI files. - else if ( (Version)m_inputVersion < BamToolsIndex::BTI_2_0 ) { - const string message = "unsupported format: this version of the index may not properly handle " - "coordinate intervals. Please run 'bamtools index -bti -in yourData.bam' " - "to generate an up-to-date, fixed BTI file."; - throw BamException("BamToolsIndex::CheckVersion", message); - } -} - -void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) { - refEntry.ID = -1; - refEntry.Blocks.clear(); -} - -void BamToolsIndex::CloseFile(void) { - if ( IsFileOpen() ) { - fclose(Resources.IndexStream); - Resources.IndexStream = 0; - } - m_indexFileSummary.clear(); -} - -// builds index from associated BAM file & writes out to index file -bool BamToolsIndex::Create(void) { - - // skip if BamReader is invalid or not open - if ( m_reader == 0 || !m_reader->IsOpen() ) { - SetErrorString("BamToolsIndex::Create", "could not create index: reader is not open"); - return false; - } - - // rewind BamReader - if ( !m_reader->Rewind() ) { - const string readerError = m_reader->GetErrorString(); - const string message = "could not create index: \n\t" + readerError; - SetErrorString("BamToolsIndex::Create", message); - return false; - } - - try { - // open new index file (read & write) - const string indexFilename = m_reader->Filename() + Extension(); - OpenFile(indexFilename, "w+b"); - - // initialize BtiFileSummary with number of references - const int& numReferences = m_reader->GetReferenceCount(); - InitializeFileSummary(numReferences); - - // intialize output file header - WriteHeader(); - - // index building markers - uint32_t currentBlockCount = 0; - int64_t currentAlignmentOffset = m_reader->Tell(); - int32_t blockRefId = -1; - int32_t blockMaxEndPosition = -1; - int64_t blockStartOffset = currentAlignmentOffset; - int32_t blockStartPosition = -1; - - // plow through alignments, storing index entries - BamAlignment al; - BtiReferenceEntry refEntry; - while ( m_reader->LoadNextAlignment(al) ) { - - // if moved to new reference - if ( al.RefID != blockRefId ) { - - // if first pass, check: - if ( currentBlockCount == 0 ) { - - // write any empty references up to (but not including) al.RefID - for ( int i = 0; i < al.RefID; ++i ) - WriteReferenceEntry( BtiReferenceEntry(i) ); - } - - // not first pass: - else { - - // store previous BTI block data in reference entry - const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); - refEntry.Blocks.push_back(block); - - // write reference entry, then clear - WriteReferenceEntry(refEntry); - ClearReferenceEntry(refEntry); - - // write any empty references between (but not including) - // the last blockRefID and current al.RefID - for ( int i = blockRefId+1; i < al.RefID; ++i ) - WriteReferenceEntry( BtiReferenceEntry(i) ); - - // reset block count - currentBlockCount = 0; - } - - // set ID for new reference entry - refEntry.ID = al.RefID; - } - - // if beginning of block, update counters - if ( currentBlockCount == 0 ) { - blockRefId = al.RefID; - blockStartOffset = currentAlignmentOffset; - blockStartPosition = al.Position; - blockMaxEndPosition = al.GetEndPosition(); - } - - // increment block counter - ++currentBlockCount; - - // check end position - const int32_t alignmentEndPosition = al.GetEndPosition(); - if ( alignmentEndPosition > blockMaxEndPosition ) - blockMaxEndPosition = alignmentEndPosition; - - // if block is full, get offset for next block, reset currentBlockCount - if ( currentBlockCount == m_blockSize ) { - - // store previous block data in reference entry - const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); - refEntry.Blocks.push_back(block); - - // update markers - blockStartOffset = m_reader->Tell(); - currentBlockCount = 0; - } - - // not the best name, but for the next iteration, this value will be the offset of the - // *current* alignment. this is necessary because we won't know if this next alignment - // is on a new reference until we actually read it - currentAlignmentOffset = m_reader->Tell(); - } - - // after finishing alignments, if any data was read, check: - if ( blockRefId >= 0 ) { - - // store last BTI block data in reference entry - const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); - refEntry.Blocks.push_back(block); - - // write last reference entry, then clear - WriteReferenceEntry(refEntry); - ClearReferenceEntry(refEntry); - - // then write any empty references remaining at end of file - for ( int i = blockRefId+1; i < numReferences; ++i ) - WriteReferenceEntry( BtiReferenceEntry(i) ); - } - - } catch ( BamException& e ) { - m_errorString = e.what(); - return false; - } - - // rewind BamReader - if ( !m_reader->Rewind() ) { - const string readerError = m_reader->GetErrorString(); - const string message = "could not create index: \n\t" + readerError; - SetErrorString("BamToolsIndex::Create", message); - return false; - } - - // return success - return true; -} - -// returns format's file extension -const std::string BamToolsIndex::Extension(void) { - return BamToolsIndex::BTI_EXTENSION; -} - -void BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { - - // return false ref ID is not a valid index in file summary data - if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() ) - throw BamException("BamToolsIndex::GetOffset", "invalid region requested"); - - // retrieve reference index data for left bound reference - BtiReferenceEntry refEntry(region.LeftRefID); - ReadReferenceEntry(refEntry); - - // binary search for an overlapping block (may not be first one though) - bool found = false; - typedef BtiBlockVector::const_iterator BtiBlockConstIterator; - BtiBlockConstIterator blockFirst = refEntry.Blocks.begin(); - BtiBlockConstIterator blockIter = blockFirst; - BtiBlockConstIterator blockLast = refEntry.Blocks.end(); - iterator_traits::difference_type count = distance(blockFirst, blockLast); - iterator_traits::difference_type step; - while ( count > 0 ) { - blockIter = blockFirst; - step = count/2; - advance(blockIter, step); - - const BtiBlock& block = (*blockIter); - if ( block.StartPosition <= region.RightPosition ) { - if ( block.MaxEndPosition > region.LeftPosition ) { - offset = block.StartOffset; - break; - } - blockFirst = ++blockIter; - count -= step+1; - } - else count = step; - } - - // if we didn't search "off the end" of the blocks - if ( blockIter != blockLast ) { - - // "walk back" until we've gone too far - while ( blockIter != blockFirst ) { - const BtiBlock& currentBlock = (*blockIter); - - --blockIter; - const BtiBlock& previousBlock = (*blockIter); - if ( previousBlock.MaxEndPosition <= region.LeftPosition ) { - offset = currentBlock.StartOffset; - found = true; - break; - } - } - - // if we walked all the way to first block, just return that and let the reader's - // region overlap parsing do the rest - if ( blockIter == blockFirst ) { - const BtiBlock& block = (*blockIter); - offset = block.StartOffset; - found = true; - } - } - - - // sets to false if blocks container is empty, or if no matching block could be found - *hasAlignmentsInRegion = found; -} - -// returns whether reference has alignments or no -bool BamToolsIndex::HasAlignments(const int& referenceID) const { - if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() ) - return false; - const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID); - return ( refSummary.NumBlocks > 0 ); -} - -// pre-allocates space for each reference's summary data -void BamToolsIndex::InitializeFileSummary(const int& numReferences) { - m_indexFileSummary.clear(); - for ( int i = 0; i < numReferences; ++i ) - m_indexFileSummary.push_back( BtiReferenceSummary() ); -} - -// returns true if the index stream is open -bool BamToolsIndex::IsFileOpen(void) const { - return ( Resources.IndexStream != 0 ); -} - -// attempts to use index data to jump to @region, returns success/fail -// a "successful" jump indicates no error, but not whether this region has data -// * thus, the method sets a flag to indicate whether there are alignments -// available after the jump position -bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) { - - // clear flag - *hasAlignmentsInRegion = false; - - // skip if invalid reader or not open - if ( m_reader == 0 || !m_reader->IsOpen() ) { - SetErrorString("BamToolsIndex::Jump", "could not jump: reader is not open"); - return false; - } - - // make sure left-bound position is valid - const RefVector& references = m_reader->GetReferenceData(); - if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) { - SetErrorString("BamToolsIndex::Jump", "could not create index: invalid region requested"); - return false; - } - - // calculate nearest offset to jump to - int64_t offset; - try { - GetOffset(region, offset, hasAlignmentsInRegion); - } catch ( BamException& e ) { - m_errorString = e.what(); - return false; - } - - // return success/failure of seek - return m_reader->Seek(offset); -} - -// loads existing data from file into memory -bool BamToolsIndex::Load(const std::string& filename) { - - try { - - // attempt to open file (read-only) - OpenFile(filename, "rb"); - - // load metadata & generate in-memory summary - LoadHeader(); - LoadFileSummary(); - - // return success - return true; - - } catch ( BamException& e ) { - m_errorString = e.what(); - return false; - } -} - -void BamToolsIndex::LoadFileSummary(void) { - - // load number of reference sequences - int numReferences; - LoadNumReferences(numReferences); - - // initialize file summary data - InitializeFileSummary(numReferences); - - // load summary for each reference - BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin(); - BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end(); - for ( ; summaryIter != summaryEnd; ++summaryIter ) - LoadReferenceSummary(*summaryIter); -} - -void BamToolsIndex::LoadHeader(void) { - - // check BTI file metadata - CheckMagicNumber(); - CheckVersion(); - - // use file's BTI block size to set member variable - const size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, Resources.IndexStream); - if ( m_isBigEndian ) SwapEndian_32(m_blockSize); - if ( elementsRead != 1 ) - throw BamException("BamToolsIndex::LoadHeader", "could not read BTI block size"); -} - -void BamToolsIndex::LoadNumBlocks(int& numBlocks) { - const size_t elementsRead = fread(&numBlocks, sizeof(numBlocks), 1, Resources.IndexStream); - if ( m_isBigEndian ) SwapEndian_32(numBlocks); - if ( elementsRead != 1 ) - throw BamException("BamToolsIndex::LoadNumBlocks", "could not read number of BTI blocks"); -} - -void BamToolsIndex::LoadNumReferences(int& numReferences) { - const size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, Resources.IndexStream); - if ( m_isBigEndian ) SwapEndian_32(numReferences); - if ( elementsRead != 1 ) - throw BamException("BamToolsIndex::LoadNumReferences", "could not read number of references"); -} - -void BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) { - - // load number of blocks - int numBlocks; - LoadNumBlocks(numBlocks); - - // store block summary data for this reference - refSummary.NumBlocks = numBlocks; - refSummary.FirstBlockFilePosition = Tell(); - - // skip reference's blocks - SkipBlocks(numBlocks); -} - -void BamToolsIndex::OpenFile(const std::string& filename, const char* mode) { - - // make sure any previous index file is closed - CloseFile(); - - // attempt to open file - Resources.IndexStream = fopen(filename.c_str(), mode); - if ( !IsFileOpen() ) { - const string message = string("could not open file: ") + filename; - throw BamException("BamToolsIndex::OpenFile", message); - } -} - -void BamToolsIndex::ReadBlock(BtiBlock& block) { - - // read in block data members - size_t elementsRead = 0; - elementsRead += fread(&block.MaxEndPosition, sizeof(block.MaxEndPosition), 1, Resources.IndexStream); - elementsRead += fread(&block.StartOffset, sizeof(block.StartOffset), 1, Resources.IndexStream); - elementsRead += fread(&block.StartPosition, sizeof(block.StartPosition), 1, Resources.IndexStream); - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_32(block.MaxEndPosition); - SwapEndian_64(block.StartOffset); - SwapEndian_32(block.StartPosition); - } - - if ( elementsRead != 3 ) - throw BamException("BamToolsIndex::ReadBlock", "could not read block"); -} - -void BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) { - - // prep blocks container - blocks.clear(); - blocks.reserve(refSummary.NumBlocks); - - // skip to first block entry - Seek( refSummary.FirstBlockFilePosition, SEEK_SET ); - - // read & store block entries - BtiBlock block; - for ( int i = 0; i < refSummary.NumBlocks; ++i ) { - ReadBlock(block); - blocks.push_back(block); - } -} - -void BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) { - - // return false if refId not valid index in file summary structure - if ( refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size() ) - throw BamException("BamToolsIndex::ReadReferenceEntry", "invalid reference requested"); - - // use index summary to assist reading the reference's BTI blocks - const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID); - ReadBlocks(refSummary, refEntry.Blocks); -} - -void BamToolsIndex::Seek(const int64_t& position, const int& origin) { - if ( fseek64(Resources.IndexStream, position, origin) != 0 ) - throw BamException("BamToolsIndex::Seek", "could not seek in BAI file"); -} - -void BamToolsIndex::SkipBlocks(const int& numBlocks) { - Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR ); -} - -int64_t BamToolsIndex::Tell(void) const { - return ftell64(Resources.IndexStream); -} - -void BamToolsIndex::WriteBlock(const BtiBlock& block) { - - // copy entry data - int32_t maxEndPosition = block.MaxEndPosition; - int64_t startOffset = block.StartOffset; - int32_t startPosition = block.StartPosition; - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_32(maxEndPosition); - SwapEndian_64(startOffset); - SwapEndian_32(startPosition); - } - - // write the reference index entry - size_t elementsWritten = 0; - elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, Resources.IndexStream); - elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, Resources.IndexStream); - elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, Resources.IndexStream); - if ( elementsWritten != 3 ) - throw BamException("BamToolsIndex::WriteBlock", "could not write BTI block"); -} - -void BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) { - BtiBlockVector::const_iterator blockIter = blocks.begin(); - BtiBlockVector::const_iterator blockEnd = blocks.end(); - for ( ; blockIter != blockEnd; ++blockIter ) - WriteBlock(*blockIter); -} - -void BamToolsIndex::WriteHeader(void) { - - size_t elementsWritten = 0; - - // write BTI index format 'magic number' - elementsWritten += fwrite(BamToolsIndex::BTI_MAGIC, 1, 4, Resources.IndexStream); - - // write BTI index format version - int32_t currentVersion = (int32_t)m_outputVersion; - if ( m_isBigEndian ) SwapEndian_32(currentVersion); - elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, Resources.IndexStream); - - // write block size - uint32_t blockSize = m_blockSize; - if ( m_isBigEndian ) SwapEndian_32(blockSize); - elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, Resources.IndexStream); - - // write number of references - int32_t numReferences = m_indexFileSummary.size(); - if ( m_isBigEndian ) SwapEndian_32(numReferences); - elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, Resources.IndexStream); - - if ( elementsWritten != 7 ) - throw BamException("BamToolsIndex::WriteHeader", "could not write BTI header"); -} - -void BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) { - - // write number of blocks this reference - uint32_t numBlocks = refEntry.Blocks.size(); - if ( m_isBigEndian ) SwapEndian_32(numBlocks); - const size_t elementsWritten = fwrite(&numBlocks, sizeof(numBlocks), 1, Resources.IndexStream); - if ( elementsWritten != 1 ) - throw BamException("BamToolsIndex::WriteReferenceEntry", "could not write number of blocks"); - - // write actual block entries - WriteBlocks(refEntry.Blocks); -} diff --git a/src/api/internal/BamToolsIndex_p.h b/src/api/internal/BamToolsIndex_p.h deleted file mode 100644 index 1e9ec18..0000000 --- a/src/api/internal/BamToolsIndex_p.h +++ /dev/null @@ -1,185 +0,0 @@ -// *************************************************************************** -// BamToolsIndex.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides index operations for the BamTools index format (".bti") -// *************************************************************************** - -#ifndef BAMTOOLS_INDEX_FORMAT_H -#define BAMTOOLS_INDEX_FORMAT_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to -// version without notice, or even be removed. -// -// We mean it. - -#include "api/BamAux.h" -#include "api/BamIndex.h" -#include -#include -#include - -namespace BamTools { -namespace Internal { - -// contains data for each 'block' in a BTI index -struct BtiBlock { - - // data members - int32_t MaxEndPosition; - int64_t StartOffset; - int32_t StartPosition; - - // ctor - BtiBlock(const int32_t& maxEndPosition = 0, - const int64_t& startOffset = 0, - const int32_t& startPosition = 0) - : MaxEndPosition(maxEndPosition) - , StartOffset(startOffset) - , StartPosition(startPosition) - { } -}; - -// convenience typedef for describing a a list of BTI blocks on a reference -typedef std::vector BtiBlockVector; - -// contains all fields necessary for building, loading, & writing -// full BTI index data for a single reference -struct BtiReferenceEntry { - - // data members - int32_t ID; - BtiBlockVector Blocks; - - // ctor - BtiReferenceEntry(const int& id = -1) - : ID(id) - { } -}; - -// provides (persistent) summary of BtiReferenceEntry's index data -struct BtiReferenceSummary { - - // data members - int NumBlocks; - uint64_t FirstBlockFilePosition; - - // ctor - BtiReferenceSummary(void) - : NumBlocks(0) - , FirstBlockFilePosition(0) - { } -}; - -// convenience typedef for describing a full BTI index file summary -typedef std::vector BtiFileSummary; - -class BamToolsIndex : public BamIndex { - - // keep a list of any supported versions here - // (might be useful later to handle any 'legacy' versions if the format changes) - // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on - // - // so a change introduced in BTI_1_2 may be handled from then on by: - // - // if ( indexVersion >= BTI_1_2 ) - // do something new - // else - // do the old thing - enum Version { BTI_1_0 = 1 - , BTI_1_1 - , BTI_1_2 - , BTI_2_0 - }; - - // ctor & dtor - public: - BamToolsIndex(Internal::BamReaderPrivate* reader); - ~BamToolsIndex(void); - - // BamIndex implementation - public: - // builds index from associated BAM file & writes out to index file - bool Create(void); - // returns whether reference has alignments or no - bool HasAlignments(const int& referenceID) const; - // attempts to use index data to jump to @region, returns success/fail - // a "successful" jump indicates no error, but not whether this region has data - // * thus, the method sets a flag to indicate whether there are alignments - // available after the jump position - bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); - // loads existing data from file into memory - bool Load(const std::string& filename); - BamIndex::IndexType Type(void) const { return BamIndex::BAMTOOLS; } - public: - // returns format's file extension - static const std::string Extension(void); - - // internal methods - private: - - // index file ops - void CheckMagicNumber(void); - void CheckVersion(void); - void CloseFile(void); - bool IsFileOpen(void) const; - void OpenFile(const std::string& filename, const char* mode); - void Seek(const int64_t& position, const int& origin); - int64_t Tell(void) const; - - // index-creation methods - void ClearReferenceEntry(BtiReferenceEntry& refEntry); - void WriteBlock(const BtiBlock& block); - void WriteBlocks(const BtiBlockVector& blocks); - void WriteHeader(void); - void WriteReferenceEntry(const BtiReferenceEntry& refEntry); - - // random-access methods - void GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); - void ReadBlock(BtiBlock& block); - void ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks); - void ReadReferenceEntry(BtiReferenceEntry& refEntry); - - // BTI summary data methods - void InitializeFileSummary(const int& numReferences); - void LoadFileSummary(void); - void LoadHeader(void); - void LoadNumBlocks(int& numBlocks); - void LoadNumReferences(int& numReferences); - void LoadReferenceSummary(BtiReferenceSummary& refSummary); - void SkipBlocks(const int& numBlocks); - - // data members - private: - bool m_isBigEndian; - BtiFileSummary m_indexFileSummary; - uint32_t m_blockSize; - int32_t m_inputVersion; // Version is serialized as int - Version m_outputVersion; - - struct RaiiWrapper { - FILE* IndexStream; - RaiiWrapper(void); - ~RaiiWrapper(void); - }; - RaiiWrapper Resources; - - // static constants - private: - static const uint32_t DEFAULT_BLOCK_LENGTH; - static const std::string BTI_EXTENSION; - static const char* const BTI_MAGIC; - static const int SIZEOF_BLOCK; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMTOOLS_INDEX_FORMAT_H diff --git a/src/api/internal/BamWriter_p.cpp b/src/api/internal/BamWriter_p.cpp deleted file mode 100644 index 1b1a3f2..0000000 --- a/src/api/internal/BamWriter_p.cpp +++ /dev/null @@ -1,462 +0,0 @@ -// *************************************************************************** -// BamWriter_p.cpp (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for producing BAM files -// *************************************************************************** - -#include "api/BamAlignment.h" -#include "api/BamConstants.h" -#include "api/IBamIODevice.h" -#include "api/internal/BamException_p.h" -#include "api/internal/BamWriter_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -#include -#include -using namespace std; - -// ctor -BamWriterPrivate::BamWriterPrivate(void) - : m_isBigEndian( BamTools::SystemIsBigEndian() ) -{ } - -// dtor -BamWriterPrivate::~BamWriterPrivate(void) { - Close(); -} - -// calculates minimum bin for a BAM alignment interval [begin, end) -uint32_t BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { - --end; - if ( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); - if ( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); - if ( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20); - if ( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23); - if ( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26); - return 0; -} - -// closes the alignment archive -void BamWriterPrivate::Close(void) { - - // skip if file not open - if ( !IsOpen() ) return; - - // close output stream - try { - m_stream.Close(); - } catch ( BamException& e ) { - m_errorString = e.what(); - } -} - -// creates a cigar string from the supplied alignment -void BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, string& packedCigar) { - - // initialize - const size_t numCigarOperations = cigarOperations.size(); - packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT); - - // pack the cigar data into the string - unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); - - // iterate over cigar operations - vector::const_iterator coIter = cigarOperations.begin(); - vector::const_iterator coEnd = cigarOperations.end(); - for ( ; coIter != coEnd; ++coIter ) { - - // store op in packedCigar - uint8_t cigarOp; - switch ( coIter->Type ) { - case (Constants::BAM_CIGAR_MATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MATCH; break; - case (Constants::BAM_CIGAR_INS_CHAR) : cigarOp = Constants::BAM_CIGAR_INS; break; - case (Constants::BAM_CIGAR_DEL_CHAR) : cigarOp = Constants::BAM_CIGAR_DEL; break; - case (Constants::BAM_CIGAR_REFSKIP_CHAR) : cigarOp = Constants::BAM_CIGAR_REFSKIP; break; - case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_SOFTCLIP; break; - case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_HARDCLIP; break; - case (Constants::BAM_CIGAR_PAD_CHAR) : cigarOp = Constants::BAM_CIGAR_PAD; break; - case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_SEQMATCH; break; - case (Constants::BAM_CIGAR_MISMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MISMATCH; break; - default: - const string message = string("invalid CIGAR operation type") + coIter->Type; - throw BamException("BamWriter::CreatePackedCigar", message); - } - - *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp; - pPackedCigar++; - } -} - -// encodes the supplied query sequence into 4-bit notation -void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) { - - // prepare the encoded query string - const size_t queryLength = query.size(); - const size_t encodedQueryLength = static_cast((queryLength+1)/2); - encodedQuery.resize(encodedQueryLength); - char* pEncodedQuery = (char*)encodedQuery.data(); - const char* pQuery = (const char*)query.data(); - - // walk through original query sequence, encoding its bases - unsigned char nucleotideCode; - bool useHighWord = true; - while ( *pQuery ) { - switch ( *pQuery ) { - case (Constants::BAM_DNA_EQUAL) : nucleotideCode = Constants::BAM_BASECODE_EQUAL; break; - case (Constants::BAM_DNA_A) : nucleotideCode = Constants::BAM_BASECODE_A; break; - case (Constants::BAM_DNA_C) : nucleotideCode = Constants::BAM_BASECODE_C; break; - case (Constants::BAM_DNA_M) : nucleotideCode = Constants::BAM_BASECODE_M; break; - case (Constants::BAM_DNA_G) : nucleotideCode = Constants::BAM_BASECODE_G; break; - case (Constants::BAM_DNA_R) : nucleotideCode = Constants::BAM_BASECODE_R; break; - case (Constants::BAM_DNA_S) : nucleotideCode = Constants::BAM_BASECODE_S; break; - case (Constants::BAM_DNA_V) : nucleotideCode = Constants::BAM_BASECODE_V; break; - case (Constants::BAM_DNA_T) : nucleotideCode = Constants::BAM_BASECODE_T; break; - case (Constants::BAM_DNA_W) : nucleotideCode = Constants::BAM_BASECODE_W; break; - case (Constants::BAM_DNA_Y) : nucleotideCode = Constants::BAM_BASECODE_Y; break; - case (Constants::BAM_DNA_H) : nucleotideCode = Constants::BAM_BASECODE_H; break; - case (Constants::BAM_DNA_K) : nucleotideCode = Constants::BAM_BASECODE_K; break; - case (Constants::BAM_DNA_D) : nucleotideCode = Constants::BAM_BASECODE_D; break; - case (Constants::BAM_DNA_B) : nucleotideCode = Constants::BAM_BASECODE_B; break; - case (Constants::BAM_DNA_N) : nucleotideCode = Constants::BAM_BASECODE_N; break; - default: - const string message = string("invalid base: ") + *pQuery; - throw BamException("BamWriter::EncodeQuerySequence", message); - } - - // pack the nucleotide code - if ( useHighWord ) { - *pEncodedQuery = nucleotideCode << 4; - useHighWord = false; - } else { - *pEncodedQuery |= nucleotideCode; - ++pEncodedQuery; - useHighWord = true; - } - - // increment the query position - ++pQuery; - } -} - -// returns a description of the last error that occurred -std::string BamWriterPrivate::GetErrorString(void) const { - return m_errorString; -} - -// returns whether BAM file is open for writing or not -bool BamWriterPrivate::IsOpen(void) const { - return m_stream.IsOpen(); -} - -// opens the alignment archive -bool BamWriterPrivate::Open(const string& filename, - const string& samHeaderText, - const RefVector& referenceSequences) -{ - try { - - // open the BGZF file for writing - m_stream.Open(filename, IBamIODevice::WriteOnly); - - // write BAM file 'metadata' components - WriteMagicNumber(); - WriteSamHeaderText(samHeaderText); - WriteReferences(referenceSequences); - - // return success - return true; - - } catch ( BamException& e ) { - m_errorString = e.what(); - return false; - } -} - -// saves the alignment to the alignment archive -bool BamWriterPrivate::SaveAlignment(const BamAlignment& al) { - - try { - - // if BamAlignment contains only the core data and a raw char data buffer - // (as a result of BamReader::GetNextAlignmentCore()) - if ( al.SupportData.HasCoreOnly ) - WriteCoreAlignment(al); - - // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc - // (resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code) - else WriteAlignment(al); - - // if we get here, everything OK - return true; - - } catch ( BamException& e ) { - m_errorString = e.what(); - return false; - } -} - -void BamWriterPrivate::SetWriteCompressed(bool ok) { - // modifying compression is not allowed if BAM file is open - if ( !IsOpen() ) - m_stream.SetWriteCompressed(ok); -} - -void BamWriterPrivate::WriteAlignment(const BamAlignment& al) { - - // calculate char lengths - const unsigned int nameLength = al.Name.size() + 1; - const unsigned int numCigarOperations = al.CigarData.size(); - const unsigned int queryLength = al.QueryBases.size(); - const unsigned int tagDataLength = al.TagData.size(); - - // no way to tell if alignment's bin is already defined (there is no default, invalid value) - // so we'll go ahead calculate its bin ID before storing - const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition()); - - // create our packed cigar string - string packedCigar; - CreatePackedCigar(al.CigarData, packedCigar); - const unsigned int packedCigarLength = packedCigar.size(); - - // encode the query - string encodedQuery; - EncodeQuerySequence(al.QueryBases, encodedQuery); - const unsigned int encodedQueryLength = encodedQuery.size(); - - // write the block size - const unsigned int dataBlockSize = nameLength + - packedCigarLength + - encodedQueryLength + - queryLength + - tagDataLength; - unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize; - if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); - m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); - - // assign the BAM core data - uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; - buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; - buffer[4] = queryLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( m_isBigEndian ) { - for ( int i = 0; i < 8; ++i ) - BamTools::SwapEndian_32(buffer[i]); - } - - // write the BAM core - m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); - - // write the query name - m_stream.Write(al.Name.c_str(), nameLength); - - // write the packed cigar - if ( m_isBigEndian ) { - char* cigarData = new char[packedCigarLength](); - memcpy(cigarData, packedCigar.data(), packedCigarLength); - if ( m_isBigEndian ) { - for ( size_t i = 0; i < packedCigarLength; ++i ) - BamTools::SwapEndian_32p(&cigarData[i]); - } - m_stream.Write(cigarData, packedCigarLength); - delete[] cigarData; // TODO: cleanup on Write exception thrown? - } - else - m_stream.Write(packedCigar.data(), packedCigarLength); - - // write the encoded query sequence - m_stream.Write(encodedQuery.data(), encodedQueryLength); - - // write the base qualities - char* pBaseQualities = (char*)al.Qualities.data(); - for ( size_t i = 0; i < queryLength; ++i ) - pBaseQualities[i] -= 33; // FASTQ conversion - m_stream.Write(pBaseQualities, queryLength); - - // write the read group tag - if ( m_isBigEndian ) { - - char* tagData = new char[tagDataLength](); - memcpy(tagData, al.TagData.data(), tagDataLength); - - size_t i = 0; - while ( i < tagDataLength ) { - - i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.) - const char type = tagData[i]; // get tag type at position i - ++i; - - switch ( type ) { - - case(Constants::BAM_TAG_TYPE_ASCII) : - case(Constants::BAM_TAG_TYPE_INT8) : - case(Constants::BAM_TAG_TYPE_UINT8) : - ++i; - break; - - case(Constants::BAM_TAG_TYPE_INT16) : - case(Constants::BAM_TAG_TYPE_UINT16) : - BamTools::SwapEndian_16p(&tagData[i]); - i += sizeof(uint16_t); - break; - - case(Constants::BAM_TAG_TYPE_FLOAT) : - case(Constants::BAM_TAG_TYPE_INT32) : - case(Constants::BAM_TAG_TYPE_UINT32) : - BamTools::SwapEndian_32p(&tagData[i]); - i += sizeof(uint32_t); - break; - - case(Constants::BAM_TAG_TYPE_HEX) : - case(Constants::BAM_TAG_TYPE_STRING) : - // no endian swapping necessary for hex-string/string data - while ( tagData[i] ) - ++i; - // increment one more for null terminator - ++i; - break; - - case(Constants::BAM_TAG_TYPE_ARRAY) : - - { - // read array type - const char arrayType = tagData[i]; - ++i; - - // swap endian-ness of number of elements in place, then retrieve for loop - BamTools::SwapEndian_32p(&tagData[i]); - int32_t numElements; - memcpy(&numElements, &tagData[i], sizeof(uint32_t)); - i += sizeof(uint32_t); - - // swap endian-ness of array elements - for ( int j = 0; j < numElements; ++j ) { - switch (arrayType) { - case (Constants::BAM_TAG_TYPE_INT8) : - case (Constants::BAM_TAG_TYPE_UINT8) : - // no endian-swapping necessary - ++i; - break; - case (Constants::BAM_TAG_TYPE_INT16) : - case (Constants::BAM_TAG_TYPE_UINT16) : - BamTools::SwapEndian_16p(&tagData[i]); - i += sizeof(uint16_t); - break; - case (Constants::BAM_TAG_TYPE_FLOAT) : - case (Constants::BAM_TAG_TYPE_INT32) : - case (Constants::BAM_TAG_TYPE_UINT32) : - BamTools::SwapEndian_32p(&tagData[i]); - i += sizeof(uint32_t); - break; - default: - delete[] tagData; - const string message = string("invalid binary array type: ") + arrayType; - throw BamException("BamWriter::SaveAlignment", message); - } - } - - break; - } - - default : - delete[] tagData; - const string message = string("invalid tag type: ") + type; - throw BamException("BamWriter::SaveAlignment", message); - } - } - - m_stream.Write(tagData, tagDataLength); - delete[] tagData; // TODO: cleanup on Write exception thrown? - } - else - m_stream.Write(al.TagData.data(), tagDataLength); -} - -void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al) { - - // write the block size - unsigned int blockSize = al.SupportData.BlockLength; - if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); - m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); - - // re-calculate bin (in case BamAlignment's position has been previously modified) - const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition()); - - // assign the BAM core data - uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; - buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; - buffer[4] = al.SupportData.QuerySequenceLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( m_isBigEndian ) { - for ( int i = 0; i < 8; ++i ) - BamTools::SwapEndian_32(buffer[i]); - } - - // write the BAM core - m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); - - // write the raw char data - m_stream.Write((char*)al.SupportData.AllCharData.data(), - al.SupportData.BlockLength-Constants::BAM_CORE_SIZE); -} - -void BamWriterPrivate::WriteMagicNumber(void) { - // write BAM file 'magic number' - m_stream.Write(Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH); -} - -void BamWriterPrivate::WriteReferences(const BamTools::RefVector& referenceSequences) { - - // write the number of reference sequences - uint32_t numReferenceSequences = referenceSequences.size(); - if ( m_isBigEndian ) BamTools::SwapEndian_32(numReferenceSequences); - m_stream.Write((char*)&numReferenceSequences, Constants::BAM_SIZEOF_INT); - - // foreach reference sequence - RefVector::const_iterator rsIter = referenceSequences.begin(); - RefVector::const_iterator rsEnd = referenceSequences.end(); - for ( ; rsIter != rsEnd; ++rsIter ) { - - // write the reference sequence name length - uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; - if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceSequenceNameLen); - m_stream.Write((char*)&referenceSequenceNameLen, Constants::BAM_SIZEOF_INT); - - // write the reference sequence name - m_stream.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); - - // write the reference sequence length - int32_t referenceLength = rsIter->RefLength; - if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceLength); - m_stream.Write((char*)&referenceLength, Constants::BAM_SIZEOF_INT); - } -} - -void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText) { - - // write the SAM header text length - uint32_t samHeaderLen = samHeaderText.size(); - if ( m_isBigEndian ) BamTools::SwapEndian_32(samHeaderLen); - m_stream.Write((char*)&samHeaderLen, Constants::BAM_SIZEOF_INT); - - // write the SAM header text - if ( samHeaderLen > 0 ) - m_stream.Write(samHeaderText.data(), samHeaderLen); -} diff --git a/src/api/internal/BamWriter_p.h b/src/api/internal/BamWriter_p.h deleted file mode 100644 index cf10941..0000000 --- a/src/api/internal/BamWriter_p.h +++ /dev/null @@ -1,73 +0,0 @@ -// *************************************************************************** -// BamWriter_p.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for producing BAM files -// *************************************************************************** - -#ifndef BAMWRITER_P_H -#define BAMWRITER_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to -// version without notice, or even be removed. -// -// We mean it. - -#include "api/BamAux.h" -#include "api/internal/BgzfStream_p.h" -#include -#include - -namespace BamTools { - -class BamAlignment; - -namespace Internal { - -class BamWriterPrivate { - - // ctor & dtor - public: - BamWriterPrivate(void); - ~BamWriterPrivate(void); - - // interface methods - public: - void Close(void); - std::string GetErrorString(void) const; - bool IsOpen(void) const; - bool Open(const std::string& filename, - const std::string& samHeaderText, - const BamTools::RefVector& referenceSequences); - bool SaveAlignment(const BamAlignment& al); - void SetWriteCompressed(bool ok); - - // 'internal' methods - public: - uint32_t CalculateMinimumBin(const int begin, int end) const; - void CreatePackedCigar(const std::vector& cigarOperations, std::string& packedCigar); - void EncodeQuerySequence(const std::string& query, std::string& encodedQuery); - void WriteAlignment(const BamAlignment& al); - void WriteCoreAlignment(const BamAlignment& al); - void WriteMagicNumber(void); - void WriteReferences(const BamTools::RefVector& referenceSequences); - void WriteSamHeaderText(const std::string& samHeaderText); - - // data members - private: - BgzfStream m_stream; - bool m_isBigEndian; - std::string m_errorString; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMWRITER_P_H diff --git a/src/api/internal/BgzfStream_p.cpp b/src/api/internal/BgzfStream_p.cpp deleted file mode 100644 index 5891067..0000000 --- a/src/api/internal/BgzfStream_p.cpp +++ /dev/null @@ -1,460 +0,0 @@ -// *************************************************************************** -// BgzfStream_p.cpp (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 11 October 2011(DB) -// --------------------------------------------------------------------------- -// Based on BGZF routines developed at the Broad Institute. -// Provides the basic functionality for reading & writing BGZF files -// Replaces the old BGZF.* files to avoid clashing with other toolkits -// *************************************************************************** - -#include "api/BamAux.h" -#include "api/BamConstants.h" -#include "api/internal/BamDeviceFactory_p.h" -#include "api/internal/BamException_p.h" -#include "api/internal/BgzfStream_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -#include "zlib.h" - -#include -#include -#include -#include -using namespace std; - -// ---------------------------- -// RaiiWrapper implementation -// ---------------------------- - -BgzfStream::RaiiWrapper::RaiiWrapper(void) { - CompressedBlock = new char[Constants::BGZF_MAX_BLOCK_SIZE]; - UncompressedBlock = new char[Constants::BGZF_DEFAULT_BLOCK_SIZE]; -} - -BgzfStream::RaiiWrapper::~RaiiWrapper(void) { - - // clean up buffers - delete[] CompressedBlock; - delete[] UncompressedBlock; - CompressedBlock = 0; - UncompressedBlock = 0; -} - -// --------------------------- -// BgzfStream implementation -// --------------------------- - -// constructor -BgzfStream::BgzfStream(void) - : m_blockLength(0) - , m_blockOffset(0) - , m_blockAddress(0) - , m_isWriteCompressed(true) - , m_device(0) -{ } - -// destructor -BgzfStream::~BgzfStream(void) { - Close(); -} - -// checks BGZF block header -bool BgzfStream::CheckBlockHeader(char* header) { - return (header[0] == Constants::GZIP_ID1 && - header[1] == Constants::GZIP_ID2 && - header[2] == Z_DEFLATED && - (header[3] & Constants::FLG_FEXTRA) != 0 && - BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN && - header[12] == Constants::BGZF_ID1 && - header[13] == Constants::BGZF_ID2 && - BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN ); -} - -// closes BGZF file -void BgzfStream::Close(void) { - - // skip if no device open - if ( m_device == 0 ) return; - - // if writing to file, flush the current BGZF block, - // then write an empty block (as EOF marker) - if ( m_device->IsOpen() && (m_device->Mode() == IBamIODevice::WriteOnly) ) { - FlushBlock(); - const size_t blockLength = DeflateBlock(); - m_device->Write(Resources.CompressedBlock, blockLength); - } - - // close device - m_device->Close(); - delete m_device; - m_device = 0; - - // reset state - m_blockLength = 0; - m_blockOffset = 0; - m_blockAddress = 0; - m_isWriteCompressed = true; -} - -// compresses the current block -size_t BgzfStream::DeflateBlock(void) { - - // initialize the gzip header - char* buffer = Resources.CompressedBlock; - memset(buffer, 0, 18); - buffer[0] = Constants::GZIP_ID1; - buffer[1] = Constants::GZIP_ID2; - buffer[2] = Constants::CM_DEFLATE; - buffer[3] = Constants::FLG_FEXTRA; - buffer[9] = Constants::OS_UNKNOWN; - buffer[10] = Constants::BGZF_XLEN; - buffer[12] = Constants::BGZF_ID1; - buffer[13] = Constants::BGZF_ID2; - buffer[14] = Constants::BGZF_LEN; - - // set compression level - const int compressionLevel = ( m_isWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 ); - - // loop to retry for blocks that do not compress enough - int inputLength = m_blockOffset; - size_t compressedLength = 0; - const unsigned int bufferSize = Constants::BGZF_MAX_BLOCK_SIZE; - - while ( true ) { - - // initialize zstream values - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)Resources.UncompressedBlock; - zs.avail_in = inputLength; - zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH]; - zs.avail_out = bufferSize - - Constants::BGZF_BLOCK_HEADER_LENGTH - - Constants::BGZF_BLOCK_FOOTER_LENGTH; - - // initialize the zlib compression algorithm - int status = deflateInit2(&zs, - compressionLevel, - Z_DEFLATED, - Constants::GZIP_WINDOW_BITS, - Constants::Z_DEFAULT_MEM_LEVEL, - Z_DEFAULT_STRATEGY); - if ( status != Z_OK ) - throw BamException("BgzfStream::DeflateBlock", "zlib deflateInit2 failed"); - - // compress the data - status = deflate(&zs, Z_FINISH); - - // if not at stream end - if ( status != Z_STREAM_END ) { - - deflateEnd(&zs); - - // there was not enough space available in buffer - // try to reduce the input length & re-start loop - if ( status == Z_OK ) { - inputLength -= 1024; - if ( inputLength < 0 ) - throw BamException("BgzfStream::DeflateBlock", "input reduction failed"); - continue; - } - - throw BamException("BgzfStream::DeflateBlock", "zlib deflate failed"); - } - - // finalize the compression routine - status = deflateEnd(&zs); - if ( status != Z_OK ) - throw BamException("BgzfStream::DeflateBlock", "zlib deflateEnd failed"); - - // update compressedLength - compressedLength = zs.total_out + - Constants::BGZF_BLOCK_HEADER_LENGTH + - Constants::BGZF_BLOCK_FOOTER_LENGTH; - if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE ) - throw BamException("BgzfStream::DeflateBlock", "deflate overflow"); - - // quit while loop - break; - } - - // store the compressed length - BamTools::PackUnsignedShort(&buffer[16], static_cast(compressedLength - 1)); - - // store the CRC32 checksum - uint32_t crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)Resources.UncompressedBlock, inputLength); - BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc); - BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength); - - // ensure that we have less than a block of data left - int remaining = m_blockOffset - inputLength; - if ( remaining > 0 ) { - if ( remaining > inputLength ) - throw BamException("BgzfStream::DeflateBlock", "after deflate, remainder too large"); - memcpy(Resources.UncompressedBlock, Resources.UncompressedBlock + inputLength, remaining); - } - - // update block data - m_blockOffset = remaining; - - // return result - return compressedLength; -} - -// flushes the data in the BGZF block -void BgzfStream::FlushBlock(void) { - - BT_ASSERT_X( m_device, "BgzfStream::FlushBlock() - attempting to flush to null device" ); - - // flush all of the remaining blocks - while ( m_blockOffset > 0 ) { - - // compress the data block - const size_t blockLength = DeflateBlock(); - - // flush the data to our output device - const size_t numBytesWritten = m_device->Write(Resources.CompressedBlock, blockLength); - if ( numBytesWritten != blockLength ) { - stringstream s(""); - s << "expected to write " << blockLength - << " bytes during flushing, but wrote " << numBytesWritten; - throw BamException("BgzfStream::FlushBlock", s.str()); - } - - // update block data - m_blockAddress += blockLength; - } -} - -// decompresses the current block -size_t BgzfStream::InflateBlock(const size_t& blockLength) { - - // setup zlib stream object - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)Resources.CompressedBlock + 18; - zs.avail_in = blockLength - 16; - zs.next_out = (Bytef*)Resources.UncompressedBlock; - zs.avail_out = Constants::BGZF_DEFAULT_BLOCK_SIZE; - - // initialize - int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS); - if ( status != Z_OK ) - throw BamException("BgzfStream::InflateBlock", "zlib inflateInit failed"); - - // decompress - status = inflate(&zs, Z_FINISH); - if ( status != Z_STREAM_END ) { - inflateEnd(&zs); - throw BamException("BgzfStream::InflateBlock", "zlib inflate failed"); - } - - // finalize - status = inflateEnd(&zs); - if ( status != Z_OK ) { - inflateEnd(&zs); - throw BamException("BgzfStream::InflateBlock", "zlib inflateEnd failed"); - } - - // return result - return zs.total_out; -} - -bool BgzfStream::IsOpen(void) const { - if ( m_device == 0 ) - return false; - return m_device->IsOpen(); -} - -void BgzfStream::Open(const string& filename, const IBamIODevice::OpenMode mode) { - - // close current device if necessary - Close(); - BT_ASSERT_X( (m_device == 0), "BgzfStream::Open() - unable to properly close previous IO device" ); - - // retrieve new IO device depending on filename - m_device = BamDeviceFactory::CreateDevice(filename); - BT_ASSERT_X( m_device, "BgzfStream::Open() - unable to create IO device from filename" ); - - // if device fails to open - if ( !m_device->Open(mode) ) { - const string deviceError = m_device->GetErrorString(); - const string message = string("could not open BGZF stream: \n\t") + deviceError; - throw BamException("BgzfStream::Open", message); - } -} - -// reads BGZF data into a byte buffer -size_t BgzfStream::Read(char* data, const size_t dataLength) { - - if ( dataLength == 0 ) - return 0; - - // if stream not open for reading - BT_ASSERT_X( m_device, "BgzfStream::Read() - trying to read from null device"); - if ( !m_device->IsOpen() || (m_device->Mode() != IBamIODevice::ReadOnly) ) - return 0; - - // read blocks as needed until desired data length is retrieved - char* output = data; - size_t numBytesRead = 0; - while ( numBytesRead < dataLength ) { - - // determine bytes available in current block - int bytesAvailable = m_blockLength - m_blockOffset; - - // read (and decompress) next block if needed - if ( bytesAvailable <= 0 ) { - ReadBlock(); - bytesAvailable = m_blockLength - m_blockOffset; - if ( bytesAvailable <= 0 ) - break; - } - - // copy data from uncompressed source buffer into data destination buffer - const size_t copyLength = min( (dataLength-numBytesRead), (size_t)bytesAvailable ); - memcpy(output, Resources.UncompressedBlock + m_blockOffset, copyLength); - - // update counters - m_blockOffset += copyLength; - output += copyLength; - numBytesRead += copyLength; - } - - // update block data - if ( m_blockOffset == m_blockLength ) { - m_blockAddress = m_device->Tell(); - m_blockOffset = 0; - m_blockLength = 0; - - } - - // return actual number of bytes read - return numBytesRead; -} - -// reads a BGZF block -void BgzfStream::ReadBlock(void) { - - BT_ASSERT_X( m_device, "BgzfStream::ReadBlock() - trying to read from null IO device"); - - // store block's starting address - int64_t blockAddress = m_device->Tell(); - - // read block header from file - char header[Constants::BGZF_BLOCK_HEADER_LENGTH]; - size_t numBytesRead = m_device->Read(header, Constants::BGZF_BLOCK_HEADER_LENGTH); - - // if block header empty - if ( numBytesRead == 0 ) { - m_blockLength = 0; - return; - } - - // if block header invalid size - if ( numBytesRead != Constants::BGZF_BLOCK_HEADER_LENGTH ) - throw BamException("BgzfStream::ReadBlock", "invalid block header size"); - - // validate block header contents - if ( !BgzfStream::CheckBlockHeader(header) ) - throw BamException("BgzfStream::ReadBlock", "invalid block header contents"); - - // copy header contents to compressed buffer - const size_t blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1; - memcpy(Resources.CompressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH); - - // read remainder of block - const size_t remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH; - numBytesRead = m_device->Read(&Resources.CompressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], remaining); - if ( numBytesRead != remaining ) - throw BamException("BgzfStream::ReadBlock", "could not read data from block"); - - // decompress block data - numBytesRead = InflateBlock(blockLength); - - // update block data - if ( m_blockLength != 0 ) - m_blockOffset = 0; - m_blockAddress = blockAddress; - m_blockLength = numBytesRead; -} - -// seek to position in BGZF file -void BgzfStream::Seek(const int64_t& position) { - - BT_ASSERT_X( m_device, "BgzfStream::Seek() - trying to seek on null IO device"); - - // skip if device is not open - if ( !IsOpen() ) return; - - // determine adjusted offset & address - int blockOffset = (position & 0xFFFF); - int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; - - // attempt seek in file - if ( m_device->IsRandomAccess() && m_device->Seek(blockAddress) ) { - - // update block data & return success - m_blockLength = 0; - m_blockAddress = blockAddress; - m_blockOffset = blockOffset; - } - else { - stringstream s(""); - s << "unable to seek to position: " << position; - throw BamException("BgzfStream::Seek", s.str()); - } -} - -void BgzfStream::SetWriteCompressed(bool ok) { - m_isWriteCompressed = ok; -} - -// get file position in BGZF file -int64_t BgzfStream::Tell(void) const { - if ( !IsOpen() ) - return 0; - return ( (m_blockAddress << 16) | (m_blockOffset & 0xFFFF) ); -} - -// writes the supplied data into the BGZF buffer -size_t BgzfStream::Write(const char* data, const size_t dataLength) { - - BT_ASSERT_X( m_device, "BgzfStream::Write() - trying to write to null IO device"); - BT_ASSERT_X( (m_device->Mode() == IBamIODevice::WriteOnly), - "BgzfStream::Write() - trying to write to non-writable IO device"); - - // skip if file not open for writing - if ( !IsOpen() ) - return 0; - - // write blocks as needed til all data is written - size_t numBytesWritten = 0; - const char* input = data; - const size_t blockLength = Constants::BGZF_DEFAULT_BLOCK_SIZE; - while ( numBytesWritten < dataLength ) { - - // copy data contents to uncompressed output buffer - unsigned int copyLength = min(blockLength - m_blockOffset, dataLength - numBytesWritten); - char* buffer = Resources.UncompressedBlock; - memcpy(buffer + m_blockOffset, input, copyLength); - - // update counter - m_blockOffset += copyLength; - input += copyLength; - numBytesWritten += copyLength; - - // flush (& compress) output buffer when full - if ( m_blockOffset == blockLength ) - FlushBlock(); - } - - // return actual number of bytes written - return numBytesWritten; -} diff --git a/src/api/internal/BgzfStream_p.h b/src/api/internal/BgzfStream_p.h deleted file mode 100644 index 88d7472..0000000 --- a/src/api/internal/BgzfStream_p.h +++ /dev/null @@ -1,97 +0,0 @@ -// *************************************************************************** -// BgzfStream_p.h (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011(DB) -// --------------------------------------------------------------------------- -// Based on BGZF routines developed at the Broad Institute. -// Provides the basic functionality for reading & writing BGZF files -// Replaces the old BGZF.* files to avoid clashing with other toolkits -// *************************************************************************** - -#ifndef BGZFSTREAM_P_H -#define BGZFSTREAM_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include "api/api_global.h" -#include "api/IBamIODevice.h" -#include - -namespace BamTools { -namespace Internal { - -class BgzfStream { - - // constructor & destructor - public: - BgzfStream(void); - ~BgzfStream(void); - - // main interface methods - public: - // closes BGZF file - void Close(void); - // returns true if BgzfStream open for IO - bool IsOpen(void) const; - // opens the BGZF file - void Open(const std::string& filename, const IBamIODevice::OpenMode mode); - // reads BGZF data into a byte buffer - size_t Read(char* data, const size_t dataLength); - // seek to position in BGZF file - void Seek(const int64_t& position); - // sets IO device (closes previous, if any, but does not attempt to open) - void SetIODevice(IBamIODevice* device); - // enable/disable compressed output - void SetWriteCompressed(bool ok); - // get file position in BGZF file - int64_t Tell(void) const; - // writes the supplied data into the BGZF buffer - size_t Write(const char* data, const size_t dataLength); - - // internal methods - private: - // compresses the current block - size_t DeflateBlock(void); - // flushes the data in the BGZF block - void FlushBlock(void); - // de-compresses the current block - size_t InflateBlock(const size_t& blockLength); - // reads a BGZF block - void ReadBlock(void); - - // static 'utility' methods - public: - // checks BGZF block header - static bool CheckBlockHeader(char* header); - - // data members - public: - unsigned int m_blockLength; - unsigned int m_blockOffset; - uint64_t m_blockAddress; - - bool m_isWriteCompressed; - IBamIODevice* m_device; - - struct RaiiWrapper { - RaiiWrapper(void); - ~RaiiWrapper(void); - char* UncompressedBlock; - char* CompressedBlock; - }; - RaiiWrapper Resources; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BGZFSTREAM_P_H diff --git a/src/api/internal/CMakeLists.txt b/src/api/internal/CMakeLists.txt new file mode 100644 index 0000000..1e7b8dd --- /dev/null +++ b/src/api/internal/CMakeLists.txt @@ -0,0 +1,25 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2011 Derek Barnett +# +# src/api/internal +# ========================== + +set ( InternalDir "internal" ) + +add_subdirectory ( bam ) +add_subdirectory ( index ) +add_subdirectory ( io ) +add_subdirectory ( sam ) +add_subdirectory ( utils ) + +set ( InternalSources + ${InternalBamSources} + ${InternalIndexSources} + ${InternalIOSources} + ${InternalSamSources} + ${InternalUtilsSources} + + PARENT_SCOPE # <-- leave this last + ) + diff --git a/src/api/internal/ILocalIODevice_p.cpp b/src/api/internal/ILocalIODevice_p.cpp deleted file mode 100644 index 8730a91..0000000 --- a/src/api/internal/ILocalIODevice_p.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// *************************************************************************** -// ILocalIODevice_p.cpp (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides shared behavior for files & pipes -// *************************************************************************** - -#include "api/internal/ILocalIODevice_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -#include -using namespace std; - -ILocalIODevice::ILocalIODevice(void) - : IBamIODevice() - , m_stream(0) -{ } - -ILocalIODevice::~ILocalIODevice(void) { - Close(); -} - -void ILocalIODevice::Close(void) { - - // skip if not open - if ( !IsOpen() ) - return; - - // flush & close FILE* - fflush(m_stream); - fclose(m_stream); - m_stream = 0; - - // reset other device state - m_mode = IBamIODevice::NotOpen; -} - -size_t ILocalIODevice::Read(char* data, const unsigned int numBytes) { - BT_ASSERT_X( m_stream, "ILocalIODevice::Read: trying to read from null stream" ); - BT_ASSERT_X( (m_mode == IBamIODevice::ReadOnly), "ILocalIODevice::Read: device not in read-only mode"); - return fread(data, sizeof(char), numBytes, m_stream); -} - -int64_t ILocalIODevice::Tell(void) const { - BT_ASSERT_X( m_stream, "ILocalIODevice::Tell: trying to get file position fromnull stream" ); - return ftell64(m_stream); -} - -size_t ILocalIODevice::Write(const char* data, const unsigned int numBytes) { - BT_ASSERT_X( m_stream, "ILocalIODevice::Write: tryint to write to null stream" ); - BT_ASSERT_X( (m_mode == IBamIODevice::WriteOnly), "ILocalIODevice::Write: device not in write-only mode" ); - return fwrite(data, sizeof(char), numBytes, m_stream); -} diff --git a/src/api/internal/ILocalIODevice_p.h b/src/api/internal/ILocalIODevice_p.h deleted file mode 100644 index a71f378..0000000 --- a/src/api/internal/ILocalIODevice_p.h +++ /dev/null @@ -1,50 +0,0 @@ -// *************************************************************************** -// ILocalIODevice_p.h (c) 2011 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides shared behavior for files & pipes -// *************************************************************************** - -#ifndef ILOCALIODEVICE_P_H -#define ILOCALIODEVICE_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include "api/IBamIODevice.h" - -namespace BamTools { -namespace Internal { - -class ILocalIODevice : public IBamIODevice { - - // ctor & dtor - public: - ILocalIODevice(void); - virtual ~ILocalIODevice(void); - - // IBamIODevice implementation - public: - virtual void Close(void); - virtual size_t Read(char* data, const unsigned int numBytes); - virtual int64_t Tell(void) const; - virtual size_t Write(const char* data, const unsigned int numBytes); - - // data members - protected: - FILE* m_stream; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // ILOCALIODEVICE_P_H diff --git a/src/api/internal/IRemoteIODevice_p.cpp b/src/api/internal/IRemoteIODevice_p.cpp deleted file mode 100644 index e69de29..0000000 diff --git a/src/api/internal/IRemoteIODevice_p.h b/src/api/internal/IRemoteIODevice_p.h deleted file mode 100644 index e69de29..0000000 diff --git a/src/api/internal/SamFormatParser_p.cpp b/src/api/internal/SamFormatParser_p.cpp deleted file mode 100644 index 195fdcd..0000000 --- a/src/api/internal/SamFormatParser_p.cpp +++ /dev/null @@ -1,222 +0,0 @@ -// *************************************************************************** -// SamFormatParser.cpp (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides functionality for parsing SAM header text into SamHeader object -// *************************************************************************** - -#include "api/SamConstants.h" -#include "api/SamHeader.h" -#include "api/internal/BamException_p.h" -#include "api/internal/SamFormatParser_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -#include -#include -#include -using namespace std; - -SamFormatParser::SamFormatParser(SamHeader& header) - : m_header(header) -{ } - -SamFormatParser::~SamFormatParser(void) { } - -void SamFormatParser::Parse(const string& headerText) { - - // clear header's prior contents - m_header.Clear(); - - // empty header is OK, but skip processing - if ( headerText.empty() ) - return; - - // other wise parse SAM lines - istringstream headerStream(headerText); - string headerLine(""); - while ( getline(headerStream, headerLine) ) - ParseSamLine(headerLine); -} - -void SamFormatParser::ParseSamLine(const string& line) { - - // skip if line is not long enough to contain true values - if ( line.length() < 5 ) return; - - // determine token at beginning of line - const string firstToken = line.substr(0,3); - string restOfLine = line.substr(4); - if ( firstToken == Constants::SAM_HD_BEGIN_TOKEN) ParseHDLine(restOfLine); - else if ( firstToken == Constants::SAM_SQ_BEGIN_TOKEN) ParseSQLine(restOfLine); - else if ( firstToken == Constants::SAM_RG_BEGIN_TOKEN) ParseRGLine(restOfLine); - else if ( firstToken == Constants::SAM_PG_BEGIN_TOKEN) ParsePGLine(restOfLine); - else if ( firstToken == Constants::SAM_CO_BEGIN_TOKEN) ParseCOLine(restOfLine); - else { - const string message = string("unknown token: ") + firstToken; - throw BamException("SamFormatParser::ParseSamLine", message); - } -} - -void SamFormatParser::ParseHDLine(const string& line) { - - // split HD lines into tokens - vector tokens = Split(line, Constants::SAM_TAB); - - // iterate over tokens - vector::const_iterator tokenIter = tokens.begin(); - vector::const_iterator tokenEnd = tokens.end(); - for ( ; tokenIter != tokenEnd; ++tokenIter ) { - - // get tag/value - const string tokenTag = (*tokenIter).substr(0,2); - const string tokenValue = (*tokenIter).substr(3); - - // set header contents - if ( tokenTag == Constants::SAM_HD_VERSION_TAG ) m_header.Version = tokenValue; - else if ( tokenTag == Constants::SAM_HD_SORTORDER_TAG ) m_header.SortOrder = tokenValue; - else if ( tokenTag == Constants::SAM_HD_GROUPORDER_TAG ) m_header.GroupOrder = tokenValue; - else { - const string message = string("unknown HD tag: ") + tokenTag; - throw BamException("SamFormatParser::ParseHDLine", message); - } - } - - // check for required tags - if ( !m_header.HasVersion() ) - throw BamException("SamFormatParser::ParseHDLine", "@HD line is missing VN tag"); -} - -void SamFormatParser::ParseSQLine(const string& line) { - - SamSequence seq; - - // split SQ line into tokens - vector tokens = Split(line, Constants::SAM_TAB); - - // iterate over tokens - vector::const_iterator tokenIter = tokens.begin(); - vector::const_iterator tokenEnd = tokens.end(); - for ( ; tokenIter != tokenEnd; ++tokenIter ) { - - // get tag/value - const string tokenTag = (*tokenIter).substr(0,2); - const string tokenValue = (*tokenIter).substr(3); - - // set sequence contents - if ( tokenTag == Constants::SAM_SQ_NAME_TAG ) seq.Name = tokenValue; - else if ( tokenTag == Constants::SAM_SQ_LENGTH_TAG ) seq.Length = tokenValue; - else if ( tokenTag == Constants::SAM_SQ_ASSEMBLYID_TAG ) seq.AssemblyID = tokenValue; - else if ( tokenTag == Constants::SAM_SQ_CHECKSUM_TAG ) seq.Checksum = tokenValue; - else if ( tokenTag == Constants::SAM_SQ_SPECIES_TAG ) seq.Species = tokenValue; - else if ( tokenTag == Constants::SAM_SQ_URI_TAG ) seq.URI = tokenValue; - else { - const string message = string("unknown SQ tag: ") + tokenTag; - throw BamException("SamFormatParser::ParseSQLine", message); - } - } - - // check for required tags - if ( !seq.HasName() ) - throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing SN tag"); - if ( !seq.HasLength() ) - throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing LN tag"); - - // store SAM sequence entry - m_header.Sequences.Add(seq); -} - -void SamFormatParser::ParseRGLine(const string& line) { - - SamReadGroup rg; - - // split string into tokens - vector tokens = Split(line, Constants::SAM_TAB); - - // iterate over tokens - vector::const_iterator tokenIter = tokens.begin(); - vector::const_iterator tokenEnd = tokens.end(); - for ( ; tokenIter != tokenEnd; ++tokenIter ) { - - // get token tag/value - const string tokenTag = (*tokenIter).substr(0,2); - const string tokenValue = (*tokenIter).substr(3); - - // set read group contents - if ( tokenTag == Constants::SAM_RG_ID_TAG ) rg.ID = tokenValue; - else if ( tokenTag == Constants::SAM_RG_DESCRIPTION_TAG ) rg.Description = tokenValue; - else if ( tokenTag == Constants::SAM_RG_FLOWORDER_TAG ) rg.FlowOrder = tokenValue; - else if ( tokenTag == Constants::SAM_RG_KEYSEQUENCE_TAG ) rg.KeySequence = tokenValue; - else if ( tokenTag == Constants::SAM_RG_LIBRARY_TAG ) rg.Library = tokenValue; - else if ( tokenTag == Constants::SAM_RG_PLATFORMUNIT_TAG ) rg.PlatformUnit = tokenValue; - else if ( tokenTag == Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG ) rg.PredictedInsertSize = tokenValue; - else if ( tokenTag == Constants::SAM_RG_PRODUCTIONDATE_TAG ) rg.ProductionDate = tokenValue; - else if ( tokenTag == Constants::SAM_RG_PROGRAM_TAG ) rg.Program = tokenValue; - else if ( tokenTag == Constants::SAM_RG_SAMPLE_TAG ) rg.Sample = tokenValue; - else if ( tokenTag == Constants::SAM_RG_SEQCENTER_TAG ) rg.SequencingCenter = tokenValue; - else if ( tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG ) rg.SequencingTechnology = tokenValue; - else { - const string message = string("unknown RG tag: ") + tokenTag; - throw BamException("SamFormatParser::ParseRGLine", message); - } - } - - // check for required tags - if ( !rg.HasID() ) - throw BamException("SamFormatParser::ParseRGLine", "@RG line is missing ID tag"); - - // store SAM read group entry - m_header.ReadGroups.Add(rg); -} - -void SamFormatParser::ParsePGLine(const string& line) { - - SamProgram pg; - - // split string into tokens - vector tokens = Split(line, Constants::SAM_TAB); - - // iterate over tokens - vector::const_iterator tokenIter = tokens.begin(); - vector::const_iterator tokenEnd = tokens.end(); - for ( ; tokenIter != tokenEnd; ++tokenIter ) { - - // get token tag/value - const string tokenTag = (*tokenIter).substr(0,2); - const string tokenValue = (*tokenIter).substr(3); - - // set program record contents - if ( tokenTag == Constants::SAM_PG_ID_TAG ) pg.ID = tokenValue; - else if ( tokenTag == Constants::SAM_PG_NAME_TAG ) pg.Name = tokenValue; - else if ( tokenTag == Constants::SAM_PG_COMMANDLINE_TAG ) pg.CommandLine = tokenValue; - else if ( tokenTag == Constants::SAM_PG_PREVIOUSPROGRAM_TAG ) pg.PreviousProgramID = tokenValue; - else if ( tokenTag == Constants::SAM_PG_VERSION_TAG ) pg.Version = tokenValue; - else { - const string message = string("unknown PG tag: ") + tokenTag; - throw BamException("SamFormatParser::ParsePGLine", message); - } - } - - // check for required tags - if ( !pg.HasID() ) - throw BamException("SamFormatParser::ParsePGLine", "@PG line is missing ID tag"); - - // store SAM program entry - m_header.Programs.Add(pg); -} - -void SamFormatParser::ParseCOLine(const string& line) { - // simply add line to comments list - m_header.Comments.push_back(line); -} - -const vector SamFormatParser::Split(const string& line, const char delim) { - vector tokens; - stringstream lineStream(line); - string token; - while ( getline(lineStream, token, delim) ) - tokens.push_back(token); - return tokens; -} diff --git a/src/api/internal/SamFormatParser_p.h b/src/api/internal/SamFormatParser_p.h deleted file mode 100644 index cf6d54c..0000000 --- a/src/api/internal/SamFormatParser_p.h +++ /dev/null @@ -1,61 +0,0 @@ -// *************************************************************************** -// SamFormatParser.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 23 December 2010 (DB) -// --------------------------------------------------------------------------- -// Provides functionality for parsing SAM header text into SamHeader object -// *************************************************************************** - -#ifndef SAM_FORMAT_PARSER_H -#define SAM_FORMAT_PARSER_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include -#include - -namespace BamTools { - -class SamHeader; - -namespace Internal { - -class SamFormatParser { - - // ctor & dtor - public: - SamFormatParser(BamTools::SamHeader& header); - ~SamFormatParser(void); - - // parse text & populate header data - public: - void Parse(const std::string& headerText); - - // internal methods - private: - void ParseSamLine(const std::string& line); - void ParseHDLine(const std::string& line); - void ParseSQLine(const std::string& line); - void ParseRGLine(const std::string& line); - void ParsePGLine(const std::string& line); - void ParseCOLine(const std::string& line); - const std::vector Split(const std::string& line, const char delim); - - // data members - private: - SamHeader& m_header; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // SAM_FORMAT_PARSER_H diff --git a/src/api/internal/SamFormatPrinter_p.cpp b/src/api/internal/SamFormatPrinter_p.cpp deleted file mode 100644 index f9a118e..0000000 --- a/src/api/internal/SamFormatPrinter_p.cpp +++ /dev/null @@ -1,219 +0,0 @@ -// *************************************************************************** -// SamFormatPrinter.cpp (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 14 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides functionality for printing formatted SAM header to string -// *************************************************************************** - -#include "api/SamConstants.h" -#include "api/SamHeader.h" -#include "api/internal/SamFormatPrinter_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -#include -#include -#include -using namespace std; - -// ------------------------ -// static utility methods -// ------------------------ - -static inline -const string FormatTag(const string& tag, const string& value) { - return string(Constants::SAM_TAB + tag + Constants::SAM_COLON + value); -} - -// --------------------------------- -// SamFormatPrinter implementation -// --------------------------------- - -SamFormatPrinter::SamFormatPrinter(const SamHeader& header) - : m_header(header) -{ } - -SamFormatPrinter::~SamFormatPrinter(void) { } - -const string SamFormatPrinter::ToString(void) const { - - // clear out stream - stringstream out(""); - - // generate formatted header text - PrintHD(out); - PrintSQ(out); - PrintRG(out); - PrintPG(out); - PrintCO(out); - - // return result - return out.str(); -} - -void SamFormatPrinter::PrintHD(std::stringstream& out) const { - - // if header has @HD data - if ( m_header.HasVersion() ) { - - // @HD VN: - out << Constants::SAM_HD_BEGIN_TOKEN - << FormatTag(Constants::SAM_HD_VERSION_TAG, m_header.Version); - - // SO: - if ( m_header.HasSortOrder() ) - out << FormatTag(Constants::SAM_HD_SORTORDER_TAG, m_header.SortOrder); - - // GO: - if ( m_header.HasGroupOrder() ) - out << FormatTag(Constants::SAM_HD_GROUPORDER_TAG, m_header.GroupOrder); - - // newline - out << endl; - } -} - -void SamFormatPrinter::PrintSQ(std::stringstream& out) const { - - // iterate over sequence entries - SamSequenceConstIterator seqIter = m_header.Sequences.ConstBegin(); - SamSequenceConstIterator seqEnd = m_header.Sequences.ConstEnd(); - for ( ; seqIter != seqEnd; ++seqIter ) { - const SamSequence& seq = (*seqIter); - - // @SQ SN: LN: - out << Constants::SAM_SQ_BEGIN_TOKEN - << FormatTag(Constants::SAM_SQ_NAME_TAG, seq.Name) - << FormatTag(Constants::SAM_SQ_LENGTH_TAG, seq.Length); - - // AS: - if ( seq.HasAssemblyID() ) - out << FormatTag(Constants::SAM_SQ_ASSEMBLYID_TAG, seq.AssemblyID); - - // M5: - if ( seq.HasChecksum() ) - out << FormatTag(Constants::SAM_SQ_CHECKSUM_TAG, seq.Checksum); - - // SP: - if ( seq.HasSpecies() ) - out << FormatTag(Constants::SAM_SQ_SPECIES_TAG, seq.Species); - - // UR: - if ( seq.HasURI() ) - out << FormatTag(Constants::SAM_SQ_URI_TAG, seq.URI); - - // newline - out << endl; - } -} - -void SamFormatPrinter::PrintRG(std::stringstream& out) const { - - // iterate over read group entries - SamReadGroupConstIterator rgIter = m_header.ReadGroups.ConstBegin(); - SamReadGroupConstIterator rgEnd = m_header.ReadGroups.ConstEnd(); - for ( ; rgIter != rgEnd; ++rgIter ) { - const SamReadGroup& rg = (*rgIter); - - // @RG ID: - out << Constants::SAM_RG_BEGIN_TOKEN - << FormatTag(Constants::SAM_RG_ID_TAG, rg.ID); - - // CN: - if ( rg.HasSequencingCenter() ) - out << FormatTag(Constants::SAM_RG_SEQCENTER_TAG, rg.SequencingCenter); - - // DS: - if ( rg.HasDescription() ) - out << FormatTag(Constants::SAM_RG_DESCRIPTION_TAG, rg.Description); - - // DT: - if ( rg.HasProductionDate() ) - out << FormatTag(Constants::SAM_RG_PRODUCTIONDATE_TAG, rg.ProductionDate); - - // FO: - if ( rg.HasFlowOrder() ) - out << FormatTag(Constants::SAM_RG_FLOWORDER_TAG, rg.FlowOrder); - - // KS: - if ( rg.HasKeySequence() ) - out << FormatTag(Constants::SAM_RG_KEYSEQUENCE_TAG, rg.KeySequence); - - // LB: - if ( rg.HasLibrary() ) - out << FormatTag(Constants::SAM_RG_LIBRARY_TAG, rg.Library); - - // PG: - if ( rg.HasProgram() ) - out << FormatTag(Constants::SAM_RG_PROGRAM_TAG, rg.Program); - - // PI: - if ( rg.HasPredictedInsertSize() ) - out << FormatTag(Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG, rg.PredictedInsertSize); - - // PL: - if ( rg.HasSequencingTechnology() ) - out << FormatTag(Constants::SAM_RG_SEQTECHNOLOGY_TAG, rg.SequencingTechnology); - - // PU: - if ( rg.HasPlatformUnit() ) - out << FormatTag(Constants::SAM_RG_PLATFORMUNIT_TAG, rg.PlatformUnit); - - // SM: - if ( rg.HasSample() ) - out << FormatTag(Constants::SAM_RG_SAMPLE_TAG, rg.Sample); - - // newline - out << endl; - } -} - -void SamFormatPrinter::PrintPG(std::stringstream& out) const { - - // iterate over program record entries - SamProgramConstIterator pgIter = m_header.Programs.ConstBegin(); - SamProgramConstIterator pgEnd = m_header.Programs.ConstEnd(); - for ( ; pgIter != pgEnd; ++pgIter ) { - const SamProgram& pg = (*pgIter); - - // @PG ID: - out << Constants::SAM_PG_BEGIN_TOKEN - << FormatTag(Constants::SAM_PG_ID_TAG, pg.ID); - - // PN: - if ( pg.HasName() ) - out << FormatTag(Constants::SAM_PG_NAME_TAG, pg.Name); - - // CL: - if ( pg.HasCommandLine() ) - out << FormatTag(Constants::SAM_PG_COMMANDLINE_TAG, pg.CommandLine); - - // PP: - if ( pg.HasPreviousProgramID() ) - out << FormatTag(Constants::SAM_PG_PREVIOUSPROGRAM_TAG, pg.PreviousProgramID); - - // VN: - if ( pg.HasVersion() ) - out << FormatTag(Constants::SAM_PG_VERSION_TAG, pg.Version); - - // newline - out << endl; - } -} - -void SamFormatPrinter::PrintCO(std::stringstream& out) const { - - // iterate over comments - vector::const_iterator commentIter = m_header.Comments.begin(); - vector::const_iterator commentEnd = m_header.Comments.end(); - for ( ; commentIter != commentEnd; ++commentIter ) { - - // @CO - out << Constants::SAM_CO_BEGIN_TOKEN - << Constants::SAM_TAB - << (*commentIter) - << endl; - } -} diff --git a/src/api/internal/SamFormatPrinter_p.h b/src/api/internal/SamFormatPrinter_p.h deleted file mode 100644 index ea29181..0000000 --- a/src/api/internal/SamFormatPrinter_p.h +++ /dev/null @@ -1,59 +0,0 @@ -// *************************************************************************** -// SamFormatPrinter.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 6 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides functionality for printing formatted SAM header to string -// *************************************************************************** - -#ifndef SAM_FORMAT_PRINTER_H -#define SAM_FORMAT_PRINTER_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include -#include - -namespace BamTools { - -class SamHeader; - -namespace Internal { - -class SamFormatPrinter { - - // ctor & dtor - public: - SamFormatPrinter(const BamTools::SamHeader& header); - ~SamFormatPrinter(void); - - // generates SAM-formatted string from header data - public: - const std::string ToString(void) const; - - // internal methods - private: - void PrintHD(std::stringstream& out) const; - void PrintSQ(std::stringstream& out) const; - void PrintRG(std::stringstream& out) const; - void PrintPG(std::stringstream& out) const; - void PrintCO(std::stringstream& out) const; - - // data members - private: - const SamHeader& m_header; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // SAM_FORMAT_PRINTER_H diff --git a/src/api/internal/SamHeaderValidator_p.cpp b/src/api/internal/SamHeaderValidator_p.cpp deleted file mode 100644 index c76fff9..0000000 --- a/src/api/internal/SamHeaderValidator_p.cpp +++ /dev/null @@ -1,524 +0,0 @@ -// *************************************************************************** -// SamHeaderValidator.cpp (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 14 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides functionality for validating SamHeader data -// *************************************************************************** - -#include "api/SamConstants.h" -#include "api/SamHeader.h" -#include "api/internal/SamHeaderValidator_p.h" -#include "api/internal/SamHeaderVersion_p.h" -using namespace BamTools; -using namespace BamTools::Internal; - -#include -#include -#include -using namespace std; - -// ------------------------ -// static utility methods -// ------------------------- - -static -bool caseInsensitiveCompare(const string& lhs, const string& rhs) { - - // can omit checking chars if lengths not equal - const int lhsLength = lhs.length(); - const int rhsLength = rhs.length(); - if ( lhsLength != rhsLength ) - return false; - - // do *basic* toupper checks on each string char's - for ( int i = 0; i < lhsLength; ++i ) { - if ( toupper( (int)lhs.at(i)) != toupper( (int)rhs.at(i)) ) - return false; - } - - // otherwise OK - return true; -} - -// ------------------------------------------------------------------------ -// Allow validation rules to vary, as needed, between SAM header versions -// -// use SAM_VERSION_X_Y to tag important changes -// -// Together, they will allow for comparisons like: -// if ( m_version < SAM_VERSION_2_0 ) { -// // use some older rule -// else -// // use rule introduced with version 2.0 - -static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0); -static const SamHeaderVersion SAM_VERSION_1_1 = SamHeaderVersion(1,1); -static const SamHeaderVersion SAM_VERSION_1_2 = SamHeaderVersion(1,2); -static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3); -static const SamHeaderVersion SAM_VERSION_1_4 = SamHeaderVersion(1,4); - -// TODO: This functionality is currently unused. -// Make validation "version-aware." -// -// ------------------------------------------------------------------------ - -const string SamHeaderValidator::ERROR_PREFIX = "ERROR: "; -const string SamHeaderValidator::WARN_PREFIX = "WARNING: "; -const string SamHeaderValidator::NEWLINE = "\n"; - -SamHeaderValidator::SamHeaderValidator(const SamHeader& header) - : m_header(header) -{ } - -SamHeaderValidator::~SamHeaderValidator(void) { } - -void SamHeaderValidator::AddError(const string& message) { - m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE); -} - -void SamHeaderValidator::AddWarning(const string& message) { - m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE); -} - -void SamHeaderValidator::PrintErrorMessages(ostream& stream) { - - // skip if no error messages - if ( m_errorMessages.empty() ) - return; - - // print error header line - stream << "* SAM header has " << m_errorMessages.size() << " errors:" << endl; - - // print each error message - vector::const_iterator errorIter = m_errorMessages.begin(); - vector::const_iterator errorEnd = m_errorMessages.end(); - for ( ; errorIter != errorEnd; ++errorIter ) - stream << (*errorIter); -} - -void SamHeaderValidator::PrintMessages(ostream& stream) { - PrintErrorMessages(stream); - PrintWarningMessages(stream); -} - -void SamHeaderValidator::PrintWarningMessages(ostream& stream) { - - // skip if no warning messages - if ( m_warningMessages.empty() ) - return; - - // print warning header line - stream << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl; - - // print each warning message - vector::const_iterator warnIter = m_warningMessages.begin(); - vector::const_iterator warnEnd = m_warningMessages.end(); - for ( ; warnIter != warnEnd; ++warnIter ) - stream << (*warnIter); -} - -// entry point for validation -bool SamHeaderValidator::Validate(void) { - bool isValid = true; - isValid &= ValidateMetadata(); - isValid &= ValidateSequenceDictionary(); - isValid &= ValidateReadGroupDictionary(); - isValid &= ValidateProgramChain(); - return isValid; -} - -// check all SAM header 'metadata' -bool SamHeaderValidator::ValidateMetadata(void) { - bool isValid = true; - isValid &= ValidateVersion(); - isValid &= ValidateSortOrder(); - isValid &= ValidateGroupOrder(); - return isValid; -} - -// check SAM header version tag -bool SamHeaderValidator::ValidateVersion(void) { - - const string& version = m_header.Version; - - // warn if version not present - if ( version.empty() ) { - AddWarning("Version (VN) missing. Not required, but strongly recommended"); - return true; - } - - // invalid if version does not contain a period - const size_t periodFound = version.find(Constants::SAM_PERIOD); - if ( periodFound == string::npos ) { - AddError("Invalid version (VN) format: " + version); - return false; - } - - // invalid if major version is empty or contains non-digits - const string majorVersion = version.substr(0, periodFound); - if ( majorVersion.empty() || !ContainsOnlyDigits(majorVersion) ) { - AddError("Invalid version (VN) format: " + version); - return false; - } - - // invalid if major version is empty or contains non-digits - const string minorVersion = version.substr(periodFound + 1); - if ( minorVersion.empty() || !ContainsOnlyDigits(minorVersion) ) { - AddError("Invalid version (VN) format: " + version); - return false; - } - - // TODO: check if version is not just syntactically OK, - // but is also a valid SAM version ( 1.0 .. CURRENT ) - - // all checked out this far, then version is OK - return true; -} - -// assumes non-empty input string -bool SamHeaderValidator::ContainsOnlyDigits(const string& s) { - const size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS); - return ( nonDigitPosition == string::npos ) ; -} - -// validate SAM header sort order tag -bool SamHeaderValidator::ValidateSortOrder(void) { - - const string& sortOrder = m_header.SortOrder; - - // warn if sort order not present - if ( sortOrder.empty() ) { - AddWarning("Sort order (SO) missing. Not required, but strongly recommended"); - return true; - } - - // if sort order is valid keyword - if ( sortOrder == Constants::SAM_HD_SORTORDER_COORDINATE || - sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME || - sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED - ) - { - return true; - } - - // otherwise - AddError("Invalid sort order (SO): " + sortOrder); - return false; -} - -// validate SAM header group order tag -bool SamHeaderValidator::ValidateGroupOrder(void) { - - const string& groupOrder = m_header.GroupOrder; - - // if no group order, no problem, just return OK - if ( groupOrder.empty() ) - return true; - - // if group order is valid keyword - if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE || - groupOrder == Constants::SAM_HD_GROUPORDER_QUERY || - groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE - ) - { - return true; - } - - // otherwise - AddError("Invalid group order (GO): " + groupOrder); - return false; -} - -// validate SAM header sequence dictionary -bool SamHeaderValidator::ValidateSequenceDictionary(void) { - - bool isValid = true; - - // check for unique sequence names - isValid &= ContainsUniqueSequenceNames(); - - // iterate over sequences - const SamSequenceDictionary& sequences = m_header.Sequences; - SamSequenceConstIterator seqIter = sequences.ConstBegin(); - SamSequenceConstIterator seqEnd = sequences.ConstEnd(); - for ( ; seqIter != seqEnd; ++seqIter ) { - const SamSequence& seq = (*seqIter); - isValid &= ValidateSequence(seq); - } - - // return validation state - return isValid; -} - -// make sure all SQ names are unique -bool SamHeaderValidator::ContainsUniqueSequenceNames(void) { - - bool isValid = true; - set sequenceNames; - set::iterator nameIter; - - // iterate over sequences - const SamSequenceDictionary& sequences = m_header.Sequences; - SamSequenceConstIterator seqIter = sequences.ConstBegin(); - SamSequenceConstIterator seqEnd = sequences.ConstEnd(); - for ( ; seqIter != seqEnd; ++seqIter ) { - const SamSequence& seq = (*seqIter); - - // lookup sequence name - const string& name = seq.Name; - nameIter = sequenceNames.find(name); - - // error if found (duplicate entry) - if ( nameIter != sequenceNames.end() ) { - AddError("Sequence name (SN): " + name + " is not unique"); - isValid = false; - } - - // otherwise ok, store name - sequenceNames.insert(name); - } - - // return validation state - return isValid; -} - -// validate SAM header sequence entry -bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) { - bool isValid = true; - isValid &= CheckNameFormat(seq.Name); - isValid &= CheckLengthInRange(seq.Length); - return isValid; -} - -// check sequence name is valid format -bool SamHeaderValidator::CheckNameFormat(const string& name) { - - // invalid if name is empty - if ( name.empty() ) { - AddError("Sequence entry (@SQ) is missing SN tag"); - return false; - } - - // invalid if first character is a reserved char - const char firstChar = name.at(0); - if ( firstChar == Constants::SAM_EQUAL || firstChar == Constants::SAM_STAR ) { - AddError("Invalid sequence name (SN): " + name); - return false; - } - // otherwise OK - return true; -} - -// check that sequence length is within accepted range -bool SamHeaderValidator::CheckLengthInRange(const string& length) { - - // invalid if empty - if ( length.empty() ) { - AddError("Sequence entry (@SQ) is missing LN tag"); - return false; - } - - // convert string length to numeric - stringstream lengthStream(length); - unsigned int sequenceLength; - lengthStream >> sequenceLength; - - // invalid if length outside accepted range - if ( sequenceLength < Constants::SAM_SQ_LENGTH_MIN || sequenceLength > Constants::SAM_SQ_LENGTH_MAX ) { - AddError("Sequence length (LN): " + length + " out of range"); - return false; - } - - // otherwise OK - return true; -} - -// validate SAM header read group dictionary -bool SamHeaderValidator::ValidateReadGroupDictionary(void) { - - bool isValid = true; - - // check for unique read group IDs & platform units - isValid &= ContainsUniqueIDsAndPlatformUnits(); - - // iterate over read groups - const SamReadGroupDictionary& readGroups = m_header.ReadGroups; - SamReadGroupConstIterator rgIter = readGroups.ConstBegin(); - SamReadGroupConstIterator rgEnd = readGroups.ConstEnd(); - for ( ; rgIter != rgEnd; ++rgIter ) { - const SamReadGroup& rg = (*rgIter); - isValid &= ValidateReadGroup(rg); - } - - // return validation state - return isValid; -} - -// make sure RG IDs and platform units are unique -bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) { - - bool isValid = true; - set readGroupIds; - set platformUnits; - set::iterator idIter; - set::iterator puIter; - - // iterate over sequences - const SamReadGroupDictionary& readGroups = m_header.ReadGroups; - SamReadGroupConstIterator rgIter = readGroups.ConstBegin(); - SamReadGroupConstIterator rgEnd = readGroups.ConstEnd(); - for ( ; rgIter != rgEnd; ++rgIter ) { - const SamReadGroup& rg = (*rgIter); - - // -------------------------------- - // check for unique ID - - // lookup read group ID - const string& id = rg.ID; - idIter = readGroupIds.find(id); - - // error if found (duplicate entry) - if ( idIter != readGroupIds.end() ) { - AddError("Read group ID (ID): " + id + " is not unique"); - isValid = false; - } - - // otherwise ok, store id - readGroupIds.insert(id); - - // -------------------------------- - // check for unique platform unit - - // lookup platform unit - const string& pu = rg.PlatformUnit; - puIter = platformUnits.find(pu); - - // error if found (duplicate entry) - if ( puIter != platformUnits.end() ) { - AddError("Platform unit (PU): " + pu + " is not unique"); - isValid = false; - } - - // otherwise ok, store platform unit - platformUnits.insert(pu); - } - - // return validation state - return isValid; -} - -// validate SAM header read group entry -bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) { - bool isValid = true; - isValid &= CheckReadGroupID(rg.ID); - isValid &= CheckSequencingTechnology(rg.SequencingTechnology); - return isValid; -} - -// make sure RG ID exists -bool SamHeaderValidator::CheckReadGroupID(const string& id) { - - // invalid if empty - if ( id.empty() ) { - AddError("Read group entry (@RG) is missing ID tag"); - return false; - } - - // otherwise OK - return true; -} - -// make sure RG sequencing tech is one of the accepted keywords -bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) { - - // if no technology provided, no problem, just return OK - if ( technology.empty() ) - return true; - - // if technology is valid keyword - if ( caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_CAPILLARY) || - caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_HELICOS) || - caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA) || - caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_IONTORRENT) || - caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_LS454) || - caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_PACBIO) || - caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_SOLID) - ) - { - return true; - } - - // otherwise - AddError("Invalid read group sequencing platform (PL): " + technology); - return false; -} - -// validate the SAM header "program chain" -bool SamHeaderValidator::ValidateProgramChain(void) { - bool isValid = true; - isValid &= ContainsUniqueProgramIds(); - isValid &= ValidatePreviousProgramIds(); - return isValid; -} - -// make sure all PG IDs are unique -bool SamHeaderValidator::ContainsUniqueProgramIds(void) { - - bool isValid = true; - set programIds; - set::iterator pgIdIter; - - // iterate over program records - const SamProgramChain& programs = m_header.Programs; - SamProgramConstIterator pgIter = programs.ConstBegin(); - SamProgramConstIterator pgEnd = programs.ConstEnd(); - for ( ; pgIter != pgEnd; ++pgIter ) { - const SamProgram& pg = (*pgIter); - - // lookup program ID - const string& pgId = pg.ID; - pgIdIter = programIds.find(pgId); - - // error if found (duplicate entry) - if ( pgIdIter != programIds.end() ) { - AddError("Program ID (ID): " + pgId + " is not unique"); - isValid = false; - } - - // otherwise ok, store ID - programIds.insert(pgId); - } - - // return validation state - return isValid; -} - -// make sure that any PP tags present point to existing @PG IDs -bool SamHeaderValidator::ValidatePreviousProgramIds(void) { - - bool isValid = true; - - // iterate over program records - const SamProgramChain& programs = m_header.Programs; - SamProgramConstIterator pgIter = programs.ConstBegin(); - SamProgramConstIterator pgEnd = programs.ConstEnd(); - for ( ; pgIter != pgEnd; ++pgIter ) { - const SamProgram& pg = (*pgIter); - - // ignore record for validation if PreviousProgramID is empty - const string& ppId = pg.PreviousProgramID; - if ( ppId.empty() ) - continue; - - // see if program "chain" contains an entry for ppId - if ( !programs.Contains(ppId) ) { - AddError("PreviousProgramID (PP): " + ppId + " is not a known ID"); - isValid = false; - } - } - - // return validation state - return isValid; -} diff --git a/src/api/internal/SamHeaderValidator_p.h b/src/api/internal/SamHeaderValidator_p.h deleted file mode 100644 index 7d0c60a..0000000 --- a/src/api/internal/SamHeaderValidator_p.h +++ /dev/null @@ -1,105 +0,0 @@ -// *************************************************************************** -// SamHeaderValidator.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 6 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides functionality for validating SamHeader data -// *************************************************************************** - -#ifndef SAM_HEADER_VALIDATOR_P_H -#define SAM_HEADER_VALIDATOR_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include -#include -#include - -namespace BamTools { - -class SamHeader; -class SamReadGroup; -class SamSequence; - -namespace Internal { - -class SamHeaderValidator { - - // ctor & dtor - public: - SamHeaderValidator(const SamHeader& header); - ~SamHeaderValidator(void); - - // SamHeaderValidator interface - public: - - // prints error & warning messages - void PrintMessages(std::ostream& stream); - - // validates SamHeader data, returns true/false accordingly - bool Validate(void); - - // internal methods - private: - - // validate header metadata - bool ValidateMetadata(void); - bool ValidateVersion(void); - bool ContainsOnlyDigits(const std::string& s); - bool ValidateSortOrder(void); - bool ValidateGroupOrder(void); - - // validate sequence dictionary - bool ValidateSequenceDictionary(void); - bool ContainsUniqueSequenceNames(void); - bool CheckNameFormat(const std::string& name); - bool ValidateSequence(const SamSequence& seq); - bool CheckLengthInRange(const std::string& length); - - // validate read group dictionary - bool ValidateReadGroupDictionary(void); - bool ContainsUniqueIDsAndPlatformUnits(void); - bool ValidateReadGroup(const SamReadGroup& rg); - bool CheckReadGroupID(const std::string& id); - bool CheckSequencingTechnology(const std::string& technology); - - // validate program data - bool ValidateProgramChain(void); - bool ContainsUniqueProgramIds(void); - bool ValidatePreviousProgramIds(void); - - // error reporting - void AddError(const std::string& message); - void AddWarning(const std::string& message); - void PrintErrorMessages(std::ostream& stream); - void PrintWarningMessages(std::ostream& stream); - - // data members - private: - - // SamHeader being validated - const SamHeader& m_header; - - // error reporting helpers - static const std::string ERROR_PREFIX; - static const std::string WARN_PREFIX; - static const std::string NEWLINE; - - // error reporting messages - std::vector m_errorMessages; - std::vector m_warningMessages; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // SAM_HEADER_VALIDATOR_P_H diff --git a/src/api/internal/SamHeaderVersion_p.h b/src/api/internal/SamHeaderVersion_p.h deleted file mode 100644 index 4f85df0..0000000 --- a/src/api/internal/SamHeaderVersion_p.h +++ /dev/null @@ -1,134 +0,0 @@ -// *************************************************************************** -// SamHeaderVersion.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// --------------------------------------------------------------------------- -// Last modified: 10 October 2011 (DB) -// --------------------------------------------------------------------------- -// Provides functionality for comparing SAM header versions -// ************************************************************************* - -#ifndef SAM_HEADERVERSION_P_H -#define SAM_HEADERVERSION_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include "api/SamConstants.h" -#include -#include - -namespace BamTools { -namespace Internal { - -class SamHeaderVersion { - - // ctors & dtor - public: - SamHeaderVersion(void) - : m_majorVersion(0) - , m_minorVersion(0) - { } - - explicit SamHeaderVersion(const std::string& version) - : m_majorVersion(0) - , m_minorVersion(0) - { - SetVersion(version); - } - - SamHeaderVersion(const unsigned int& major, const unsigned int& minor) - : m_majorVersion(major) - , m_minorVersion(minor) - { } - - ~SamHeaderVersion(void) { - m_majorVersion = 0; - m_minorVersion = 0; - } - - // acess data - public: - unsigned int MajorVersion(void) const { return m_majorVersion; } - unsigned int MinorVersion(void) const { return m_minorVersion; } - - void SetVersion(const std::string& version); - std::string ToString(void) const; - - // data members - private: - unsigned int m_majorVersion; - unsigned int m_minorVersion; -}; - -inline -void SamHeaderVersion::SetVersion(const std::string& version) { - - // do nothing if version is empty - if ( !version.empty() ) { - - std::stringstream versionStream(""); - - // do nothing if period not found - const size_t periodFound = version.find(Constants::SAM_PERIOD); - if ( periodFound != std::string::npos ) { - - // store major version if non-empty and contains only digits - const std::string& majorVersion = version.substr(0, periodFound); - versionStream.str(majorVersion); - if ( !majorVersion.empty() ) { - const size_t nonDigitFound = majorVersion.find_first_not_of(Constants::SAM_DIGITS); - if ( nonDigitFound == std::string::npos ) - versionStream >> m_majorVersion; - } - - // store minor version if non-empty and contains only digits - const std::string& minorVersion = version.substr(periodFound + 1); - versionStream.str(minorVersion); - if ( !minorVersion.empty() ) { - const size_t nonDigitFound = minorVersion.find_first_not_of(Constants::SAM_DIGITS); - if ( nonDigitFound == std::string::npos ) - versionStream >> m_minorVersion; - } - } - } -} - -// ----------------------------------------------------- -// printing - -inline std::string SamHeaderVersion::ToString(void) const { - std::stringstream version; - version << m_majorVersion << Constants::SAM_PERIOD << m_minorVersion; - return version.str(); -} - -// ----------------------------------------------------- -// comparison operators - -inline bool operator==(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { - return (lhs.MajorVersion() == rhs.MajorVersion()) && - (lhs.MinorVersion() == rhs.MinorVersion()); -} - -inline bool operator<(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { - if ( lhs.MajorVersion() == rhs.MajorVersion() ) - return lhs.MinorVersion() < rhs.MinorVersion(); - else - return lhs.MajorVersion() < rhs.MajorVersion(); -} - -inline bool operator> (const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return rhs < lhs; } -inline bool operator<=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return !(lhs>rhs); } -inline bool operator>=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return !(lhs +#include +using namespace std; + +// ------------------------ +// static utility methods +// ------------------------ + +static inline +bool isValidMagicNumber(const char* buffer) { + return ( strncmp(buffer, Constants::BAM_HEADER_MAGIC, + Constants::BAM_HEADER_MAGIC_LENGTH) == 0 ); +} + +// -------------------------- +// BamHeader implementation +// -------------------------- + +// ctor +BamHeader::BamHeader(void) { } + +// dtor +BamHeader::~BamHeader(void) { } + +// reads magic number from BGZF stream, returns true if valid +void BamHeader::CheckMagicNumber(BgzfStream* stream) { + + // try to read magic number + char buffer[Constants::BAM_HEADER_MAGIC_LENGTH]; + const size_t numBytesRead = stream->Read(buffer, Constants::BAM_HEADER_MAGIC_LENGTH); + if ( numBytesRead != (int)Constants::BAM_HEADER_MAGIC_LENGTH ) + throw BamException("BamHeader::CheckMagicNumber", "could not read magic number"); + + // validate magic number + if ( !isValidMagicNumber(buffer) ) + throw BamException("BamHeader::CheckMagicNumber", "invalid magic number"); +} + +// clear SamHeader data +void BamHeader::Clear(void) { + m_header.Clear(); +} + +// return true if SamHeader data is valid +bool BamHeader::IsValid(void) const { + return m_header.IsValid(); +} + +// load BAM header ('magic number' and SAM header text) from BGZF stream +void BamHeader::Load(BgzfStream* stream) { + + // read & check magic number + CheckMagicNumber(stream); + + // read header (length, then actual text) + uint32_t length(0); + ReadHeaderLength(stream, length); + ReadHeaderText(stream, length); +} + +// reads SAM header text length from BGZF stream, stores it in @length +void BamHeader::ReadHeaderLength(BgzfStream* stream, uint32_t& length) { + + // read BAM header text length + char buffer[sizeof(uint32_t)]; + const size_t numBytesRead = stream->Read(buffer, sizeof(uint32_t)); + if ( numBytesRead != sizeof(uint32_t) ) + throw BamException("BamHeader::ReadHeaderLength", "could not read header length"); + + // convert char buffer to length + length = BamTools::UnpackUnsignedInt(buffer); + if ( BamTools::SystemIsBigEndian() ) + BamTools::SwapEndian_32(length); +} + +// reads SAM header text from BGZF stream, stores in SamHeader object +void BamHeader::ReadHeaderText(BgzfStream* stream, const uint32_t& length) { + + // read header text + char* headerText = (char*)calloc(length + 1, 1); + const size_t bytesRead = stream->Read(headerText, length); + + // if error reading, clean up buffer & throw + if ( bytesRead != length ) { + free(headerText); + throw BamException("BamHeader::ReadHeaderText", "could not read header text"); + } + + // otherwise, text was read OK + // store & cleanup + m_header.SetHeaderText( (string)((const char*)headerText) ); + free(headerText); +} + +// returns *copy* of SamHeader data object +SamHeader BamHeader::ToSamHeader(void) const { + return m_header; +} + +// returns SAM-formatted string of header data +string BamHeader::ToString(void) const { + return m_header.ToString(); +} diff --git a/src/api/internal/bam/BamHeader_p.h b/src/api/internal/bam/BamHeader_p.h new file mode 100644 index 0000000..499ad96 --- /dev/null +++ b/src/api/internal/bam/BamHeader_p.h @@ -0,0 +1,69 @@ +// *************************************************************************** +// BamHeader_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for handling BAM headers. +// *************************************************************************** + +#ifndef BAMHEADER_P_H +#define BAMHEADER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/SamHeader.h" +#include + +namespace BamTools { +namespace Internal { + +class BgzfStream; + +class BamHeader { + + // ctor & dtor + public: + BamHeader(void); + ~BamHeader(void); + + // BamHeader interface + public: + // clear SamHeader data + void Clear(void); + // return true if SamHeader data is valid + bool IsValid(void) const; + // load BAM header ('magic number' and SAM header text) from BGZF stream + // returns true if all OK + void Load(BgzfStream* stream); + // returns (editable) copy of SamHeader data object + SamHeader ToSamHeader(void) const; + // returns SAM-formatted string of header data + std::string ToString(void) const; + + // internal methods + private: + // reads magic number from BGZF stream + void CheckMagicNumber(BgzfStream* stream); + // reads SAM header length from BGZF stream, stores it in @length + void ReadHeaderLength(BgzfStream* stream, uint32_t& length); + // reads SAM header text from BGZF stream, stores in SamHeader object + void ReadHeaderText(BgzfStream* stream, const uint32_t& length); + + // data members + private: + SamHeader m_header; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMHEADER_P_H diff --git a/src/api/internal/bam/BamMultiMerger_p.h b/src/api/internal/bam/BamMultiMerger_p.h new file mode 100644 index 0000000..3000097 --- /dev/null +++ b/src/api/internal/bam/BamMultiMerger_p.h @@ -0,0 +1,266 @@ +// *************************************************************************** +// BamMultiMerger_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides merging functionality for BamMultiReader. At this point, supports +// sorting results by (refId, position) or by read name. +// *************************************************************************** + +#ifndef BAMMULTIMERGER_P_H +#define BAMMULTIMERGER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/BamAlignment.h" +#include "api/BamReader.h" +#include "api/algorithms/Sort.h" +#include +#include +#include +#include + +namespace BamTools { +namespace Internal { + +struct MergeItem { + + // data members + BamReader* Reader; + BamAlignment* Alignment; + + // ctors & dtor + MergeItem(BamReader* reader = 0, + BamAlignment* alignment = 0) + : Reader(reader) + , Alignment(alignment) + { } + + MergeItem(const MergeItem& other) + : Reader(other.Reader) + , Alignment(other.Alignment) + { } + + ~MergeItem(void) { } +}; + +template +struct MergeItemSorter : public std::binary_function { + + public: + MergeItemSorter(const Compare& comp = Compare()) + : m_comp(comp) + { } + + bool operator()(const MergeItem& lhs, const MergeItem& rhs) { + const BamAlignment& l = *lhs.Alignment; + const BamAlignment& r = *rhs.Alignment; + return m_comp(l,r); + } + + private: + Compare m_comp; +}; + +// pure ABC so we can just work polymorphically with any specific merger implementation +class IMultiMerger { + + public: + IMultiMerger(void) { } + virtual ~IMultiMerger(void) { } + public: + virtual void Add(MergeItem item) =0; + virtual void Clear(void) =0; + virtual const MergeItem& First(void) const =0; + virtual bool IsEmpty(void) const =0; + virtual void Remove(BamReader* reader) =0; + virtual int Size(void) const =0; + virtual MergeItem TakeFirst(void) =0; +}; + +// general merger +template +class MultiMerger : public IMultiMerger { + + public: + typedef Compare CompareType; + typedef MergeItemSorter MergeType; + + public: + explicit MultiMerger(const Compare& comp = Compare()) + : IMultiMerger() + , m_data( MergeType(comp) ) + { } + ~MultiMerger(void) { } + + public: + void Add(MergeItem item); + void Clear(void); + const MergeItem& First(void) const; + bool IsEmpty(void) const; + void Remove(BamReader* reader); + int Size(void) const; + MergeItem TakeFirst(void); + + private: + typedef MergeItem ValueType; + typedef std::multiset ContainerType; + typedef typename ContainerType::iterator DataIterator; + typedef typename ContainerType::const_iterator DataConstIterator; + ContainerType m_data; +}; + +template +inline void MultiMerger::Add(MergeItem item) { + + // N.B. - any future custom Compare types must define this method + // see algorithms/Sort.h + + if ( CompareType::UsesCharData() ) + item.Alignment->BuildCharData(); + m_data.insert(item); +} + +template +inline void MultiMerger::Clear(void) { + m_data.clear(); +} + +template +inline const MergeItem& MultiMerger::First(void) const { + const ValueType& entry = (*m_data.begin()); + return entry; +} + +template +inline bool MultiMerger::IsEmpty(void) const { + return m_data.empty(); +} +template +inline void MultiMerger::Remove(BamReader* reader) { + + if ( reader == 0 ) return; + const std::string& filenameToRemove = reader->GetFilename(); + + // iterate over readers in cache + DataIterator dataIter = m_data.begin(); + DataIterator dataEnd = m_data.end(); + for ( ; dataIter != dataEnd; ++dataIter ) { + const MergeItem& item = (*dataIter); + const BamReader* itemReader = item.Reader; + if ( itemReader == 0 ) continue; + + // remove iterator on match + if ( itemReader->GetFilename() == filenameToRemove ) { + m_data.erase(dataIter); + return; + } + } +} +template +inline int MultiMerger::Size(void) const { + return m_data.size(); +} + +template +inline MergeItem MultiMerger::TakeFirst(void) { + DataIterator firstIter = m_data.begin(); + MergeItem firstItem = (*firstIter); + m_data.erase(firstIter); + return firstItem; +} + +// unsorted "merger" +template<> +class MultiMerger : public IMultiMerger { + + public: + explicit MultiMerger(const Algorithms::Sort::Unsorted& comp = Algorithms::Sort::Unsorted()) + : IMultiMerger() + { } + ~MultiMerger(void) { } + + public: + void Add(MergeItem item); + void Clear(void); + const MergeItem& First(void) const; + bool IsEmpty(void) const; + void Remove(BamReader* reader); + int Size(void) const; + MergeItem TakeFirst(void); + + private: + typedef MergeItem ValueType; + typedef std::deque ContainerType; + typedef ContainerType::iterator DataIterator; + typedef ContainerType::const_iterator DataConstIterator; + ContainerType m_data; +}; + +inline +void MultiMerger::Add(MergeItem item) { + m_data.push_back(item); +} + +inline +void MultiMerger::Clear(void) { + m_data.clear(); +} + +inline +const MergeItem& MultiMerger::First(void) const { + return m_data.front(); +} + +inline +bool MultiMerger::IsEmpty(void) const { + return m_data.empty(); +} + +inline +void MultiMerger::Remove(BamReader* reader) { + + if ( reader == 0 ) return; + const std::string filenameToRemove = reader->GetFilename(); + + // iterate over readers in cache + DataIterator dataIter = m_data.begin(); + DataIterator dataEnd = m_data.end(); + for ( ; dataIter != dataEnd; ++dataIter ) { + const MergeItem& item = (*dataIter); + const BamReader* itemReader = item.Reader; + if ( itemReader == 0 ) continue; + + // remove iterator on match + if ( itemReader->GetFilename() == filenameToRemove ) { + m_data.erase(dataIter); + return; + } + } +} + +inline +int MultiMerger::Size(void) const { + return m_data.size(); +} + +inline +MergeItem MultiMerger::TakeFirst(void) { + MergeItem firstItem = m_data.front(); + m_data.pop_front(); + return firstItem; +} + +} // namespace Internal +} // namespace BamTools + +#endif // BAMMULTIMERGER_P_H diff --git a/src/api/internal/bam/BamMultiReader_p.cpp b/src/api/internal/bam/BamMultiReader_p.cpp new file mode 100644 index 0000000..d3f2b15 --- /dev/null +++ b/src/api/internal/bam/BamMultiReader_p.cpp @@ -0,0 +1,799 @@ +// *************************************************************************** +// BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011 (DB) +// --------------------------------------------------------------------------- +// Functionality for simultaneously reading multiple BAM files +// ************************************************************************* + +#include "api/BamAlignment.h" +#include "api/BamMultiReader.h" +#include "api/SamConstants.h" +#include "api/algorithms/Sort.h" +#include "api/internal/bam/BamMultiReader_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +#include +#include +using namespace std; + +// ctor +BamMultiReaderPrivate::BamMultiReaderPrivate(void) + : m_alignmentCache(0) +{ } + +// dtor +BamMultiReaderPrivate::~BamMultiReaderPrivate(void) { + Close(); +} + +// close all BAM files +bool BamMultiReaderPrivate::Close(void) { + + m_errorString.clear(); + + if ( CloseFiles(Filenames()) ) + return true; + else { + const string currentError = m_errorString; + const string message = string("error encountered while closing all files: \n\t") + currentError; + SetErrorString("BamMultiReader::Close", message); + return false; + } +} + +// close requested BAM file +bool BamMultiReaderPrivate::CloseFile(const string& filename) { + + m_errorString.clear(); + + vector filenames(1, filename); + if ( CloseFiles(filenames) ) + return true; + else { + const string currentError = m_errorString; + const string message = string("error while closing file: ") + filename + "\n" + currentError; + SetErrorString("BamMultiReader::CloseFile", message); + return false; + } +} + +// close requested BAM files +bool BamMultiReaderPrivate::CloseFiles(const vector& filenames) { + + bool errorsEncountered = false; + m_errorString.clear(); + + // iterate over filenames + vector::const_iterator filesIter = filenames.begin(); + vector::const_iterator filesEnd = filenames.end(); + for ( ; filesIter != filesEnd; ++filesIter ) { + const string& filename = (*filesIter); + if ( filename.empty() ) continue; + + // iterate over readers + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // if reader matches requested filename + if ( reader->GetFilename() == filename ) { + + // remove reader's entry from alignment cache + m_alignmentCache->Remove(reader); + + // clean up reader & its alignment + if ( !reader->Close() ) { + m_errorString.append(1, '\t'); + m_errorString.append(reader->GetErrorString()); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } + delete reader; + reader = 0; + + // delete reader's alignment entry + BamAlignment* alignment = item.Alignment; + delete alignment; + alignment = 0; + + // remove reader from reader list + m_readers.erase(readerIter); + + // on match, just go on to next filename + // (no need to keep looking and item iterator is invalid now anyway) + break; + } + } + } + + // make sure alignment cache is cleaned up if all readers closed + if ( m_readers.empty() && m_alignmentCache ) { + m_alignmentCache->Clear(); + delete m_alignmentCache; + m_alignmentCache = 0; + } + + // return whether all readers closed OK + return !errorsEncountered; +} + +// creates index files for BAM files that don't have them +bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) { + + bool errorsEncountered = false; + m_errorString.clear(); + + // iterate over readers + vector::iterator itemIter = m_readers.begin(); + vector::iterator itemEnd = m_readers.end(); + for ( ; itemIter != itemEnd; ++itemIter ) { + MergeItem& item = (*itemIter); + BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // if reader doesn't have an index, create one + if ( !reader->HasIndex() ) { + if ( !reader->CreateIndex(type) ) { + m_errorString.append(1, '\t'); + m_errorString.append(reader->GetErrorString()); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } + } + } + + // check for errors encountered before returning success/fail + if ( errorsEncountered ) { + const string currentError = m_errorString; + const string message = string("error while creating index files: ") + "\n" + currentError; + SetErrorString("BamMultiReader::CreateIndexes", message); + return false; + } else + return true; +} + +IMultiMerger* BamMultiReaderPrivate::CreateAlignmentCache(void) const { + + // fetch SamHeader + SamHeader header = GetHeader(); + + // if BAM files are sorted by position + if ( header.SortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ) + return new MultiMerger(); + + // if BAM files are sorted by read name + if ( header.SortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ) + return new MultiMerger(); + + // otherwise "unknown" or "unsorted", use unsorted merger and just read in + return new MultiMerger(); +} + +const vector BamMultiReaderPrivate::Filenames(void) const { + + // init filename container + vector filenames; + filenames.reserve( m_readers.size() ); + + // iterate over readers + vector::const_iterator itemIter = m_readers.begin(); + vector::const_iterator itemEnd = m_readers.end(); + for ( ; itemIter != itemEnd; ++itemIter ) { + const MergeItem& item = (*itemIter); + const BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // store filename if not empty + const string& filename = reader->GetFilename(); + if ( !filename.empty() ) + filenames.push_back(filename); + } + + // return result + return filenames; +} + +string BamMultiReaderPrivate::GetErrorString(void) const { + return m_errorString; +} + +SamHeader BamMultiReaderPrivate::GetHeader(void) const { + const string& text = GetHeaderText(); + return SamHeader(text); +} + +// makes a virtual, unified header for all the bam files in the multireader +string BamMultiReaderPrivate::GetHeaderText(void) const { + + // N.B. - right now, simply copies all header data from first BAM, + // and then appends RG's from other BAM files + // TODO: make this more intelligent wrt other header lines/fields + + // if no readers open + const size_t numReaders = m_readers.size(); + if ( numReaders == 0 ) return string(); + + // retrieve first reader's header + const MergeItem& firstItem = m_readers.front(); + const BamReader* reader = firstItem.Reader; + if ( reader == 0 ) return string(); + SamHeader mergedHeader = reader->GetHeader(); + + // iterate over any remaining readers (skipping the first) + for ( size_t i = 1; i < numReaders; ++i ) { + const MergeItem& item = m_readers.at(i); + const BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // retrieve current reader's header + const SamHeader currentHeader = reader->GetHeader(); + + // append current reader's RG entries to merged header + // N.B. - SamReadGroupDictionary handles duplicate-checking + mergedHeader.ReadGroups.Add(currentHeader.ReadGroups); + + // TODO: merge anything else?? + } + + // return stringified header + return mergedHeader.ToString(); +} + +// get next alignment among all files +bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) { + return PopNextCachedAlignment(al, true); +} + +// get next alignment among all files without parsing character data from alignments +bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) { + return PopNextCachedAlignment(al, false); +} + +// --------------------------------------------------------------------------------------- +// +// NB: The following GetReferenceX() functions assume that we have identical +// references for all BAM files. We enforce this by invoking the +// ValidateReaders() method to verify that our reference data is the same +// across all files on Open - so we will not encounter a situation in which +// there is a mismatch and we are still live. +// +// --------------------------------------------------------------------------------------- + +// returns the number of reference sequences +int BamMultiReaderPrivate::GetReferenceCount(void) const { + + // handle empty multireader + if ( m_readers.empty() ) return 0; + + // return reference count from first reader + const MergeItem& item = m_readers.front(); + const BamReader* reader = item.Reader; + if ( reader == 0 ) return 0; + else + return reader->GetReferenceCount(); +} + +// returns vector of reference objects +const RefVector BamMultiReaderPrivate::GetReferenceData(void) const { + + // handle empty multireader + if ( m_readers.empty() ) return RefVector(); + + // return reference data from first BamReader + const MergeItem& item = m_readers.front(); + const BamReader* reader = item.Reader; + if ( reader == 0 ) return RefVector(); + else + return reader->GetReferenceData(); +} + +// returns refID from reference name +int BamMultiReaderPrivate::GetReferenceID(const string& refName) const { + + // handle empty multireader + if ( m_readers.empty() ) return -1; + + // return reference ID from first BamReader + const MergeItem& item = m_readers.front(); + const BamReader* reader = item.Reader; + if ( reader == 0 ) return -1; + else + return reader->GetReferenceID(refName); +} +// --------------------------------------------------------------------------------------- + +// returns true if all readers have index data available +// this is useful to indicate whether Jump() or SetRegion() are possible +bool BamMultiReaderPrivate::HasIndexes(void) const { + + // handle empty multireader + if ( m_readers.empty() ) + return false; + + bool result = true; + + // iterate over readers + vector::const_iterator readerIter = m_readers.begin(); + vector::const_iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + const MergeItem& item = (*readerIter); + const BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // see if current reader has index data + result &= reader->HasIndex(); + } + + return result; +} + +// returns true if multireader has open readers +bool BamMultiReaderPrivate::HasOpenReaders(void) { + + // iterate over readers + vector::const_iterator readerIter = m_readers.begin(); + vector::const_iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + const MergeItem& item = (*readerIter); + const BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // return true whenever an open reader is found + if ( reader->IsOpen() ) return true; + } + + // no readers open + return false; +} + +// performs random-access jump using (refID, position) as a left-bound +bool BamMultiReaderPrivate::Jump(int refID, int position) { + + // NB: While it may make sense to track readers in which we can + // successfully Jump, in practice a failure of Jump means "no + // alignments here." It makes sense to simply accept the failure, + // UpdateAlignments(), and continue. + + // iterate over readers + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // jump in each BamReader to position of interest + reader->Jump(refID, position); + } + + // returns status of cache update + return UpdateAlignmentCache(); +} + +// locate (& load) index files for BAM readers that don't already have one loaded +bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredType) { + + bool errorsEncountered = false; + m_errorString.clear(); + + // iterate over readers + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // if reader has no index, try to locate one + if ( !reader->HasIndex() ) { + if ( !reader->LocateIndex(preferredType) ) { + m_errorString.append(1, '\t'); + m_errorString.append(reader->GetErrorString()); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } + } + } + + // check for errors encountered before returning success/fail + if ( errorsEncountered ) { + const string currentError = m_errorString; + const string message = string("error while locating index files: ") + "\n" + currentError; + SetErrorString("BamMultiReader::LocatingIndexes", message); + return false; + } else + return true; +} + +// opens BAM files +bool BamMultiReaderPrivate::Open(const vector& filenames) { + + m_errorString.clear(); + + // put all current readers back at beginning (refreshes alignment cache) + if ( !Rewind() ) { + const string currentError = m_errorString; + const string message = string("unable to rewind existing readers: \n\t") + currentError; + SetErrorString("BamMultiReader::Open", message); + return false; + } + + // iterate over filenames + bool errorsEncountered = false; + vector::const_iterator filenameIter = filenames.begin(); + vector::const_iterator filenameEnd = filenames.end(); + for ( ; filenameIter != filenameEnd; ++filenameIter ) { + const string& filename = (*filenameIter); + if ( filename.empty() ) continue; + + // attempt to open BamReader + BamReader* reader = new BamReader; + const bool readerOpened = reader->Open(filename); + + // if opened OK, store it + if ( readerOpened ) + m_readers.push_back( MergeItem(reader, new BamAlignment) ); + + // otherwise store error & clean up invalid reader + else { + m_errorString.append(1, '\t'); + m_errorString += string("unable to open file: ") + filename; + m_errorString.append(1, '\n'); + errorsEncountered = true; + + delete reader; + reader = 0; + } + } + + // check for errors while opening + if ( errorsEncountered ) { + const string currentError = m_errorString; + const string message = string("unable to open all files: \t\n") + currentError; + SetErrorString("BamMultiReader::Open", message); + return false; + } + + // check for BAM file consistency + if ( !ValidateReaders() ) { + const string currentError = m_errorString; + const string message = string("unable to open inconsistent files: \t\n") + currentError; + SetErrorString("BamMultiReader::Open", message); + return false; + } + + // update alignment cache + return UpdateAlignmentCache(); +} + +bool BamMultiReaderPrivate::OpenFile(const std::string& filename) { + vector filenames(1, filename); + if ( Open(filenames) ) + return true; + else { + const string currentError = m_errorString; + const string message = string("could not open file: ") + filename + "\n\t" + currentError; + SetErrorString("BamMultiReader::OpenFile", message); + return false; + } +} + +bool BamMultiReaderPrivate::OpenIndexes(const vector& indexFilenames) { + + // TODO: This needs to be cleaner - should not assume same order. + // And either way, shouldn't start at first reader. Should start at + // first reader without an index? + + // make sure same number of index filenames as readers + if ( m_readers.size() != indexFilenames.size() ) { + const string message("size of index file list does not match current BAM file count"); + SetErrorString("BamMultiReader::OpenIndexes", message); + return false; + } + + bool errorsEncountered = false; + m_errorString.clear(); + + // iterate over BamReaders + vector::const_iterator indexFilenameIter = indexFilenames.begin(); + vector::const_iterator indexFilenameEnd = indexFilenames.end(); + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + + // open index filename on reader + if ( reader ) { + const string& indexFilename = (*indexFilenameIter); + if ( !reader->OpenIndex(indexFilename) ) { + m_errorString.append(1, '\t'); + m_errorString += reader->GetErrorString(); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } + } + + // increment filename iterator, skip if no more index files to open + if ( ++indexFilenameIter == indexFilenameEnd ) + break; + } + + // return success/fail + if ( errorsEncountered ) { + const string currentError = m_errorString; + const string message = string("could not open all index files: \n\t") + currentError; + SetErrorString("BamMultiReader::OpenIndexes", message); + return false; + } else + return true; +} + +bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData) { + + // skip if no alignments available + if ( m_alignmentCache == 0 || m_alignmentCache->IsEmpty() ) + return false; + + // pop next merge item entry from cache + MergeItem item = m_alignmentCache->TakeFirst(); + BamReader* reader = item.Reader; + BamAlignment* alignment = item.Alignment; + if ( reader == 0 || alignment == 0 ) + return false; + + // set char data if requested + if ( needCharData ) { + alignment->BuildCharData(); + alignment->Filename = reader->GetFilename(); + } + + // store cached alignment into destination parameter (by copy) + al = *alignment; + + // load next alignment from reader & store in cache + SaveNextAlignment(reader, alignment); + return true; +} + +// returns BAM file pointers to beginning of alignment data & resets alignment cache +bool BamMultiReaderPrivate::Rewind(void) { + + // skip if no readers open + if ( m_readers.empty() ) + return true; + + // attempt to rewind files + if ( !RewindReaders() ) { + const string currentError = m_errorString; + const string message = string("could not rewind readers: \n\t") + currentError; + SetErrorString("BamMultiReader::Rewind", message); + return false; + } + + // return status of cache update + return UpdateAlignmentCache(); +} + +// returns BAM file pointers to beginning of alignment data +bool BamMultiReaderPrivate::RewindReaders(void) { + + m_errorString.clear(); + bool errorsEncountered = false; + + // iterate over readers + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // attempt rewind on BamReader + if ( !reader->Rewind() ) { + m_errorString.append(1, '\t'); + m_errorString.append( reader->GetErrorString() ); + m_errorString.append(1, '\n'); + errorsEncountered = true; + } + } + + return !errorsEncountered; +} + +void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) { + + // if can read alignment from reader, store in cache + // + // N.B. - lazy building of alignment's char data - populated only: + // automatically by alignment cache to maintain its sorting OR + // on demand from client call to future call to GetNextAlignment() + + if ( reader->GetNextAlignmentCore(*alignment) ) + m_alignmentCache->Add( MergeItem(reader, alignment) ); +} + +void BamMultiReaderPrivate::SetErrorString(const string& where, const string& what) const { + static const string SEPARATOR = ": "; + m_errorString = where + SEPARATOR + what; +} + +bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) { + + // NB: While it may make sense to track readers in which we can + // successfully SetRegion, In practice a failure of SetRegion means "no + // alignments here." It makes sense to simply accept the failure, + // UpdateAlignments(), and continue. + + // iterate over alignments + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // set region of interest + reader->SetRegion(region); + } + + // return status of cache update + return UpdateAlignmentCache(); +} + +// updates our alignment cache +bool BamMultiReaderPrivate::UpdateAlignmentCache(void) { + + // create alignment cache if not created yet + if ( m_alignmentCache == 0 ) { + m_alignmentCache = CreateAlignmentCache(); + if ( m_alignmentCache == 0 ) { + SetErrorString("BamMultiReader::UpdateAlignmentCache", "unable to create new alignment cache"); + return false; + } + } + + // clear any prior cache data + m_alignmentCache->Clear(); + + // iterate over readers + vector::iterator readerIter = m_readers.begin(); + vector::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + BamAlignment* alignment = item.Alignment; + if ( reader == 0 || alignment == 0 ) continue; + + // save next alignment from each reader in cache + SaveNextAlignment(reader, alignment); + } + + // if we get here, ok + return true; +} + +// ValidateReaders checks that all the readers point to BAM files representing +// alignments against the same set of reference sequences, and that the +// sequences are identically ordered. If these checks fail the operation of +// the multireader is undefined, so we force program exit. +bool BamMultiReaderPrivate::ValidateReaders(void) const { + + m_errorString.clear(); + + // skip if 0 or 1 readers opened + if ( m_readers.empty() || (m_readers.size() == 1) ) + return true; + + // retrieve first reader + const MergeItem& firstItem = m_readers.front(); + const BamReader* firstReader = firstItem.Reader; + if ( firstReader == 0 ) return false; + + // retrieve first reader's header data + const SamHeader& firstReaderHeader = firstReader->GetHeader(); + const string& firstReaderSortOrder = firstReaderHeader.SortOrder; + + // retrieve first reader's reference data + const RefVector& firstReaderRefData = firstReader->GetReferenceData(); + const int firstReaderRefCount = firstReader->GetReferenceCount(); + const int firstReaderRefSize = firstReaderRefData.size(); + + // iterate over all readers + vector::const_iterator readerIter = m_readers.begin(); + vector::const_iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + const MergeItem& item = (*readerIter); + BamReader* reader = item.Reader; + if ( reader == 0 ) continue; + + // get current reader's header data + const SamHeader& currentReaderHeader = reader->GetHeader(); + const string& currentReaderSortOrder = currentReaderHeader.SortOrder; + + // check compatible sort order + if ( currentReaderSortOrder != firstReaderSortOrder ) { + const string message = string("mismatched sort order in ") + reader->GetFilename() + + ", expected " + firstReaderSortOrder + + ", but found " + currentReaderSortOrder; + SetErrorString("BamMultiReader::ValidateReaders", message); + return false; + } + + // get current reader's reference data + const RefVector currentReaderRefData = reader->GetReferenceData(); + const int currentReaderRefCount = reader->GetReferenceCount(); + const int currentReaderRefSize = currentReaderRefData.size(); + + // init reference data iterators + RefVector::const_iterator firstRefIter = firstReaderRefData.begin(); + RefVector::const_iterator firstRefEnd = firstReaderRefData.end(); + RefVector::const_iterator currentRefIter = currentReaderRefData.begin(); + + // compare reference counts from BamReader ( & container size, in case of BR error) + if ( (currentReaderRefCount != firstReaderRefCount) || + (firstReaderRefSize != currentReaderRefSize) ) + { + stringstream s(""); + s << "mismatched reference count in " << reader->GetFilename() + << ", expected " << firstReaderRefCount + << ", but found " << currentReaderRefCount; + SetErrorString("BamMultiReader::ValidateReaders", s.str()); + return false; + } + + // this will be ok; we just checked above that we have identically-sized sets of references + // here we simply check if they are all, in fact, equal in content + while ( firstRefIter != firstRefEnd ) { + const RefData& firstRef = (*firstRefIter); + const RefData& currentRef = (*currentRefIter); + + // compare reference name & length + if ( (firstRef.RefName != currentRef.RefName) || + (firstRef.RefLength != currentRef.RefLength) ) + { + stringstream s(""); + s << "mismatched references found in" << reader->GetFilename() + << "expected: " << endl; + + // print first reader's reference data + RefVector::const_iterator refIter = firstReaderRefData.begin(); + RefVector::const_iterator refEnd = firstReaderRefData.end(); + for ( ; refIter != refEnd; ++refIter ) { + const RefData& entry = (*refIter); + stringstream s(""); + s << entry.RefName << " " << endl; + } + + s << "but found: " << endl; + + // print current reader's reference data + refIter = currentReaderRefData.begin(); + refEnd = currentReaderRefData.end(); + for ( ; refIter != refEnd; ++refIter ) { + const RefData& entry = (*refIter); + s << entry.RefName << " " << entry.RefLength << endl; + } + + SetErrorString("BamMultiReader::ValidateReaders", s.str()); + return false; + } + + // update iterators + ++firstRefIter; + ++currentRefIter; + } + } + + // if we get here, everything checks out + return true; +} diff --git a/src/api/internal/bam/BamMultiReader_p.h b/src/api/internal/bam/BamMultiReader_p.h new file mode 100644 index 0000000..9d7c39a --- /dev/null +++ b/src/api/internal/bam/BamMultiReader_p.h @@ -0,0 +1,99 @@ +// *************************************************************************** +// BamMultiReader_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011 (DB) +// --------------------------------------------------------------------------- +// Functionality for simultaneously reading multiple BAM files +// ************************************************************************* + +#ifndef BAMMULTIREADER_P_H +#define BAMMULTIREADER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/SamHeader.h" +#include "api/BamMultiReader.h" +#include "api/internal/bam/BamMultiMerger_p.h" +#include +#include + +namespace BamTools { +namespace Internal { + +class BamMultiReaderPrivate { + + // typedefs + public: + typedef std::pair ReaderAlignment; + + // constructor / destructor + public: + BamMultiReaderPrivate(void); + ~BamMultiReaderPrivate(void); + + // public interface + public: + + // file operations + bool Close(void); + bool CloseFile(const std::string& filename); + const std::vector Filenames(void) const; + bool Jump(int refID, int position = 0); + bool Open(const std::vector& filenames); + bool OpenFile(const std::string& filename); + bool Rewind(void); + bool SetRegion(const BamRegion& region); + + // access alignment data + bool GetNextAlignment(BamAlignment& al); + bool GetNextAlignmentCore(BamAlignment& al); + bool HasOpenReaders(void); + + // access auxiliary data + SamHeader GetHeader(void) const; + std::string GetHeaderText(void) const; + int GetReferenceCount(void) const; + const BamTools::RefVector GetReferenceData(void) const; + int GetReferenceID(const std::string& refName) const; + + // BAM index operations + bool CreateIndexes(const BamIndex::IndexType& type = BamIndex::STANDARD); + bool HasIndexes(void) const; + bool LocateIndexes(const BamIndex::IndexType& preferredType = BamIndex::STANDARD); + bool OpenIndexes(const std::vector& indexFilenames); + + // error handling + std::string GetErrorString(void) const; + + // 'internal' methods + public: + + bool CloseFiles(const std::vector& filenames); + IMultiMerger* CreateAlignmentCache(void) const; + bool PopNextCachedAlignment(BamAlignment& al, const bool needCharData); + bool RewindReaders(void); + void SaveNextAlignment(BamReader* reader, BamAlignment* alignment); + void SetErrorString(const std::string& where, const std::string& what) const; // + bool UpdateAlignmentCache(void); + bool ValidateReaders(void) const; + + // data members + public: + std::vector m_readers; + IMultiMerger* m_alignmentCache; + mutable std::string m_errorString; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMMULTIREADER_P_H diff --git a/src/api/internal/bam/BamRandomAccessController_p.cpp b/src/api/internal/bam/BamRandomAccessController_p.cpp new file mode 100644 index 0000000..848fafd --- /dev/null +++ b/src/api/internal/bam/BamRandomAccessController_p.cpp @@ -0,0 +1,289 @@ +// *************************************************************************** +// BamRandomAccessController_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011(DB) +// --------------------------------------------------------------------------- +// Manages random access operations in a BAM file +// ************************************************************************** + +#include "api/BamIndex.h" +#include "api/internal/bam/BamRandomAccessController_p.h" +#include "api/internal/bam/BamReader_p.h" +#include "api/internal/index/BamIndexFactory_p.h" +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +using namespace std; + +BamRandomAccessController::BamRandomAccessController(void) + : m_index(0) + , m_hasAlignmentsInRegion(true) +{ } + +BamRandomAccessController::~BamRandomAccessController(void) { + Close(); +} + +void BamRandomAccessController::AdjustRegion(const int& referenceCount) { + + // skip if no index available + if ( m_index == 0 ) + return; + + // see if any references in region have alignments + m_hasAlignmentsInRegion = false; + int currentId = m_region.LeftRefID; + const int rightBoundRefId = ( m_region.isRightBoundSpecified() ? m_region.RightRefID : referenceCount - 1 ); + while ( currentId <= rightBoundRefId ) { + m_hasAlignmentsInRegion = m_index->HasAlignments(currentId); + if ( m_hasAlignmentsInRegion ) break; + ++currentId; + } + + // if no data found on any reference in region + if ( !m_hasAlignmentsInRegion ) + return; + + // if left bound of desired region had no data, use first reference that had data + // otherwise, leave requested region as-is + if ( currentId != m_region.LeftRefID ) { + m_region.LeftRefID = currentId; + m_region.LeftPosition = 0; + } +} + +// returns alignments' "RegionState": { Before|Overlaps|After } current region +BamRandomAccessController::RegionState +BamRandomAccessController::AlignmentState(const BamAlignment& alignment) const { + + // if region has no left bound at all + if ( !m_region.isLeftBoundSpecified() ) + return OverlapsRegion; + + // handle unmapped reads - return AFTER region to halt processing + if ( alignment.RefID == -1 ) + return AfterRegion; + + // if alignment is on any reference before left bound reference + if ( alignment.RefID < m_region.LeftRefID ) + return BeforeRegion; + + // if alignment is on left bound reference + else if ( alignment.RefID == m_region.LeftRefID ) { + + // if alignment starts at or after left bound position + if ( alignment.Position >= m_region.LeftPosition) { + + if ( m_region.isRightBoundSpecified() && // right bound is specified AND + m_region.LeftRefID == m_region.RightRefID && // left & right bounds on same reference AND + alignment.Position >= m_region.RightPosition ) // alignment starts on or after right bound position + return AfterRegion; + + // otherwise, alignment overlaps region + else return OverlapsRegion; + } + + // alignment starts before left bound position + else { + + // if alignment overlaps left bound position + if ( alignment.GetEndPosition() > m_region.LeftPosition ) + return OverlapsRegion; + else + return BeforeRegion; + } + } + + // otherwise alignment is on a reference after left bound reference + else { + + // if region has a right bound + if ( m_region.isRightBoundSpecified() ) { + + // alignment is on any reference between boundaries + if ( alignment.RefID < m_region.RightRefID ) + return OverlapsRegion; + + // alignment is on any reference after right boundary + else if ( alignment.RefID > m_region.RightRefID ) + return AfterRegion; + + // alignment is on right bound reference + else { + + // if alignment starts before right bound position + if ( alignment.Position < m_region.RightPosition ) + return OverlapsRegion; + else + return AfterRegion; + } + } + + // otherwise, alignment starts after left bound and there is no right bound given + else return OverlapsRegion; + } +} + +void BamRandomAccessController::Close(void) { + ClearIndex(); + ClearRegion(); +} + +void BamRandomAccessController::ClearIndex(void) { + if ( m_index ) { + delete m_index; + m_index = 0; + } +} + +void BamRandomAccessController::ClearRegion(void) { + m_region.clear(); + m_hasAlignmentsInRegion = true; +} + +bool BamRandomAccessController::CreateIndex(BamReaderPrivate* reader, + const BamIndex::IndexType& type) +{ + // skip if reader is invalid + assert(reader); + if ( !reader->IsOpen() ) { + SetErrorString("BamRandomAccessController::CreateIndex", + "cannot create index for unopened reader"); + return false; + } + + // create new index of requested type + BamIndex* newIndex = BamIndexFactory::CreateIndexOfType(type, reader); + if ( newIndex == 0 ) { + stringstream s(""); + s << "could not create index of type: " << type; + SetErrorString("BamRandomAccessController::CreateIndex", s.str()); + return false; + } + + // attempt to build index from current BamReader file + if ( !newIndex->Create() ) { + const string indexError = newIndex->GetErrorString(); + const string message = "could not create index: \n\t" + indexError; + SetErrorString("BamRandomAccessController::CreateIndex", message); + return false; + } + + // save new index & return success + SetIndex(newIndex); + return true; +} + +string BamRandomAccessController::GetErrorString(void) const { + return m_errorString; +} + +bool BamRandomAccessController::HasIndex(void) const { + return ( m_index != 0 ); +} + +bool BamRandomAccessController::HasRegion(void) const { + return ( !m_region.isNull() ); +} + +bool BamRandomAccessController::IndexHasAlignmentsForReference(const int& refId) { + return m_index->HasAlignments(refId); +} + +bool BamRandomAccessController::LocateIndex(BamReaderPrivate* reader, + const BamIndex::IndexType& preferredType) +{ + // look up index filename, deferring to preferredType if possible + assert(reader); + const string& indexFilename = BamIndexFactory::FindIndexFilename(reader->Filename(), preferredType); + + // if no index file found (of any type) + if ( indexFilename.empty() ) { + const string message = string("could not find index file for:") + reader->Filename(); + SetErrorString("BamRandomAccessController::LocateIndex", message); + return false; + } + + // otherwise open & use index file that was found + return OpenIndex(indexFilename, reader); +} + +bool BamRandomAccessController::OpenIndex(const string& indexFilename, BamReaderPrivate* reader) { + + // attempt create new index of type based on filename + BamIndex* index = BamIndexFactory::CreateIndexFromFilename(indexFilename, reader); + if ( index == 0 ) { + const string message = string("could not open index file: ") + indexFilename; + SetErrorString("BamRandomAccessController::OpenIndex", message); + return false; + } + + // attempt to load data from index file + if ( !index->Load(indexFilename) ) { + const string indexError = index->GetErrorString(); + const string message = string("could not load index data from file: ") + indexFilename + + "\n\t" + indexError; + SetErrorString("BamRandomAccessController::OpenIndex", message); + return false; + } + + // save new index & return success + SetIndex(index); + return true; +} + +bool BamRandomAccessController::RegionHasAlignments(void) const { + return m_hasAlignmentsInRegion; +} + +void BamRandomAccessController::SetErrorString(const string& where, const string& what) { + m_errorString = where + ": " + what; +} + +void BamRandomAccessController::SetIndex(BamIndex* index) { + if ( m_index ) + ClearIndex(); + m_index = index; +} + +bool BamRandomAccessController::SetRegion(const BamRegion& region, const int& referenceCount) { + + // store region + m_region = region; + + // cannot jump when no index is available + if ( !HasIndex() ) { + SetErrorString("BamRandomAccessController", "cannot jump if no index data available"); + return false; + } + + // adjust region as necessary to reflect where data actually begins + AdjustRegion(referenceCount); + + // if no data present, return true + // * Not an error, but future attempts to access alignments in this region will not return data + // Returning true is useful in a BamMultiReader setting where some BAM files may + // lack alignments in regions where other files still have data available. + if ( !m_hasAlignmentsInRegion ) + return true; + + // return success/failure of jump to specified region, + // + // * Index::Jump() is allowed to modify the m_hasAlignmentsInRegion flag + // This covers 'corner case' where a region is requested that lies beyond the last + // alignment on a reference. If this occurs, any subsequent calls to GetNextAlignment[Core] + // will not return data. BamMultiReader will still be able to successfully pull alignments + // from a region from other files even if this one has no data. + if ( !m_index->Jump(m_region, &m_hasAlignmentsInRegion) ) { + const string indexError = m_index->GetErrorString(); + const string message = string("could not set region\n\t") + indexError; + SetErrorString("BamRandomAccessController::OpenIndex", message); + return false; + } + else + return true; +} diff --git a/src/api/internal/bam/BamRandomAccessController_p.h b/src/api/internal/bam/BamRandomAccessController_p.h new file mode 100644 index 0000000..9262a61 --- /dev/null +++ b/src/api/internal/bam/BamRandomAccessController_p.h @@ -0,0 +1,94 @@ +// *************************************************************************** +// BamRandomAccessController_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011(DB) +// --------------------------------------------------------------------------- +// Manages random access operations in a BAM file +// *************************************************************************** + +#ifndef BAMRACONTROLLER_P_H +#define BAMRACONTROLLER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/BamAux.h" +#include "api/BamIndex.h" + +namespace BamTools { + +class BamAlignment; + +namespace Internal { + +class BamReaderPrivate; + +class BamRandomAccessController { + + // enums + public: enum RegionState { BeforeRegion = 0 + , OverlapsRegion + , AfterRegion + }; + + // ctor & dtor + public: + BamRandomAccessController(void); + ~BamRandomAccessController(void); + + // BamRandomAccessController interface + public: + + // index methods + void ClearIndex(void); + bool CreateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& type); + bool HasIndex(void) const; + bool IndexHasAlignmentsForReference(const int& refId); + bool LocateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& preferredType); + bool OpenIndex(const std::string& indexFilename, BamReaderPrivate* reader); + void SetIndex(BamIndex* index); + + // region methods + void ClearRegion(void); + bool HasRegion(void) const; + RegionState AlignmentState(const BamAlignment& alignment) const; + bool RegionHasAlignments(void) const; + bool SetRegion(const BamRegion& region, const int& referenceCount); + + // general methods + void Close(void); + std::string GetErrorString(void) const; + + // internal methods + private: + // adjusts requested region if necessary (depending on where data actually begins) + void AdjustRegion(const int& referenceCount); + // error-string handling + void SetErrorString(const std::string& where, const std::string& what); + + // data members + private: + + // index data + BamIndex* m_index; // owns the index, not a copy - responsible for deleting + + // region data + BamRegion m_region; + bool m_hasAlignmentsInRegion; + + // general data + std::string m_errorString; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMRACONTROLLER_P_H diff --git a/src/api/internal/bam/BamReader_p.cpp b/src/api/internal/bam/BamReader_p.cpp new file mode 100644 index 0000000..6904da7 --- /dev/null +++ b/src/api/internal/bam/BamReader_p.cpp @@ -0,0 +1,469 @@ +// *************************************************************************** +// BamReader_p.cpp (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +<<<<<<< HEAD:src/api/internal/BamReader_p.cpp +// Last modified: 14 November 2011 (DB) +======= +// Last modified: 25 October 2011 (DB) +>>>>>>> remoteio:src/api/internal/bam/BamReader_p.cpp +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading BAM files +// *************************************************************************** + +#include "api/BamConstants.h" +#include "api/BamReader.h" +#include "api/IBamIODevice.h" +#include "api/internal/bam/BamHeader_p.h" +#include "api/internal/bam/BamRandomAccessController_p.h" +#include "api/internal/bam/BamReader_p.h" +#include "api/internal/index/BamStandardIndex_p.h" +#include "api/internal/index/BamToolsIndex_p.h" +#include "api/internal/io/BamDeviceFactory_p.h" +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +#include +#include +using namespace std; + +// constructor +BamReaderPrivate::BamReaderPrivate(BamReader* parent) + : m_alignmentsBeginOffset(0) + , m_parent(parent) +{ + m_isBigEndian = BamTools::SystemIsBigEndian(); +} + +// destructor +BamReaderPrivate::~BamReaderPrivate(void) { + Close(); +} + +// closes the BAM file +bool BamReaderPrivate::Close(void) { + + // clear BAM metadata + m_references.clear(); + m_header.Clear(); + + // clear filename + m_filename.clear(); + + // close random access controller + m_randomAccessController.Close(); + + // if stream is open, attempt close + if ( IsOpen() ) { + try { + m_stream.Close(); + } catch ( BamException& e ) { + const string streamError = e.what(); + const string message = string("encountered error closing BAM file: \n\t") + streamError; + SetErrorString("BamReader::Close", message); + return false; + } + } + + // return success + return true; +} + +// creates an index file of requested type on current BAM file +bool BamReaderPrivate::CreateIndex(const BamIndex::IndexType& type) { + + // skip if BAM file not open + if ( !IsOpen() ) { + SetErrorString("BamReader::CreateIndex", "cannot create index on unopened BAM file"); + return false; + } + + // attempt to create index + if ( m_randomAccessController.CreateIndex(this, type) ) + return true; + else { + const string bracError = m_randomAccessController.GetErrorString(); + const string message = string("could not create index: \n\t") + bracError; + SetErrorString("BamReader::CreateIndex", message); + return false; + } +} + +// return path & filename of current BAM file +const string BamReaderPrivate::Filename(void) const { + return m_filename; +} + +string BamReaderPrivate::GetErrorString(void) const { + return m_errorString; +} + +// return header data as std::string +string BamReaderPrivate::GetHeaderText(void) const { + return m_header.ToString(); +} + +// return header data as SamHeader object +SamHeader BamReaderPrivate::GetSamHeader(void) const { + return m_header.ToSamHeader(); +} + +// get next alignment (with character data fully parsed) +bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) { + + // if valid alignment found + if ( GetNextAlignmentCore(alignment) ) { + + // store alignment's "source" filename + alignment.Filename = m_filename; + + // return success/failure of parsing char data + if ( alignment.BuildCharData() ) + return true; + else { + const string alError = alignment.GetErrorString(); + const string message = string("could not populate alignment data: \n\t") + alError; + SetErrorString("BamReader::GetNextAlignment", message); + return false; + } + } + + // no valid alignment found + return false; +} + +// retrieves next available alignment core data (returns success/fail) +// ** DOES NOT populate any character data fields (read name, bases, qualities, tag data, filename) +// these can be accessed, if necessary, from the supportData +// useful for operations requiring ONLY positional or other alignment-related information +bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& alignment) { + + // skip if stream not opened + if ( !m_stream.IsOpen() ) + return false; + + try { + + // skip if region is set but has no alignments + if ( m_randomAccessController.HasRegion() && + !m_randomAccessController.RegionHasAlignments() ) + { + return false; + } + + // if can't read next alignment + if ( !LoadNextAlignment(alignment) ) + return false; + + // check alignment's region-overlap state + BamRandomAccessController::RegionState state = m_randomAccessController.AlignmentState(alignment); + + // if alignment starts after region, no need to keep reading + if ( state == BamRandomAccessController::AfterRegion ) + return false; + + // read until overlap is found + while ( state != BamRandomAccessController::OverlapsRegion ) { + + // if can't read next alignment + if ( !LoadNextAlignment(alignment) ) + return false; + + // check alignment's region-overlap state + state = m_randomAccessController.AlignmentState(alignment); + + // if alignment starts after region, no need to keep reading + if ( state == BamRandomAccessController::AfterRegion ) + return false; + } + + // if we get here, we found the next 'valid' alignment + // (e.g. overlaps current region if one was set, simply the next alignment if not) + alignment.SupportData.HasCoreOnly = true; + return true; + + } catch ( BamException& e ) { + const string streamError = e.what(); + const string message = string("encountered error reading BAM alignment: \n\t") + streamError; + SetErrorString("BamReader::GetNextAlignmentCore", message); + return false; + } +} + +int BamReaderPrivate::GetReferenceCount(void) const { + return m_references.size(); +} + +const RefVector& BamReaderPrivate::GetReferenceData(void) const { + return m_references; +} + +// returns RefID for given RefName (returns References.size() if not found) +int BamReaderPrivate::GetReferenceID(const string& refName) const { + + // retrieve names from reference data + vector refNames; + RefVector::const_iterator refIter = m_references.begin(); + RefVector::const_iterator refEnd = m_references.end(); + for ( ; refIter != refEnd; ++refIter) + refNames.push_back( (*refIter).RefName ); + + // return 'index-of' refName (or -1 if not found) + int index = distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); + if ( index == (int)m_references.size() ) return -1; + else return index; +} + +bool BamReaderPrivate::HasIndex(void) const { + return m_randomAccessController.HasIndex(); +} + +bool BamReaderPrivate::IsOpen(void) const { + return m_stream.IsOpen(); +} + +// load BAM header data +void BamReaderPrivate::LoadHeaderData(void) { + m_header.Load(&m_stream); +} + +// populates BamAlignment with alignment data under file pointer, returns success/fail +bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) { + + // read in the 'block length' value, make sure it's not zero + char buffer[sizeof(uint32_t)]; + m_stream.Read(buffer, sizeof(uint32_t)); + alignment.SupportData.BlockLength = BamTools::UnpackUnsignedInt(buffer); + if ( m_isBigEndian ) BamTools::SwapEndian_32(alignment.SupportData.BlockLength); + if ( alignment.SupportData.BlockLength == 0 ) + return false; + + // read in core alignment data, make sure the right size of data was read + char x[Constants::BAM_CORE_SIZE]; + if ( m_stream.Read(x, Constants::BAM_CORE_SIZE) != Constants::BAM_CORE_SIZE ) + return false; + + // swap core endian-ness if necessary + if ( m_isBigEndian ) { + for ( unsigned int i = 0; i < Constants::BAM_CORE_SIZE; i+=sizeof(uint32_t) ) + BamTools::SwapEndian_32p(&x[i]); + } + + // set BamAlignment 'core' and 'support' data + alignment.RefID = BamTools::UnpackSignedInt(&x[0]); + alignment.Position = BamTools::UnpackSignedInt(&x[4]); + + unsigned int tempValue = BamTools::UnpackUnsignedInt(&x[8]); + alignment.Bin = tempValue >> 16; + alignment.MapQuality = tempValue >> 8 & 0xff; + alignment.SupportData.QueryNameLength = tempValue & 0xff; + + tempValue = BamTools::UnpackUnsignedInt(&x[12]); + alignment.AlignmentFlag = tempValue >> 16; + alignment.SupportData.NumCigarOperations = tempValue & 0xffff; + + alignment.SupportData.QuerySequenceLength = BamTools::UnpackUnsignedInt(&x[16]); + alignment.MateRefID = BamTools::UnpackSignedInt(&x[20]); + alignment.MatePosition = BamTools::UnpackSignedInt(&x[24]); + alignment.InsertSize = BamTools::UnpackSignedInt(&x[28]); + + // set BamAlignment length + alignment.Length = alignment.SupportData.QuerySequenceLength; + + // read in character data - make sure proper data size was read + bool readCharDataOK = false; + const unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE; + RaiiBuffer allCharData(dataLength); + + if ( m_stream.Read(allCharData.Buffer, dataLength) == dataLength ) { + + // store 'allCharData' in supportData structure + alignment.SupportData.AllCharData.assign((const char*)allCharData.Buffer, dataLength); + + // set success flag + readCharDataOK = true; + + // save CIGAR ops + // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly, + // even when GetNextAlignmentCore() is called + const unsigned int cigarDataOffset = alignment.SupportData.QueryNameLength; + uint32_t* cigarData = (uint32_t*)(allCharData.Buffer + cigarDataOffset); + CigarOp op; + alignment.CigarData.clear(); + alignment.CigarData.reserve(alignment.SupportData.NumCigarOperations); + for ( unsigned int i = 0; i < alignment.SupportData.NumCigarOperations; ++i ) { + + // swap endian-ness if necessary + if ( m_isBigEndian ) BamTools::SwapEndian_32(cigarData[i]); + + // build CigarOp structure + op.Length = (cigarData[i] >> Constants::BAM_CIGAR_SHIFT); + op.Type = Constants::BAM_CIGAR_LOOKUP[ (cigarData[i] & Constants::BAM_CIGAR_MASK) ]; + + // save CigarOp + alignment.CigarData.push_back(op); + } + } + + // return success/failure + return readCharDataOK; +} + +// loads reference data from BAM file +bool BamReaderPrivate::LoadReferenceData(void) { + + // get number of reference sequences + char buffer[sizeof(uint32_t)]; + m_stream.Read(buffer, sizeof(uint32_t)); + uint32_t numberRefSeqs = BamTools::UnpackUnsignedInt(buffer); + if ( m_isBigEndian ) BamTools::SwapEndian_32(numberRefSeqs); + m_references.reserve((int)numberRefSeqs); + + // iterate over all references in header + for ( unsigned int i = 0; i != numberRefSeqs; ++i ) { + + // get length of reference name + m_stream.Read(buffer, sizeof(uint32_t)); + uint32_t refNameLength = BamTools::UnpackUnsignedInt(buffer); + if ( m_isBigEndian ) BamTools::SwapEndian_32(refNameLength); + RaiiBuffer refName(refNameLength); + + // get reference name and reference sequence length + m_stream.Read(refName.Buffer, refNameLength); + m_stream.Read(buffer, sizeof(int32_t)); + int32_t refLength = BamTools::UnpackSignedInt(buffer); + if ( m_isBigEndian ) BamTools::SwapEndian_32(refLength); + + // store data for reference + RefData aReference; + aReference.RefName = (string)((const char*)refName.Buffer); + aReference.RefLength = refLength; + m_references.push_back(aReference); + } + + // return success + return true; +} + +bool BamReaderPrivate::LocateIndex(const BamIndex::IndexType& preferredType) { + + if ( m_randomAccessController.LocateIndex(this, preferredType) ) + return true; + else { + const string bracError = m_randomAccessController.GetErrorString(); + const string message = string("could not locate index: \n\t") + bracError; + SetErrorString("BamReader::LocateIndex", message); + return false; + } +} + +// opens BAM file (and index) +bool BamReaderPrivate::Open(const string& filename) { + + try { + + // make sure we're starting with fresh state + Close(); + + // open BgzfStream + m_stream.Open(filename, IBamIODevice::ReadOnly); + + // load BAM metadata + LoadHeaderData(); + LoadReferenceData(); + + // store filename & offset of first alignment + m_filename = filename; + m_alignmentsBeginOffset = m_stream.Tell(); + + // return success + return true; + + } catch ( BamException& e ) { + const string error = e.what(); + const string message = string("could not open file: ") + filename + + "\n\t" + error; + SetErrorString("BamReader::Open", message); + return false; + } +} + +bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) { + + if ( m_randomAccessController.OpenIndex(indexFilename, this) ) + return true; + else { + const string bracError = m_randomAccessController.GetErrorString(); + const string message = string("could not open index: \n\t") + bracError; + SetErrorString("BamReader::OpenIndex", message); + return false; + } +} + +// returns BAM file pointer to beginning of alignment data +bool BamReaderPrivate::Rewind(void) { + + // reset region + m_randomAccessController.ClearRegion(); + + // return status of seeking back to first alignment + if ( Seek(m_alignmentsBeginOffset) ) + return true; + else { + const string currentError = m_errorString; + const string message = string("could not rewind: \n\t") + currentError; + SetErrorString("BamReader::Rewind", message); + return false; + } +} + +bool BamReaderPrivate::Seek(const int64_t& position) { + + // skip if BAM file not open + if ( !IsOpen() ) { + SetErrorString("BamReader::Seek", "cannot seek on unopened BAM file"); + return false; + } + + try { + m_stream.Seek(position); + return true; + } + catch ( BamException& e ) { + const string streamError = e.what(); + const string message = string("could not seek in BAM file: \n\t") + streamError; + SetErrorString("BamReader::Seek", message); + return false; + } +} + +void BamReaderPrivate::SetErrorString(const string& where, const string& what) { + static const string SEPARATOR = ": "; + m_errorString = where + SEPARATOR + what; +} + +void BamReaderPrivate::SetIndex(BamIndex* index) { + m_randomAccessController.SetIndex(index); +} + +// sets current region & attempts to jump to it +// returns success/failure +bool BamReaderPrivate::SetRegion(const BamRegion& region) { + + if ( m_randomAccessController.SetRegion(region, m_references.size()) ) + return true; + else { + const string bracError = m_randomAccessController.GetErrorString(); + const string message = string("could not set region: \n\t") + bracError; + SetErrorString("BamReader::SetRegion", message); + return false; + } +} + +int64_t BamReaderPrivate::Tell(void) const { + return m_stream.Tell(); +} diff --git a/src/api/internal/bam/BamReader_p.h b/src/api/internal/bam/BamReader_p.h new file mode 100644 index 0000000..e8db646 --- /dev/null +++ b/src/api/internal/bam/BamReader_p.h @@ -0,0 +1,118 @@ +// *************************************************************************** +// BamReader_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading BAM files +// *************************************************************************** + +#ifndef BAMREADER_P_H +#define BAMREADER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/BamAlignment.h" +#include "api/BamIndex.h" +#include "api/BamReader.h" +#include "api/SamHeader.h" +#include "api/internal/bam/BamHeader_p.h" +#include "api/internal/bam/BamRandomAccessController_p.h" +#include "api/internal/io/BgzfStream_p.h" +#include + +namespace BamTools { +namespace Internal { + +class BamReaderPrivate { + + // ctor & dtor + public: + BamReaderPrivate(BamReader* parent); + ~BamReaderPrivate(void); + + // BamReader interface + public: + + // file operations + bool Close(void); + const std::string Filename(void) const; + bool IsOpen(void) const; + bool Open(const std::string& filename); + bool Rewind(void); + bool SetRegion(const BamRegion& region); + + // access alignment data + bool GetNextAlignment(BamAlignment& alignment); + bool GetNextAlignmentCore(BamAlignment& alignment); + + // access auxiliary data + std::string GetHeaderText(void) const; + SamHeader GetSamHeader(void) const; + int GetReferenceCount(void) const; + const RefVector& GetReferenceData(void) const; + int GetReferenceID(const std::string& refName) const; + + // index operations + bool CreateIndex(const BamIndex::IndexType& type); + bool HasIndex(void) const; + bool LocateIndex(const BamIndex::IndexType& preferredType); + bool OpenIndex(const std::string& indexFilename); + void SetIndex(BamIndex* index); + + // error handling + std::string GetErrorString(void) const; + void SetErrorString(const std::string& where, const std::string& what); + + // internal methods, but available as a BamReaderPrivate 'interface' + // + // these methods should only be used by BamTools::Internal classes + // (currently only used by the BamIndex subclasses) + public: + // retrieves header text from BAM file + void LoadHeaderData(void); + // retrieves BAM alignment under file pointer + // (does no overlap checking or character data parsing) + bool LoadNextAlignment(BamAlignment& alignment); + // builds reference data structure from BAM file + bool LoadReferenceData(void); + // seek reader to file position + bool Seek(const int64_t& position); + // return reader's file position + int64_t Tell(void) const; + + // data members + public: + + // general BAM file data + int64_t m_alignmentsBeginOffset; + std::string m_filename; + RefVector m_references; + + // system data + bool m_isBigEndian; + + // parent BamReader + BamReader* m_parent; + + // BamReaderPrivate components + BamHeader m_header; + BamRandomAccessController m_randomAccessController; + BgzfStream m_stream; + + // error handling + std::string m_errorString; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMREADER_P_H diff --git a/src/api/internal/bam/BamWriter_p.cpp b/src/api/internal/bam/BamWriter_p.cpp new file mode 100644 index 0000000..ba4989f --- /dev/null +++ b/src/api/internal/bam/BamWriter_p.cpp @@ -0,0 +1,462 @@ +// *************************************************************************** +// BamWriter_p.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#include "api/BamAlignment.h" +#include "api/BamConstants.h" +#include "api/IBamIODevice.h" +#include "api/internal/bam/BamWriter_p.h" +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +using namespace std; + +// ctor +BamWriterPrivate::BamWriterPrivate(void) + : m_isBigEndian( BamTools::SystemIsBigEndian() ) +{ } + +// dtor +BamWriterPrivate::~BamWriterPrivate(void) { + Close(); +} + +// calculates minimum bin for a BAM alignment interval [begin, end) +uint32_t BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { + --end; + if ( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); + if ( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); + if ( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20); + if ( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23); + if ( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26); + return 0; +} + +// closes the alignment archive +void BamWriterPrivate::Close(void) { + + // skip if file not open + if ( !IsOpen() ) return; + + // close output stream + try { + m_stream.Close(); + } catch ( BamException& e ) { + m_errorString = e.what(); + } +} + +// creates a cigar string from the supplied alignment +void BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, string& packedCigar) { + + // initialize + const size_t numCigarOperations = cigarOperations.size(); + packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT); + + // pack the cigar data into the string + unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); + + // iterate over cigar operations + vector::const_iterator coIter = cigarOperations.begin(); + vector::const_iterator coEnd = cigarOperations.end(); + for ( ; coIter != coEnd; ++coIter ) { + + // store op in packedCigar + uint8_t cigarOp; + switch ( coIter->Type ) { + case (Constants::BAM_CIGAR_MATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MATCH; break; + case (Constants::BAM_CIGAR_INS_CHAR) : cigarOp = Constants::BAM_CIGAR_INS; break; + case (Constants::BAM_CIGAR_DEL_CHAR) : cigarOp = Constants::BAM_CIGAR_DEL; break; + case (Constants::BAM_CIGAR_REFSKIP_CHAR) : cigarOp = Constants::BAM_CIGAR_REFSKIP; break; + case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_SOFTCLIP; break; + case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_HARDCLIP; break; + case (Constants::BAM_CIGAR_PAD_CHAR) : cigarOp = Constants::BAM_CIGAR_PAD; break; + case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_SEQMATCH; break; + case (Constants::BAM_CIGAR_MISMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MISMATCH; break; + default: + const string message = string("invalid CIGAR operation type") + coIter->Type; + throw BamException("BamWriter::CreatePackedCigar", message); + } + + *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp; + pPackedCigar++; + } +} + +// encodes the supplied query sequence into 4-bit notation +void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) { + + // prepare the encoded query string + const size_t queryLength = query.size(); + const size_t encodedQueryLength = static_cast((queryLength+1)/2); + encodedQuery.resize(encodedQueryLength); + char* pEncodedQuery = (char*)encodedQuery.data(); + const char* pQuery = (const char*)query.data(); + + // walk through original query sequence, encoding its bases + unsigned char nucleotideCode; + bool useHighWord = true; + while ( *pQuery ) { + switch ( *pQuery ) { + case (Constants::BAM_DNA_EQUAL) : nucleotideCode = Constants::BAM_BASECODE_EQUAL; break; + case (Constants::BAM_DNA_A) : nucleotideCode = Constants::BAM_BASECODE_A; break; + case (Constants::BAM_DNA_C) : nucleotideCode = Constants::BAM_BASECODE_C; break; + case (Constants::BAM_DNA_M) : nucleotideCode = Constants::BAM_BASECODE_M; break; + case (Constants::BAM_DNA_G) : nucleotideCode = Constants::BAM_BASECODE_G; break; + case (Constants::BAM_DNA_R) : nucleotideCode = Constants::BAM_BASECODE_R; break; + case (Constants::BAM_DNA_S) : nucleotideCode = Constants::BAM_BASECODE_S; break; + case (Constants::BAM_DNA_V) : nucleotideCode = Constants::BAM_BASECODE_V; break; + case (Constants::BAM_DNA_T) : nucleotideCode = Constants::BAM_BASECODE_T; break; + case (Constants::BAM_DNA_W) : nucleotideCode = Constants::BAM_BASECODE_W; break; + case (Constants::BAM_DNA_Y) : nucleotideCode = Constants::BAM_BASECODE_Y; break; + case (Constants::BAM_DNA_H) : nucleotideCode = Constants::BAM_BASECODE_H; break; + case (Constants::BAM_DNA_K) : nucleotideCode = Constants::BAM_BASECODE_K; break; + case (Constants::BAM_DNA_D) : nucleotideCode = Constants::BAM_BASECODE_D; break; + case (Constants::BAM_DNA_B) : nucleotideCode = Constants::BAM_BASECODE_B; break; + case (Constants::BAM_DNA_N) : nucleotideCode = Constants::BAM_BASECODE_N; break; + default: + const string message = string("invalid base: ") + *pQuery; + throw BamException("BamWriter::EncodeQuerySequence", message); + } + + // pack the nucleotide code + if ( useHighWord ) { + *pEncodedQuery = nucleotideCode << 4; + useHighWord = false; + } else { + *pEncodedQuery |= nucleotideCode; + ++pEncodedQuery; + useHighWord = true; + } + + // increment the query position + ++pQuery; + } +} + +// returns a description of the last error that occurred +std::string BamWriterPrivate::GetErrorString(void) const { + return m_errorString; +} + +// returns whether BAM file is open for writing or not +bool BamWriterPrivate::IsOpen(void) const { + return m_stream.IsOpen(); +} + +// opens the alignment archive +bool BamWriterPrivate::Open(const string& filename, + const string& samHeaderText, + const RefVector& referenceSequences) +{ + try { + + // open the BGZF file for writing + m_stream.Open(filename, IBamIODevice::WriteOnly); + + // write BAM file 'metadata' components + WriteMagicNumber(); + WriteSamHeaderText(samHeaderText); + WriteReferences(referenceSequences); + + // return success + return true; + + } catch ( BamException& e ) { + m_errorString = e.what(); + return false; + } +} + +// saves the alignment to the alignment archive +bool BamWriterPrivate::SaveAlignment(const BamAlignment& al) { + + try { + + // if BamAlignment contains only the core data and a raw char data buffer + // (as a result of BamReader::GetNextAlignmentCore()) + if ( al.SupportData.HasCoreOnly ) + WriteCoreAlignment(al); + + // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc + // (resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code) + else WriteAlignment(al); + + // if we get here, everything OK + return true; + + } catch ( BamException& e ) { + m_errorString = e.what(); + return false; + } +} + +void BamWriterPrivate::SetWriteCompressed(bool ok) { + // modifying compression is not allowed if BAM file is open + if ( !IsOpen() ) + m_stream.SetWriteCompressed(ok); +} + +void BamWriterPrivate::WriteAlignment(const BamAlignment& al) { + + // calculate char lengths + const unsigned int nameLength = al.Name.size() + 1; + const unsigned int numCigarOperations = al.CigarData.size(); + const unsigned int queryLength = al.QueryBases.size(); + const unsigned int tagDataLength = al.TagData.size(); + + // no way to tell if alignment's bin is already defined (there is no default, invalid value) + // so we'll go ahead calculate its bin ID before storing + const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition()); + + // create our packed cigar string + string packedCigar; + CreatePackedCigar(al.CigarData, packedCigar); + const unsigned int packedCigarLength = packedCigar.size(); + + // encode the query + string encodedQuery; + EncodeQuerySequence(al.QueryBases, encodedQuery); + const unsigned int encodedQueryLength = encodedQuery.size(); + + // write the block size + const unsigned int dataBlockSize = nameLength + + packedCigarLength + + encodedQueryLength + + queryLength + + tagDataLength; + unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize; + if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); + m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); + + // assign the BAM core data + uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; + buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; + buffer[4] = queryLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( m_isBigEndian ) { + for ( int i = 0; i < 8; ++i ) + BamTools::SwapEndian_32(buffer[i]); + } + + // write the BAM core + m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); + + // write the query name + m_stream.Write(al.Name.c_str(), nameLength); + + // write the packed cigar + if ( m_isBigEndian ) { + char* cigarData = new char[packedCigarLength](); + memcpy(cigarData, packedCigar.data(), packedCigarLength); + if ( m_isBigEndian ) { + for ( size_t i = 0; i < packedCigarLength; ++i ) + BamTools::SwapEndian_32p(&cigarData[i]); + } + m_stream.Write(cigarData, packedCigarLength); + delete[] cigarData; // TODO: cleanup on Write exception thrown? + } + else + m_stream.Write(packedCigar.data(), packedCigarLength); + + // write the encoded query sequence + m_stream.Write(encodedQuery.data(), encodedQueryLength); + + // write the base qualities + char* pBaseQualities = (char*)al.Qualities.data(); + for ( size_t i = 0; i < queryLength; ++i ) + pBaseQualities[i] -= 33; // FASTQ conversion + m_stream.Write(pBaseQualities, queryLength); + + // write the read group tag + if ( m_isBigEndian ) { + + char* tagData = new char[tagDataLength](); + memcpy(tagData, al.TagData.data(), tagDataLength); + + size_t i = 0; + while ( i < tagDataLength ) { + + i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.) + const char type = tagData[i]; // get tag type at position i + ++i; + + switch ( type ) { + + case(Constants::BAM_TAG_TYPE_ASCII) : + case(Constants::BAM_TAG_TYPE_INT8) : + case(Constants::BAM_TAG_TYPE_UINT8) : + ++i; + break; + + case(Constants::BAM_TAG_TYPE_INT16) : + case(Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + + case(Constants::BAM_TAG_TYPE_FLOAT) : + case(Constants::BAM_TAG_TYPE_INT32) : + case(Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + + case(Constants::BAM_TAG_TYPE_HEX) : + case(Constants::BAM_TAG_TYPE_STRING) : + // no endian swapping necessary for hex-string/string data + while ( tagData[i] ) + ++i; + // increment one more for null terminator + ++i; + break; + + case(Constants::BAM_TAG_TYPE_ARRAY) : + + { + // read array type + const char arrayType = tagData[i]; + ++i; + + // swap endian-ness of number of elements in place, then retrieve for loop + BamTools::SwapEndian_32p(&tagData[i]); + int32_t numElements; + memcpy(&numElements, &tagData[i], sizeof(uint32_t)); + i += sizeof(uint32_t); + + // swap endian-ness of array elements + for ( int j = 0; j < numElements; ++j ) { + switch (arrayType) { + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + // no endian-swapping necessary + ++i; + break; + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + default: + delete[] tagData; + const string message = string("invalid binary array type: ") + arrayType; + throw BamException("BamWriter::SaveAlignment", message); + } + } + + break; + } + + default : + delete[] tagData; + const string message = string("invalid tag type: ") + type; + throw BamException("BamWriter::SaveAlignment", message); + } + } + + m_stream.Write(tagData, tagDataLength); + delete[] tagData; // TODO: cleanup on Write exception thrown? + } + else + m_stream.Write(al.TagData.data(), tagDataLength); +} + +void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al) { + + // write the block size + unsigned int blockSize = al.SupportData.BlockLength; + if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); + m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); + + // re-calculate bin (in case BamAlignment's position has been previously modified) + const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition()); + + // assign the BAM core data + uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; + buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; + buffer[4] = al.SupportData.QuerySequenceLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( m_isBigEndian ) { + for ( int i = 0; i < 8; ++i ) + BamTools::SwapEndian_32(buffer[i]); + } + + // write the BAM core + m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); + + // write the raw char data + m_stream.Write((char*)al.SupportData.AllCharData.data(), + al.SupportData.BlockLength-Constants::BAM_CORE_SIZE); +} + +void BamWriterPrivate::WriteMagicNumber(void) { + // write BAM file 'magic number' + m_stream.Write(Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH); +} + +void BamWriterPrivate::WriteReferences(const BamTools::RefVector& referenceSequences) { + + // write the number of reference sequences + uint32_t numReferenceSequences = referenceSequences.size(); + if ( m_isBigEndian ) BamTools::SwapEndian_32(numReferenceSequences); + m_stream.Write((char*)&numReferenceSequences, Constants::BAM_SIZEOF_INT); + + // foreach reference sequence + RefVector::const_iterator rsIter = referenceSequences.begin(); + RefVector::const_iterator rsEnd = referenceSequences.end(); + for ( ; rsIter != rsEnd; ++rsIter ) { + + // write the reference sequence name length + uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; + if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceSequenceNameLen); + m_stream.Write((char*)&referenceSequenceNameLen, Constants::BAM_SIZEOF_INT); + + // write the reference sequence name + m_stream.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); + + // write the reference sequence length + int32_t referenceLength = rsIter->RefLength; + if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceLength); + m_stream.Write((char*)&referenceLength, Constants::BAM_SIZEOF_INT); + } +} + +void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText) { + + // write the SAM header text length + uint32_t samHeaderLen = samHeaderText.size(); + if ( m_isBigEndian ) BamTools::SwapEndian_32(samHeaderLen); + m_stream.Write((char*)&samHeaderLen, Constants::BAM_SIZEOF_INT); + + // write the SAM header text + if ( samHeaderLen > 0 ) + m_stream.Write(samHeaderText.data(), samHeaderLen); +} diff --git a/src/api/internal/bam/BamWriter_p.h b/src/api/internal/bam/BamWriter_p.h new file mode 100644 index 0000000..d5bbe8d --- /dev/null +++ b/src/api/internal/bam/BamWriter_p.h @@ -0,0 +1,73 @@ +// *************************************************************************** +// BamWriter_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#ifndef BAMWRITER_P_H +#define BAMWRITER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. + +#include "api/BamAux.h" +#include "api/internal/io/BgzfStream_p.h" +#include +#include + +namespace BamTools { + +class BamAlignment; + +namespace Internal { + +class BamWriterPrivate { + + // ctor & dtor + public: + BamWriterPrivate(void); + ~BamWriterPrivate(void); + + // interface methods + public: + void Close(void); + std::string GetErrorString(void) const; + bool IsOpen(void) const; + bool Open(const std::string& filename, + const std::string& samHeaderText, + const BamTools::RefVector& referenceSequences); + bool SaveAlignment(const BamAlignment& al); + void SetWriteCompressed(bool ok); + + // 'internal' methods + public: + uint32_t CalculateMinimumBin(const int begin, int end) const; + void CreatePackedCigar(const std::vector& cigarOperations, std::string& packedCigar); + void EncodeQuerySequence(const std::string& query, std::string& encodedQuery); + void WriteAlignment(const BamAlignment& al); + void WriteCoreAlignment(const BamAlignment& al); + void WriteMagicNumber(void); + void WriteReferences(const BamTools::RefVector& referenceSequences); + void WriteSamHeaderText(const std::string& samHeaderText); + + // data members + private: + BgzfStream m_stream; + bool m_isBigEndian; + std::string m_errorString; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMWRITER_P_H diff --git a/src/api/internal/bam/CMakeLists.txt b/src/api/internal/bam/CMakeLists.txt new file mode 100644 index 0000000..64d8534 --- /dev/null +++ b/src/api/internal/bam/CMakeLists.txt @@ -0,0 +1,19 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2011 Derek Barnett +# +# src/api/internal/bam +# ========================== + +set ( InternalBamDir "${InternalDir}/bam" ) + +set ( InternalBamSources + ${InternalBamDir}/BamHeader_p.cpp + ${InternalBamDir}/BamMultiReader_p.cpp + ${InternalBamDir}/BamRandomAccessController_p.cpp + ${InternalBamDir}/BamReader_p.cpp + ${InternalBamDir}/BamWriter_p.cpp + + PARENT_SCOPE # <-- leave this last +) + diff --git a/src/api/internal/index/BamIndexFactory_p.cpp b/src/api/internal/index/BamIndexFactory_p.cpp new file mode 100644 index 0000000..ab7751f --- /dev/null +++ b/src/api/internal/index/BamIndexFactory_p.cpp @@ -0,0 +1,107 @@ +// *************************************************************************** +// BamIndexFactory_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides interface for generating BamIndex implementations +// *************************************************************************** + +#include "api/internal/index/BamIndexFactory_p.h" +#include "api/internal/index/BamStandardIndex_p.h" +#include "api/internal/index/BamToolsIndex_p.h" +using namespace BamTools; +using namespace BamTools::Internal; +using namespace std; + +// generates index filename from BAM filename (depending on requested type) +// if type is unknown, returns empty string +const string BamIndexFactory::CreateIndexFilename(const string& bamFilename, + const BamIndex::IndexType& type) +{ + switch ( type ) { + case ( BamIndex::STANDARD ) : return ( bamFilename + BamStandardIndex::Extension() ); + case ( BamIndex::BAMTOOLS ) : return ( bamFilename + BamToolsIndex::Extension() ); + default : + return string(); + } +} + +// creates a new BamIndex object, depending on extension of @indexFilename +BamIndex* BamIndexFactory::CreateIndexFromFilename(const string& indexFilename, BamReaderPrivate* reader) { + + // get file extension from index filename, including dot (".EXT") + // if can't get file extension, return null index + const string extension = FileExtension(indexFilename); + if ( extension.empty() ) + return 0; + + // create index based on extension + if ( extension == BamStandardIndex::Extension() ) return new BamStandardIndex(reader); + else if ( extension == BamToolsIndex::Extension() ) return new BamToolsIndex(reader); + else + return 0; +} + +// creates a new BamIndex, object of requested @type +BamIndex* BamIndexFactory::CreateIndexOfType(const BamIndex::IndexType& type, + BamReaderPrivate* reader) +{ + switch ( type ) { + case ( BamIndex::STANDARD ) : return new BamStandardIndex(reader); + case ( BamIndex::BAMTOOLS ) : return new BamToolsIndex(reader); + default : + return 0; + } +} + +// retrieves file extension (including '.') +const string BamIndexFactory::FileExtension(const string& filename) { + + // if filename cannot contain valid path + extension, return empty string + if ( filename.empty() || filename.length() <= 4 ) + return string(); + + // look for last dot in filename + const size_t lastDotPosition = filename.find_last_of('.'); + + // if none found, return empty string + if ( lastDotPosition == string::npos ) + return string(); + + // return substring from last dot position + return filename.substr(lastDotPosition); +} + +// returns name of existing index file that corresponds to @bamFilename +// will defer to @preferredType if possible, if not will attempt to load any supported type +// returns empty string if not found +const string BamIndexFactory::FindIndexFilename(const string& bamFilename, + const BamIndex::IndexType& preferredType) +{ + // skip if BAM filename provided is empty + if ( bamFilename.empty() ) + return string(); + + // try to find index of preferred type first + // return index filename if found + string indexFilename = CreateIndexFilename(bamFilename, preferredType); + if ( !indexFilename.empty() ) + return indexFilename; + + // couldn't find preferred type, try the other supported types + // return index filename if found + if ( preferredType != BamIndex::STANDARD ) { + indexFilename = CreateIndexFilename(bamFilename, BamIndex::STANDARD); + if ( !indexFilename.empty() ) + return indexFilename; + } + if ( preferredType != BamIndex::BAMTOOLS ) { + indexFilename = CreateIndexFilename(bamFilename, BamIndex::BAMTOOLS); + if ( !indexFilename.empty() ) + return indexFilename; + } + + // otherwise couldn't find any index matching this filename + return string(); +} diff --git a/src/api/internal/index/BamIndexFactory_p.h b/src/api/internal/index/BamIndexFactory_p.h new file mode 100644 index 0000000..4e4f1cf --- /dev/null +++ b/src/api/internal/index/BamIndexFactory_p.h @@ -0,0 +1,49 @@ +// *************************************************************************** +// BamIndexFactory_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides interface for generating BamIndex implementations +// *************************************************************************** + +#ifndef BAMINDEX_FACTORY_P_H +#define BAMINDEX_FACTORY_P_H + +#include "api/BamIndex.h" +#include + +namespace BamTools { +namespace Internal { + +class BamIndexFactory { + + // static interface methods + public: + // creates a new BamIndex object, depending on extension of @indexFilename + static BamIndex* CreateIndexFromFilename(const std::string& indexFilename, + BamReaderPrivate* reader); + // creates a new BamIndex object, of requested @type + static BamIndex* CreateIndexOfType(const BamIndex::IndexType& type, + BamReaderPrivate* reader); + // returns name of existing index file that corresponds to @bamFilename + // will defer to @preferredType if possible + // if @preferredType not found, will attempt to load any supported index type + // returns empty string if no index file (of any type) is found + static const std::string FindIndexFilename(const std::string& bamFilename, + const BamIndex::IndexType& preferredType); + + // internal methods + public: + // generates index filename from BAM filename (depending on requested type) + // if type is unknown, returns empty string + static const std::string CreateIndexFilename(const std::string& bamFilename, + const BamIndex::IndexType& type); + // retrieves file extension (including '.') + static const std::string FileExtension(const std::string& filename); +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMINDEX_FACTORY_P_H diff --git a/src/api/internal/index/BamStandardIndex_p.cpp b/src/api/internal/index/BamStandardIndex_p.cpp new file mode 100644 index 0000000..dcdec8d --- /dev/null +++ b/src/api/internal/index/BamStandardIndex_p.cpp @@ -0,0 +1,965 @@ +// *************************************************************************** +// BamStandardIndex.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the standardized BAM index format (".bai") +// *************************************************************************** + +#include "api/BamAlignment.h" +#include "api/internal/bam/BamReader_p.h" +#include "api/internal/index/BamStandardIndex_p.h" +#include "api/internal/io/BamDeviceFactory_p.h" +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +#include +#include +using namespace std; + +// ----------------------------------- +// static BamStandardIndex constants +// ----------------------------------- + +const int BamStandardIndex::MAX_BIN = 37450; // =(8^6-1)/7+1 +const int BamStandardIndex::BAM_LIDX_SHIFT = 14; +const string BamStandardIndex::BAI_EXTENSION = ".bai"; +const char* const BamStandardIndex::BAI_MAGIC = "BAI\1"; +const int BamStandardIndex::SIZEOF_ALIGNMENTCHUNK = sizeof(uint64_t)*2; +const int BamStandardIndex::SIZEOF_BINCORE = sizeof(uint32_t) + sizeof(int32_t); +const int BamStandardIndex::SIZEOF_LINEAROFFSET = sizeof(uint64_t); + +// ---------------------------- +// RaiiWrapper implementation +// ---------------------------- + +BamStandardIndex::RaiiWrapper::RaiiWrapper(void) + : Device(0) + , Buffer(0) +{ } + +BamStandardIndex::RaiiWrapper::~RaiiWrapper(void) { + + if ( Device ) { + Device->Close(); + delete Device; + Device = 0; + } + + if ( Buffer ) { + delete[] Buffer; + Buffer = 0; + } +} + +// --------------------------------- +// BamStandardIndex implementation +// --------------------------------- + +// ctor +BamStandardIndex::BamStandardIndex(Internal::BamReaderPrivate* reader) + : BamIndex(reader) + , m_bufferLength(0) +{ + m_isBigEndian = BamTools::SystemIsBigEndian(); +} + +// dtor +BamStandardIndex::~BamStandardIndex(void) { + CloseFile(); +} + +void BamStandardIndex::AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end) { + + // retrieve references from reader + const RefVector& references = m_reader->GetReferenceData(); + + // LeftPosition cannot be greater than or equal to reference length + if ( region.LeftPosition >= references.at(region.LeftRefID).RefLength ) + throw BamException("BamStandardIndex::AdjustRegion", "invalid region requested"); + + // set region 'begin' + begin = (unsigned int)region.LeftPosition; + + // if right bound specified AND left&right bounds are on same reference + // OK to use right bound position as region 'end' + if ( region.isRightBoundSpecified() && ( region.LeftRefID == region.RightRefID ) ) + end = (unsigned int)region.RightPosition; + + // otherwise, set region 'end' to last reference base + else end = (unsigned int)references.at(region.LeftRefID).RefLength; +} + +// [begin, end) +void BamStandardIndex::CalculateCandidateBins(const uint32_t& begin, + const uint32_t& end, + set& candidateBins) +{ + // initialize list, bin '0' is always a valid bin + candidateBins.insert(0); + + // get rest of bins that contain this region + unsigned int k; + for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { candidateBins.insert(k); } + for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { candidateBins.insert(k); } + for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { candidateBins.insert(k); } + for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { candidateBins.insert(k); } + for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { candidateBins.insert(k); } +} + +void BamStandardIndex::CalculateCandidateOffsets(const BaiReferenceSummary& refSummary, + const uint64_t& minOffset, + set& candidateBins, + vector& offsets) +{ + // seek to first bin + Seek(refSummary.FirstBinFilePosition, SEEK_SET); + + // iterate over reference bins + uint32_t binId; + int32_t numAlignmentChunks; + set::iterator candidateBinIter; + for ( int i = 0; i < refSummary.NumBins; ++i ) { + + // read bin contents (if successful, alignment chunks are now in m_buffer) + ReadBinIntoBuffer(binId, numAlignmentChunks); + + // see if bin is a 'candidate bin' + candidateBinIter = candidateBins.find(binId); + + // if not, move on to next bin + if ( candidateBinIter == candidateBins.end() ) + continue; + + // otherwise, check bin's contents against for overlap + else { + + size_t offset = 0; + uint64_t chunkStart; + uint64_t chunkStop; + + // iterate over alignment chunks + for ( int j = 0; j < numAlignmentChunks; ++j ) { + + // read chunk start & stop from buffer + memcpy((char*)&chunkStart, m_resources.Buffer+offset, sizeof(uint64_t)); + offset += sizeof(uint64_t); + memcpy((char*)&chunkStop, m_resources.Buffer+offset, sizeof(uint64_t)); + offset += sizeof(uint64_t); + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_64(chunkStart); + SwapEndian_64(chunkStop); + } + + // store alignment chunk's start offset + // if its stop offset is larger than our 'minOffset' + if ( chunkStop >= minOffset ) + offsets.push_back(chunkStart); + } + + // 'pop' bin ID from candidate bins set + candidateBins.erase(candidateBinIter); + + // quit if no more candidates + if ( candidateBins.empty() ) + break; + } + } +} + +uint64_t BamStandardIndex::CalculateMinOffset(const BaiReferenceSummary& refSummary, + const uint32_t& begin) +{ + // if no linear offsets exist, return 0 + if ( refSummary.NumLinearOffsets == 0 ) + return 0; + + // if 'begin' starts beyond last linear offset, use the last linear offset as minimum + // else use the offset corresponding to the requested start position + const int shiftedBegin = begin>>BamStandardIndex::BAM_LIDX_SHIFT; + if ( shiftedBegin >= refSummary.NumLinearOffsets ) + return LookupLinearOffset( refSummary, refSummary.NumLinearOffsets-1 ); + else + return LookupLinearOffset( refSummary, shiftedBegin ); +} + +void BamStandardIndex::CheckBufferSize(char*& buffer, + unsigned int& bufferLength, + const unsigned int& requestedBytes) +{ + try { + if ( requestedBytes > bufferLength ) { + bufferLength = requestedBytes + 10; + delete[] buffer; + buffer = new char[bufferLength]; + } + } catch ( std::bad_alloc& ) { + stringstream s(""); + s << "out of memory when allocating " << requestedBytes << " bytes"; + throw BamException("BamStandardIndex::CheckBufferSize", s.str()); + } +} + +void BamStandardIndex::CheckBufferSize(unsigned char*& buffer, + unsigned int& bufferLength, + const unsigned int& requestedBytes) +{ + try { + if ( requestedBytes > bufferLength ) { + bufferLength = requestedBytes + 10; + delete[] buffer; + buffer = new unsigned char[bufferLength]; + } + } catch ( std::bad_alloc& ) { + stringstream s(""); + s << "out of memory when allocating " << requestedBytes << " bytes"; + throw BamException("BamStandardIndex::CheckBufferSize", s.str()); + } +} + +void BamStandardIndex::CheckMagicNumber(void) { + + // check 'magic number' to see if file is BAI index + char magic[4]; + const int64_t numBytesRead = m_resources.Device->Read(magic, sizeof(magic)); + if ( numBytesRead != 4 ) + throw BamException("BamStandardIndex::CheckMagicNumber", "could not read BAI magic number"); + + // compare to expected value + if ( strncmp(magic, BamStandardIndex::BAI_MAGIC, 4) != 0 ) + throw BamException("BamStandardIndex::CheckMagicNumber", "invalid BAI magic number"); +} + +void BamStandardIndex::ClearReferenceEntry(BaiReferenceEntry& refEntry) { + refEntry.ID = -1; + refEntry.Bins.clear(); + refEntry.LinearOffsets.clear(); +} + +void BamStandardIndex::CloseFile(void) { + + // close file stream + if ( IsDeviceOpen() ) { + m_resources.Device->Close(); + delete m_resources.Device; + m_resources.Device = 0; + } + + // clear index file summary data + m_indexFileSummary.clear(); + + // clean up I/O buffer + delete[] m_resources.Buffer; + m_resources.Buffer = 0; + m_bufferLength = 0; +} + +// builds index from associated BAM file & writes out to index file +bool BamStandardIndex::Create(void) { + + // skip if BamReader is invalid or not open + if ( m_reader == 0 || !m_reader->IsOpen() ) { + SetErrorString("BamStandardIndex::Create", "could not create index: reader is not open"); + return false; + } + + // rewind BamReader + if ( !m_reader->Rewind() ) { + const string readerError = m_reader->GetErrorString(); + const string message = "could not create index: \n\t" + readerError; + SetErrorString("BamStandardIndex::Create", message); + return false; + } + + try { + + // open new index file (read & write) + string indexFilename = m_reader->Filename() + Extension(); + OpenFile(indexFilename, IBamIODevice::ReadWrite); + + // initialize BaiFileSummary with number of references + const int& numReferences = m_reader->GetReferenceCount(); + ReserveForSummary(numReferences); + + // initialize output file + WriteHeader(); + + // set up bin, ID, offset, & coordinate markers + const uint32_t defaultValue = 0xffffffffu; + uint32_t currentBin = defaultValue; + uint32_t lastBin = defaultValue; + int32_t currentRefID = defaultValue; + int32_t lastRefID = defaultValue; + uint64_t currentOffset = (uint64_t)m_reader->Tell(); + uint64_t lastOffset = currentOffset; + int32_t lastPosition = defaultValue; + + // iterate through alignments in BAM file + BamAlignment al; + BaiReferenceEntry refEntry; + while ( m_reader->LoadNextAlignment(al) ) { + + // changed to new reference + if ( lastRefID != al.RefID ) { + + // if not first reference, save previous reference data + if ( lastRefID != (int32_t)defaultValue ) { + + SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); + WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); + + // write any empty references between (but *NOT* including) lastRefID & al.RefID + for ( int i = lastRefID+1; i < al.RefID; ++i ) { + BaiReferenceEntry emptyEntry(i); + WriteReferenceEntry(emptyEntry); + } + + // update bin markers + currentOffset = lastOffset; + currentBin = al.Bin; + lastBin = al.Bin; + currentRefID = al.RefID; + } + + // otherwise, this is first pass + // be sure to write any empty references up to (but *NOT* including) current RefID + else { + for ( int i = 0; i < al.RefID; ++i ) { + BaiReferenceEntry emptyEntry(i); + WriteReferenceEntry(emptyEntry); + } + } + + // update reference markers + refEntry.ID = al.RefID; + lastRefID = al.RefID; + lastBin = defaultValue; + } + + // if lastPosition greater than current alignment position - file not sorted properly + else if ( lastPosition > al.Position ) { + stringstream s(""); + s << "BAM file is not properly sorted by coordinate" << endl + << "Current alignment position: " << al.Position + << " < previous alignment position: " << lastPosition + << " on reference ID: " << al.RefID << endl; + SetErrorString("BamStandardIndex::Create", s.str()); + return false; + } + + // if alignment's ref ID is valid & its bin is not a 'leaf' + if ( (al.RefID >= 0) && (al.Bin < 4681) ) + SaveLinearOffsetEntry(refEntry.LinearOffsets, al.Position, al.GetEndPosition(), lastOffset); + + // changed to new BAI bin + if ( al.Bin != lastBin ) { + + // if not first bin on reference, save previous bin data + if ( currentBin != defaultValue ) + SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); + + // update markers + currentOffset = lastOffset; + currentBin = al.Bin; + lastBin = al.Bin; + currentRefID = al.RefID; + + // if invalid RefID, break out + if ( currentRefID < 0 ) + break; + } + + // make sure that current file pointer is beyond lastOffset + if ( m_reader->Tell() <= (int64_t)lastOffset ) { + SetErrorString("BamStandardIndex::Create", "calculating offsets failed"); + return false; + } + + // update lastOffset & lastPosition + lastOffset = m_reader->Tell(); + lastPosition = al.Position; + } + + // after finishing alignments, if any data was read, check: + if ( currentRefID >= 0 ) { + + // store last alignment chunk to its bin, then write last reference entry with data + SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); + WriteReferenceEntry(refEntry); + + // then write any empty references remaining at end of file + for ( int i = currentRefID+1; i < numReferences; ++i ) { + BaiReferenceEntry emptyEntry(i); + WriteReferenceEntry(emptyEntry); + } + } + + } catch ( BamException& e) { + m_errorString = e.what(); + return false; + } + + // rewind BamReader + if ( !m_reader->Rewind() ) { + const string readerError = m_reader->GetErrorString(); + const string message = "could not create index: \n\t" + readerError; + SetErrorString("BamStandardIndex::Create", message); + return false; + } + + // return success + return true; +} + +// returns format's file extension +const string BamStandardIndex::Extension(void) { + return BamStandardIndex::BAI_EXTENSION; +} + +void BamStandardIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { + + // cannot calculate offsets if unknown/invalid reference ID requested + if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() ) + throw BamException("BamStandardIndex::GetOffset", "invalid reference ID requested"); + + // retrieve index summary for left bound reference + const BaiReferenceSummary& refSummary = m_indexFileSummary.at(region.LeftRefID); + + // set up region boundaries based on actual BamReader data + uint32_t begin; + uint32_t end; + AdjustRegion(region, begin, end); + + // retrieve all candidate bin IDs for region + set candidateBins; + CalculateCandidateBins(begin, end, candidateBins); + + // use reference's linear offsets to calculate the minimum offset + // that must be considered to find overlap + const uint64_t& minOffset = CalculateMinOffset(refSummary, begin); + + // attempt to use reference summary, minOffset, & candidateBins to calculate offsets + // no data should not be error, just bail + vector offsets; + CalculateCandidateOffsets(refSummary, minOffset, candidateBins, offsets); + if ( offsets.empty() ) + return; + + // ensure that offsets are sorted before processing + sort( offsets.begin(), offsets.end() ); + + // binary search for an overlapping block (may not be first one though) + BamAlignment al; + typedef vector::const_iterator OffsetConstIterator; + OffsetConstIterator offsetFirst = offsets.begin(); + OffsetConstIterator offsetIter = offsetFirst; + OffsetConstIterator offsetLast = offsets.end(); + iterator_traits::difference_type count = distance(offsetFirst, offsetLast); + iterator_traits::difference_type step; + while ( count > 0 ) { + offsetIter = offsetFirst; + step = count/2; + advance(offsetIter, step); + + // attempt seek to candidate offset + const int64_t& candidateOffset = (*offsetIter); + if ( !m_reader->Seek(candidateOffset) ) { + const string readerError = m_reader->GetErrorString(); + const string message = "could not seek in BAM file: \n\t" + readerError; + throw BamException("BamToolsIndex::GetOffset", message); + } + + // load first available alignment, setting flag to true if data exists + *hasAlignmentsInRegion = m_reader->LoadNextAlignment(al); + + // check alignment against region + if ( al.GetEndPosition() <= region.LeftPosition ) { + offsetFirst = ++offsetIter; + count -= step+1; + } else count = step; + } + + // step back to the offset before the 'current offset' (to make sure we cover overlaps) + if ( offsetIter != offsets.begin() ) + --offsetIter; + offset = (*offsetIter); +} + +// returns whether reference has alignments or no +bool BamStandardIndex::HasAlignments(const int& referenceID) const { + if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() ) + return false; + const BaiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID); + return ( refSummary.NumBins > 0 ); +} + +bool BamStandardIndex::IsDeviceOpen(void) const { + if ( m_resources.Device == 0 ) + return false; + return m_resources.Device->IsOpen(); +} + +// attempts to use index data to jump to @region, returns success/fail +// a "successful" jump indicates no error, but not whether this region has data +// * thus, the method sets a flag to indicate whether there are alignments +// available after the jump position +bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { + + // clear out flag + *hasAlignmentsInRegion = false; + + // skip if invalid reader or not open + if ( m_reader == 0 || !m_reader->IsOpen() ) { + SetErrorString("BamStandardIndex::Jump", "could not jump: reader is not open"); + return false; + } + + // calculate nearest offset to jump to + int64_t offset; + try { + GetOffset(region, offset, hasAlignmentsInRegion); + } catch ( BamException& e ) { + m_errorString = e.what(); + return false; + } + + // if region has alignments, return success/fail of seeking there + if ( *hasAlignmentsInRegion ) + return m_reader->Seek(offset); + + // otherwise, simply return true (but hasAlignmentsInRegion flag has been set to false) + // (this is OK, BamReader will check this flag before trying to load data) + return true; +} + +// loads existing data from file into memory +bool BamStandardIndex::Load(const std::string& filename) { + + try { + + // attempt to open file (read-only) + OpenFile(filename, IBamIODevice::ReadOnly); + + // validate format + CheckMagicNumber(); + + // load in-memory summary of index data + SummarizeIndexFile(); + + // return success + return true; + + } catch ( BamException& e ) { + m_errorString = e.what(); + return false; + } +} + +uint64_t BamStandardIndex::LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index) { + + // attempt seek to proper index file position + const int64_t linearOffsetFilePosition = (int64_t)refSummary.FirstLinearOffsetFilePosition + + index*BamStandardIndex::SIZEOF_LINEAROFFSET; + Seek(linearOffsetFilePosition, SEEK_SET); + + // read linear offset from BAI file + uint64_t linearOffset; + ReadLinearOffset(linearOffset); + return linearOffset; +} + +void BamStandardIndex::MergeAlignmentChunks(BaiAlignmentChunkVector& chunks) { + + // skip if chunks are empty, nothing to merge + if ( chunks.empty() ) + return; + + // set up merged alignment chunk container + BaiAlignmentChunkVector mergedChunks; + mergedChunks.push_back( chunks[0] ); + + // iterate over chunks + int i = 0; + BaiAlignmentChunkVector::iterator chunkIter = chunks.begin(); + BaiAlignmentChunkVector::iterator chunkEnd = chunks.end(); + for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { + + // get 'currentMergeChunk' based on numeric index + BaiAlignmentChunk& currentMergeChunk = mergedChunks[i]; + + // get sourceChunk based on source vector iterator + BaiAlignmentChunk& sourceChunk = (*chunkIter); + + // if currentMergeChunk ends where sourceChunk starts, then merge the two + if ( currentMergeChunk.Stop>>16 == sourceChunk.Start>>16 ) + currentMergeChunk.Stop = sourceChunk.Stop; + + // otherwise + else { + // append sourceChunk after currentMergeChunk + mergedChunks.push_back(sourceChunk); + + // update i, so the next iteration will consider the + // recently-appended sourceChunk as new mergeChunk candidate + ++i; + } + } + + // saved newly-merged chunks into (parameter) chunks + chunks = mergedChunks; +} + +void BamStandardIndex::OpenFile(const std::string& filename, IBamIODevice::OpenMode mode) { + + // make sure any previous index file is closed + CloseFile(); + + m_resources.Device = BamDeviceFactory::CreateDevice(filename); + if ( m_resources.Device == 0 ) { + const string message = string("could not open file: ") + filename; + throw BamException("BamStandardIndex::OpenFile", message); + } + + // attempt to open file + m_resources.Device->Open(mode); + if ( !IsDeviceOpen() ) { + const string message = string("could not open file: ") + filename; + throw BamException("BamStandardIndex::OpenFile", message); + } +} + +void BamStandardIndex::ReadBinID(uint32_t& binId) { + const int64_t numBytesRead = m_resources.Device->Read((char*)&binId, sizeof(binId)); + if ( m_isBigEndian ) SwapEndian_32(binId); + if ( numBytesRead != sizeof(binId) ) + throw BamException("BamStandardIndex::ReadBinID", "could not read BAI bin ID"); +} + +void BamStandardIndex::ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks) { + + // read bin header + ReadBinID(binId); + ReadNumAlignmentChunks(numAlignmentChunks); + + // read bin contents + const unsigned int bytesRequested = numAlignmentChunks*BamStandardIndex::SIZEOF_ALIGNMENTCHUNK; + ReadIntoBuffer(bytesRequested); +} + +void BamStandardIndex::ReadIntoBuffer(const unsigned int& bytesRequested) { + + // ensure that our buffer is big enough for request + BamStandardIndex::CheckBufferSize(m_resources.Buffer, m_bufferLength, bytesRequested); + + // read from BAI file stream + const int64_t bytesRead = m_resources.Device->Read(m_resources.Buffer, bytesRequested); + if ( bytesRead != (int64_t)bytesRequested ) { + stringstream s(""); + s << "expected to read: " << bytesRequested << " bytes, " + << "but instead read: " << bytesRead; + throw BamException("BamStandardIndex::ReadIntoBuffer", s.str()); + } +} + +void BamStandardIndex::ReadLinearOffset(uint64_t& linearOffset) { + const int64_t numBytesRead = m_resources.Device->Read((char*)&linearOffset, sizeof(linearOffset)); + if ( m_isBigEndian ) SwapEndian_64(linearOffset); + if ( numBytesRead != sizeof(linearOffset) ) + throw BamException("BamStandardIndex::ReadLinearOffset", "could not read BAI linear offset"); +} + +void BamStandardIndex::ReadNumAlignmentChunks(int& numAlignmentChunks) { + const int64_t numBytesRead = m_resources.Device->Read((char*)&numAlignmentChunks, sizeof(numAlignmentChunks)); + if ( m_isBigEndian ) SwapEndian_32(numAlignmentChunks); + if ( numBytesRead != sizeof(numAlignmentChunks) ) + throw BamException("BamStandardIndex::ReadNumAlignmentChunks", "could not read BAI chunk count"); +} + +void BamStandardIndex::ReadNumBins(int& numBins) { + const int64_t numBytesRead = m_resources.Device->Read((char*)&numBins, sizeof(numBins)); + if ( m_isBigEndian ) SwapEndian_32(numBins); + if ( numBytesRead != sizeof(numBins) ) + throw BamException("BamStandardIndex::ReadNumBins", "could not read BAI bin count"); +} + +void BamStandardIndex::ReadNumLinearOffsets(int& numLinearOffsets) { + const int64_t numBytesRead = m_resources.Device->Read((char*)&numLinearOffsets, sizeof(numLinearOffsets)); + if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets); + if ( numBytesRead != sizeof(numLinearOffsets) ) + throw BamException("BamStandardIndex::ReadNumAlignmentChunks", "could not read BAI linear offset count"); +} + +void BamStandardIndex::ReadNumReferences(int& numReferences) { + const int64_t numBytesRead = m_resources.Device->Read((char*)&numReferences, sizeof(numReferences)); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + if ( numBytesRead != sizeof(numReferences) ) + throw BamException("BamStandardIndex::ReadNumReferences", "could not read reference count"); +} + +void BamStandardIndex::ReserveForSummary(const int& numReferences) { + m_indexFileSummary.clear(); + m_indexFileSummary.assign( numReferences, BaiReferenceSummary() ); +} + +void BamStandardIndex::SaveAlignmentChunkToBin(BaiBinMap& binMap, + const uint32_t& currentBin, + const uint64_t& currentOffset, + const uint64_t& lastOffset) +{ + // create new alignment chunk + BaiAlignmentChunk newChunk(currentOffset, lastOffset); + + // if no entry exists yet for this bin, create one and store alignment chunk + BaiBinMap::iterator binIter = binMap.find(currentBin); + if ( binIter == binMap.end() ) { + BaiAlignmentChunkVector newChunks; + newChunks.push_back(newChunk); + binMap.insert( pair(currentBin, newChunks)); + } + + // otherwise, just append alignment chunk + else { + BaiAlignmentChunkVector& binChunks = (*binIter).second; + binChunks.push_back( newChunk ); + } +} + +void BamStandardIndex::SaveBinsSummary(const int& refId, const int& numBins) { + BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId); + refSummary.NumBins = numBins; + refSummary.FirstBinFilePosition = Tell(); +} + +void BamStandardIndex::SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets, + const int& alignmentStartPosition, + const int& alignmentStopPosition, + const uint64_t& lastOffset) +{ + // get converted offsets + const int beginOffset = alignmentStartPosition >> BamStandardIndex::BAM_LIDX_SHIFT; + const int endOffset = (alignmentStopPosition - 1) >> BamStandardIndex::BAM_LIDX_SHIFT; + + // resize vector if necessary + int oldSize = offsets.size(); + int newSize = endOffset + 1; + if ( oldSize < newSize ) + offsets.resize(newSize, 0); + + // store offset + for( int i = beginOffset + 1; i <= endOffset; ++i ) { + if ( offsets[i] == 0 ) + offsets[i] = lastOffset; + } +} + +void BamStandardIndex::SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets) { + BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId); + refSummary.NumLinearOffsets = numLinearOffsets; + refSummary.FirstLinearOffsetFilePosition = Tell(); +} + +// seek to position in index file stream +void BamStandardIndex::Seek(const int64_t& position, const int origin) { + if ( !m_resources.Device->Seek(position, origin) ) + throw BamException("BamStandardIndex::Seek", "could not seek in BAI file"); +} + +void BamStandardIndex::SkipBins(const int& numBins) { + uint32_t binId; + int32_t numAlignmentChunks; + for (int i = 0; i < numBins; ++i) + ReadBinIntoBuffer(binId, numAlignmentChunks); // results & buffer ignored +} + +void BamStandardIndex::SkipLinearOffsets(const int& numLinearOffsets) { + const unsigned int bytesRequested = numLinearOffsets*BamStandardIndex::SIZEOF_LINEAROFFSET; + ReadIntoBuffer(bytesRequested); +} + +void BamStandardIndex::SortLinearOffsets(BaiLinearOffsetVector& linearOffsets) { + sort( linearOffsets.begin(), linearOffsets.end() ); +} + +void BamStandardIndex::SummarizeBins(BaiReferenceSummary& refSummary) { + + // load number of bins + int numBins; + ReadNumBins(numBins); + + // store bins summary for this reference + refSummary.NumBins = numBins; + refSummary.FirstBinFilePosition = Tell(); + + // skip this reference's bins + SkipBins(numBins); +} + +void BamStandardIndex::SummarizeIndexFile(void) { + + // load number of reference sequences + int numReferences; + ReadNumReferences(numReferences); + + // initialize file summary data + ReserveForSummary(numReferences); + + // iterate over reference entries + BaiFileSummary::iterator summaryIter = m_indexFileSummary.begin(); + BaiFileSummary::iterator summaryEnd = m_indexFileSummary.end(); + for ( int i = 0; summaryIter != summaryEnd; ++summaryIter, ++i ) + SummarizeReference(*summaryIter); +} + +void BamStandardIndex::SummarizeLinearOffsets(BaiReferenceSummary& refSummary) { + + // load number of linear offsets + int numLinearOffsets; + ReadNumLinearOffsets(numLinearOffsets); + + // store bin summary data for this reference + refSummary.NumLinearOffsets = numLinearOffsets; + refSummary.FirstLinearOffsetFilePosition = Tell(); + + // skip linear offsets in index file + SkipLinearOffsets(numLinearOffsets); +} + +void BamStandardIndex::SummarizeReference(BaiReferenceSummary& refSummary) { + SummarizeBins(refSummary); + SummarizeLinearOffsets(refSummary); +} + +// return position of file pointer in index file stream +int64_t BamStandardIndex::Tell(void) const { + return m_resources.Device->Tell(); +} + +void BamStandardIndex::WriteAlignmentChunk(const BaiAlignmentChunk& chunk) { + + // localize alignment chunk offsets + uint64_t start = chunk.Start; + uint64_t stop = chunk.Stop; + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_64(start); + SwapEndian_64(stop); + } + + // write to index file + int64_t numBytesWritten = 0; + numBytesWritten += m_resources.Device->Write((const char*)&start, sizeof(start)); + numBytesWritten += m_resources.Device->Write((const char*)&stop, sizeof(stop)); + if ( numBytesWritten != (sizeof(start)+sizeof(stop)) ) + throw BamException("BamStandardIndex::WriteAlignmentChunk", "could not write BAI alignment chunk"); +} + +void BamStandardIndex::WriteAlignmentChunks(BaiAlignmentChunkVector& chunks) { + + // make sure chunks are merged (simplified) before writing & saving summary + MergeAlignmentChunks(chunks); + + // write chunks + int32_t chunkCount = chunks.size(); + if ( m_isBigEndian ) SwapEndian_32(chunkCount); + const int64_t numBytesWritten = m_resources.Device->Write((const char*)&chunkCount, sizeof(chunkCount)); + if ( numBytesWritten != sizeof(chunkCount) ) + throw BamException("BamStandardIndex::WriteAlignmentChunks", "could not write BAI chunk count"); + + // iterate over chunks + BaiAlignmentChunkVector::const_iterator chunkIter = chunks.begin(); + BaiAlignmentChunkVector::const_iterator chunkEnd = chunks.end(); + for ( ; chunkIter != chunkEnd; ++chunkIter ) + WriteAlignmentChunk( (*chunkIter) ); +} + +void BamStandardIndex::WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks) { + + // write BAM bin ID + uint32_t binKey = binId; + if ( m_isBigEndian ) SwapEndian_32(binKey); + const int64_t numBytesWritten = m_resources.Device->Write((const char*)&binKey, sizeof(binKey)); + if ( numBytesWritten != sizeof(binKey) ) + throw BamException("BamStandardIndex::WriteBin", "could not write bin ID"); + + // write bin's alignment chunks + WriteAlignmentChunks(chunks); +} + +void BamStandardIndex::WriteBins(const int& refId, BaiBinMap& bins) { + + // write number of bins + int32_t binCount = bins.size(); + if ( m_isBigEndian ) SwapEndian_32(binCount); + const int64_t numBytesWritten = m_resources.Device->Write((const char*)&binCount, sizeof(binCount)); + if ( numBytesWritten != sizeof(binCount) ) + throw BamException("BamStandardIndex::WriteBins", "could not write bin count"); + + // save summary for reference's bins + SaveBinsSummary(refId, bins.size()); + + // iterate over bins + BaiBinMap::iterator binIter = bins.begin(); + BaiBinMap::iterator binEnd = bins.end(); + for ( ; binIter != binEnd; ++binIter ) + WriteBin( (*binIter).first, (*binIter).second ); +} + +void BamStandardIndex::WriteHeader(void) { + + int64_t numBytesWritten = 0; + + // write magic number + numBytesWritten += m_resources.Device->Write(BamStandardIndex::BAI_MAGIC, 4); + + // write number of reference sequences + int32_t numReferences = m_indexFileSummary.size(); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + numBytesWritten += m_resources.Device->Write((const char*)&numReferences, sizeof(numReferences)); + + if ( numBytesWritten != sizeof(numReferences)+4 ) + throw BamException("BamStandardIndex::WriteHeader", "could not write BAI header"); +} + +void BamStandardIndex::WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets) { + + // make sure linear offsets are sorted before writing & saving summary + SortLinearOffsets(linearOffsets); + + int64_t numBytesWritten = 0; + + // write number of linear offsets + int32_t offsetCount = linearOffsets.size(); + if ( m_isBigEndian ) SwapEndian_32(offsetCount); + numBytesWritten += m_resources.Device->Write((const char*)&offsetCount, sizeof(offsetCount)); + + // save summary for reference's linear offsets + SaveLinearOffsetsSummary(refId, linearOffsets.size()); + + // iterate over linear offsets + BaiLinearOffsetVector::const_iterator offsetIter = linearOffsets.begin(); + BaiLinearOffsetVector::const_iterator offsetEnd = linearOffsets.end(); + for ( ; offsetIter != offsetEnd; ++offsetIter ) { + + // write linear offset + uint64_t linearOffset = (*offsetIter); + if ( m_isBigEndian ) SwapEndian_64(linearOffset); + numBytesWritten += m_resources.Device->Write((const char*)&linearOffset, sizeof(linearOffset)); + } + + if ( numBytesWritten != (sizeof(offsetCount) + linearOffsets.size()*sizeof(uint64_t)) ) + throw BamException("BamStandardIndex::WriteLinearOffsets", "could not write BAI linear offsets"); +} + +void BamStandardIndex::WriteReferenceEntry(BaiReferenceEntry& refEntry) { + WriteBins(refEntry.ID, refEntry.Bins); + WriteLinearOffsets(refEntry.ID, refEntry.LinearOffsets); +} diff --git a/src/api/internal/index/BamStandardIndex_p.h b/src/api/internal/index/BamStandardIndex_p.h new file mode 100644 index 0000000..273d56e --- /dev/null +++ b/src/api/internal/index/BamStandardIndex_p.h @@ -0,0 +1,237 @@ +// *************************************************************************** +// BamStandardIndex.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the standardized BAM index format (".bai") +// *************************************************************************** + +#ifndef BAM_STANDARD_INDEX_FORMAT_H +#define BAM_STANDARD_INDEX_FORMAT_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. + +#include "api/BamAux.h" +#include "api/BamIndex.h" +#include "api/IBamIODevice.h" +#include +#include +#include +#include + +namespace BamTools { +namespace Internal { + +// ----------------------------------------------------------------------------- +// BamStandardIndex data structures + +// defines start and end of a contiguous run of alignments +struct BaiAlignmentChunk { + + // data members + uint64_t Start; + uint64_t Stop; + + // constructor + BaiAlignmentChunk(const uint64_t& start = 0, + const uint64_t& stop = 0) + : Start(start) + , Stop(stop) + { } +}; + +// comparison operator (for sorting) +inline +bool operator<(const BaiAlignmentChunk& lhs, const BaiAlignmentChunk& rhs) { + return lhs.Start < rhs.Start; +} + +// convenience typedef for a list of all alignment 'chunks' in a BAI bin +typedef std::vector BaiAlignmentChunkVector; + +// convenience typedef for a map of all BAI bins in a reference (ID => chunks) +typedef std::map BaiBinMap; + +// convenience typedef for a list of all 'linear offsets' in a reference +typedef std::vector BaiLinearOffsetVector; + +// contains all fields necessary for building, loading, & writing +// full BAI index data for a single reference +struct BaiReferenceEntry { + + // data members + int32_t ID; + BaiBinMap Bins; + BaiLinearOffsetVector LinearOffsets; + + // ctor + BaiReferenceEntry(const int32_t& id = -1) + : ID(id) + { } +}; + +// provides (persistent) summary of BaiReferenceEntry's index data +struct BaiReferenceSummary { + + // data members + int NumBins; + int NumLinearOffsets; + uint64_t FirstBinFilePosition; + uint64_t FirstLinearOffsetFilePosition; + + // ctor + BaiReferenceSummary(void) + : NumBins(0) + , NumLinearOffsets(0) + , FirstBinFilePosition(0) + , FirstLinearOffsetFilePosition(0) + { } +}; + +// convenience typedef for describing a full BAI index file summary +typedef std::vector BaiFileSummary; + +// end BamStandardIndex data structures +// ----------------------------------------------------------------------------- + +class BamStandardIndex : public BamIndex { + + // ctor & dtor + public: + BamStandardIndex(Internal::BamReaderPrivate* reader); + ~BamStandardIndex(void); + + // BamIndex implementation + public: + // builds index from associated BAM file & writes out to index file + bool Create(void); + // returns whether reference has alignments or no + bool HasAlignments(const int& referenceID) const; + // attempts to use index data to jump to @region, returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + // loads existing data from file into memory + bool Load(const std::string& filename); + BamIndex::IndexType Type(void) const { return BamIndex::STANDARD; } + public: + // returns format's file extension + static const std::string Extension(void); + + // internal methods + private: + + // index file ops + void CheckMagicNumber(void); + void CloseFile(void); + bool IsDeviceOpen(void) const; + void OpenFile(const std::string& filename, IBamIODevice::OpenMode mode); + void Seek(const int64_t& position, const int origin); + int64_t Tell(void) const; + + // BAI index building methods + void ClearReferenceEntry(BaiReferenceEntry& refEntry); + void SaveAlignmentChunkToBin(BaiBinMap& binMap, + const uint32_t& currentBin, + const uint64_t& currentOffset, + const uint64_t& lastOffset); + void SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets, + const int& alignmentStartPosition, + const int& alignmentStopPosition, + const uint64_t& lastOffset); + + // random-access methods + void AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end); + void CalculateCandidateBins(const uint32_t& begin, + const uint32_t& end, + std::set& candidateBins); + void CalculateCandidateOffsets(const BaiReferenceSummary& refSummary, + const uint64_t& minOffset, + std::set& candidateBins, + std::vector& offsets); + uint64_t CalculateMinOffset(const BaiReferenceSummary& refSummary, const uint32_t& begin); + void GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); + uint64_t LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index); + + // BAI summary (create/load) methods + void ReserveForSummary(const int& numReferences); + void SaveBinsSummary(const int& refId, const int& numBins); + void SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets); + void SkipBins(const int& numBins); + void SkipLinearOffsets(const int& numLinearOffsets); + void SummarizeBins(BaiReferenceSummary& refSummary); + void SummarizeIndexFile(void); + void SummarizeLinearOffsets(BaiReferenceSummary& refSummary); + void SummarizeReference(BaiReferenceSummary& refSummary); + + // BAI full index input methods + void ReadBinID(uint32_t& binId); + void ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks); + void ReadIntoBuffer(const unsigned int& bytesRequested); + void ReadLinearOffset(uint64_t& linearOffset); + void ReadNumAlignmentChunks(int& numAlignmentChunks); + void ReadNumBins(int& numBins); + void ReadNumLinearOffsets(int& numLinearOffsets); + void ReadNumReferences(int& numReferences); + + // BAI full index output methods + void MergeAlignmentChunks(BaiAlignmentChunkVector& chunks); + void SortLinearOffsets(BaiLinearOffsetVector& linearOffsets); + void WriteAlignmentChunk(const BaiAlignmentChunk& chunk); + void WriteAlignmentChunks(BaiAlignmentChunkVector& chunks); + void WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks); + void WriteBins(const int& refId, BaiBinMap& bins); + void WriteHeader(void); + void WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets); + void WriteReferenceEntry(BaiReferenceEntry& refEntry); + + // data members + private: + bool m_isBigEndian; + BaiFileSummary m_indexFileSummary; + + // our input buffer + unsigned int m_bufferLength; + struct RaiiWrapper { + IBamIODevice* Device; + char* Buffer; + RaiiWrapper(void); + ~RaiiWrapper(void); + }; + RaiiWrapper m_resources; + + // static methods + private: + // checks if the buffer is large enough to accomodate the requested size + static void CheckBufferSize(char*& buffer, + unsigned int& bufferLength, + const unsigned int& requestedBytes); + // checks if the buffer is large enough to accomodate the requested size + static void CheckBufferSize(unsigned char*& buffer, + unsigned int& bufferLength, + const unsigned int& requestedBytes); + // static constants + private: + static const int MAX_BIN; + static const int BAM_LIDX_SHIFT; + static const std::string BAI_EXTENSION; + static const char* const BAI_MAGIC; + static const int SIZEOF_ALIGNMENTCHUNK; + static const int SIZEOF_BINCORE; + static const int SIZEOF_LINEAROFFSET; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAM_STANDARD_INDEX_FORMAT_H diff --git a/src/api/internal/index/BamToolsIndex_p.cpp b/src/api/internal/index/BamToolsIndex_p.cpp new file mode 100644 index 0000000..bb09bc9 --- /dev/null +++ b/src/api/internal/index/BamToolsIndex_p.cpp @@ -0,0 +1,642 @@ +// *************************************************************************** +// BamToolsIndex.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the BamTools index format (".bti") +// *************************************************************************** + +#include "api/BamAlignment.h" +#include "api/internal/bam/BamReader_p.h" +#include "api/internal/index/BamToolsIndex_p.h" +#include "api/internal/io/BamDeviceFactory_p.h" +#include "api/internal/io/BgzfStream_p.h" +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +#include +#include +#include +#include +using namespace std; + +// -------------------------------- +// static BamToolsIndex constants +// -------------------------------- + +const uint32_t BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000; +const string BamToolsIndex::BTI_EXTENSION = ".bti"; +const char* const BamToolsIndex::BTI_MAGIC = "BTI\1"; +const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t)*2 + sizeof(int64_t); + +// ---------------------------- +// RaiiWrapper implementation +// ---------------------------- + +BamToolsIndex::RaiiWrapper::RaiiWrapper(void) + : Device(0) +{ } + +BamToolsIndex::RaiiWrapper::~RaiiWrapper(void) { + if ( Device ) { + Device->Close(); + delete Device; + Device = 0; + } +} + +// ------------------------------ +// BamToolsIndex implementation +// ------------------------------ + +// ctor +BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader) + : BamIndex(reader) + , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH) + , m_inputVersion(0) + , m_outputVersion(BTI_2_0) // latest version - used for writing new index files +{ + m_isBigEndian = BamTools::SystemIsBigEndian(); +} + +// dtor +BamToolsIndex::~BamToolsIndex(void) { + CloseFile(); +} + +void BamToolsIndex::CheckMagicNumber(void) { + + // read magic number + char magic[4]; + const int64_t numBytesRead = m_resources.Device->Read(magic, 4); + if ( numBytesRead != 4 ) + throw BamException("BamToolsIndex::CheckMagicNumber", "could not read BTI magic number"); + + // validate expected magic number + if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 ) + throw BamException("BamToolsIndex::CheckMagicNumber", "invalid BTI magic number"); +} + +// check index file version, return true if OK +void BamToolsIndex::CheckVersion(void) { + + // read version from file + const int64_t numBytesRead = m_resources.Device->Read((char*)&m_inputVersion, sizeof(m_inputVersion)); + if ( numBytesRead != sizeof(m_inputVersion) ) + throw BamException("BamToolsIndex::CheckVersion", "could not read format version"); + if ( m_isBigEndian ) SwapEndian_32(m_inputVersion); + + // if version is negative, or zero + if ( m_inputVersion <= 0 ) + throw BamException("BamToolsIndex::CheckVersion", "invalid format version"); + + // if version is newer than can be supported by this version of bamtools + else if ( m_inputVersion > m_outputVersion ) { + const string message = "unsupported format: this index was created by a newer version of BamTools. " + "Update your local version of BamTools to use the index file."; + throw BamException("BamToolsIndex::CheckVersion", message); + } + + // ------------------------------------------------------------------ + // check for deprecated, unsupported versions + // (the format had to be modified to accomodate a particular bug fix) + + // Version 2.0: introduced support for half-open intervals, instead of the old closed intervals + // respondBy: throwing exception - we're not going to try to handle the old BTI files. + else if ( (Version)m_inputVersion < BamToolsIndex::BTI_2_0 ) { + const string message = "unsupported format: this version of the index may not properly handle " + "coordinate intervals. Please run 'bamtools index -bti -in yourData.bam' " + "to generate an up-to-date, fixed BTI file."; + throw BamException("BamToolsIndex::CheckVersion", message); + } +} + +void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) { + refEntry.ID = -1; + refEntry.Blocks.clear(); +} + +void BamToolsIndex::CloseFile(void) { + if ( IsDeviceOpen() ) { + m_resources.Device->Close(); + delete m_resources.Device; + m_resources.Device = 0; + } + m_indexFileSummary.clear(); +} + +// builds index from associated BAM file & writes out to index file +bool BamToolsIndex::Create(void) { + + // skip if BamReader is invalid or not open + if ( m_reader == 0 || !m_reader->IsOpen() ) { + SetErrorString("BamToolsIndex::Create", "could not create index: reader is not open"); + return false; + } + + // rewind BamReader + if ( !m_reader->Rewind() ) { + const string readerError = m_reader->GetErrorString(); + const string message = "could not create index: \n\t" + readerError; + SetErrorString("BamToolsIndex::Create", message); + return false; + } + + try { + // open new index file (read & write) + const string indexFilename = m_reader->Filename() + Extension(); + OpenFile(indexFilename, IBamIODevice::ReadWrite); + + // initialize BtiFileSummary with number of references + const int& numReferences = m_reader->GetReferenceCount(); + InitializeFileSummary(numReferences); + + // intialize output file header + WriteHeader(); + + // index building markers + uint32_t currentBlockCount = 0; + int64_t currentAlignmentOffset = m_reader->Tell(); + int32_t blockRefId = -1; + int32_t blockMaxEndPosition = -1; + int64_t blockStartOffset = currentAlignmentOffset; + int32_t blockStartPosition = -1; + + // plow through alignments, storing index entries + BamAlignment al; + BtiReferenceEntry refEntry; + while ( m_reader->LoadNextAlignment(al) ) { + + // if moved to new reference + if ( al.RefID != blockRefId ) { + + // if first pass, check: + if ( currentBlockCount == 0 ) { + + // write any empty references up to (but not including) al.RefID + for ( int i = 0; i < al.RefID; ++i ) + WriteReferenceEntry( BtiReferenceEntry(i) ); + } + + // not first pass: + else { + + // store previous BTI block data in reference entry + const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); + + // write reference entry, then clear + WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); + + // write any empty references between (but not including) + // the last blockRefID and current al.RefID + for ( int i = blockRefId+1; i < al.RefID; ++i ) + WriteReferenceEntry( BtiReferenceEntry(i) ); + + // reset block count + currentBlockCount = 0; + } + + // set ID for new reference entry + refEntry.ID = al.RefID; + } + + // if beginning of block, update counters + if ( currentBlockCount == 0 ) { + blockRefId = al.RefID; + blockStartOffset = currentAlignmentOffset; + blockStartPosition = al.Position; + blockMaxEndPosition = al.GetEndPosition(); + } + + // increment block counter + ++currentBlockCount; + + // check end position + const int32_t alignmentEndPosition = al.GetEndPosition(); + if ( alignmentEndPosition > blockMaxEndPosition ) + blockMaxEndPosition = alignmentEndPosition; + + // if block is full, get offset for next block, reset currentBlockCount + if ( currentBlockCount == m_blockSize ) { + + // store previous block data in reference entry + const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); + + // update markers + blockStartOffset = m_reader->Tell(); + currentBlockCount = 0; + } + + // not the best name, but for the next iteration, this value will be the offset of the + // *current* alignment. this is necessary because we won't know if this next alignment + // is on a new reference until we actually read it + currentAlignmentOffset = m_reader->Tell(); + } + + // after finishing alignments, if any data was read, check: + if ( blockRefId >= 0 ) { + + // store last BTI block data in reference entry + const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); + + // write last reference entry, then clear + WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); + + // then write any empty references remaining at end of file + for ( int i = blockRefId+1; i < numReferences; ++i ) + WriteReferenceEntry( BtiReferenceEntry(i) ); + } + + } catch ( BamException& e ) { + m_errorString = e.what(); + return false; + } + + // rewind BamReader + if ( !m_reader->Rewind() ) { + const string readerError = m_reader->GetErrorString(); + const string message = "could not create index: \n\t" + readerError; + SetErrorString("BamToolsIndex::Create", message); + return false; + } + + // return success + return true; +} + +// returns format's file extension +const std::string BamToolsIndex::Extension(void) { + return BamToolsIndex::BTI_EXTENSION; +} + +void BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { + + // return false ref ID is not a valid index in file summary data + if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() ) + throw BamException("BamToolsIndex::GetOffset", "invalid region requested"); + + // retrieve reference index data for left bound reference + BtiReferenceEntry refEntry(region.LeftRefID); + ReadReferenceEntry(refEntry); + + // binary search for an overlapping block (may not be first one though) + bool found = false; + typedef BtiBlockVector::const_iterator BtiBlockConstIterator; + BtiBlockConstIterator blockFirst = refEntry.Blocks.begin(); + BtiBlockConstIterator blockIter = blockFirst; + BtiBlockConstIterator blockLast = refEntry.Blocks.end(); + iterator_traits::difference_type count = distance(blockFirst, blockLast); + iterator_traits::difference_type step; + while ( count > 0 ) { + blockIter = blockFirst; + step = count/2; + advance(blockIter, step); + + const BtiBlock& block = (*blockIter); + if ( block.StartPosition <= region.RightPosition ) { + if ( block.MaxEndPosition > region.LeftPosition ) { + offset = block.StartOffset; + break; + } + blockFirst = ++blockIter; + count -= step+1; + } + else count = step; + } + + // if we didn't search "off the end" of the blocks + if ( blockIter != blockLast ) { + + // "walk back" until we've gone too far + while ( blockIter != blockFirst ) { + const BtiBlock& currentBlock = (*blockIter); + + --blockIter; + const BtiBlock& previousBlock = (*blockIter); + if ( previousBlock.MaxEndPosition <= region.LeftPosition ) { + offset = currentBlock.StartOffset; + found = true; + break; + } + } + + // if we walked all the way to first block, just return that and let the reader's + // region overlap parsing do the rest + if ( blockIter == blockFirst ) { + const BtiBlock& block = (*blockIter); + offset = block.StartOffset; + found = true; + } + } + + + // sets to false if blocks container is empty, or if no matching block could be found + *hasAlignmentsInRegion = found; +} + +// returns whether reference has alignments or no +bool BamToolsIndex::HasAlignments(const int& referenceID) const { + if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() ) + return false; + const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID); + return ( refSummary.NumBlocks > 0 ); +} + +// pre-allocates space for each reference's summary data +void BamToolsIndex::InitializeFileSummary(const int& numReferences) { + m_indexFileSummary.clear(); + for ( int i = 0; i < numReferences; ++i ) + m_indexFileSummary.push_back( BtiReferenceSummary() ); +} + +// returns true if the index stream is open +bool BamToolsIndex::IsDeviceOpen(void) const { + if ( m_resources.Device == 0 ) + return false; + return m_resources.Device->IsOpen(); +} + +// attempts to use index data to jump to @region, returns success/fail +// a "successful" jump indicates no error, but not whether this region has data +// * thus, the method sets a flag to indicate whether there are alignments +// available after the jump position +bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) { + + // clear flag + *hasAlignmentsInRegion = false; + + // skip if invalid reader or not open + if ( m_reader == 0 || !m_reader->IsOpen() ) { + SetErrorString("BamToolsIndex::Jump", "could not jump: reader is not open"); + return false; + } + + // make sure left-bound position is valid + const RefVector& references = m_reader->GetReferenceData(); + if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) { + SetErrorString("BamToolsIndex::Jump", "could not create index: invalid region requested"); + return false; + } + + // calculate nearest offset to jump to + int64_t offset; + try { + GetOffset(region, offset, hasAlignmentsInRegion); + } catch ( BamException& e ) { + m_errorString = e.what(); + return false; + } + + // return success/failure of seek + return m_reader->Seek(offset); +} + +// loads existing data from file into memory +bool BamToolsIndex::Load(const std::string& filename) { + + try { + + // attempt to open file (read-only) + OpenFile(filename, IBamIODevice::ReadOnly); + + // load metadata & generate in-memory summary + LoadHeader(); + LoadFileSummary(); + + // return success + return true; + + } catch ( BamException& e ) { + m_errorString = e.what(); + return false; + } +} + +void BamToolsIndex::LoadFileSummary(void) { + + // load number of reference sequences + int numReferences; + LoadNumReferences(numReferences); + + // initialize file summary data + InitializeFileSummary(numReferences); + + // load summary for each reference + BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin(); + BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end(); + for ( ; summaryIter != summaryEnd; ++summaryIter ) + LoadReferenceSummary(*summaryIter); +} + +void BamToolsIndex::LoadHeader(void) { + + // check BTI file metadata + CheckMagicNumber(); + CheckVersion(); + + // use file's BTI block size to set member variable + const int64_t numBytesRead = m_resources.Device->Read((char*)&m_blockSize, sizeof(m_blockSize)); + if ( m_isBigEndian ) SwapEndian_32(m_blockSize); + if ( numBytesRead != sizeof(m_blockSize) ) + throw BamException("BamToolsIndex::LoadHeader", "could not read BTI block size"); +} + +void BamToolsIndex::LoadNumBlocks(int& numBlocks) { + const int64_t numBytesRead = m_resources.Device->Read((char*)&numBlocks, sizeof(numBlocks)); + if ( m_isBigEndian ) SwapEndian_32(numBlocks); + if ( numBytesRead != sizeof(numBlocks) ) + throw BamException("BamToolsIndex::LoadNumBlocks", "could not read number of BTI blocks"); +} + +void BamToolsIndex::LoadNumReferences(int& numReferences) { + const int64_t numBytesRead = m_resources.Device->Read((char*)&numReferences, sizeof(numReferences)); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + if ( numBytesRead != sizeof(numReferences) ) + throw BamException("BamToolsIndex::LoadNumReferences", "could not read number of references"); +} + +void BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) { + + // load number of blocks + int numBlocks; + LoadNumBlocks(numBlocks); + + // store block summary data for this reference + refSummary.NumBlocks = numBlocks; + refSummary.FirstBlockFilePosition = Tell(); + + // skip reference's blocks + SkipBlocks(numBlocks); +} + +void BamToolsIndex::OpenFile(const std::string& filename, IBamIODevice::OpenMode mode) { + + // make sure any previous index file is closed + CloseFile(); + + m_resources.Device = BamDeviceFactory::CreateDevice(filename); + if ( m_resources.Device == 0 ) { + const string message = string("could not open file: ") + filename; + throw BamException("BamStandardIndex::OpenFile", message); + } + + // attempt to open file + m_resources.Device->Open(mode); + if ( !IsDeviceOpen() ) { + const string message = string("could not open file: ") + filename; + throw BamException("BamToolsIndex::OpenFile", message); + } +} + +void BamToolsIndex::ReadBlock(BtiBlock& block) { + + // read in block data members + int64_t numBytesRead = 0; + numBytesRead += m_resources.Device->Read((char*)&block.MaxEndPosition, sizeof(block.MaxEndPosition)); + numBytesRead += m_resources.Device->Read((char*)&block.StartOffset, sizeof(block.StartOffset)); + numBytesRead += m_resources.Device->Read((char*)&block.StartPosition, sizeof(block.StartPosition)); + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_32(block.MaxEndPosition); + SwapEndian_64(block.StartOffset); + SwapEndian_32(block.StartPosition); + } + + // check block read ok + const int expectedBytes = sizeof(block.MaxEndPosition) + + sizeof(block.StartOffset) + + sizeof(block.StartPosition); + if ( numBytesRead != expectedBytes ) + throw BamException("BamToolsIndex::ReadBlock", "could not read block"); +} + +void BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) { + + // prep blocks container + blocks.clear(); + blocks.reserve(refSummary.NumBlocks); + + // skip to first block entry + Seek( refSummary.FirstBlockFilePosition, SEEK_SET ); + + // read & store block entries + BtiBlock block; + for ( int i = 0; i < refSummary.NumBlocks; ++i ) { + ReadBlock(block); + blocks.push_back(block); + } +} + +void BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) { + + // return false if refId not valid index in file summary structure + if ( refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size() ) + throw BamException("BamToolsIndex::ReadReferenceEntry", "invalid reference requested"); + + // use index summary to assist reading the reference's BTI blocks + const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID); + ReadBlocks(refSummary, refEntry.Blocks); +} + +void BamToolsIndex::Seek(const int64_t& position, const int origin) { + if ( !m_resources.Device->Seek(position, origin) ) + throw BamException("BamToolsIndex::Seek", "could not seek in BAI file"); +} + +void BamToolsIndex::SkipBlocks(const int& numBlocks) { + Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR ); +} + +int64_t BamToolsIndex::Tell(void) const { + return m_resources.Device->Tell(); +} + +void BamToolsIndex::WriteBlock(const BtiBlock& block) { + + // copy entry data + int32_t maxEndPosition = block.MaxEndPosition; + int64_t startOffset = block.StartOffset; + int32_t startPosition = block.StartPosition; + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_32(maxEndPosition); + SwapEndian_64(startOffset); + SwapEndian_32(startPosition); + } + + // write the reference index entry + int64_t numBytesWritten = 0; + numBytesWritten += m_resources.Device->Write((const char*)&maxEndPosition, sizeof(maxEndPosition)); + numBytesWritten += m_resources.Device->Write((const char*)&startOffset, sizeof(startOffset)); + numBytesWritten += m_resources.Device->Write((const char*)&startPosition, sizeof(startPosition)); + + // check block written ok + const int expectedBytes = sizeof(maxEndPosition) + + sizeof(startOffset) + + sizeof(startPosition); + if ( numBytesWritten != expectedBytes ) + throw BamException("BamToolsIndex::WriteBlock", "could not write BTI block"); +} + +void BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) { + BtiBlockVector::const_iterator blockIter = blocks.begin(); + BtiBlockVector::const_iterator blockEnd = blocks.end(); + for ( ; blockIter != blockEnd; ++blockIter ) + WriteBlock(*blockIter); +} + +void BamToolsIndex::WriteHeader(void) { + + int64_t numBytesWritten = 0 ; + + // write BTI index format 'magic number' + numBytesWritten += m_resources.Device->Write(BamToolsIndex::BTI_MAGIC, 4); + + // write BTI index format version + int32_t currentVersion = (int32_t)m_outputVersion; + if ( m_isBigEndian ) SwapEndian_32(currentVersion); + numBytesWritten += m_resources.Device->Write((const char*)¤tVersion, sizeof(currentVersion)); + + // write block size + uint32_t blockSize = m_blockSize; + if ( m_isBigEndian ) SwapEndian_32(blockSize); + numBytesWritten += m_resources.Device->Write((const char*)&blockSize, sizeof(blockSize)); + + // write number of references + int32_t numReferences = m_indexFileSummary.size(); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + numBytesWritten += m_resources.Device->Write((const char*)&numReferences, sizeof(numReferences)); + + // check header written ok + const int expectedBytes = 4 + + sizeof(currentVersion) + + sizeof(blockSize) + + sizeof(numReferences); + if ( numBytesWritten != expectedBytes ) + throw BamException("BamToolsIndex::WriteHeader", "could not write BTI header"); +} + +void BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) { + + // write number of blocks this reference + uint32_t numBlocks = refEntry.Blocks.size(); + if ( m_isBigEndian ) SwapEndian_32(numBlocks); + const int64_t numBytesWritten = m_resources.Device->Write((const char*)&numBlocks, sizeof(numBlocks)); + if ( numBytesWritten != sizeof(numBlocks) ) + throw BamException("BamToolsIndex::WriteReferenceEntry", "could not write number of blocks"); + + // write actual block entries + WriteBlocks(refEntry.Blocks); +} diff --git a/src/api/internal/index/BamToolsIndex_p.h b/src/api/internal/index/BamToolsIndex_p.h new file mode 100644 index 0000000..c1e1aa0 --- /dev/null +++ b/src/api/internal/index/BamToolsIndex_p.h @@ -0,0 +1,186 @@ +// *************************************************************************** +// BamToolsIndex.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the BamTools index format (".bti") +// *************************************************************************** + +#ifndef BAMTOOLS_INDEX_FORMAT_H +#define BAMTOOLS_INDEX_FORMAT_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. + +#include "api/BamAux.h" +#include "api/BamIndex.h" +#include "api/IBamIODevice.h" +#include +#include +#include + +namespace BamTools { +namespace Internal { + +// contains data for each 'block' in a BTI index +struct BtiBlock { + + // data members + int32_t MaxEndPosition; + int64_t StartOffset; + int32_t StartPosition; + + // ctor + BtiBlock(const int32_t& maxEndPosition = 0, + const int64_t& startOffset = 0, + const int32_t& startPosition = 0) + : MaxEndPosition(maxEndPosition) + , StartOffset(startOffset) + , StartPosition(startPosition) + { } +}; + +// convenience typedef for describing a a list of BTI blocks on a reference +typedef std::vector BtiBlockVector; + +// contains all fields necessary for building, loading, & writing +// full BTI index data for a single reference +struct BtiReferenceEntry { + + // data members + int32_t ID; + BtiBlockVector Blocks; + + // ctor + BtiReferenceEntry(const int& id = -1) + : ID(id) + { } +}; + +// provides (persistent) summary of BtiReferenceEntry's index data +struct BtiReferenceSummary { + + // data members + int NumBlocks; + uint64_t FirstBlockFilePosition; + + // ctor + BtiReferenceSummary(void) + : NumBlocks(0) + , FirstBlockFilePosition(0) + { } +}; + +// convenience typedef for describing a full BTI index file summary +typedef std::vector BtiFileSummary; + +class BamToolsIndex : public BamIndex { + + // keep a list of any supported versions here + // (might be useful later to handle any 'legacy' versions if the format changes) + // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on + // + // so a change introduced in BTI_1_2 may be handled from then on by: + // + // if ( indexVersion >= BTI_1_2 ) + // do something new + // else + // do the old thing + enum Version { BTI_1_0 = 1 + , BTI_1_1 + , BTI_1_2 + , BTI_2_0 + }; + + // ctor & dtor + public: + BamToolsIndex(Internal::BamReaderPrivate* reader); + ~BamToolsIndex(void); + + // BamIndex implementation + public: + // builds index from associated BAM file & writes out to index file + bool Create(void); + // returns whether reference has alignments or no + bool HasAlignments(const int& referenceID) const; + // attempts to use index data to jump to @region, returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + // loads existing data from file into memory + bool Load(const std::string& filename); + BamIndex::IndexType Type(void) const { return BamIndex::BAMTOOLS; } + public: + // returns format's file extension + static const std::string Extension(void); + + // internal methods + private: + + // index file ops + void CheckMagicNumber(void); + void CheckVersion(void); + void CloseFile(void); + bool IsDeviceOpen(void) const; + void OpenFile(const std::string& filename, IBamIODevice::OpenMode mode); + void Seek(const int64_t& position, const int origin); + int64_t Tell(void) const; + + // index-creation methods + void ClearReferenceEntry(BtiReferenceEntry& refEntry); + void WriteBlock(const BtiBlock& block); + void WriteBlocks(const BtiBlockVector& blocks); + void WriteHeader(void); + void WriteReferenceEntry(const BtiReferenceEntry& refEntry); + + // random-access methods + void GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); + void ReadBlock(BtiBlock& block); + void ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks); + void ReadReferenceEntry(BtiReferenceEntry& refEntry); + + // BTI summary data methods + void InitializeFileSummary(const int& numReferences); + void LoadFileSummary(void); + void LoadHeader(void); + void LoadNumBlocks(int& numBlocks); + void LoadNumReferences(int& numReferences); + void LoadReferenceSummary(BtiReferenceSummary& refSummary); + void SkipBlocks(const int& numBlocks); + + // data members + private: + bool m_isBigEndian; + BtiFileSummary m_indexFileSummary; + uint32_t m_blockSize; + int32_t m_inputVersion; // Version is serialized as int + Version m_outputVersion; + + struct RaiiWrapper { + IBamIODevice* Device; + RaiiWrapper(void); + ~RaiiWrapper(void); + }; + RaiiWrapper m_resources; + + // static constants + private: + static const uint32_t DEFAULT_BLOCK_LENGTH; + static const std::string BTI_EXTENSION; + static const char* const BTI_MAGIC; + static const int SIZEOF_BLOCK; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMTOOLS_INDEX_FORMAT_H diff --git a/src/api/internal/index/CMakeLists.txt b/src/api/internal/index/CMakeLists.txt new file mode 100644 index 0000000..1c78cb9 --- /dev/null +++ b/src/api/internal/index/CMakeLists.txt @@ -0,0 +1,17 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2011 Derek Barnett +# +# src/api/internal/index +# ========================== + +set ( InternalIndexDir "${InternalDir}/index" ) + +set ( InternalIndexSources + ${InternalIndexDir}/BamIndexFactory_p.cpp + ${InternalIndexDir}/BamStandardIndex_p.cpp + ${InternalIndexDir}/BamToolsIndex_p.cpp + + PARENT_SCOPE # <-- leave this last +) + diff --git a/src/api/internal/io/BamDeviceFactory_p.cpp b/src/api/internal/io/BamDeviceFactory_p.cpp new file mode 100644 index 0000000..f9c7694 --- /dev/null +++ b/src/api/internal/io/BamDeviceFactory_p.cpp @@ -0,0 +1,37 @@ +// *************************************************************************** +// BamDeviceFactory_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 September 2011 (DB) +// --------------------------------------------------------------------------- +// Creates built-in concrete implementations of IBamIODevices +// *************************************************************************** + +#include "api/internal/io/BamDeviceFactory_p.h" +#include "api/internal/io/BamFile_p.h" +#include "api/internal/io/BamFtp_p.h" +#include "api/internal/io/BamHttp_p.h" +#include "api/internal/io/BamPipe_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +using namespace std; + +IBamIODevice* BamDeviceFactory::CreateDevice(const string& source) { + + // check for requested pipe + if ( source == "-" || source == "stdin" || source == "stdout" ) + return new BamPipe; + + // check for HTTP prefix + if ( source.find("http://") == 0 ) + return new BamHttp(source); + + // check for FTP prefix + if ( source.find("ftp://") == 0 ) + return new BamFtp(source); + + // otherwise assume a "normal" file + return new BamFile(source); +} diff --git a/src/api/internal/io/BamDeviceFactory_p.h b/src/api/internal/io/BamDeviceFactory_p.h new file mode 100644 index 0000000..1d48533 --- /dev/null +++ b/src/api/internal/io/BamDeviceFactory_p.h @@ -0,0 +1,37 @@ +// *************************************************************************** +// BamDeviceFactory_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Creates built-in concrete implementations of IBamIODevices +// *************************************************************************** + +#ifndef BAMDEVICEFACTORY_P_H +#define BAMDEVICEFACTORY_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/IBamIODevice.h" +#include + +namespace BamTools { +namespace Internal { + +class BamDeviceFactory { + public: + static IBamIODevice* CreateDevice(const std::string& source); +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMDEVICEFACTORY_P_H diff --git a/src/api/internal/io/BamFile_p.cpp b/src/api/internal/io/BamFile_p.cpp new file mode 100644 index 0000000..990d9bf --- /dev/null +++ b/src/api/internal/io/BamFile_p.cpp @@ -0,0 +1,69 @@ +// *************************************************************************** +// BamFile_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides BAM file-specific IO behavior +// *************************************************************************** + +#include "api/internal/io/BamFile_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +using namespace std; + +BamFile::BamFile(const string& filename) + : ILocalIODevice() + , m_filename(filename) +{ } + +BamFile::~BamFile(void) { } + +void BamFile::Close(void) { + if ( IsOpen() ) { + m_filename.clear(); + ILocalIODevice::Close(); + } +} + +bool BamFile::IsRandomAccess(void) const { + return true; +} + +bool BamFile::Open(const IBamIODevice::OpenMode mode) { + + // make sure we're starting with a fresh file stream + Close(); + + // attempt to open FILE* depending on requested openmode + if ( mode == IBamIODevice::ReadOnly ) + m_stream = fopen(m_filename.c_str(), "rb"); + else if ( mode == IBamIODevice::WriteOnly ) + m_stream = fopen(m_filename.c_str(), "wb"); + else if ( mode == IBamIODevice::ReadWrite ) + m_stream = fopen(m_filename.c_str(), "w+b"); + else { + SetErrorString("BamFile::Open", "unknown open mode requested"); + return false; + } + + // check that we obtained a valid FILE* + if ( m_stream == 0 ) { + const string message_base = string("could not open file handle for "); + const string message = message_base + ( (m_filename.empty()) ? "empty filename" : m_filename ); + SetErrorString("BamFile::Open", message); + return false; + } + + // store current IO mode & return success + m_mode = mode; + return true; +} + +bool BamFile::Seek(const int64_t& position, const int origin) { + BT_ASSERT_X( m_stream, "BamFile::Seek() - null stream" ); + return ( fseek64(m_stream, position, origin) == 0 ); +} diff --git a/src/api/internal/io/BamFile_p.h b/src/api/internal/io/BamFile_p.h new file mode 100644 index 0000000..ed61813 --- /dev/null +++ b/src/api/internal/io/BamFile_p.h @@ -0,0 +1,51 @@ +// *************************************************************************** +// BamFile_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides BAM file-specific IO behavior +// *************************************************************************** + +#ifndef BAMFILE_P_H +#define BAMFILE_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/internal/io/ILocalIODevice_p.h" +#include + +namespace BamTools { +namespace Internal { + +class BamFile : public ILocalIODevice { + + // ctor & dtor + public: + BamFile(const std::string& filename); + ~BamFile(void); + + // ILocalIODevice implementation + public: + void Close(void); + bool IsRandomAccess(void) const; + bool Open(const IBamIODevice::OpenMode mode); + bool Seek(const int64_t& position, const int origin = SEEK_SET); + + // data members + private: + std::string m_filename; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMFILE_P_H diff --git a/src/api/internal/io/BamFtp_p.cpp b/src/api/internal/io/BamFtp_p.cpp new file mode 100644 index 0000000..d9f933c --- /dev/null +++ b/src/api/internal/io/BamFtp_p.cpp @@ -0,0 +1,498 @@ +// *************************************************************************** +// BamFtp_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides reading/writing of BAM files on FTP server +// *************************************************************************** + +#include "api/BamAux.h" +#include "api/internal/io/BamFtp_p.h" +#include "api/internal/io/TcpSocket_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +#include +using namespace std; + +namespace BamTools { +namespace Internal { + +// ----------- +// constants +// ----------- + +static const uint16_t FTP_PORT = 21; +static const string FTP_PREFIX = "ftp://"; +static const size_t FTP_PREFIX_LENGTH = 6; +static const string FTP_NEWLINE = "\r\n"; + +static const string DEFAULT_USER = "anonymous"; +static const string DEFAULT_PASS = "anonymous@"; + +static const string ABOR_CMD = "ABOR"; +static const string USER_CMD = "USER"; +static const string PASS_CMD = "PASS"; +static const string PASV_CMD = "PASV"; +static const string REIN_CMD = "REIN"; +static const string REST_CMD = "REST"; +static const string RETR_CMD = "RETR"; +static const string TYPE_CMD = "TYPE"; + +static const char CMD_SEPARATOR = ' '; +static const char HOST_SEPARATOR = '/'; +static const char IP_SEPARATOR = '.'; + +static const char MULTILINE_CONTINUE = '-'; + +static const char PASV_REPLY_PREFIX = '('; +static const char PASV_REPLY_SEPARATOR = ','; +static const char PASV_REPLY_SUFFIX = ')'; + +// ----------------- +// utility methods +// ----------------- + +static inline +vector split(const string& source, const char delim) { + + stringstream ss(source); + string field; + vector fields; + + while ( getline(ss, field, delim) ) + fields.push_back(field); + return fields; +} + +static inline +bool startsWith(const string& source, const string& pattern) { + return ( source.find(pattern) == 0 ); +} + +static inline +string toLower(const string& s) { + string out; + const size_t sSize = s.size(); + out.reserve(sSize); + for ( size_t i = 0; i < sSize; ++i ) + out[i] = tolower(s[i]); + return out; +} + +} // namespace Internal +} // namespace BamTools + +// ----------------------- +// BamFtp implementation +// ----------------------- + +BamFtp::BamFtp(const string& url) + : IBamIODevice() + , m_commandSocket(new TcpSocket) + , m_dataSocket(new TcpSocket) + , m_port(FTP_PORT) + , m_dataPort(0) + , m_username(DEFAULT_USER) + , m_password(DEFAULT_PASS) + , m_isUrlParsed(false) + , m_filePosition(-1) +{ + ParseUrl(url); +} + +BamFtp::~BamFtp(void) { + + // close connection & clean up + Close(); + if ( m_commandSocket ) + delete m_commandSocket; + if ( m_dataSocket ) + delete m_dataSocket; +} + +void BamFtp::Close(void) { + + // disconnect socket + m_commandSocket->DisconnectFromHost(); + m_dataSocket->DisconnectFromHost(); + + // reset state - necessary?? + m_isUrlParsed = false; + m_filePosition = -1; + m_username = DEFAULT_USER; + m_password = DEFAULT_PASS; + m_dataHostname.clear(); + m_dataPort = 0; +} + +bool BamFtp::ConnectCommandSocket(void) { + + BT_ASSERT_X(m_commandSocket, "null command socket?"); + + // connect to FTP server + if ( !m_commandSocket->ConnectToHost(m_hostname, m_port, m_mode) ) { + SetErrorString("BamFtp::ConnectCommandSocket", "could not connect to host"); + return false; + } + + // receive initial reply from host + if ( !ReceiveReply() ) { + Close(); + return false; + } + + // send USER command + string userCommand = USER_CMD + CMD_SEPARATOR + m_username + FTP_NEWLINE; + if ( !SendCommand(userCommand, true) ) { + Close(); + return false; + } + + // send PASS command + string passwordCommand = PASS_CMD + CMD_SEPARATOR + m_password + FTP_NEWLINE; + if ( !SendCommand(passwordCommand, true) ) { + Close(); + return false; + } + + // send TYPE command + string typeCommand = TYPE_CMD + CMD_SEPARATOR + 'I' + FTP_NEWLINE; + if ( !SendCommand(typeCommand, true) ) { + Close(); + return false; + } + + // return success + return true; +} + +bool BamFtp::ConnectDataSocket(void) { + + // failure if can't connect to command socket first + if ( !m_commandSocket->IsConnected() ) { + if ( !ConnectCommandSocket() ) + return false; + } + + // make sure we're starting with a fresh data channel + if ( m_dataSocket->IsConnected() ) + m_dataSocket->DisconnectFromHost(); + + // send passive connection command + const string passiveCommand = PASV_CMD + FTP_NEWLINE; + if ( !SendCommand(passiveCommand, true) ) { + // TODO: set error string + return false; + } + + // retrieve passive connection port + if ( !ParsePassiveResponse() ) { + // TODO: set error string + return false; + } + + // set up restart command (tell server where to start fetching bytes from) + if ( m_filePosition >= 0 ) { + + stringstream fpStream(""); + fpStream << m_filePosition; + string restartCommand = REST_CMD + CMD_SEPARATOR + fpStream.str() + FTP_NEWLINE; + if ( !SendCommand(restartCommand, true) ) { + // TODO: set error string + return false; + } + } + + // main file retrieval request + string retrieveCommand = RETR_CMD + CMD_SEPARATOR + m_filename + FTP_NEWLINE; + if ( !SendCommand(retrieveCommand, false) ) { + // TODO: set error string + return false; + } + + // make data channel connection + if ( !m_dataSocket->ConnectToHost(m_dataHostname, m_dataPort) ) { + // TODO: set error string + return false; + } + + // fetch intial reply from server + if ( !ReceiveReply() ) { + // TODO: set error string + m_dataSocket->DisconnectFromHost(); + return false; + } + + // make sure we have reply code 150 (all good) + if ( !startsWith(m_response, "150") ) { + // TODO: set error string + m_dataSocket->DisconnectFromHost(); + return false; + } + + // return success + return true; +} + +bool BamFtp::IsOpen(void) const { + return IBamIODevice::IsOpen() && m_isUrlParsed; +} + +bool BamFtp::IsRandomAccess(void) const { + return true; +} + +bool BamFtp::Open(const IBamIODevice::OpenMode mode) { + + // BamFtp only supports read-only access + if ( mode != IBamIODevice::ReadOnly ) { + SetErrorString("BamFtp::Open", "writing on this device is not supported"); + return false; + } + + // initialize basic valid state + m_mode = mode; + m_filePosition = 0; + + // attempt connection to command & data sockets + return ( ConnectCommandSocket() && ConnectDataSocket() ); +} + +bool BamFtp::ParsePassiveResponse(void) { + + // fail if empty + if ( m_response.empty() ) + return false; + + // find parentheses + const size_t leftParenFound = m_response.find(PASV_REPLY_PREFIX); + const size_t rightParenFound = m_response.find(PASV_REPLY_SUFFIX); + if ( leftParenFound == string::npos || rightParenFound == string::npos ) + return false; + + // grab everything between ( should be "h1,h2,h3,h4,p1,p2" ) + string::const_iterator responseBegin = m_response.begin(); + const string hostAndPort(responseBegin+leftParenFound+1, responseBegin+rightParenFound); + + // parse into string fields + vector fields = split(hostAndPort, PASV_REPLY_SEPARATOR); + if ( fields.size() != 6 ) + return false; + + // fetch passive connection IP + m_dataHostname = fields[0] + IP_SEPARATOR + + fields[1] + IP_SEPARATOR + + fields[2] + IP_SEPARATOR + + fields[3]; + + // fetch passive connection port + const uint8_t portUpper = static_cast(atoi(fields[4].c_str())); + const uint8_t portLower = static_cast(atoi(fields[5].c_str())); + m_dataPort = ( portUpper<<8 ) + portLower; + + // return success + return true; +} + +void BamFtp::ParseUrl(const string& url) { + + // clear flag to start + m_isUrlParsed = false; + + // make sure url starts with "ftp://", case-insensitive + string tempUrl(url); + toLower(tempUrl); + const size_t prefixFound = tempUrl.find(FTP_PREFIX); + if ( prefixFound == string::npos ) + return; + + // find end of host name portion (first '/' hit after the prefix) + const size_t firstSlashFound = tempUrl.find(HOST_SEPARATOR, FTP_PREFIX_LENGTH); + if ( firstSlashFound == string::npos ) { + ; // no slash found... no filename given along with host? + } + + // fetch hostname + string hostname = tempUrl.substr(FTP_PREFIX_LENGTH, (firstSlashFound - FTP_PREFIX_LENGTH)); + m_hostname = hostname; + m_port = FTP_PORT; + + // store remainder of URL as filename (must be non-empty) + string filename = tempUrl.substr(firstSlashFound); + if ( filename.empty() ) + return; + m_filename = filename; + + // set parsed OK flag + m_isUrlParsed = true; +} + +int64_t BamFtp::Read(char* data, const unsigned int numBytes) { + + // if BamHttp not in a valid state + if ( !IsOpen() ) + return -1; + + // read until hit desired @numBytes + int64_t bytesReadSoFar = 0; + while ( bytesReadSoFar < numBytes ) { + + // calculate number of bytes we're going to try to read this iteration + const size_t remainingBytes = ( numBytes - bytesReadSoFar ); + + // if either disconnected somehow, or (more likely) we have seeked since last read + if ( !m_dataSocket->IsConnected() ) { + if ( !ConnectDataSocket() ) { + // TODO: set error string + return -1; + } + } + + // read bytes from data socket + const int64_t socketBytesRead = ReadDataSocket(data+bytesReadSoFar, remainingBytes); + if ( socketBytesRead < 0 ) + return -1; + bytesReadSoFar += socketBytesRead; + m_filePosition += socketBytesRead; + } + + // return actual number bytes successfully read + return bytesReadSoFar; +} + +int64_t BamFtp::ReadCommandSocket(char* data, const unsigned int maxNumBytes) { + + // try to read 'remainingBytes' from socket + const int64_t numBytesRead = m_commandSocket->Read(data, maxNumBytes); + if ( numBytesRead < 0 ) + return -1; + return numBytesRead; +} + +int64_t BamFtp::ReadDataSocket(char* data, const unsigned int maxNumBytes) { + + // try to read 'remainingBytes' from socket + const int64_t numBytesRead = m_dataSocket->Read(data, maxNumBytes); + if ( numBytesRead < 0 ) + return -1; + return numBytesRead; +} + +bool BamFtp::ReceiveReply(void) { + + // failure if not connected + if ( !m_commandSocket->IsConnected() ) { + SetErrorString("BamFtp::ReceiveReply()", "command socket not connected"); + return false; + } + + m_response.clear(); + + // read header data (& discard for now) + bool headerEnd = false; + while ( !headerEnd ) { + + const string headerLine = m_commandSocket->ReadLine(); + m_response += headerLine; + + // if line is of form 'xyz ', quit reading lines + if ( (headerLine.length() >= 4 ) && + isdigit(headerLine[0]) && + isdigit(headerLine[1]) && + isdigit(headerLine[2]) && + ( headerLine[3] != MULTILINE_CONTINUE ) + ) + { + headerEnd = true; + } + } + + // return success, depending on response + if ( m_response.empty() ) { + SetErrorString("BamFtp::ReceiveReply", "error reading server reply"); + return false; + } + return true; +} + +bool BamFtp::Seek(const int64_t& position, const int origin) { + + // if FTP device not in a valid state + if ( !IsOpen() ) { + // TODO: set error string + return false; + } + + // ---------------------- + // UGLY !! but works?? + // ---------------------- + // disconnect from server + m_dataSocket->DisconnectFromHost(); + m_commandSocket->DisconnectFromHost(); + + // update file position & return success + if ( origin == SEEK_CUR ) + m_filePosition += position; + else if ( origin == SEEK_SET) + m_filePosition = position; + else { + // TODO: set error string + return false; + } + return true; +} + +bool BamFtp::SendCommand(const string& command, bool waitForReply) { + + // failure if not connected + if ( !m_commandSocket->IsConnected() ) { + SetErrorString("BamFtp::SendCommand", "command socket not connected"); + return false; + } + + // write command to 'command socket' + if ( WriteCommandSocket(command.c_str(), command.length()) == -1 ) { + SetErrorString("BamFtp::SendCommand", "error writing to socket"); + // get actual error from command socket?? + return false; + } + + // if we sent a command that receives a response + if ( waitForReply ) + return ReceiveReply(); + + // return success + return true; +} + +int64_t BamFtp::Tell(void) const { + return ( IsOpen() ? m_filePosition : -1 ); +} + +int64_t BamFtp::Write(const char* data, const unsigned int numBytes) { + (void)data; + (void)numBytes; + BT_ASSERT_X(false, "BamFtp::Write : write-mode not supported on this device"); + SetErrorString("BamFtp::Write", "write-mode not supported on this device"); + return -1; +} + +int64_t BamFtp::WriteCommandSocket(const char* data, const unsigned int numBytes) { + if ( !m_commandSocket->IsConnected() ) + return -1; + m_commandSocket->ClearBuffer(); + return m_commandSocket->Write(data, numBytes); +} + +int64_t BamFtp::WriteDataSocket(const char* data, const unsigned int numBytes) { + (void)data; + (void)numBytes; + BT_ASSERT_X(false, "BamFtp::WriteDataSocket: write-mode not supported on this device"); + SetErrorString("BamFtp::Write", "write-mode not supported on this device"); + return -1; +} diff --git a/src/api/internal/io/BamFtp_p.h b/src/api/internal/io/BamFtp_p.h new file mode 100644 index 0000000..11f549c --- /dev/null +++ b/src/api/internal/io/BamFtp_p.h @@ -0,0 +1,91 @@ +// *************************************************************************** +// BamFtp_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides reading/writing of BAM files on FTP server +// *************************************************************************** + +#ifndef BAMFTP_P_H +#define BAMFTP_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/IBamIODevice.h" +#include + +namespace BamTools { +namespace Internal { + +class TcpSocket; + +class BamFtp : public IBamIODevice { + + // ctor & dtor + public: + BamFtp(const std::string& url); + ~BamFtp(void); + + // IBamIODevice implementation + public: + void Close(void); + bool IsOpen(void) const; + bool IsRandomAccess(void) const; + bool Open(const IBamIODevice::OpenMode mode); + int64_t Read(char* data, const unsigned int numBytes); + bool Seek(const int64_t& position, const int origin = SEEK_SET); + int64_t Tell(void) const; + int64_t Write(const char* data, const unsigned int numBytes); + + // internal methods + private: + bool ConnectCommandSocket(void); + bool ConnectDataSocket(void); + bool ParsePassiveResponse(void); + void ParseUrl(const std::string& url); + int64_t ReadCommandSocket(char* data, const unsigned int numBytes); + int64_t ReadDataSocket(char* data, const unsigned int numBytes); + bool ReceiveReply(void); + bool SendCommand(const std::string& command, bool waitForReply); + int64_t WriteCommandSocket(const char* data, const unsigned int numBytes); + int64_t WriteDataSocket(const char* data, const unsigned int numBytes); + + // data members + private: + + // our main sockets + TcpSocket* m_commandSocket; + TcpSocket* m_dataSocket; + + // our connection data + std::string m_hostname; + uint16_t m_port; + std::string m_dataHostname; + uint16_t m_dataPort; + std::string m_filename; + + std::string m_username; + std::string m_password; + + std::string m_response; + + // internal state flags + bool m_isUrlParsed; + + // file position + int64_t m_filePosition; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMFTP_P_H diff --git a/src/api/internal/io/BamHttp_p.cpp b/src/api/internal/io/BamHttp_p.cpp new file mode 100644 index 0000000..e2ade70 --- /dev/null +++ b/src/api/internal/io/BamHttp_p.cpp @@ -0,0 +1,411 @@ +// *************************************************************************** +// BamHttp_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides reading/writing of BAM files on HTTP server +// *************************************************************************** + +#include "api/BamAux.h" +#include "api/internal/io/BamHttp_p.h" +#include "api/internal/io/HttpHeader_p.h" +#include "api/internal/io/TcpSocket_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +#include +using namespace std; + +namespace BamTools { +namespace Internal { + +// ----------- +// constants +// ----------- + +static const string HTTP_PORT = "80"; +static const string HTTP_PREFIX = "http://"; +static const size_t HTTP_PREFIX_LENGTH = 7; + +static const string DOUBLE_NEWLINE = "\n\n"; + +static const string GET_METHOD = "GET"; +static const string HOST_HEADER = "Host"; +static const string RANGE_HEADER = "Range"; +static const string BYTES_PREFIX = "bytes="; + +static const char HOST_SEPARATOR = '/'; +static const char PROXY_SEPARATOR = ':'; + +// ----------------- +// utility methods +// ----------------- + +static inline +bool endsWith(const string& source, const string& pattern) { + return ( source.find(pattern) == (source.length() - pattern.length()) ); +} + +static inline +string toLower(const string& s) { + string out; + const size_t sSize = s.size(); + out.reserve(sSize); + for ( size_t i = 0; i < sSize; ++i ) + out[i] = tolower(s[i]); + return out; +} + +} // namespace Internal +} // namespace BamTools + +// ------------------------ +// BamHttp implementation +// ------------------------ + +BamHttp::BamHttp(const string& url) + : IBamIODevice() + , m_socket(new TcpSocket) + , m_port(HTTP_PORT) + , m_request(0) + , m_response(0) + , m_isUrlParsed(false) + , m_filePosition(-1) + , m_endRangeFilePosition(-1) +{ + ParseUrl(url); +} + +BamHttp::~BamHttp(void) { + + // close connection & clean up + Close(); + if ( m_socket ) + delete m_socket; +} + +void BamHttp::Close(void) { + + // disconnect socket + m_socket->DisconnectFromHost(); + + // clean up request & response + if ( m_request ) { + delete m_request; + m_request = 0; + } + if ( m_response ) { + delete m_response; + m_response = 0; + } + + // reset state - necessary?? + m_isUrlParsed = false; + m_filePosition = -1; + m_endRangeFilePosition = -1; +} + +bool BamHttp::ConnectSocket(void) { + + BT_ASSERT_X(m_socket, "null socket?"); + + // any state checks, etc? + if ( !m_socket->ConnectToHost(m_hostname, m_port, m_mode) ) { + // TODO: set error string + return false; + } + + // attempt initial request + m_filePosition = 0; + m_endRangeFilePosition = -1; + if ( !SendRequest() ) { + // TODO: set error string + Close(); + return false; + } + + // wait for response from server + if ( !ReceiveResponse() ) { + // TODO: set error string + Close(); + return false; + } + + // return success + return true; +} + +bool BamHttp::EnsureSocketConnection(void) { + if ( m_socket->IsConnected() ) + return true; + else return ConnectSocket(); +} + +bool BamHttp::IsOpen(void) const { + return IBamIODevice::IsOpen() && m_isUrlParsed; +} + +bool BamHttp::IsRandomAccess(void) const { + return true; +} + +bool BamHttp::Open(const IBamIODevice::OpenMode mode) { + + // BamHttp only supports read-only access + if ( mode != IBamIODevice::ReadOnly ) { + SetErrorString("BamHttp::Open", "writing on this device is not supported"); + return false; + } + m_mode = mode; + + // attempt connection to socket + if ( !ConnectSocket() ) { + SetErrorString("BamHttp::Open", m_socket->GetErrorString()); + return false; + } + + // return success + return true; +} + +void BamHttp::ParseUrl(const string& url) { + + // clear flag to start + m_isUrlParsed = false; + + // make sure url starts with "http://", case-insensitive + string tempUrl(url); + toLower(tempUrl); + const size_t prefixFound = tempUrl.find(HTTP_PREFIX); + if ( prefixFound == string::npos ) + return; + + // find end of host name portion (first '/' hit after the prefix) + const size_t firstSlashFound = tempUrl.find(HOST_SEPARATOR, HTTP_PREFIX_LENGTH); + if ( firstSlashFound == string::npos ) { + ; // no slash found... no filename given along with host? + } + + // fetch hostname (check for proxy port) + string hostname = tempUrl.substr(HTTP_PREFIX_LENGTH, (firstSlashFound - HTTP_PREFIX_LENGTH)); + const size_t colonFound = hostname.find(PROXY_SEPARATOR); + if ( colonFound != string::npos ) { + ; // TODO: handle proxy port (later, just skip for now) + } else { + m_hostname = hostname; + m_port = HTTP_PORT; + } + + // store remainder of URL as filename (must be non-empty) + string filename = tempUrl.substr(firstSlashFound); + if ( filename.empty() ) + return; + m_filename = filename; + + // set parsed OK flag + m_isUrlParsed = true; +} + +int64_t BamHttp::Read(char* data, const unsigned int numBytes) { + + // if BamHttp not in a valid state + if ( !IsOpen() ) + return -1; + + // read until hit desired @numBytes + int64_t bytesReadSoFar = 0; + while ( bytesReadSoFar < numBytes ) { + + // calculate number of bytes we're going to try to read this iteration + const size_t remainingBytes = ( numBytes - bytesReadSoFar ); + + // if socket has access to entire file contents + // i.e. we received response with full data (status code == 200) + if ( m_endRangeFilePosition < 0 ) { + + // try to read 'remainingBytes' from socket + const int64_t socketBytesRead = ReadFromSocket(data+bytesReadSoFar, remainingBytes); + if ( socketBytesRead < 0 ) + return -1; + bytesReadSoFar += socketBytesRead; + m_filePosition += socketBytesRead; + } + + // socket has access to a range of data (might already be in buffer) + // i.e. we received response with partial data (status code == 206) + else { + + // there is data left from last request + if ( m_endRangeFilePosition > m_filePosition ) { + + // try to read either the total 'remainingBytes' or + // whatever we have remaining from last request range + const size_t rangeRemainingBytes = m_endRangeFilePosition - m_filePosition; + const size_t bytesToRead = std::min(remainingBytes, rangeRemainingBytes); + const int64_t socketBytesRead = ReadFromSocket(data+bytesReadSoFar, bytesToRead); + if ( socketBytesRead < 0 ) + return -1; + bytesReadSoFar += socketBytesRead; + m_filePosition += socketBytesRead; + } + + // otherwise, this is a 1st-time read or + // we already read everything from the last GET request + else { + + // request for next range + if ( !SendRequest(remainingBytes) || !ReceiveResponse() ) { + Close(); + return -1; + } + } + } + } + + // return actual number bytes successfully read + return bytesReadSoFar; +} + +int64_t BamHttp::ReadFromSocket(char* data, const unsigned int maxNumBytes) { + + // try to read 'remainingBytes' from socket + const int64_t numBytesRead = m_socket->Read(data, maxNumBytes); + if ( numBytesRead < 0 ) + return -1; + return numBytesRead; +} + +bool BamHttp::ReceiveResponse(void) { + + // clear any prior response + if ( m_response ) + delete m_response; + + // make sure we're connected + if ( !EnsureSocketConnection() ) + return false; + + // fetch header, up until double new line + string responseHeader; + do { + // read line & append to full header + const string headerLine = m_socket->ReadLine(); + responseHeader += headerLine; + + } while ( !endsWith(responseHeader, DOUBLE_NEWLINE) ); + + // sanity check + if ( responseHeader.empty() ) { + // TODO: set error string + Close(); + return false; + } + + // create response from header text + m_response = new HttpResponseHeader(responseHeader); + if ( !m_response->IsValid() ) { + // TODO: set error string + Close(); + return false; + } + + // if we got range response as requested + if ( m_response->GetStatusCode() == 206 ) + return true; + + // if we got the full file contents instead of range + else if ( m_response->GetStatusCode() == 200 ) { + + // skip up to current file position + RaiiBuffer tmp(0x8000); + int64_t numBytesRead = 0; + while ( numBytesRead < m_filePosition ) { + int64_t result = ReadFromSocket(tmp.Buffer, 0x8000); + if ( result < 0 ) { + Close(); + return false; + } + numBytesRead += result; + } + + // return success + return true; + } + + // on any other reponse status + // TODO: set error string + Close(); + return false; +} + +bool BamHttp::Seek(const int64_t& position, const int origin) { + + // if HTTP device not in a valid state + if ( !IsOpen() ) { + // TODO: set error string + return false; + } + + // discard socket's buffer contents, update positions, & return success + m_socket->ClearBuffer(); + + if ( origin == SEEK_CUR ) + m_filePosition += position; + else if ( origin == SEEK_SET ) + m_filePosition = position; + else { + // TODO: set error string + return false; + } + m_endRangeFilePosition = m_filePosition; + return true; +} + +bool BamHttp::SendRequest(const size_t numBytes) { + + // remove any currently active request + if ( m_request ) + delete m_request; + + // create range string + m_endRangeFilePosition = m_filePosition + numBytes; + stringstream range(""); + range << BYTES_PREFIX << m_filePosition << '-' << m_endRangeFilePosition; + + // make sure we're connected + if ( !EnsureSocketConnection() ) + return false; + + // create request + m_request = new HttpRequestHeader(GET_METHOD, m_filename); + m_request->SetField(HOST_HEADER, m_hostname); + m_request->SetField(RANGE_HEADER, range.str()); + + // write request to socket + const string requestHeader = m_request->ToString(); + const size_t headerSize = requestHeader.size(); + return ( WriteToSocket(requestHeader.c_str(), headerSize) == headerSize ); +} + +int64_t BamHttp::Tell(void) const { + return ( IsOpen() ? m_filePosition : -1 ); +} + +int64_t BamHttp::Write(const char* data, const unsigned int numBytes) { + (void)data; + (void)numBytes; + BT_ASSERT_X(false, "BamHttp::Write : write-mode not supported on this device"); + SetErrorString("BamHttp::Write", "write-mode not supported on this device"); + return -1; +} + +int64_t BamHttp::WriteToSocket(const char* data, const unsigned int numBytes) { + if ( !m_socket->IsConnected() ) + return -1; + m_socket->ClearBuffer(); + return m_socket->Write(data, numBytes); +} diff --git a/src/api/internal/io/BamHttp_p.h b/src/api/internal/io/BamHttp_p.h new file mode 100644 index 0000000..371ccce --- /dev/null +++ b/src/api/internal/io/BamHttp_p.h @@ -0,0 +1,87 @@ +// *************************************************************************** +// BamHttp_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides reading/writing of BAM files on HTTP server +// *************************************************************************** + +#ifndef BAMHTTP_P_H +#define BAMHTTP_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/IBamIODevice.h" +#include + +namespace BamTools { +namespace Internal { + +class HttpRequestHeader; +class HttpResponseHeader; +class TcpSocket; + +class BamHttp : public IBamIODevice { + + // ctor & dtor + public: + BamHttp(const std::string& url); + ~BamHttp(void); + + // IBamIODevice implementation + public: + void Close(void); + bool IsOpen(void) const; + bool IsRandomAccess(void) const; + bool Open(const IBamIODevice::OpenMode mode); + int64_t Read(char* data, const unsigned int numBytes); + bool Seek(const int64_t& position, const int origin = SEEK_SET); + int64_t Tell(void) const; + int64_t Write(const char* data, const unsigned int numBytes); + + // internal methods + private: + bool ConnectSocket(void); + bool EnsureSocketConnection(void); + void ParseUrl(const std::string& url); + int64_t ReadFromSocket(char* data, const unsigned int numBytes); + bool ReceiveResponse(void); + bool SendRequest(const size_t numBytes = 0); + int64_t WriteToSocket(const char* data, const unsigned int numBytes); + + // data members + private: + + // our main socket + TcpSocket* m_socket; + + // our connection data + std::string m_hostname; + std::string m_port; + std::string m_filename; + + // our last (active) request & response info + HttpRequestHeader* m_request; + HttpResponseHeader* m_response; + + // internal state flags + bool m_isUrlParsed; + + // file position + int64_t m_filePosition; + int64_t m_endRangeFilePosition; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMHTTP_P_H diff --git a/src/api/internal/io/BamPipe_p.cpp b/src/api/internal/io/BamPipe_p.cpp new file mode 100644 index 0000000..2d571fd --- /dev/null +++ b/src/api/internal/io/BamPipe_p.cpp @@ -0,0 +1,61 @@ +// *************************************************************************** +// BamPipe_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides BAM pipe-specific IO behavior +// *************************************************************************** + +#include "api/internal/io/BamPipe_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +using namespace std; + +BamPipe::BamPipe(void) : ILocalIODevice() { } + +BamPipe::~BamPipe(void) { } + +bool BamPipe::IsRandomAccess(void) const { + return false; +} + +bool BamPipe::Open(const IBamIODevice::OpenMode mode) { + + // make sure we're starting with a fresh pipe + Close(); + + // open stdin/stdout depending on requested openmode + if ( mode == IBamIODevice::ReadOnly ) + m_stream = freopen(0, "rb", stdin); + else if ( mode == IBamIODevice::WriteOnly ) + m_stream = freopen(0, "wb", stdout); + else { + const string errorType = string( (mode == IBamIODevice::ReadWrite) ? "unsupported" + : "unknown" ); + const string message = errorType + " open mode requested"; + SetErrorString("BamPipe::Open", message); + return false; + } + + // check that we obtained a valid FILE* + if ( m_stream == 0 ) { + const string message_base = string("could not open handle on "); + const string message = message_base + ( (mode == IBamIODevice::ReadOnly) ? "stdin" + : "stdout" ); + SetErrorString("BamPipe::Open", message); + return false; + } + + // store current IO mode & return success + m_mode = mode; + return true; +} + +bool BamPipe::Seek(const int64_t&, const int) { + SetErrorString("BamPipe::Seek", "random access not allowed in FIFO pipe"); + return false; +} diff --git a/src/api/internal/io/BamPipe_p.h b/src/api/internal/io/BamPipe_p.h new file mode 100644 index 0000000..1a95cc7 --- /dev/null +++ b/src/api/internal/io/BamPipe_p.h @@ -0,0 +1,46 @@ +// *************************************************************************** +// BamPipe_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides BAM pipe-specific IO behavior +// *************************************************************************** + +#ifndef BAMPIPE_P_H +#define BAMPIPE_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/internal/io/ILocalIODevice_p.h" +#include + +namespace BamTools { +namespace Internal { + +class BamPipe : public ILocalIODevice { + + // ctor & dtor + public: + BamPipe(void); + ~BamPipe(void); + + // IBamIODevice implementation + public: + bool IsRandomAccess(void) const; + bool Open(const IBamIODevice::OpenMode mode); + bool Seek(const int64_t& position, const int origin = SEEK_SET); +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMPIPE_P_H diff --git a/src/api/internal/io/BgzfStream_p.cpp b/src/api/internal/io/BgzfStream_p.cpp new file mode 100644 index 0000000..7f73d67 --- /dev/null +++ b/src/api/internal/io/BgzfStream_p.cpp @@ -0,0 +1,470 @@ +// *************************************************************************** +// BgzfStream_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011(DB) +// --------------------------------------------------------------------------- +// Based on BGZF routines developed at the Broad Institute. +// Provides the basic functionality for reading & writing BGZF files +// Replaces the old BGZF.* files to avoid clashing with other toolkits +// *************************************************************************** + +#include "api/BamAux.h" +#include "api/BamConstants.h" +#include "api/internal/io/BamDeviceFactory_p.h" +#include "api/internal/io/BgzfStream_p.h" +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include "zlib.h" + +#include +#include +#include +#include +using namespace std; + +// --------------------------- +// BgzfStream implementation +// --------------------------- + +// constructor +BgzfStream::BgzfStream(void) + : m_blockLength(0) + , m_blockOffset(0) + , m_blockAddress(0) + , m_isWriteCompressed(true) + , m_device(0) + , m_uncompressedBlock(Constants::BGZF_DEFAULT_BLOCK_SIZE) + , m_compressedBlock(Constants::BGZF_MAX_BLOCK_SIZE) +{ } + +// destructor +BgzfStream::~BgzfStream(void) { + Close(); +} + +// checks BGZF block header +bool BgzfStream::CheckBlockHeader(char* header) { + return (header[0] == Constants::GZIP_ID1 && + header[1] == Constants::GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & Constants::FLG_FEXTRA) != 0 && + BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN && + header[12] == Constants::BGZF_ID1 && + header[13] == Constants::BGZF_ID2 && + BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN ); +} + +// closes BGZF file +void BgzfStream::Close(void) { + + // skip if no device open + if ( m_device == 0 ) return; + + // if writing to file, flush the current BGZF block, + // then write an empty block (as EOF marker) + if ( m_device->IsOpen() && (m_device->Mode() == IBamIODevice::WriteOnly) ) { + FlushBlock(); + const size_t blockLength = DeflateBlock(); + m_device->Write(m_compressedBlock.Buffer, blockLength); + } + + // close device + m_device->Close(); + delete m_device; + m_device = 0; + + // ensure our buffers are cleared out + m_uncompressedBlock.Clear(); + m_compressedBlock.Clear(); + + // reset state + m_blockLength = 0; + m_blockOffset = 0; + m_blockAddress = 0; + m_isWriteCompressed = true; +} + +// compresses the current block +size_t BgzfStream::DeflateBlock(void) { + + // initialize the gzip header + char* buffer = m_compressedBlock.Buffer; + memset(buffer, 0, 18); + buffer[0] = Constants::GZIP_ID1; + buffer[1] = Constants::GZIP_ID2; + buffer[2] = Constants::CM_DEFLATE; + buffer[3] = Constants::FLG_FEXTRA; + buffer[9] = Constants::OS_UNKNOWN; + buffer[10] = Constants::BGZF_XLEN; + buffer[12] = Constants::BGZF_ID1; + buffer[13] = Constants::BGZF_ID2; + buffer[14] = Constants::BGZF_LEN; + + // set compression level + const int compressionLevel = ( m_isWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 ); + + // loop to retry for blocks that do not compress enough + int inputLength = m_blockOffset; + size_t compressedLength = 0; + const unsigned int bufferSize = Constants::BGZF_MAX_BLOCK_SIZE; + + while ( true ) { + + // initialize zstream values + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = (Bytef*)m_uncompressedBlock.Buffer; + zs.avail_in = inputLength; + zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH]; + zs.avail_out = bufferSize - + Constants::BGZF_BLOCK_HEADER_LENGTH - + Constants::BGZF_BLOCK_FOOTER_LENGTH; + + // initialize the zlib compression algorithm + int status = deflateInit2(&zs, + compressionLevel, + Z_DEFLATED, + Constants::GZIP_WINDOW_BITS, + Constants::Z_DEFAULT_MEM_LEVEL, + Z_DEFAULT_STRATEGY); + if ( status != Z_OK ) + throw BamException("BgzfStream::DeflateBlock", "zlib deflateInit2 failed"); + + // compress the data + status = deflate(&zs, Z_FINISH); + + // if not at stream end + if ( status != Z_STREAM_END ) { + + deflateEnd(&zs); + + // there was not enough space available in buffer + // try to reduce the input length & re-start loop + if ( status == Z_OK ) { + inputLength -= 1024; + if ( inputLength < 0 ) + throw BamException("BgzfStream::DeflateBlock", "input reduction failed"); + continue; + } + + throw BamException("BgzfStream::DeflateBlock", "zlib deflate failed"); + } + + // finalize the compression routine + status = deflateEnd(&zs); + if ( status != Z_OK ) + throw BamException("BgzfStream::DeflateBlock", "zlib deflateEnd failed"); + + // update compressedLength + compressedLength = zs.total_out + + Constants::BGZF_BLOCK_HEADER_LENGTH + + Constants::BGZF_BLOCK_FOOTER_LENGTH; + if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE ) + throw BamException("BgzfStream::DeflateBlock", "deflate overflow"); + + // quit while loop + break; + } + + // store the compressed length + BamTools::PackUnsignedShort(&buffer[16], static_cast(compressedLength - 1)); + + // store the CRC32 checksum + uint32_t crc = crc32(0, NULL, 0); + crc = crc32(crc, (Bytef*)m_uncompressedBlock.Buffer, inputLength); + BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc); + BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength); + + // ensure that we have less than a block of data left + int remaining = m_blockOffset - inputLength; + if ( remaining > 0 ) { + if ( remaining > inputLength ) + throw BamException("BgzfStream::DeflateBlock", "after deflate, remainder too large"); + memcpy(m_uncompressedBlock.Buffer, m_uncompressedBlock.Buffer + inputLength, remaining); + } + + // update block data + m_blockOffset = remaining; + + // return result + return compressedLength; +} + +// flushes the data in the BGZF block +void BgzfStream::FlushBlock(void) { + + BT_ASSERT_X( m_device, "BgzfStream::FlushBlock() - attempting to flush to null device" ); + + // flush all of the remaining blocks + while ( m_blockOffset > 0 ) { + + // compress the data block + const size_t blockLength = DeflateBlock(); + + // flush the data to our output device + const int64_t numBytesWritten = m_device->Write(m_compressedBlock.Buffer, blockLength); + + // check for device error + if ( numBytesWritten < 0 ) { + const string message = string("device error: ") + m_device->GetErrorString(); + throw BamException("BgzfStream::FlushBlock", message); + } + + // check that we wrote expected numBytes + if ( numBytesWritten != static_cast(blockLength) ) { + stringstream s(""); + s << "expected to write " << blockLength + << " bytes during flushing, but wrote " << numBytesWritten; + throw BamException("BgzfStream::FlushBlock", s.str()); + } + + // update block data + m_blockAddress += blockLength; + } +} + +// decompresses the current block +size_t BgzfStream::InflateBlock(const size_t& blockLength) { + + // setup zlib stream object + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = (Bytef*)m_compressedBlock.Buffer + 18; + zs.avail_in = blockLength - 16; + zs.next_out = (Bytef*)m_uncompressedBlock.Buffer; + zs.avail_out = Constants::BGZF_DEFAULT_BLOCK_SIZE; + + // initialize + int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS); + if ( status != Z_OK ) + throw BamException("BgzfStream::InflateBlock", "zlib inflateInit failed"); + + // decompress + status = inflate(&zs, Z_FINISH); + if ( status != Z_STREAM_END ) { + inflateEnd(&zs); + throw BamException("BgzfStream::InflateBlock", "zlib inflate failed"); + } + + // finalize + status = inflateEnd(&zs); + if ( status != Z_OK ) { + inflateEnd(&zs); + throw BamException("BgzfStream::InflateBlock", "zlib inflateEnd failed"); + } + + // return result + return zs.total_out; +} + +bool BgzfStream::IsOpen(void) const { + if ( m_device == 0 ) + return false; + return m_device->IsOpen(); +} + +void BgzfStream::Open(const string& filename, const IBamIODevice::OpenMode mode) { + + // close current device if necessary + Close(); + BT_ASSERT_X( (m_device == 0), "BgzfStream::Open() - unable to properly close previous IO device" ); + + // retrieve new IO device depending on filename + m_device = BamDeviceFactory::CreateDevice(filename); + BT_ASSERT_X( m_device, "BgzfStream::Open() - unable to create IO device from filename" ); + + // if device fails to open + if ( !m_device->Open(mode) ) { + const string deviceError = m_device->GetErrorString(); + const string message = string("could not open BGZF stream: \n\t") + deviceError; + throw BamException("BgzfStream::Open", message); + } +} + +// reads BGZF data into a byte buffer +size_t BgzfStream::Read(char* data, const size_t dataLength) { + + if ( dataLength == 0 ) + return 0; + + // if stream not open for reading + BT_ASSERT_X( m_device, "BgzfStream::Read() - trying to read from null device"); + if ( !m_device->IsOpen() || (m_device->Mode() != IBamIODevice::ReadOnly) ) + return 0; + + // read blocks as needed until desired data length is retrieved + char* output = data; + size_t numBytesRead = 0; + while ( numBytesRead < dataLength ) { + + // determine bytes available in current block + int bytesAvailable = m_blockLength - m_blockOffset; + + // read (and decompress) next block if needed + if ( bytesAvailable <= 0 ) { + ReadBlock(); + bytesAvailable = m_blockLength - m_blockOffset; + if ( bytesAvailable <= 0 ) + break; + } + + // copy data from uncompressed source buffer into data destination buffer + const size_t copyLength = min( (dataLength-numBytesRead), (size_t)bytesAvailable ); + memcpy(output, m_uncompressedBlock.Buffer + m_blockOffset, copyLength); + + // update counters + m_blockOffset += copyLength; + output += copyLength; + numBytesRead += copyLength; + } + + // update block data + if ( m_blockOffset == m_blockLength ) { + m_blockAddress = m_device->Tell(); + m_blockOffset = 0; + m_blockLength = 0; + + } + + // return actual number of bytes read + return numBytesRead; +} + +// reads a BGZF block +void BgzfStream::ReadBlock(void) { + + BT_ASSERT_X( m_device, "BgzfStream::ReadBlock() - trying to read from null IO device"); + + // store block's starting address + int64_t blockAddress = m_device->Tell(); + + // read block header from file + char header[Constants::BGZF_BLOCK_HEADER_LENGTH]; + int64_t numBytesRead = m_device->Read(header, Constants::BGZF_BLOCK_HEADER_LENGTH); + + // check for device error + if ( numBytesRead < 0 ) { + const string message = string("device error: ") + m_device->GetErrorString(); + throw BamException("BgzfStream::ReadBlock", message); + } + + // if block header empty + if ( numBytesRead == 0 ) { + m_blockLength = 0; + return; + } + + // if block header invalid size + if ( numBytesRead != static_cast(Constants::BGZF_BLOCK_HEADER_LENGTH) ) + throw BamException("BgzfStream::ReadBlock", "invalid block header size"); + + // validate block header contents + if ( !BgzfStream::CheckBlockHeader(header) ) + throw BamException("BgzfStream::ReadBlock", "invalid block header contents"); + + // copy header contents to compressed buffer + const size_t blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1; + memcpy(m_compressedBlock.Buffer, header, Constants::BGZF_BLOCK_HEADER_LENGTH); + + // read remainder of block + const size_t remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH; + numBytesRead = m_device->Read(&m_compressedBlock.Buffer[Constants::BGZF_BLOCK_HEADER_LENGTH], remaining); + + // check for device error + if ( numBytesRead < 0 ) { + const string message = string("device error: ") + m_device->GetErrorString(); + throw BamException("BgzfStream::ReadBlock", message); + } + + // check that we read in expected numBytes + if ( numBytesRead != static_cast(remaining) ) + throw BamException("BgzfStream::ReadBlock", "could not read data from block"); + + // decompress block data + const size_t newBlockLength = InflateBlock(blockLength); + + // update block data + if ( m_blockLength != 0 ) + m_blockOffset = 0; + m_blockAddress = blockAddress; + m_blockLength = newBlockLength; +} + +// seek to position in BGZF file +void BgzfStream::Seek(const int64_t& position) { + + BT_ASSERT_X( m_device, "BgzfStream::Seek() - trying to seek on null IO device"); + + // skip if device is not open + if ( !IsOpen() ) return; + + // determine adjusted offset & address + int blockOffset = (position & 0xFFFF); + int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; + + // attempt seek in file + if ( m_device->IsRandomAccess() && m_device->Seek(blockAddress) ) { + + // update block data & return success + m_blockLength = 0; + m_blockAddress = blockAddress; + m_blockOffset = blockOffset; + } + else { + stringstream s(""); + s << "unable to seek to position: " << position; + throw BamException("BgzfStream::Seek", s.str()); + } +} + +void BgzfStream::SetWriteCompressed(bool ok) { + m_isWriteCompressed = ok; +} + +// get file position in BGZF file +int64_t BgzfStream::Tell(void) const { + if ( !IsOpen() ) + return 0; + return ( (m_blockAddress << 16) | (m_blockOffset & 0xFFFF) ); +} + +// writes the supplied data into the BGZF buffer +size_t BgzfStream::Write(const char* data, const size_t dataLength) { + + BT_ASSERT_X( m_device, "BgzfStream::Write() - trying to write to null IO device"); + BT_ASSERT_X( (m_device->Mode() == IBamIODevice::WriteOnly), + "BgzfStream::Write() - trying to write to non-writable IO device"); + + // skip if file not open for writing + if ( !IsOpen() ) + return 0; + + // write blocks as needed til all data is written + size_t numBytesWritten = 0; + const char* input = data; + const size_t blockLength = Constants::BGZF_DEFAULT_BLOCK_SIZE; + while ( numBytesWritten < dataLength ) { + + // copy data contents to uncompressed output buffer + unsigned int copyLength = min(blockLength - m_blockOffset, dataLength - numBytesWritten); + char* buffer = m_uncompressedBlock.Buffer; + memcpy(buffer + m_blockOffset, input, copyLength); + + // update counter + m_blockOffset += copyLength; + input += copyLength; + numBytesWritten += copyLength; + + // flush (& compress) output buffer when full + if ( m_blockOffset == blockLength ) + FlushBlock(); + } + + // return actual number of bytes written + return numBytesWritten; +} diff --git a/src/api/internal/io/BgzfStream_p.h b/src/api/internal/io/BgzfStream_p.h new file mode 100644 index 0000000..47b3609 --- /dev/null +++ b/src/api/internal/io/BgzfStream_p.h @@ -0,0 +1,93 @@ +// *************************************************************************** +// BgzfStream_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011(DB) +// --------------------------------------------------------------------------- +// Based on BGZF routines developed at the Broad Institute. +// Provides the basic functionality for reading & writing BGZF files +// Replaces the old BGZF.* files to avoid clashing with other toolkits +// *************************************************************************** + +#ifndef BGZFSTREAM_P_H +#define BGZFSTREAM_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/api_global.h" +#include "api/BamAux.h" +#include "api/IBamIODevice.h" +#include + +namespace BamTools { +namespace Internal { + +class BgzfStream { + + // constructor & destructor + public: + BgzfStream(void); + ~BgzfStream(void); + + // main interface methods + public: + // closes BGZF file + void Close(void); + // returns true if BgzfStream open for IO + bool IsOpen(void) const; + // opens the BGZF file + void Open(const std::string& filename, const IBamIODevice::OpenMode mode); + // reads BGZF data into a byte buffer + size_t Read(char* data, const size_t dataLength); + // seek to position in BGZF file + void Seek(const int64_t& position); + // sets IO device (closes previous, if any, but does not attempt to open) + void SetIODevice(IBamIODevice* device); + // enable/disable compressed output + void SetWriteCompressed(bool ok); + // get file position in BGZF file + int64_t Tell(void) const; + // writes the supplied data into the BGZF buffer + size_t Write(const char* data, const size_t dataLength); + + // internal methods + private: + // compresses the current block + size_t DeflateBlock(void); + // flushes the data in the BGZF block + void FlushBlock(void); + // de-compresses the current block + size_t InflateBlock(const size_t& blockLength); + // reads a BGZF block + void ReadBlock(void); + + // static 'utility' methods + public: + // checks BGZF block header + static bool CheckBlockHeader(char* header); + + // data members + public: + unsigned int m_blockLength; + unsigned int m_blockOffset; + uint64_t m_blockAddress; + + bool m_isWriteCompressed; + IBamIODevice* m_device; + + RaiiBuffer m_uncompressedBlock; + RaiiBuffer m_compressedBlock; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BGZFSTREAM_P_H diff --git a/src/api/internal/io/ByteArray_p.cpp b/src/api/internal/io/ByteArray_p.cpp new file mode 100644 index 0000000..5f54c83 --- /dev/null +++ b/src/api/internal/io/ByteArray_p.cpp @@ -0,0 +1,111 @@ +// *************************************************************************** +// ByteArray_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a dynamic, variable-length byte buffer +// *************************************************************************** + +#include "api/internal/io/ByteArray_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +using namespace std; + +// -------------------------- +// ByteArray implementation +// -------------------------- + +ByteArray::ByteArray(void) + : m_data() +{ } + +ByteArray::ByteArray(const string& value) + : m_data(value.begin(), value.end()) +{ } + +ByteArray::ByteArray(const vector& value) + : m_data(value) +{ } + +ByteArray::ByteArray(const char* value, size_t n) { + const string s(value, n); + m_data.assign(s.begin(), s.end()); +} + +ByteArray::ByteArray(const ByteArray& other) + : m_data(other.m_data) +{ } + +ByteArray::~ByteArray(void) { } + +ByteArray& ByteArray::operator=(const ByteArray& other) { + m_data = other.m_data; + return *this; +} + +void ByteArray::Clear(void) { + m_data.clear(); +} + +const char* ByteArray::ConstData(void) const { + return &m_data[0]; +} + +char* ByteArray::Data(void) { + return &m_data[0]; +} + +const char& ByteArray::operator[](size_t i) const { + return m_data[i]; +} + +char& ByteArray::operator[](size_t i) { + return m_data[i]; +} + +size_t ByteArray::IndexOf(const char c, const size_t from, const size_t to) const { + const size_t size = ( (to == 0 ) ? m_data.size() : to ); + for ( size_t i = from; i < size; ++i ) { + if ( m_data.at(i) == c ) + return i; + } + return m_data.size(); +} + +ByteArray& ByteArray::Remove(size_t from, size_t n) { + + // if 'from' outside range, just return + const size_t originalSize = m_data.size(); + if ( from >= originalSize ) + return *this; + + // if asked to clip from 'from' to end (or beyond), simply resize + if ( from + n >= originalSize ) + Resize(from); + + // otherwise, shift data & resize + else { + memmove( &m_data[from], &m_data[from+n], (originalSize-from-n) ); + Resize(originalSize - n); + } + + // return reference to modified byte array + return *this; +} + +void ByteArray::Resize(size_t n) { + m_data.resize(n, 0); +} + +size_t ByteArray::Size(void) const { + return m_data.size(); +} + +void ByteArray::Squeeze(void) { + vector t(m_data); + t.swap(m_data); +} diff --git a/src/api/internal/io/ByteArray_p.h b/src/api/internal/io/ByteArray_p.h new file mode 100644 index 0000000..7e95f6e --- /dev/null +++ b/src/api/internal/io/ByteArray_p.h @@ -0,0 +1,69 @@ +// *************************************************************************** +// ByteArray_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a dynamic, variable-length byte buffer +// *************************************************************************** + +#ifndef BYTEARRAY_P_H +#define BYTEARRAY_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/api_global.h" +#include +#include + +namespace BamTools { +namespace Internal { + +// provides a wrapper around a byte vector +class ByteArray { + + // ctors & dtor + public: + ByteArray(void); + ByteArray(const std::string& value); + ByteArray(const std::vector& value); + ByteArray(const char* value, size_t n); + ByteArray(const ByteArray& other); + ~ByteArray(void); + + ByteArray& operator=(const ByteArray& other); + + // ByteArray interface + public: + + // data access + const char* ConstData(void) const; + char* Data(void); + const char& operator[](size_t i) const; + char& operator[](size_t i); + + // byte array manipulation + void Clear(void); + size_t IndexOf(const char c, const size_t from = 0, const size_t to = 0) const; + ByteArray& Remove(size_t from, size_t n); + void Resize(size_t n); + size_t Size(void) const; + void Squeeze(void); + + // data members + private: + std::vector m_data; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BYTEARRAY_P_H diff --git a/src/api/internal/io/CMakeLists.txt b/src/api/internal/io/CMakeLists.txt new file mode 100644 index 0000000..d9da416 --- /dev/null +++ b/src/api/internal/io/CMakeLists.txt @@ -0,0 +1,52 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2011 Derek Barnett +# +# src/api/internal/io +# ========================== + +set ( InternalIODir "${InternalDir}/io" ) + +#-------------------------- +# platform-independent IO +#-------------------------- +set ( CommonIOSources + ${InternalIODir}/BamDeviceFactory_p.cpp + ${InternalIODir}/BamFile_p.cpp + ${InternalIODir}/BamFtp_p.cpp + ${InternalIODir}/BamHttp_p.cpp + ${InternalIODir}/BamPipe_p.cpp + ${InternalIODir}/BgzfStream_p.cpp + ${InternalIODir}/ByteArray_p.cpp + ${InternalIODir}/HostAddress_p.cpp + ${InternalIODir}/HostInfo_p.cpp + ${InternalIODir}/HttpHeader_p.cpp + ${InternalIODir}/ILocalIODevice_p.cpp + ${InternalIODir}/RollingBuffer_p.cpp + ${InternalIODir}/TcpSocket_p.cpp + ${InternalIODir}/TcpSocketEngine_p.cpp +) + +#------------------------ +# platform-dependent IO +#------------------------ +if ( _WIN32 ) + set ( PlatformIOSources + ${InternalIODir}/TcpSocketEngine_win_p.cpp + ) +else ( _WIN32 ) + set ( PlatformIOSources + ${InternalIODir}/TcpSocketEngine_unix_p.cpp + ) +endif ( _WIN32 ) + +#--------------------------- +# make build-specific list +#--------------------------- +set ( InternalIOSources + ${CommonIOSources} + ${PlatformIOSources} + + PARENT_SCOPE # <-- leave this last +) + diff --git a/src/api/internal/io/HostAddress_p.cpp b/src/api/internal/io/HostAddress_p.cpp new file mode 100644 index 0000000..873087b --- /dev/null +++ b/src/api/internal/io/HostAddress_p.cpp @@ -0,0 +1,396 @@ +// *************************************************************************** +// HostAddress_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a generic IP address container +// *************************************************************************** + +#include "api/internal/io/HostAddress_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +#include +using namespace std; + +// ------------------------ +// static utility methods +// ------------------------ + +namespace BamTools { +namespace Internal { + +// split a string into fields, on delimiter character +static inline +vector Split(const string& source, char delim) { + stringstream ss(source); + string field; + vector fields; + while ( getline(ss, field, delim) ) + fields.push_back(field); + return fields; +} + +// return number of occurrences of @pattern in @source +static inline +uint8_t CountHits(const string& source, const string& pattern) { + + uint8_t count(0); + size_t found = source.find(pattern); + while ( found != string::npos ) { + ++count; + found = source.find(pattern, found+1); + } + return count; +} + +static +bool ParseIp4(const string& address, uint32_t& maybeIp4 ) { + + // split IP address into string fields + vector addressFields = Split(address, '.'); + if ( addressFields.size() != 4 ) + return false; + + // convert each field to integer value + uint32_t ipv4(0); + for ( uint8_t i = 0; i < 4; ++i ) { + + const string& field = addressFields.at(i); + const size_t fieldSize = field.size(); + for ( size_t j = 0; j < fieldSize; ++j ) { + if ( !isdigit(field[j]) ) + return false; + } + + int value = atoi( addressFields.at(i).c_str() ); + if ( value < 0 || value > 255 ) + return false; + + // append byte value + ipv4 <<= 8; + ipv4 += value; + } + + // store 32-bit IP address & return success + maybeIp4 = ipv4; + return true; +} + +static +bool ParseIp6(const string& address, uint8_t* maybeIp6 ) { + + string tmp = address; + + // look for '%' char (if found, lop off that part of address) + // we're going to ignore any link-local zone index, for now at least + const size_t percentFound = tmp.rfind('%'); + if ( percentFound != string::npos ) + tmp = tmp.substr(0, percentFound); + + // split IP address into string fields + vector fields = Split(tmp, ':'); + const uint8_t numFields = fields.size(); + if ( numFields < 3 || numFields > 8 ) + return false; + + // get number of '::' separators + const uint8_t numColonColons = CountHits(tmp, "::"); + if ( numFields == 8 && numColonColons > 1 ) + return false; + + // check valid IPv6 'compression' + // must be valid 'pure' IPv6 or mixed IPv4/6 notation + const size_t dotFound = tmp.find('.'); + const bool isMixed = ( dotFound != string::npos ); + if ( numColonColons != 1 && (numFields < (isMixed ? 7 : 8)) ) + return false; + + // iterate over provided fields + size_t index = 16; + size_t fillCount = 9 - numFields; + for ( int8_t i = numFields - 1; i >= 0; --i ) { + if ( index == 0 ) + return false; + const string& field = fields.at(i); + + // if field empty + if ( field.empty() ) { + + // if last field empty + if ( i == numFields - 1 ) { + const string& previousField = fields.at(i-1); + if ( previousField.empty() ) + return false; + maybeIp6[--index] = 0; + maybeIp6[--index] = 0; + } + + // if first field empty + else if ( i == 0 ) { + // make sure ':' isn't first character + const string& nextField = fields.at(i+1); + if ( nextField.empty() ) return false; + maybeIp6[--index] = 0; + maybeIp6[--index] = 0; + } + + // fill in 'compressed' 0s + else { + for ( uint8_t j = 0; j < fillCount; ++j ) { + if ( index == 0 ) return false; + maybeIp6[--index] = 0; + maybeIp6[--index] = 0; + } + } + } + + // field has data + else { + uint32_t value = static_cast( strtoul(field.c_str(), 0, 16) ); + + if ( value <= 0xffff ) { + maybeIp6[--index] = value & 0xff; + maybeIp6[--index] = (value >> 8) & 0xff; + } + + // possible mixed IPv4/6 notation + else { + + // mixed field must be last + if ( i != numFields - 1 ) + return false; + + // parse the IPv4 section + uint32_t maybeIp4; + if ( !ParseIp4(field, maybeIp4) ) + return false; + + // store IPv4 fields in IPv6 container + maybeIp6[--index] = maybeIp4 & 0xff; + maybeIp6[--index] = (maybeIp4 >> 8) & 0xff; + maybeIp6[--index] = (maybeIp4 >> 16) & 0xff; + maybeIp6[--index] = (maybeIp4 >> 24) & 0xff; + --fillCount; + } + } + } + + // should have parsed OK, return success + return true; +} + +} // namespace Internal +} // namespace BamTools + +// ---------------------------- +// HostAddress implementation +// ---------------------------- + +HostAddress::HostAddress(void) + : m_protocol(HostAddress::UnknownNetworkProtocol) + , m_ip4Address(0) + , m_hasIpAddress(true) +{ } + +HostAddress::HostAddress(const uint32_t ip4Address) + : m_protocol(HostAddress::UnknownNetworkProtocol) + , m_ip4Address(0) + , m_hasIpAddress(true) +{ + SetAddress(ip4Address); +} + +HostAddress::HostAddress(const uint8_t* ip6Address) + : m_protocol(HostAddress::UnknownNetworkProtocol) + , m_ip4Address(0) + , m_hasIpAddress(true) +{ + SetAddress(ip6Address); +} + +HostAddress::HostAddress(const IPv6Address& ip6Address) + : m_protocol(HostAddress::UnknownNetworkProtocol) + , m_ip4Address(0) + , m_hasIpAddress(true) +{ + SetAddress(ip6Address); +} + +HostAddress::HostAddress(const std::string& address) + : m_protocol(HostAddress::UnknownNetworkProtocol) + , m_ip4Address(0) +{ + SetAddress(address); +} + +HostAddress::HostAddress(const HostAddress& other) + : m_protocol(other.m_protocol) + , m_ip4Address(other.m_ip4Address) + , m_ip6Address(other.m_ip6Address) + , m_ipString(other.m_ipString) + , m_hasIpAddress(other.m_hasIpAddress) +{ } + +HostAddress::~HostAddress(void) { } + +bool HostAddress::operator==(const HostAddress& other) const { + + // if self is IPv4 + if ( m_protocol == HostAddress::IPv4Protocol ) { + return ( other.m_protocol == HostAddress::IPv4Protocol && + m_ip4Address == other.m_ip4Address + ); + } + + // if self is IPv6 + else if ( m_protocol == HostAddress::IPv6Protocol ) { + return ( other.m_protocol == HostAddress::IPv6Protocol && + memcmp(&m_ip6Address, &other.m_ip6Address, sizeof(IPv6Address)) == 0 + ); + } + + // otherwise compare protocols + else return m_protocol == other.m_protocol; +} + +bool HostAddress::operator<(const HostAddress& other) const { + + // if self is IPv4 + if ( m_protocol == HostAddress::IPv4Protocol ) { + if ( other.m_protocol == HostAddress::IPv4Protocol ) + return m_ip4Address < m_ip4Address; + } + + // if self is IPv6 + else if ( m_protocol == HostAddress::IPv6Protocol ) { + if ( other.m_protocol == HostAddress::IPv6Protocol ) + return (memcmp(&m_ip6Address, &other.m_ip6Address, sizeof(IPv6Address)) < 0); + } + + // otherwise compare protocol types + return m_protocol < other.m_protocol; +} + +void HostAddress::Clear(void) { + + m_protocol = HostAddress::UnknownNetworkProtocol; + m_ip4Address = 0; + memset(&m_ip6Address, 0, sizeof(IPv6Address)); + m_ipString.clear(); + + // this may feel funny, but cleared IP (equivalent to '0.0.0.0') is technically valid + // and that's not really what this flag is checking anyway + // + // this flag is false *iff* the string passed in is a 'plain-text' hostname (www.foo.bar) + m_hasIpAddress = true; +} + +bool HostAddress::HasIPAddress(void) const { + return m_hasIpAddress; +} + +bool HostAddress::IsNull(void) const { + return m_protocol == HostAddress::UnknownNetworkProtocol; +} + +uint32_t HostAddress::GetIPv4Address(void) const { + return m_ip4Address; +} + +IPv6Address HostAddress::GetIPv6Address(void) const { + return m_ip6Address; +} + +std::string HostAddress::GetIPString(void) const { + + stringstream ss(""); + + // IPv4 format + if ( m_protocol == HostAddress::IPv4Protocol ) { + ss << ( (m_ip4Address>>24) & 0xff ) << '.' + << ( (m_ip4Address>>16) & 0xff ) << '.' + << ( (m_ip4Address>> 8) & 0xff ) << '.' + << ( m_ip4Address & 0xff ); + + } + + // IPv6 format + else if ( m_protocol == HostAddress::IPv6Protocol ) { + for ( uint8_t i = 0; i < 8; ++i ) { + if ( i != 0 ) + ss << ':'; + ss << hex << ( (uint16_t(m_ip6Address[2*i]) << 8) | + (uint16_t(m_ip6Address[2*i+1])) + ); + } + } + + // return result (empty string if unknown protocol) + return ss.str(); +} + +HostAddress::NetworkProtocol HostAddress::GetProtocol(void) const { + return m_protocol; +} + +bool HostAddress::ParseAddress(void) { + + // all IPv6 addresses should have a ':' + string s = m_ipString; + size_t found = s.find(':'); + if ( found != string::npos ) { + // try parse IP6 address + uint8_t maybeIp6[16]; + if ( ParseIp6(s, maybeIp6) ) { + SetAddress(maybeIp6); + m_protocol = HostAddress::IPv6Protocol; + return true; + } + } + + // all IPv4 addresses should have a '.' + found = s.find('.'); + if ( found != string::npos ) { + uint32_t maybeIp4(0); + if ( ParseIp4(s, maybeIp4) ) { + SetAddress(maybeIp4); + m_protocol = HostAddress::IPv4Protocol; + return true; + } + } + + // else likely just a plain-text host name "www.foo.bar" + // will need to look up IP address info later + m_protocol = HostAddress::UnknownNetworkProtocol; + return false; +} + +void HostAddress::SetAddress(const uint32_t ip4Address) { + m_ip4Address = ip4Address; + m_protocol = HostAddress::IPv4Protocol; + m_hasIpAddress = true; +} + +void HostAddress::SetAddress(const uint8_t* ip6Address) { + for ( uint8_t i = 0; i < 16; ++i ) + m_ip6Address[i] = ip6Address[i]; + m_protocol = HostAddress::IPv6Protocol; + m_hasIpAddress = true; +} + +void HostAddress::SetAddress(const IPv6Address& ip6Address) { + m_ip6Address = ip6Address; + m_ip4Address = 0; + m_protocol = HostAddress::IPv6Protocol; + m_hasIpAddress = true; +} + +void HostAddress::SetAddress(const std::string& address) { + m_ipString = address; + m_hasIpAddress = ParseAddress(); +} diff --git a/src/api/internal/io/HostAddress_p.h b/src/api/internal/io/HostAddress_p.h new file mode 100644 index 0000000..4c1b360 --- /dev/null +++ b/src/api/internal/io/HostAddress_p.h @@ -0,0 +1,100 @@ +// *************************************************************************** +// HostAddress_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a generic IP address container +// *************************************************************************** + +#ifndef HOSTADDRESS_P_H +#define HOSTADDRESS_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/api_global.h" +#include +#include + +namespace BamTools { +namespace Internal { + +struct IPv6Address { + + // ctor + inline IPv6Address(void) { memset(&data, 0, sizeof(uint8_t)*16); } + + // data access (no bounds checking) + inline uint8_t& operator[](size_t index) { return data[index]; } + inline uint8_t operator[](size_t index) const { return data[index]; } + + // data + uint8_t data[16]; +}; + +class HostAddress { + + // enums + public: + enum NetworkProtocol { UnknownNetworkProtocol = -1 + , IPv4Protocol = 0 + , IPv6Protocol + }; + + // ctors & dtor + public: + HostAddress(void); + explicit HostAddress(const uint32_t ip4Address); + explicit HostAddress(const uint8_t* ip6Address); + explicit HostAddress(const IPv6Address& ip6Address); + explicit HostAddress(const std::string& address); + HostAddress(const HostAddress& other); + ~HostAddress(void); + + // HostAddress interface + public: + void Clear(void); + bool HasIPAddress(void) const; // returns whether string address could be converted to IP address + bool IsNull(void) const; + + uint32_t GetIPv4Address(void) const; + IPv6Address GetIPv6Address(void) const; + std::string GetIPString(void) const; + HostAddress::NetworkProtocol GetProtocol(void) const; + + void SetAddress(const uint32_t ip4Address); + void SetAddress(const uint8_t* ip6Address); + void SetAddress(const IPv6Address& ip6Address); + void SetAddress(const std::string& address); + + // HostAddress comparison operators + public: + bool operator==(const HostAddress& other) const; + bool operator!=(const HostAddress& other) const { return !( operator==(other) ); } + bool operator<(const HostAddress& other) const; + + // internal methods + private: + bool ParseAddress(void); + + // data members + private: + HostAddress::NetworkProtocol m_protocol; + uint32_t m_ip4Address; + IPv6Address m_ip6Address; + std::string m_ipString; + bool m_hasIpAddress; // true until string passed in, then signifies whether string was an IP +}; + +} // namespace Internal +} // namespace BamTools + +#endif // HOSTADDRESS_P_H diff --git a/src/api/internal/io/HostInfo_p.cpp b/src/api/internal/io/HostInfo_p.cpp new file mode 100644 index 0000000..80343f1 --- /dev/null +++ b/src/api/internal/io/HostInfo_p.cpp @@ -0,0 +1,223 @@ +// *************************************************************************** +// HostInfo_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides DNS lookup functionality for hostname & its discovered addresses +// *************************************************************************** + +#include "api/internal/io/HostInfo_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +// platorm-specifics +#ifdef _WIN32 +# include "api/internal/io/NetWin_p.h" +#else +# include "api/internal/io/NetUnix_p.h" +#endif + +// standard C++ includes +#include +#include +#include +using namespace std; + +// ------------------------- +// HostInfo implementation +// ------------------------- + +HostInfo::HostInfo(void) + : m_error(HostInfo::NoError) +{ } + +HostInfo::HostInfo(const HostInfo& other) + : m_hostName(other.m_hostName) + , m_addresses(other.m_addresses) + , m_error(other.m_error) + , m_errorString(other.m_errorString) +{ } + +HostInfo::~HostInfo(void) { } + +vector HostInfo::Addresses(void) const { + return m_addresses; +} + +HostInfo::ErrorType HostInfo::GetError(void) const { + return m_error; +} + +string HostInfo::GetErrorString(void) const { + return m_errorString; +} + +string HostInfo::HostName(void) const { + return m_hostName; +} + +void HostInfo::SetAddresses(const std::vector& addresses) { + m_addresses = addresses; +} + +void HostInfo::SetError(const HostInfo::ErrorType error) { + m_error = error; +} + +void HostInfo::SetErrorString(const std::string& errorString) { + m_errorString = errorString; +} + +void HostInfo::SetHostName(const string& name) { + m_hostName = name; +} + +// --------------------------------- +// HostInfo::Lookup(host, port) +// - the real "heavy-lifter" here +// --------------------------------- + +HostInfo HostInfo::Lookup(const string& hostname, const string& port) { + + HostInfo result; + set uniqueAddresses; + +#ifdef _WIN32 + WindowsSockInit init; +#endif + + HostAddress address; + address.SetAddress(hostname); + + // if hostname is an IP string ('0.0.0.0' or IPv6 format) + // do reverse lookup for host domain name + // + // TODO: might just remove this... not sure if proper 'hostname' from IP string is needed + // + // so far, haven't been able to successfully fetch a domain name with reverse DNS + // getnameinfo() on test sites just returns original IP string. BUT this is likely a rare + // case that client code tries to use an IP string and the connection should work fine + // anyway. GetHostName() just won't quite show what I was hoping for. :( + if ( address.HasIPAddress() ) { + + const uint16_t portNum = static_cast( atoi(port.c_str()) ); + + sockaddr_in sa4; + sockaddr_in6 sa6; + sockaddr* sa = 0; + BT_SOCKLEN_T saSize = 0; + + // IPv4 + if ( address.GetProtocol() == HostAddress::IPv4Protocol ) { + sa = (sockaddr*)&sa4; + saSize = sizeof(sa4); + memset(&sa4, 0, sizeof(sa4)); + sa4.sin_family = AF_INET; + sa4.sin_addr.s_addr = htonl(address.GetIPv4Address()); + sa4.sin_port = htons(portNum); + } + + // IPv6 + else if ( address.GetProtocol() == HostAddress::IPv4Protocol ){ + sa = (sockaddr*)&sa6; + saSize = sizeof(sa6); + memset(&sa6, 0, sizeof(sa6)); + sa6.sin6_family = AF_INET6; + memcpy(sa6.sin6_addr.s6_addr, address.GetIPv6Address().data, sizeof(sa6.sin6_addr.s6_addr)); + sa6.sin6_port = htons(portNum); + } + + // unknown (should be unreachable) + else BT_ASSERT_X(false, "HostInfo::Lookup: unknown network protocol"); + + // lookup name for IP + char hbuf[NI_MAXHOST]; + char serv[NI_MAXSERV]; + if ( sa && (getnameinfo(sa, saSize, hbuf, sizeof(hbuf), serv, sizeof(serv), 0) == 0) ) + result.SetHostName(string(hbuf)); + + // if no domain name found, just use the original address's IP string + if ( result.HostName().empty() ) + result.SetHostName(address.GetIPString()); + + // store address in HostInfo + uniqueAddresses.insert(address); + } + + // otherwise, hostname is a domain name ('www.foo.bar') + // do 'normal' lookup + else { + + // setup address lookup 'hints' + addrinfo hints; + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; // allow either IPv4 or IPv6 + hints.ai_socktype = SOCK_STREAM; // for TCP + hints.ai_protocol = IPPROTO_TCP; + + // fetch addresses for requested hostname/port + addrinfo* res; + int status = getaddrinfo(hostname.c_str(), port.c_str(), &hints, &res ); + + // if everything OK + if ( status == 0 ) { + + // iterate over all IP addresses found + addrinfo* p = res; + for ( ; p != NULL; p = p->ai_next ) { + + // IPv4 + if ( p->ai_family == AF_INET ) { + sockaddr_in* ipv4 = (sockaddr_in*)p->ai_addr; + HostAddress a( ntohl(ipv4->sin_addr.s_addr) ); + uniqueAddresses.insert(a); + } + + // IPv6 + else if ( p->ai_family == AF_INET6 ) { + sockaddr_in6* ipv6 = (sockaddr_in6*)p->ai_addr; + HostAddress a(ipv6->sin6_addr.s6_addr); + uniqueAddresses.insert(a); + } + } + + // if we iterated, but no addresses were stored + if ( uniqueAddresses.empty() && (p == NULL) ) { + result.SetError(HostInfo::UnknownError); + result.SetErrorString("HostInfo: unknown address types found"); + } + } + + // handle error cases + else if ( +#ifndef _WIN32 + status == EAI_NONAME + || status == EAI_FAIL +# ifdef EAI_NODATA + || status == EAI_NODATA // officially deprecated, but just in case we happen to hit it +# endif // EAI_NODATA + +#else // _WIN32 + WSAGetLastError() == WSAHOST_NOT_FOUND + || WSAGetLastError() == WSANO_DATA + || WSAGetLastError() == WSANO_RECOVERY +#endif // _WIN32 + ) + { + result.SetError(HostInfo::HostNotFound); + result.SetErrorString("HostInfo: host not found"); + } + else { + result.SetError(HostInfo::UnknownError); + result.SetErrorString("HostInfo: unknown error encountered"); + } + + // cleanup + freeaddrinfo(res); + } + + // store fetched addresses (converting set -> vector) in result & return + result.SetAddresses( vector(uniqueAddresses.begin(), uniqueAddresses.end()) ); + return result; +} diff --git a/src/api/internal/io/HostInfo_p.h b/src/api/internal/io/HostInfo_p.h new file mode 100644 index 0000000..ad03d37 --- /dev/null +++ b/src/api/internal/io/HostInfo_p.h @@ -0,0 +1,76 @@ +// *************************************************************************** +// HostInfo_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides DNS lookup functionality for hostname/IP addresses +// *************************************************************************** + +#ifndef HOSTINFO_P_H +#define HOSTINFO_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/internal/io/HostAddress_p.h" +#include +#include + +namespace BamTools { +namespace Internal { + +class HostInfo { + + public: + enum ErrorType { NoError = 0 + , HostNotFound + , UnknownError + }; + + // ctors & dtor + public: + HostInfo(void); + HostInfo(const HostInfo& other); + ~HostInfo(void); + + // HostInfo interface + public: + std::string HostName(void) const; + void SetHostName(const std::string& name); + + std::vector Addresses(void) const; + void SetAddresses(const std::vector& addresses); + + HostInfo::ErrorType GetError(void) const; + std::string GetErrorString(void) const; + + // internal methods + private: + void SetError(const HostInfo::ErrorType error); + void SetErrorString(const std::string& errorString); + + // static methods + public: + static HostInfo Lookup(const std::string& hostname, + const std::string& port); + + // data members + private: + std::string m_hostName; + std::vector m_addresses; + HostInfo::ErrorType m_error; + std::string m_errorString; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // HOSTINFO_P_H diff --git a/src/api/internal/io/HttpHeader_p.cpp b/src/api/internal/io/HttpHeader_p.cpp new file mode 100644 index 0000000..c4f78b6 --- /dev/null +++ b/src/api/internal/io/HttpHeader_p.cpp @@ -0,0 +1,395 @@ +// *************************************************************************** +// HttpHeader_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a generic interface for parsing/generating HTTP headers, along +// with specialized request & response header types +// *************************************************************************** + +#include "api/internal/io/HttpHeader_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +using namespace std; + +namespace BamTools { + +// ----------- +// constants +// ----------- + +namespace Constants { + +static const char CAR_RET_CHAR = '\r'; +static const char COLON_CHAR = ':'; +static const char DOT_CHAR = '.'; +static const char NEWLINE_CHAR = '\n'; +static const char SPACE_CHAR = ' '; +static const char TAB_CHAR = '\t'; + +static const string FIELD_NEWLINE = "\r\n"; +static const string FIELD_SEPARATOR = ": "; +static const string HTTP_STRING = "HTTP/"; + +} // namespace Constants + +// ------------------------ +// static utility methods +// ------------------------ + +namespace Internal { + +static inline +bool IsSpace(const char c) { + const int n = static_cast(c); + return ( n== 0 || (n <= 13 && n >= 9) ); +} + +// split on hitting single char delim +static vector Split(const string& source, const char delim) { + stringstream ss(source); + string field; + vector fields; + while ( getline(ss, field, delim) ) + fields.push_back(field); + return fields; +} + +static string Trim(const string& source) { + + // skip if empty string + if ( source.empty() ) + return source; + + // fetch string data + const char* s = source.data(); // ignoring null-term on purpose + const size_t size = source.size(); + size_t start = 0; + size_t end = size-1; + + // skip if no spaces at start or end + if ( !IsSpace(s[start]) && !IsSpace( s[end] ) ) + return source; + + // remove leading whitespace + while ( (start != end) && IsSpace(s[start]) ) + ++start; + + // remove trailing whitespace + if ( start <= end ) { + while ( end && IsSpace(s[end]) ) + --end; + } + + // return result + return string(s + start, (end-start) + 1); +} + +} // namespace Internal +} // namespace BamTools + +// --------------------------- +// HttpHeader implementation +// --------------------------- + +HttpHeader::HttpHeader(void) + : m_isValid(true) + , m_majorVersion(1) + , m_minorVersion(1) +{ } + +HttpHeader::HttpHeader(const string& s) + : m_isValid(true) + , m_majorVersion(1) + , m_minorVersion(1) +{ + Parse(s); +} + +HttpHeader::~HttpHeader(void) { } + +bool HttpHeader::ContainsKey(const string& key) const { + return ( m_fields.find(key) != m_fields.end() ); +} + +int HttpHeader::GetMajorVersion(void) const { + return m_majorVersion; +} + +int HttpHeader::GetMinorVersion(void) const { + return m_minorVersion; +} + +string HttpHeader::GetValue(const string& key) const { + if ( ContainsKey(key) ) + return m_fields.at(key); + else return string(); +} + +bool HttpHeader::IsValid(void) const { + return m_isValid; +} + +void HttpHeader::Parse(const string& s) { + + // trim whitespace from input string + const string trimmed = Trim(s); + + // split into list of header lines + vector rawFields = Split(trimmed, Constants::NEWLINE_CHAR); + + // prep our 'cleaned' fields container + vector cleanFields; + cleanFields.reserve(rawFields.size()); + + // remove any empty fields and clean any trailing windows-style carriage returns ('\r') + vector::iterator rawFieldIter = rawFields.begin(); + vector::iterator rawFieldEnd = rawFields.end(); + for ( ; rawFieldIter != rawFieldEnd; ++rawFieldIter ) { + string& field = (*rawFieldIter); + + // skip empty fields + if ( field.empty() ) + continue; + + // remove carriage returns + const size_t fieldSize = field.size(); + if ( field[fieldSize-1] == Constants::CAR_RET_CHAR ) + field.resize(fieldSize-1); + + // store cleaned field + cleanFields.push_back(field); + } + + // skip add'l processing if nothing here + if ( cleanFields.empty() ) + return; + + // parse header lines + int lineNumber = 0; + vector::const_iterator fieldIter = cleanFields.begin(); + vector::const_iterator fieldEnd = cleanFields.end(); + for ( ; fieldIter != fieldEnd; ++fieldIter, ++lineNumber ) { + if ( !ParseLine( (*fieldIter), lineNumber ) ) { + m_isValid = false; + return; + } + } +} + +bool HttpHeader::ParseLine(const string& line, int) { + + // find colon position, return failure if not found + const size_t colonFound = line.find(Constants::COLON_CHAR); + if ( colonFound == string::npos ) + return false; + + // store key/value (without leading/trailing whitespace) & return success + const string key = Trim(line.substr(0, colonFound)); + const string value = Trim(line.substr(colonFound+1)); + m_fields[key] = value; + return true; +} + +void HttpHeader::RemoveField(const string& key) { + m_fields.erase(key); +} + +void HttpHeader::SetField(const string& key, const string& value) { + m_fields[key] = value; +} + +void HttpHeader::SetValid(bool ok) { + m_isValid = ok; +} + +void HttpHeader::SetVersion(int major, int minor) { + m_majorVersion = major; + m_minorVersion = minor; +} + +string HttpHeader::ToString(void) const { + string result(""); + if ( m_isValid ) { + map::const_iterator fieldIter = m_fields.begin(); + map::const_iterator fieldEnd = m_fields.end(); + for ( ; fieldIter != fieldEnd; ++fieldIter ) { + const string& key = (*fieldIter).first; + const string& value = (*fieldIter).second; + const string& line = key + Constants::FIELD_SEPARATOR + + value + Constants::FIELD_NEWLINE; + result += line; + } + } + return result; +} + +// ---------------------------------- +// HttpRequestHeader implementation +// ---------------------------------- + +HttpRequestHeader::HttpRequestHeader(const string& method, + const string& resource, + int majorVersion, + int minorVersion) + : HttpHeader() + , m_method(method) + , m_resource(resource) +{ + SetVersion(majorVersion, minorVersion); +} + +HttpRequestHeader::~HttpRequestHeader(void) { } + +string HttpRequestHeader::GetMethod(void) const { + return m_method; +} + +string HttpRequestHeader::GetResource(void) const { + return m_resource; +} + +bool HttpRequestHeader::ParseLine(const string& line, int lineNumber) { + + // if not 'request line', just let base class parse + if ( lineNumber != 0 ) + return HttpHeader::ParseLine(line, lineNumber); + + // fail if empty line + if ( line.empty() ) + return false; + + // walk through request line, storing positions + // GET /path/to/resource HTTP/1.1 + // ^ ^^ ^^ + const size_t foundMethod = line.find_first_not_of(Constants::SPACE_CHAR); // skip any leading whitespace + if ( foundMethod == string::npos ) return false; + const size_t foundFirstSpace = line.find(Constants::SPACE_CHAR, foundMethod+1); + if ( foundFirstSpace == string::npos ) return false; + const size_t foundResource = line.find_first_not_of(Constants::SPACE_CHAR, foundFirstSpace+1); + if ( foundResource == string::npos ) return false; + const size_t foundSecondSpace = line.find(Constants::SPACE_CHAR, foundResource+1); + if ( foundSecondSpace == string::npos ) return false; + const size_t foundVersion= line.find_first_not_of(Constants::SPACE_CHAR, foundSecondSpace+1); + if ( foundVersion == string::npos ) return false; + + // parse out method & resource + m_method = line.substr(foundMethod, foundFirstSpace - foundMethod); + m_resource = line.substr(foundResource, foundSecondSpace - foundResource); + + // parse out version numbers + const string temp = line.substr(foundVersion); + if ( (temp.find(Constants::HTTP_STRING) != 0) || (temp.size() != 8) ) + return false; + const int major = static_cast(temp.at(5) - '0'); + const int minor = static_cast(temp.at(7) - '0'); + SetVersion(major, minor); + + // if we get here, return success + return true; +} + +string HttpRequestHeader::ToString(void) const { + stringstream request(""); + request << m_method << Constants::SPACE_CHAR + << m_resource << Constants::SPACE_CHAR + << Constants::HTTP_STRING << GetMajorVersion() << Constants::DOT_CHAR << GetMinorVersion() + << Constants::FIELD_NEWLINE + << HttpHeader::ToString() + << Constants::FIELD_NEWLINE; + return request.str(); +} + +// ----------------------------------- +// HttpResponseHeader implementation +// ----------------------------------- + +HttpResponseHeader::HttpResponseHeader(const int statusCode, + const string& reason, + int majorVersion, + int minorVersion) + + : HttpHeader() + , m_statusCode(statusCode) + , m_reason(reason) +{ + SetVersion(majorVersion, minorVersion); +} + +HttpResponseHeader::HttpResponseHeader(const string& s) + : HttpHeader() + , m_statusCode(0) +{ + Parse(s); +} + +HttpResponseHeader::~HttpResponseHeader(void) { } + +string HttpResponseHeader::GetReason(void) const { + return m_reason; +} + +int HttpResponseHeader::GetStatusCode(void) const { + return m_statusCode; +} + +bool HttpResponseHeader::ParseLine(const string& line, int lineNumber) { + + // if not 'status line', just let base class + if ( lineNumber != 0 ) + return HttpHeader::ParseLine(line, lineNumber); + + // fail if empty line + if ( line.empty() ) + return false; + + // walk through status line, storing positions + // HTTP/1.1 200 OK + // ^ ^^ ^^ + + const size_t foundVersion = line.find_first_not_of(Constants::SPACE_CHAR); // skip any leading whitespace + if ( foundVersion == string::npos ) return false; + const size_t foundFirstSpace = line.find(Constants::SPACE_CHAR, foundVersion+1); + if ( foundFirstSpace == string::npos ) return false; + const size_t foundStatusCode = line.find_first_not_of(Constants::SPACE_CHAR, foundFirstSpace+1); + if ( foundStatusCode == string::npos ) return false; + const size_t foundSecondSpace = line.find(Constants::SPACE_CHAR, foundStatusCode+1); + if ( foundSecondSpace == string::npos ) return false; + const size_t foundReason= line.find_first_not_of(Constants::SPACE_CHAR, foundSecondSpace+1); + if ( foundReason == string::npos ) return false; + + // parse version numbers + string temp = line.substr(foundVersion, foundFirstSpace - foundVersion); + if ( (temp.find(Constants::HTTP_STRING) != 0) || (temp.size() != 8) ) + return false; + const int major = static_cast(temp.at(5) - '0'); + const int minor = static_cast(temp.at(7) - '0'); + SetVersion(major, minor); + + // parse status code + temp = line.substr(foundStatusCode, foundSecondSpace - foundStatusCode); + if ( temp.size() != 3 ) return false; + m_statusCode = atoi( temp.c_str() ); + + // reason phrase should be everything else left + m_reason = line.substr(foundReason); + + // if we get here, return success + return true; +} + +string HttpResponseHeader::ToString(void) const { + stringstream response(""); + response << Constants::HTTP_STRING << GetMajorVersion() << Constants::DOT_CHAR << GetMinorVersion() + << Constants::SPACE_CHAR << m_statusCode + << Constants::SPACE_CHAR << m_reason + << Constants::FIELD_NEWLINE + << HttpHeader::ToString() + << Constants::FIELD_NEWLINE; + return response.str(); +} diff --git a/src/api/internal/io/HttpHeader_p.h b/src/api/internal/io/HttpHeader_p.h new file mode 100644 index 0000000..6330235 --- /dev/null +++ b/src/api/internal/io/HttpHeader_p.h @@ -0,0 +1,132 @@ +// *************************************************************************** +// HttpHeader_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a generic interface for parsing/generating HTTP headers, along +// with specialized request & response header types +// *************************************************************************** + +#ifndef HTTP_HEADER_P_H +#define HTTP_HEADER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/api_global.h" +#include +#include + +namespace BamTools { +namespace Internal { + +class HttpHeader { + + // ctors & dtor + public: + HttpHeader(void); + HttpHeader(const std::string& s); + virtual ~HttpHeader(void); + + // HttpHeader interface + public: + + // header field=>value access + bool ContainsKey(const std::string& key) const; + std::string GetValue(const std::string& key) const; + void RemoveField(const std::string& key); + void SetField(const std::string& key, const std::string& value); + + // get formatted header string + virtual std::string ToString(void) const; + + // query HTTP version used + int GetMajorVersion(void) const; + int GetMinorVersion(void) const; + + // see if header was parsed OK + bool IsValid(void) const; + + // internal methods + protected: + void Parse(const std::string& s); + virtual bool ParseLine(const std::string& line, int lineNumber); + void SetValid(bool ok); + void SetVersion(int major, int minor); + + // data members + private: + std::map m_fields; + + bool m_isValid; // should usually be true, only false if error processing a header line + int m_majorVersion; + int m_minorVersion; +}; + +class HttpRequestHeader : public HttpHeader { + + // ctor & dtor + public: + HttpRequestHeader(const std::string& method, // "GET", "PUT", etc + const std::string& resource, // filename + int majorVersion = 1, // version info + int minorVersion = 1); + ~HttpRequestHeader(void); + + // HttpRequestHeader interface + public: + std::string GetMethod(void) const; + std::string GetResource(void) const; + + // HttpHeader implementation + public: + std::string ToString(void) const; + protected: + bool ParseLine(const std::string& line, int lineNumber); + + // data members + private: + std::string m_method; + std::string m_resource; +}; + +class HttpResponseHeader : public HttpHeader { + + // ctor & dtor + public: + HttpResponseHeader(const int statusCode, // 200, 404, etc + const std::string& reason = std::string(), // 'reason phrase' for code + int majorVersion = 1, // version info + int minorVersion = 1); + HttpResponseHeader(const std::string& s); + ~HttpResponseHeader(void); + + // HttpRequestHeader interface + public: + std::string GetReason(void) const; + int GetStatusCode(void) const; + + // HttpHeader implementation + public: + std::string ToString(void) const; + protected: + bool ParseLine(const std::string& line, int lineNumber); + + // data members + private: + int m_statusCode; + std::string m_reason; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // HTTP_HEADER_P_H diff --git a/src/api/internal/io/ILocalIODevice_p.cpp b/src/api/internal/io/ILocalIODevice_p.cpp new file mode 100644 index 0000000..19cc1da --- /dev/null +++ b/src/api/internal/io/ILocalIODevice_p.cpp @@ -0,0 +1,56 @@ +// *************************************************************************** +// ILocalIODevice_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides shared behavior for files & pipes +// *************************************************************************** + +#include "api/internal/io/ILocalIODevice_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +using namespace std; + +ILocalIODevice::ILocalIODevice(void) + : IBamIODevice() + , m_stream(0) +{ } + +ILocalIODevice::~ILocalIODevice(void) { + Close(); +} + +void ILocalIODevice::Close(void) { + + // skip if not open + if ( !IsOpen() ) + return; + + // flush & close FILE* + fflush(m_stream); + fclose(m_stream); + m_stream = 0; + + // reset other device state + m_mode = IBamIODevice::NotOpen; +} + +int64_t ILocalIODevice::Read(char* data, const unsigned int numBytes) { + BT_ASSERT_X( m_stream, "ILocalIODevice::Read: trying to read from null stream" ); + BT_ASSERT_X( (m_mode == IBamIODevice::ReadOnly), "ILocalIODevice::Read: device not in read-only mode"); + return static_cast( fread(data, sizeof(char), numBytes, m_stream) ); +} + +int64_t ILocalIODevice::Tell(void) const { + BT_ASSERT_X( m_stream, "ILocalIODevice::Tell: trying to get file position fromnull stream" ); + return ftell64(m_stream); +} + +int64_t ILocalIODevice::Write(const char* data, const unsigned int numBytes) { + BT_ASSERT_X( m_stream, "ILocalIODevice::Write: tryint to write to null stream" ); + BT_ASSERT_X( (m_mode == IBamIODevice::WriteOnly), "ILocalIODevice::Write: device not in write-only mode" ); + return static_cast( fwrite(data, sizeof(char), numBytes, m_stream) ); +} diff --git a/src/api/internal/io/ILocalIODevice_p.h b/src/api/internal/io/ILocalIODevice_p.h new file mode 100644 index 0000000..cf01f90 --- /dev/null +++ b/src/api/internal/io/ILocalIODevice_p.h @@ -0,0 +1,50 @@ +// *************************************************************************** +// ILocalIODevice_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides shared behavior for files & pipes +// *************************************************************************** + +#ifndef ILOCALIODEVICE_P_H +#define ILOCALIODEVICE_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/IBamIODevice.h" + +namespace BamTools { +namespace Internal { + +class ILocalIODevice : public IBamIODevice { + + // ctor & dtor + public: + ILocalIODevice(void); + virtual ~ILocalIODevice(void); + + // IBamIODevice implementation + public: + virtual void Close(void); + virtual int64_t Read(char* data, const unsigned int numBytes); + virtual int64_t Tell(void) const; + virtual int64_t Write(const char* data, const unsigned int numBytes); + + // data members + protected: + FILE* m_stream; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // ILOCALIODEVICE_P_H diff --git a/src/api/internal/io/NetUnix_p.h b/src/api/internal/io/NetUnix_p.h new file mode 100644 index 0000000..8cf75f8 --- /dev/null +++ b/src/api/internal/io/NetUnix_p.h @@ -0,0 +1,39 @@ +// *************************************************************************** +// NetUnix_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides common networking-related includes, etc. for all UNIX-like systems +// *************************************************************************** + +#ifndef NETUNIX_P_H +#define NETUNIX_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#ifndef _WIN32 // <-- source files only include the proper Net*_p.h, but this is a double-check + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef BT_SOCKLEN_T +# define BT_SOCKLEN_T socklen_t +#endif + +#endif // _WIN32 +#endif // NETUNIX_P_H diff --git a/src/api/internal/io/NetWin_p.h b/src/api/internal/io/NetWin_p.h new file mode 100644 index 0000000..bcef955 --- /dev/null +++ b/src/api/internal/io/NetWin_p.h @@ -0,0 +1,60 @@ +// *************************************************************************** +// NetWin_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides common networking-related includes, etc. for Windows systems +// +// Note: only supports XP and later +// *************************************************************************** + +#ifndef NETWIN_P_H +#define NETWIN_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#ifdef _WIN32 // <-- source files only include the proper Net*_p.h, but this is a double-check + +#include // <-- should bring 'windows.h' along with it +#include + +#ifndef BT_SOCKLEN_T +# define BT_SOCKLEN_T int +#endif + +#ifdef _MSC_VER +# pragma comment(lib, "ws2_32.lib") +#endif + +namespace BamTools { +namespace Internal { + +// use RAII to ensure WSA is en +class WindowsSockInit { + public: + WindowsSockInit(void) { + WSAData wsadata; + WSAStartup(MAKEWORD(2,2), &wsadata); // catch error ? + } + + ~WindowsSockInit(void) { + WSACleanup(); + } +}; + +} // namespace Internal +} // namespace BamTools + +#endif // _WIN32 + +#endif // NETWIN_P_H + diff --git a/src/api/internal/io/RollingBuffer_p.cpp b/src/api/internal/io/RollingBuffer_p.cpp new file mode 100644 index 0000000..c3f709d --- /dev/null +++ b/src/api/internal/io/RollingBuffer_p.cpp @@ -0,0 +1,310 @@ +// *************************************************************************** +// RollingBuffer_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a dynamic I/O FIFO byte queue, which removes bytes as they are +// read from the front of the buffer and grows to accept bytes being written +// to buffer end. +// +// implementation note: basically a 'smart' wrapper around 1..* ByteArrays +// *************************************************************************** + +#include "api/internal/io/RollingBuffer_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +#include +using namespace std; + +// ------------------------------ +// RollingBuffer implementation +// ------------------------------ + +RollingBuffer::RollingBuffer(size_t growth) + : m_bufferGrowth(growth) +{ + // buffer always contains at least 1 (maybe empty) byte array + m_data.push_back( ByteArray() ); + + // set cleared state + Clear(); +} + +RollingBuffer::~RollingBuffer(void) { } + +size_t RollingBuffer::BlockSize(void) const { + + // if only one byte array in buffer <- needed? + if ( m_tailBufferIndex == 0 ) + return m_tail - m_head; + + // otherwise return remaining num bytes in first array + const ByteArray& first = m_data.front(); + return first.Size() - m_head; +} + +bool RollingBuffer::CanReadLine(void) const { + return IndexOf('\n') != string::npos; +} + +void RollingBuffer::Chop(size_t n) { + + // update buffer size + if ( n > m_totalBufferSize ) + m_totalBufferSize = 0; + else + m_totalBufferSize -= n; + + // loop until target case hit + for ( ; ; ) { + + // if only one array, decrement tail + if ( m_tailBufferIndex == 0 ) { + m_tail -= n; + + // if all data chopped + if ( m_tail <= m_head ) { + m_head = 0; + m_tail = 0; + } + return; + } + + // if there's room in last byte array to 'chop', just decrement tail + if ( n <= m_tail ) { + m_tail -= n; + return; + } + + // otherwise we're going to overlap our internal byte arrays + // reduce our chop amount by the amount of data in the last byte array + n -= m_tail; + + // remove last byte array & set tail to it's end + m_data.pop_back(); + --m_tailBufferIndex; + m_tail = m_data.at(m_tailBufferIndex).Size(); + } + + // if buffer is now empty, reset state & clear up memory + if ( IsEmpty() ) + Clear(); +} + +void RollingBuffer::Clear(void) { + + // remove all byte arrays (except first) + m_data.erase( m_data.begin()+1, m_data.end() ); + + // clear out first byte array + m_data[0].Resize(0); + m_data[0].Squeeze(); + + // reset index & size markers + m_head = 0; + m_tail = 0; + m_tailBufferIndex = 0; + m_totalBufferSize = 0; +} + +void RollingBuffer::Free(size_t n) { + + // update buffer size + if ( n > m_totalBufferSize ) + m_totalBufferSize = 0; + else + m_totalBufferSize -= n; + + // loop until target case hit + for ( ; ; ) { + + const size_t blockSize = BlockSize(); + + // if there's room in current array + if ( n < blockSize ) { + + // shift 'head' over @n bytes + m_head += n; + + // check for emptied, single byte array + if ( m_head == m_tail && m_tailBufferIndex == 0 ) { + m_head = 0; + m_tail = 0; + } + + break; + } + + // otherwise we need to check next byte array + // first update amount to remove + n -= blockSize; + + // special case - there was only 1 array + if ( m_data.size() == 1 ) { + if ( m_data.at(0).Size() != m_bufferGrowth ) + m_data[0].Resize(m_bufferGrowth); + m_head = 0; + m_tail = 0; + m_tailBufferIndex = 0; + break; + } + + // otherwise, remove first array and move to next iteration + m_data.pop_front(); + --m_tailBufferIndex; + m_head = 0; + } + + // if buffer is now empty, reset state & clear up memory + if ( IsEmpty() ) + Clear(); +} + +size_t RollingBuffer::IndexOf(char c) const { + + size_t index(0); + + // iterate over byte arrays + const size_t numBuffers = m_data.size(); + for ( size_t i = 0; i < numBuffers; ++i ) { + const ByteArray& current = m_data.at(i); + + // if on first array, use head; else 0 + const size_t start = ( (i==0) ? m_head : 0 ); + + // if on last array, set end; else use current byte array size + const size_t end = ( (i==m_tailBufferIndex) ? m_tail : current.Size()); + + // look through this iteration's byte array for @c + const char* p = current.ConstData()+start; + for ( size_t j = start; j < end; ++j ) { + if ( *p++ == c ) + return index; + ++index; + } + } + + // no match found + return string::npos; +} + +bool RollingBuffer::IsEmpty(void) const { + return (m_tailBufferIndex == 0) && (m_tail == 0); +} + +size_t RollingBuffer::Read(char* dest, size_t max) { + + size_t bytesToRead = std::min(Size(), max); + size_t bytesReadSoFar = 0; + + while ( bytesReadSoFar < bytesToRead ) { + const char* readPtr = ReadPointer(); + size_t blockBytes = std::min( (bytesToRead - bytesReadSoFar), BlockSize() ); + if ( dest ) + memcpy(dest+bytesReadSoFar, readPtr, blockBytes); + bytesReadSoFar += blockBytes; + Free(blockBytes); + } + + return bytesReadSoFar; +} + +size_t RollingBuffer::ReadLine(char* dest, size_t max) { + + // if we can't read line or if max is 0 + if ( !CanReadLine() || max == 0 ) + return 0; + + // otherwise, read until we hit newline + size_t bytesReadSoFar = 0; + bool finished = false; + while ( !finished ) { + + const size_t index = IndexOf('\n'); + const char* readPtr = ReadPointer(); + size_t bytesToRead = std::min( (index+1)-bytesReadSoFar, BlockSize() ); + bytesToRead = std::min( bytesToRead, (max-1)-bytesReadSoFar ); + memcpy(dest+bytesReadSoFar, readPtr, bytesToRead); + bytesReadSoFar += bytesToRead; + Free(bytesToRead); + + if ( !((bytesReadSoFar < index+1)&&(bytesReadSoFar < max-1)) ) + finished = true; + } + + // null terminate 'dest' & return numBytesRead + dest[bytesReadSoFar] = '\0'; + return bytesReadSoFar; +} + +const char* RollingBuffer::ReadPointer(void) const { + + // return null if empty buffer + if ( m_data.empty() ) + return 0; + + // otherwise return pointer to current position + const ByteArray& first = m_data.front(); + return first.ConstData() + m_head; +} + +char* RollingBuffer::Reserve(size_t n) { + + // if empty buffer + if ( m_totalBufferSize == 0 ) { + m_data[0].Resize( std::max(m_bufferGrowth, n) ); + m_totalBufferSize += n; + m_tail = n; + return m_data[m_tailBufferIndex].Data(); + } + + // increment buffer's byte count + m_totalBufferSize += n; + + // if buffer already contains enough space to fit @n more bytes + if ( (m_tail + n) <= m_data.at(m_tailBufferIndex).Size() ) { + + // fetch write pointer at current 'tail', increment tail by @n & return + char* ptr = m_data[m_tailBufferIndex].Data() + m_tail; + m_tail += n; + return ptr; + } + + // if last byte array isn't half full + if ( m_tail < m_data.at(m_tailBufferIndex).Size()/2 ) { + + // we'll allow simple resize + m_data[m_tailBufferIndex].Resize(m_tail + n); + + // fetch write pointer at current 'tail', increment tail by @n & return + char* ptr = m_data[m_tailBufferIndex].Data() + m_tail; + m_tail += n; + return ptr; + } + + // otherwise, shrink last byte array to current used size + m_data[m_tailBufferIndex].Resize(m_tail); + + // then append new byte array + m_data.push_back( ByteArray() ); + ++m_tailBufferIndex; + m_data[m_tailBufferIndex].Resize( std::max(m_bufferGrowth, n) ); + m_tail = n; + + // return write-able pointer on new array + return m_data[m_tailBufferIndex].Data(); +} + +size_t RollingBuffer::Size(void) const { + return m_totalBufferSize; +} + +void RollingBuffer::Write(const char* src, size_t n) { + char* writePtr = Reserve(n); + memcpy(writePtr, src, n); +} diff --git a/src/api/internal/io/RollingBuffer_p.h b/src/api/internal/io/RollingBuffer_p.h new file mode 100644 index 0000000..e995f26 --- /dev/null +++ b/src/api/internal/io/RollingBuffer_p.h @@ -0,0 +1,84 @@ +// *************************************************************************** +// RollingBuffer_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides a dynamic I/O FIFO byte queue, which removes bytes as they are +// read from the front of the buffer and grows to accept bytes being written +// to buffer end. +// +// implementation note: basically a 'smart' wrapper around 1..* ByteArrays +// *************************************************************************** + +#ifndef ROLLINGBUFFER_P_H +#define ROLLINGBUFFER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/api_global.h" +#include "api/internal/io/ByteArray_p.h" +#include +#include + +namespace BamTools { +namespace Internal { + +class RollingBuffer { + + // ctors & dtor + public: + RollingBuffer(size_t growth); + ~RollingBuffer(void); + + // RollingBuffer interface + public: + + // returns current buffer size + size_t BlockSize(void) const; + // checks buffer for new line + bool CanReadLine(void) const; + // frees @n bytes from end of buffer + void Chop(size_t n); + // clears entire buffer structure + void Clear(void); + // frees @n bytes from front of buffer + void Free(size_t n); + // checks buffer for @c + size_t IndexOf(char c) const; + // returns whether buffer contains data + bool IsEmpty(void) const; + // reads up to @maxLen bytes into @dest + // returns exactly how many bytes were read from buffer + size_t Read(char* dest, size_t max); + // reads until newline (or up to @maxLen bytes) + // returns exactly how many bytes were read from buffer + size_t ReadLine(char* dest, size_t max); + + const char* ReadPointer(void) const; // returns a C-fxn compatible char* to byte data + char* Reserve(size_t n); // ensures that buffer contains space for @n incoming bytes, returns write-able char* + size_t Size(void) const; // returns current number of bytes stored in buffer + void Write(const char* src, size_t n); // reserves space for @n bytes, then appends contents of @src to buffer + + // data members + private: + size_t m_head; // index into current data (next char) + size_t m_tail; // index into last data position + size_t m_tailBufferIndex; // m_data::size() - 1 + size_t m_totalBufferSize; // total buffer size + size_t m_bufferGrowth; // new buffers are typically initialized with this size + std::deque m_data; // basic 'buffer of buffers' +}; + +} // namespace Internal +} // namespace BamTools + +#endif // ROLLINGBUFFER_P_H diff --git a/src/api/internal/io/TcpSocketEngine_p.cpp b/src/api/internal/io/TcpSocketEngine_p.cpp new file mode 100644 index 0000000..467eaeb --- /dev/null +++ b/src/api/internal/io/TcpSocketEngine_p.cpp @@ -0,0 +1,195 @@ +// *************************************************************************** +// TcpSocketEngine_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides low-level implementation of TCP I/O +// *************************************************************************** + +// N.B. - this file contains the top-level, platform-independent logic. "Native" methods +// are called as needed from the TcpSocketEngine_.cpp files. Selection of the proper +// native method file should have been handled at build-time by CMake. + +#include "api/internal/io/HostInfo_p.h" +#include "api/internal/io/TcpSocketEngine_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +TcpSocketEngine::TcpSocketEngine(void) + : m_socketDescriptor(-1) +// , m_localPort(0) + , m_remotePort(0) + , m_socketError(TcpSocket::UnknownSocketError) + , m_socketState(TcpSocket::UnconnectedState) +{ } + +TcpSocketEngine::TcpSocketEngine(const TcpSocketEngine& other) + : m_socketDescriptor(other.m_socketDescriptor) +// , m_localAddress(other.m_localAddress) + , m_remoteAddress(other.m_remoteAddress) +// , m_localPort(other.m_localPort) + , m_remotePort(other.m_remotePort) + , m_socketError(other.m_socketError) + , m_socketState(other.m_socketState) + , m_errorString(other.m_errorString) +{ } + +TcpSocketEngine::~TcpSocketEngine(void) { + Close(); +} + +void TcpSocketEngine::Close(void) { + + // close socket if we have valid FD + if ( m_socketDescriptor != -1 ) { + nativeClose(); + m_socketDescriptor = -1; + } + + // reset state + m_socketState = TcpSocket::UnconnectedState; +// m_localAddress.Clear(); + m_remoteAddress.Clear(); +// m_localPort = 0; + m_remotePort = 0; +} + +bool TcpSocketEngine::Connect(const HostAddress& address, const uint16_t port) { + + // return failure if invalid FD or already connected + if ( !IsValid() || (m_socketState == TcpSocket::ConnectedState) ) { + // TODO: set error string + return false; + } + + // attempt to connect to host address on requested port + if ( !nativeConnect(address, port) ) { + // TODO: set error string + return false; + } + + // if successful, store remote host address port & return success + // TODO: (later) fetch proxied remote & local host/port here + m_remoteAddress = address; + m_remotePort = port; + return true; +} + +std::string TcpSocketEngine::GetErrorString(void) const { + return m_errorString; +} + +//HostAddress TcpSocketEngine::GetLocalAddress(void) const { +// return m_localAddress; +//} + +//uint16_t TcpSocketEngine::GetLocalPort(void) const { +// return m_localPort; +//} + +HostAddress TcpSocketEngine::GetRemoteAddress(void) const { + return m_remoteAddress; +} + +uint16_t TcpSocketEngine::GetRemotePort(void) const { + return m_remotePort; +} + +int TcpSocketEngine::GetSocketDescriptor(void) const { + return m_socketDescriptor; +} + +TcpSocket::SocketError TcpSocketEngine::GetSocketError(void) { + return m_socketError; +} + +TcpSocket::SocketState TcpSocketEngine::GetSocketState(void) { + return m_socketState; +} + +bool TcpSocketEngine::Initialize(HostAddress::NetworkProtocol protocol) { + + // close current socket if we have one open + if ( IsValid() ) + Close(); + + // attempt to create new socket + return nativeCreateSocket(protocol); +} + +bool TcpSocketEngine::IsValid(void) const { + return (m_socketDescriptor != -1); +} + +int64_t TcpSocketEngine::NumBytesAvailable(void) const { + + // return 0 if socket FD is invalid + if ( !IsValid() ) { + // TODO: set error string + return -1; + } + + // otherwise check socket to see how much is ready + return nativeNumBytesAvailable(); +} + +int64_t TcpSocketEngine::Read(char* dest, size_t max) { + + // return failure if can't read + if ( !IsValid() || (m_socketState != TcpSocket::ConnectedState) ) + return -1; + + // otherwise return number of bytes read + return nativeRead(dest, max); +} + +bool TcpSocketEngine::WaitForRead(int msec, bool* timedOut) { + + // reset timedOut flag + *timedOut = false; + + // need to wait for our socket to be ready to read + int ret = nativeSelect(msec, true); + + // if timed out + if ( ret == 0 ) { + *timedOut = true; + m_socketError = TcpSocket::SocketTimeoutError; + m_errorString = "socket timed out"; + } + + // return if any sockets available for reading + return ( ret > 0 ); +} + +bool TcpSocketEngine::WaitForWrite(int msec, bool* timedOut) { + + // reset timedOut flag + *timedOut = false; + + // need to wait for our socket to be ready to write + int ret = nativeSelect(msec, false); + + // if timed out + if ( ret == 0 ) { + *timedOut = true; + m_socketError = TcpSocket::SocketTimeoutError; + m_errorString = "socket timed out"; + } + + // return if any sockets available for reading + return ( ret > 0 ); +} + +int64_t TcpSocketEngine::Write(const char* data, size_t length) { + + // return failure if can't write + if ( !IsValid() || (m_socketState != TcpSocket::ConnectedState) ) { + // TODO: set error string + return -1; + } + + // otherwise return number of bytes written + return nativeWrite(data, length); +} diff --git a/src/api/internal/io/TcpSocketEngine_p.h b/src/api/internal/io/TcpSocketEngine_p.h new file mode 100644 index 0000000..1a1a944 --- /dev/null +++ b/src/api/internal/io/TcpSocketEngine_p.h @@ -0,0 +1,95 @@ +// *************************************************************************** +// TcpSocketEngine_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides low-level implementation of TCP I/O +// *************************************************************************** + +#ifndef TCPSOCKETENGINE_P_H +#define TCPSOCKETENGINE_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/internal/io/HostAddress_p.h" +#include "api/internal/io/TcpSocket_p.h" + +namespace BamTools { +namespace Internal { + +struct TcpSocketEngine { + + // ctors & dtor + public: + TcpSocketEngine(void); + TcpSocketEngine(const TcpSocketEngine& other); + ~TcpSocketEngine(void); + + // TcpSocketEngine interface + public: + + // connection-related methods + void Close(void); + bool Connect(const HostAddress& address, const uint16_t port); + bool Initialize(HostAddress::NetworkProtocol protocol); + bool IsValid(void) const; + + // IO-related methods + int64_t NumBytesAvailable(void) const; + int64_t Read(char* dest, size_t max); + int64_t Write(const char* data, size_t length); + + bool WaitForRead(int msec, bool* timedOut); + bool WaitForWrite(int msec, bool* timedOut); + + // query connection state +// HostAddress GetLocalAddress(void) const; +// uint16_t GetLocalPort(void) const; + HostAddress GetRemoteAddress(void) const; + uint16_t GetRemotePort(void) const; + + int GetSocketDescriptor(void) const; + TcpSocket::SocketError GetSocketError(void); + TcpSocket::SocketState GetSocketState(void); + + std::string GetErrorString(void) const; + + // platform-dependent internal methods + // provided in the corresponding TcpSocketEngine__p.cpp + private: + void nativeClose(void); + bool nativeConnect(const HostAddress& address, const uint16_t port); + bool nativeCreateSocket(HostAddress::NetworkProtocol protocol); + void nativeDisconnect(void); + int64_t nativeNumBytesAvailable(void) const; + int64_t nativeRead(char* dest, size_t max); + int nativeSelect(int msecs, bool isRead) const; + int64_t nativeWrite(const char* data, size_t length); + + // data members + private: + int m_socketDescriptor; + +// HostAddress m_localAddress; + HostAddress m_remoteAddress; +// uint16_t m_localPort; + uint16_t m_remotePort; + + TcpSocket::SocketError m_socketError; + TcpSocket::SocketState m_socketState; + std::string m_errorString; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // TCPSOCKETENGINE_P_H diff --git a/src/api/internal/io/TcpSocketEngine_unix_p.cpp b/src/api/internal/io/TcpSocketEngine_unix_p.cpp new file mode 100644 index 0000000..efcdf8d --- /dev/null +++ b/src/api/internal/io/TcpSocketEngine_unix_p.cpp @@ -0,0 +1,247 @@ +// *************************************************************************** +// TcpSocketEngine_unix_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 15 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides low-level implementation of TCP I/O for all UNIX-like systems +// *************************************************************************** + +#include "api/internal/io/TcpSocketEngine_p.h" +#include "api/internal/io/NetUnix_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +using namespace std; + +// ------------------------ +// static utility methods +// ------------------------ + +namespace BamTools { +namespace Internal { + +} // namespace Internal +} // namespace BamTools + +// -------------------------------- +// TcpSocketEngine implementation +// -------------------------------- + +void TcpSocketEngine::nativeClose(void) { + close(m_socketDescriptor); +} + +bool TcpSocketEngine::nativeConnect(const HostAddress& address, const uint16_t port) { + + // setup connection parameters from address/port + sockaddr_in sockAddrIPv4; + sockaddr_in6 sockAddrIPv6; + sockaddr* sockAddrPtr = 0; + BT_SOCKLEN_T sockAddrSize = 0; + + // IPv6 + if ( address.GetProtocol() == HostAddress::IPv6Protocol ) { + + memset(&sockAddrIPv6, 0, sizeof(sockAddrIPv6)); + sockAddrIPv6.sin6_family = AF_INET6; + sockAddrIPv6.sin6_port = htons(port); + + IPv6Address ip6 = address.GetIPv6Address(); + memcpy(&sockAddrIPv6.sin6_addr.s6_addr, &ip6, sizeof(ip6)); + + sockAddrSize = sizeof(sockAddrIPv6); + sockAddrPtr = (sockaddr*)&sockAddrIPv6; + } + + // IPv4 + else if ( address.GetProtocol() == HostAddress::IPv4Protocol ) { + + memset(&sockAddrIPv4, 0, sizeof(sockAddrIPv4)); + sockAddrIPv4.sin_family = AF_INET; + sockAddrIPv4.sin_port = htons(port); + sockAddrIPv4.sin_addr.s_addr = htonl(address.GetIPv4Address()); + + sockAddrSize = sizeof(sockAddrIPv4); + sockAddrPtr = (sockaddr*)&sockAddrIPv4; + } + + // unknown (should be unreachable) + else BT_ASSERT_X(false, "TcpSocketEngine::nativeConnect() : unknown network protocol"); + + // attempt connection + int connectResult = connect(m_socketDescriptor, sockAddrPtr, sockAddrSize); + + // if failed, handle error + if ( connectResult == -1 ) { + + // ensure state is set before checking errno + m_socketState = TcpSocket::UnconnectedState; + + // set error type/message depending on errno + switch ( errno ) { // <-- potential thread issues later? but can't get error type from connectResult + + case EISCONN: + m_socketState = TcpSocket::ConnectedState; // socket was already connected + break; + case ECONNREFUSED: + case EINVAL: + m_socketError = TcpSocket::ConnectionRefusedError; + m_errorString = "connection refused"; + break; + case ETIMEDOUT: + m_socketError = TcpSocket::NetworkError; + m_errorString = "connection timed out"; + break; + case EHOSTUNREACH: + m_socketError = TcpSocket::NetworkError; + m_errorString = "host unreachable"; + break; + case ENETUNREACH: + m_socketError = TcpSocket::NetworkError; + m_errorString = "network unreachable"; + break; + case EADDRINUSE: + m_socketError = TcpSocket::SocketResourceError; + m_errorString = "address already in use"; + break; + case EACCES: + case EPERM: + m_socketError = TcpSocket::SocketAccessError; + m_errorString = "permission denied"; + break; + default: + break; + } + + // double check that we're not in 'connected' state; if so, return failure + if ( m_socketState != TcpSocket::ConnectedState ) + return false; + } + + // otherwise, we should be good + // update state & return success + m_socketState = TcpSocket::ConnectedState; + return true; +} + +bool TcpSocketEngine::nativeCreateSocket(HostAddress::NetworkProtocol protocol) { + + // get protocol value for requested protocol type + const int protocolNum = ( (protocol == HostAddress::IPv6Protocol) ? AF_INET6 + : AF_INET ); + + // attempt to create socket + int socketFd = socket(protocolNum, SOCK_STREAM, IPPROTO_TCP); + + // if we fetched an invalid socket descriptor + if ( socketFd <= 0 ) { + + // see what error we got + switch ( errno ) { + case EPROTONOSUPPORT: + case EAFNOSUPPORT: + case EINVAL: + m_socketError = TcpSocket::UnsupportedSocketOperationError; + m_errorString = "protocol not supported"; + break; + case ENFILE: + case EMFILE: + case ENOBUFS: + case ENOMEM: + m_socketError = TcpSocket::SocketResourceError; + m_errorString = "out of resources"; + break; + case EACCES: + m_socketError = TcpSocket::SocketAccessError; + m_errorString = "permission denied"; + break; + default: + break; + } + + // return failure + return false; + } + + // otherwise, store our socket FD & return success + m_socketDescriptor = socketFd; + return true; +} + +int64_t TcpSocketEngine::nativeNumBytesAvailable(void) const { + + // fetch number of bytes, return 0 on error + int numBytes(0); + if ( ioctl(m_socketDescriptor, FIONREAD, (char*)&numBytes) < 0 ) + return -1; + return static_cast(numBytes); +} + +int64_t TcpSocketEngine::nativeRead(char* dest, size_t max) { + + if ( !IsValid() ) + return -1; + + ssize_t ret = read(m_socketDescriptor, dest, max); + if ( ret < 0 ) { + ret = -1; + switch ( errno ) { + case EAGAIN : + // No data was available for reading + ret = -2; + break; + case ECONNRESET : + ret = 0; + break; + default: + break; + } + } + return static_cast(ret); +} + +// negative value for msecs will block (forever) until ready +int TcpSocketEngine::nativeSelect(int msecs, bool isRead) const { + + // set up FD set + fd_set fds; + FD_ZERO(&fds); + FD_SET(m_socketDescriptor, &fds); + + // setup our timeout + timeval tv; + tv.tv_sec = msecs / 1000; + tv.tv_usec = (msecs % 1000) * 1000; + + // do 'select' + if ( isRead ) + return select(m_socketDescriptor + 1, &fds, 0, 0, (msecs < 0 ? 0 : &tv)); + else + return select(m_socketDescriptor + 1, 0, &fds, 0, (msecs < 0 ? 0 : &tv)); +} + +int64_t TcpSocketEngine::nativeWrite(const char* data, size_t length) { + + ssize_t writtenBytes = write(m_socketDescriptor, data, length); + if ( writtenBytes < 0 ) { + switch (errno) { + case EPIPE: + case ECONNRESET: + writtenBytes = -1; + m_socketError = TcpSocket::RemoteHostClosedError; + m_errorString = "remote host closed connection"; + Close(); + break; + case EAGAIN: + writtenBytes = 0; + break; + default: + break; + } + } + return static_cast(writtenBytes); +} diff --git a/src/api/internal/io/TcpSocketEngine_win_p.cpp b/src/api/internal/io/TcpSocketEngine_win_p.cpp new file mode 100644 index 0000000..d1691ac --- /dev/null +++ b/src/api/internal/io/TcpSocketEngine_win_p.cpp @@ -0,0 +1,275 @@ +// *************************************************************************** +// TcpSocketEngine_win_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 15 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides low-level implementation of TCP I/O for all Windows systems +// *************************************************************************** + +#include "api/internal/io/TcpSocketEngine_p.h" +#include "api/internal/io/NetWin_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +using namespace std; + +// ------------------------ +// static utility methods +// ------------------------ + +namespace BamTools { +namespace Internal { + + +} // namespace Internal +} // namespace BamTools + +// -------------------------------- +// TcpSocketEngine implementation +// -------------------------------- + +void TcpSocketEngine::nativeClose(void) { + closesocket(m_socketDescriptor); +} + +bool TcpSocketEngine::nativeConnect(const HostAddress& address, const uint16_t port) { + + // setup connection parameters from address/port + sockaddr_in sockAddrIPv4; + sockaddr_in6 sockAddrIPv6; + sockaddr* sockAddrPtr = 0; + BT_SOCKLEN_T sockAddrSize = 0; + + // IPv6 + if ( address.GetProtocol() == HostAddress::IPv6Protocol ) { + + memset(&sockAddrIPv6, 0, sizeof(sockAddrIPv6)); + sockAddrIPv6.sin6_family = AF_INET6; + sockAddrIPv6.sin6_port = htons(port); + + IPv6Address ip6 = address.GetIPv6Address(); + memcpy(&sockAddrIPv6.sin6_addr.s6_addr, &ip6, sizeof(ip6)); + + sockAddrSize = sizeof(sockAddrIPv6); + sockAddrPtr = (sockaddr*)&sockAddrIPv6; + } + + // IPv4 + else if ( address.GetProtocol() == HostAddress::IPv4Protocol ) { + + memset(&sockAddrIPv4, 0, sizeof(sockAddrIPv4)); + sockAddrIPv4.sin_family = AF_INET; + sockAddrIPv4.sin_port = htons(port); + sockAddrIPv4.sin_addr.s_addr = htonl(address.GetIPv4Address()); + + sockAddrSize = sizeof(sockAddrIPv4); + sockAddrPtr = (sockaddr*)&sockAddrIPv4; + } + + // unknown (should be unreachable) + else BT_ASSERT_X(false, "TcpSocketEngine::nativeConnect() : unknown network protocol"); + + // attempt conenction + const int connectResult = WSAConnect(m_socketDescriptor, sockAddrPtr, sockAddrSize, 0, 0, 0, 0); + + // if failed, handle error + if ( connectResult == SOCKET_ERROR ) { + + // ensure state is set before checking error code + m_socketState = TcpSocket::UnconnectedState; + + // set error type/message depending on errorCode + const int errorCode = WSAGetLastError(); + switch ( errorCode ) { + case WSANOTINITIALISED: + m_socketError = TcpSocket::UnknownSocketError; + m_errorString = "Windows socket functionality not properly initialized"; + break; + case WSAEISCONN: + m_socketState = TcpSocket::ConnectedState; // socket already connected + break; + case WSAECONNREFUSED: + case WSAEINVAL: + m_socketError = TcpSocket::ConnectionRefusedError; + m_errorString = "connection refused"; + break; + case WSAETIMEDOUT: + m_socketError = TcpSocket::NetworkError; + m_errorString = "connection timed out"; + break; + case WSAEHOSTUNREACH: + m_socketError = TcpSocket::NetworkError; + m_errorString = "host unreachable"; + break; + case WSAENETUNREACH: + m_socketError = TcpSocket::NetworkError; + m_errorString = "network unreachable"; + break; + case WSAEADDRINUSE: + m_socketError = TcpSocket::SocketResourceError; + m_errorString = "address already in use"; + break; + case WSAEACCES: + m_socketError = TcpSocket::SocketAccessError; + m_errorString = "permission denied"; + break; + default: + break; + } + + // double check that we're not in 'connected' state; if so, return failure + if ( m_socketState != TcpSocket::ConnectedState ) + return false; + } + + // otherwise, we should be good + // update state & return success + m_socketState = TcpSocket::ConnectedState; + return true; +} + +bool TcpSocketEngine::nativeCreateSocket(HostAddress::NetworkProtocol protocol) { + + // get protocol value for requested protocol type + const int protocolNum = ( (protocol == HostAddress::IPv6Protocol) ? AF_INET6 : AF_INET ); + + // attempt to create socket + SOCKET socketFd = WSASocket(protocolNum, SOCK_STREAM, IPPROTO_TCP, 0, 0, WSA_FLAG_OVERLAPPED); + + // if we fetched an invalid socket descriptor + if ( socketFd == INVALID_SOCKET ) { + + // set error type/message depending on error code + const int errorCode = WSAGetLastError(); + switch ( errorCode ) { + case WSANOTINITIALISED: + m_socketError = TcpSocket::UnknownSocketError; + m_errorString = "Windows socket functionality not properly initialized"; + break; + case WSAEAFNOSUPPORT: + case WSAESOCKTNOSUPPORT: + case WSAEPROTOTYPE: + case WSAEINVAL: + m_socketError = TcpSocket::UnsupportedSocketOperationError; + m_errorString = "protocol not supported"; + break; + case WSAEMFILE: + case WSAENOBUFS: + m_socketError = TcpSocket::SocketResourceError; + m_errorString = "out of resources"; + break; + default: + break; + } + + // return failure + return false; + } + + // otherwise, store our socket FD & return success + m_socketDescriptor = static_cast(socketFd); + return true; +} + +int64_t TcpSocketEngine::nativeNumBytesAvailable(void) const { + + int64_t numBytes(0); + int64_t dummy(0); + DWORD bytesWritten(0); + + const int ioctlResult = WSAIoctl( m_socketDescriptor, FIONREAD + , &dummy, sizeof(dummy) + , &numBytes, sizeof(numBytes) + , &bytesWritten, 0, 0 + ); + return ( ioctlResult == SOCKET_ERROR ? -1 : numBytes ); +} + +int64_t TcpSocketEngine::nativeRead(char* dest, size_t max) { + + // skip if invalid socket + if ( !IsValid() ) + return -1; + + // set up our WSA output buffer + WSABUF buf; + buf.buf = dest; + buf.len = max; + + // attempt to read bytes + DWORD flags = 0; + DWORD bytesRead = 0; + const int readResult = WSARecv(m_socketDescriptor, &buf, 1, &bytesRead, &flags, 0, 0); + + // if error encountered + if ( readResult == SOCKET_ERROR ) { + const int errorCode = WSAGetLastError(); + switch ( errorCode ) { + case WSAEWOULDBLOCK: // nothing read this time, but more coming later + return -2; + default: + return -1; // on any other errors + } + } + + // check if nothing was read this time, but more is coming + if ( WSAGetLastError() == WSAEWOULDBLOCK ) + return -2; + + // otherwise return number of bytes read + return static_cast(bytesRead); +} + +// negative value for msecs will block (forever) until +int TcpSocketEngine::nativeSelect(int msecs, bool isRead) const { + + fd_set fds; + FD_ZERO(&fds); + FD_SET(m_socketDescriptor, &fds); + + timeval tv; + tv.tv_sec = msecs / 1000; + tv.tv_usec = (msecs % 1000) * 1000; + + // do 'select' + if ( isRead ) + return select(0, &fds, 0, 0, (msecs < 0 ? 0 : &tv)); + else + return select(0, 0, &fds, 0, (msecs < 0 ? 0 : &tv)); +} + +int64_t TcpSocketEngine::nativeWrite(const char* data, size_t length) { + + // setup our WSA write buffer + WSABUF buf; + buf.buf = (char*)data; + buf.len = length; + + // attempt to write bytes + DWORD flags = 0; + DWORD bytesWritten = 0; + const int writeResult = WSASend(m_socketDescriptor, &buf, 1, &bytesWritten, flags, 0, 0); + + // error encountered + if ( writeResult == SOCKET_ERROR ) { + + const int errorCode = WSAGetLastError(); + switch ( errorCode ) { + case WSAEWOULDBLOCK: + return 0; + case WSAECONNRESET: + case WSAECONNABORTED: + m_socketError = TcpSocket::NetworkError; + m_errorString = "connection reset or aborted"; + return -1; + default: + return -1; + } + } + + // otherwise return number of bytes written + return static_cast(bytesWritten); +} diff --git a/src/api/internal/io/TcpSocket_p.cpp b/src/api/internal/io/TcpSocket_p.cpp new file mode 100644 index 0000000..4ff53a8 --- /dev/null +++ b/src/api/internal/io/TcpSocket_p.cpp @@ -0,0 +1,432 @@ +// *************************************************************************** +// TcpSocket_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides basic TCP I/O interface +// *************************************************************************** + +#include "api/internal/io/ByteArray_p.h" +#include "api/internal/io/TcpSocket_p.h" +#include "api/internal/io/TcpSocketEngine_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +using namespace std; + +// ------------------------------------ +// static utility methods & constants +// ------------------------------------ + +namespace BamTools { +namespace Internal { + +// constants +static const size_t DEFAULT_BUFFER_SIZE = 0x4000; + +} // namespace Internal +} // namespace BamTools + +// -------------------------- +// TcpSocket implementation +// -------------------------- + +TcpSocket::TcpSocket(void) + : m_mode(IBamIODevice::NotOpen) +// , m_localPort(0) + , m_remotePort(0) + , m_engine(0) + , m_cachedSocketDescriptor(-1) + , m_readBuffer(DEFAULT_BUFFER_SIZE) + , m_error(TcpSocket::UnknownSocketError) + , m_state(TcpSocket::UnconnectedState) +{ } + +TcpSocket::~TcpSocket(void) { + if ( m_state == TcpSocket::ConnectedState ) + DisconnectFromHost(); +} + +size_t TcpSocket::BufferBytesAvailable(void) const { + return m_readBuffer.Size(); +} + +bool TcpSocket::CanReadLine(void) const { + return m_readBuffer.CanReadLine(); +} + +void TcpSocket::ClearBuffer(void) { + m_readBuffer.Clear(); +} + +bool TcpSocket::ConnectImpl(const HostInfo& hostInfo, + const std::string& port, + IBamIODevice::OpenMode mode) +{ + // skip if we're already connected + if ( m_state == TcpSocket::ConnectedState ) { + m_error = TcpSocket::SocketResourceError; + m_errorString = "socket already connected"; + return false; + } + + // reset socket state + m_hostName = hostInfo.HostName(); + m_mode = mode; + m_state = TcpSocket::UnconnectedState; + m_error = TcpSocket::UnknownSocketError; +// m_localPort = 0; + m_remotePort = 0; +// m_localAddress.Clear(); + m_remoteAddress.Clear(); + m_readBuffer.Clear(); + + // fetch candidate addresses for requested host + vector addresses = hostInfo.Addresses(); + if ( addresses.empty() ) { + m_error = TcpSocket::HostNotFoundError; + m_errorString = "no IP addresses found for host"; + return false; + } + + // convert port string to integer + stringstream ss(port); + uint16_t portNumber(0); + ss >> portNumber; + + // iterate through adddresses + vector::const_iterator addrIter = addresses.begin(); + vector::const_iterator addrEnd = addresses.end(); + for ( ; addrIter != addrEnd; ++addrIter) { + const HostAddress& addr = (*addrIter); + + // try to initialize socket engine with this address + if ( !InitializeSocketEngine(addr.GetProtocol()) ) { + // failure to initialize is OK here + // we'll just try the next available address + continue; + } + + // attempt actual connection + if ( m_engine->Connect(addr, portNumber) ) { + + // if connection successful, update our state & return true + m_mode = mode; +// m_localAddress = m_engine->GetLocalAddress(); +// m_localPort = m_engine->GetLocalPort(); + m_remoteAddress = m_engine->GetRemoteAddress(); + m_remotePort = m_engine->GetRemotePort(); + m_cachedSocketDescriptor = m_engine->GetSocketDescriptor(); + m_state = TcpSocket::ConnectedState; + return true; + } + } + + // if we get here, no connection could be made + m_error = TcpSocket::HostNotFoundError; + m_errorString = "could not connect to any host addresses"; + return false; +} + +bool TcpSocket::ConnectToHost(const string& hostName, + uint16_t port, + IBamIODevice::OpenMode mode) +{ + stringstream ss(""); + ss << port; + return ConnectToHost(hostName, ss.str(), mode); + +} + +bool TcpSocket::ConnectToHost(const string& hostName, + const string& port, + IBamIODevice::OpenMode mode) +{ + // create new address object with requested host name + HostAddress hostAddress; + hostAddress.SetAddress(hostName); + + HostInfo info; + // if host name was IP address ("x.x.x.x" or IPv6 format) + // otherwise host name was 'plain-text' ("www.foo.bar") + // we need to look up IP address(es) + if ( hostAddress.HasIPAddress() ) + info.SetAddresses( vector(1, hostAddress) ); + else + info = HostInfo::Lookup(hostName, port); + + // attempt connection on requested port + return ConnectImpl(info, port, mode); +} + +void TcpSocket::DisconnectFromHost(void) { + + // close socket engine & delete + if ( m_state == TcpSocket::ConnectedState ) + ResetSocketEngine(); + + // reset connection state +// m_localPort = 0; + m_remotePort = 0; +// m_localAddress.Clear(); + m_remoteAddress.Clear(); + m_hostName.clear(); + m_cachedSocketDescriptor = -1; + + // for future, make sure there's outgoing data that needs to be flushed + m_readBuffer.Clear(); +} + +TcpSocket::SocketError TcpSocket::GetError(void) const { + return m_error; +} + +std::string TcpSocket::GetErrorString(void) const { + return m_errorString; +} + +std::string TcpSocket::GetHostName(void) const { + return m_hostName; +} + +//HostAddress TcpSocket::GetLocalAddress(void) const { +// return m_localAddress; +//} + +//uint16_t TcpSocket::GetLocalPort(void) const { +// return m_localPort; +//} + +HostAddress TcpSocket::GetRemoteAddress(void) const { + return m_remoteAddress; +} + +uint16_t TcpSocket::GetRemotePort(void) const { + return m_remotePort; +} + +TcpSocket::SocketState TcpSocket::GetState(void) const { + return m_state; +} + +bool TcpSocket::InitializeSocketEngine(HostAddress::NetworkProtocol protocol) { + ResetSocketEngine(); + m_engine = new TcpSocketEngine; + return m_engine->Initialize(protocol); +} + +bool TcpSocket::IsConnected(void) const { + if ( m_engine == 0 ) + return false; + return ( m_engine->IsValid() && (m_state == TcpSocket::ConnectedState) ); +} + +// may be read in a look until desired data amount has been read +// returns: number of bytes read, or -1 if error +int64_t TcpSocket::Read(char* data, const unsigned int numBytes) { + + // if we have data in buffer, just return it + if ( !m_readBuffer.IsEmpty() ) { + const size_t bytesRead = m_readBuffer.Read(data, numBytes); + return static_cast(bytesRead); + } + + // otherwise, we'll need to fetch data from socket + // first make sure we have a valid socket engine + if ( m_engine == 0 ) { + // TODO: set error string/state? + return -1; + } + + // fetch data from socket, return 0 for success, -1 for failure + // since this should be called in a loop, we'll pull the actual bytes on next iteration + return ( ReadFromSocket() ? 0 : -1 ); +} + +bool TcpSocket::ReadFromSocket(void) { + + // check for any socket engine errors + if ( !m_engine->IsValid() ) { + m_errorString = "TcpSocket::ReadFromSocket - socket disconnected"; + ResetSocketEngine(); + return false; + } + + // wait for ready read + bool timedOut; + bool isReadyRead = m_engine->WaitForRead(5000, &timedOut); + + // if not ready + if ( !isReadyRead ) { + + // if we simply timed out + if ( timedOut ) { + m_errorString = "TcpSocket::ReadFromSocket - timed out waiting for ready read"; + // get error from engine ? + return false; + } + + // otherwise, there was an error + else { + m_errorString = "TcpSocket::ReadFromSocket - encountered error while waiting for ready read"; + // get error from engine ? + return false; + } + } + + // ######################################################################### + // clean this up - smells funky, but it's a key step so it has to be right + // ######################################################################### + + // get number of bytes available from socket + // (if 0, still try to read some data so we don't trigger any OS event behavior + // that respond to repeated access to a remote closed socket) + int64_t bytesToRead = m_engine->NumBytesAvailable(); + if ( bytesToRead < 0 ) { + m_errorString = "TcpSocket::ReadFromSocket - encountered error while determining numBytesAvailable"; + // get error from engine ? + return false; + } + else if ( bytesToRead == 0 ) + bytesToRead = 4096; + + // make space in buffer & read from socket + char* buffer = m_readBuffer.Reserve(bytesToRead); + int64_t numBytesRead = m_engine->Read(buffer, bytesToRead); + + // if error while reading + if ( numBytesRead == -1 ) { + m_errorString = "TcpSocket::ReadFromSocket - encountered error while reading bytes"; + // get error from engine ? + return false; + } + + // handle special case (no data, but not error) + if ( numBytesRead == -2 ) + m_readBuffer.Chop(bytesToRead); + + // return success + return true; +} + +string TcpSocket::ReadLine(int64_t max) { + + // prep result byte buffer + ByteArray result; + + size_t bufferMax = ((max > static_cast(string::npos)) ? string::npos : static_cast(max)); + result.Resize(bufferMax); + + // read data + int64_t readBytes(0); + if ( result.Size() == 0 ) { + + if ( bufferMax == 0 ) + bufferMax = string::npos; + + result.Resize(1); + + int64_t readResult; + do { + result.Resize( static_cast(std::min(bufferMax, result.Size() + DEFAULT_BUFFER_SIZE)) ); + readResult = ReadLine(result.Data()+readBytes, result.Size()-readBytes); + if ( readResult > 0 || readBytes == 0 ) + readBytes += readResult; + } while ( readResult == DEFAULT_BUFFER_SIZE && result[static_cast(readBytes-1)] != '\n' ); + + } else + readBytes = ReadLine(result.Data(), result.Size()); + + // clean up byte buffer + if ( readBytes <= 0 ) + result.Clear(); + else + result.Resize(static_cast(readBytes)); + + // return byte buffer as string + return string( result.ConstData(), result.Size() ); +} + +int64_t TcpSocket::ReadLine(char* dest, size_t max) { + + // wait for buffer to contain line contents + if ( !WaitForReadLine() ) { + m_errorString = "TcpSocket::ReadLine - error waiting for read line"; + return -1; + } + + // leave room for null term + if ( max < 2 ) + return -1; + --max; + + // read from buffer, handle newlines + int64_t readSoFar = m_readBuffer.ReadLine(dest, max); + if ( readSoFar && dest[readSoFar-1] == '\n' ) { + + // adjust for windows-style '\r\n' + if ( readSoFar > 1 && dest[readSoFar-2] == '\r') { + --readSoFar; + dest[readSoFar-1] = '\n'; + } + } + + // null terminate & return number of bytes read + dest[readSoFar] = '\0'; + return readSoFar; +} + +void TcpSocket::ResetSocketEngine(void) { + + // shut down socket engine + if ( m_engine ) { + m_engine->Close(); + delete m_engine; + m_engine = 0; + } + + // reset our state & cached socket handle + m_state = TcpSocket::UnconnectedState; + m_cachedSocketDescriptor = -1; +} + +bool TcpSocket::WaitForReadLine(void) { + + // wait until we can read a line (will return immediately if already capable) + while ( !CanReadLine() ) { + if ( !ReadFromSocket() ) + return false; + } + + // if we get here, success + return true; +} + +int64_t TcpSocket::Write(const char* data, const unsigned int numBytes) { + + // single-shot attempt at write (not buffered, just try to shove the data through socket) + // this method purely exists to send 'small' HTTP requests/FTP commands from client to server + + int64_t bytesWritten(0); + + // wait for our socket to be write-able + bool timedOut; + bool isReadyWrite = m_engine->WaitForWrite(3000, &timedOut); + if ( isReadyWrite ) + bytesWritten = m_engine->Write(data, numBytes); + else { + // timeout is OK (with current setup), we'll just return 0 & try again + // but we need to report if engine encountered some other error + if ( !timedOut ) { + // TODO: set error string + bytesWritten = -1; + } + } + + // return actual number of bytes written to socket + return bytesWritten; +} diff --git a/src/api/internal/io/TcpSocket_p.h b/src/api/internal/io/TcpSocket_p.h new file mode 100644 index 0000000..3c5f2fc --- /dev/null +++ b/src/api/internal/io/TcpSocket_p.h @@ -0,0 +1,124 @@ +// *************************************************************************** +// TcpSocket_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 November 2011 (DB) +// --------------------------------------------------------------------------- +// Provides basic TCP I/O interface +// *************************************************************************** + +#ifndef TCPSOCKET_P_H +#define TCPSOCKET_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/IBamIODevice.h" +#include "api/internal/io/HostInfo_p.h" +#include "api/internal/io/RollingBuffer_p.h" +#include + +namespace BamTools { +namespace Internal { + +class TcpSocketEngine; + +class TcpSocket { + + // enums + public: + enum SocketError { UnknownSocketError = -1 + , ConnectionRefusedError = 0 + , RemoteHostClosedError + , HostNotFoundError + , SocketAccessError + , SocketResourceError + , SocketTimeoutError + , NetworkError + , UnsupportedSocketOperationError + }; + + enum SocketState { UnconnectedState = 0 + , ConnectedState + }; + + // ctor & dtor + public: + TcpSocket(void); + ~TcpSocket(void); + + // TcpSocket interface + public: + + // connection methods + bool ConnectToHost(const std::string& hostName, + const uint16_t port, // Connect("host", 80) + IBamIODevice::OpenMode mode = IBamIODevice::ReadOnly); + bool ConnectToHost(const std::string& hostName, + const std::string& port, // Connect("host", "80") + IBamIODevice::OpenMode mode = IBamIODevice::ReadOnly); + void DisconnectFromHost(void); + bool IsConnected(void) const; + + // I/O methods + size_t BufferBytesAvailable(void) const; + bool CanReadLine(void) const; + void ClearBuffer(void); // force buffer to clear (not a 'flush', just a 'discard') + int64_t Read(char* data, const unsigned int numBytes); + std::string ReadLine(int64_t max = 0); + int64_t ReadLine(char* dest, size_t max); + bool WaitForReadLine(void); + int64_t Write(const char* data, const unsigned int numBytes); + + // connection values + std::string GetHostName(void) const; +// HostAddress GetLocalAddress(void) const; +// uint16_t GetLocalPort(void) const; + HostAddress GetRemoteAddress(void) const; + uint16_t GetRemotePort(void) const; + + // connection status + TcpSocket::SocketError GetError(void) const; + TcpSocket::SocketState GetState(void) const; + std::string GetErrorString(void) const; + + // internal methods + private: + bool ConnectImpl(const HostInfo& hostInfo, + const std::string& port, + IBamIODevice::OpenMode mode); + bool InitializeSocketEngine(HostAddress::NetworkProtocol protocol); + bool ReadFromSocket(void); + void ResetSocketEngine(void); + + // data members + private: + IBamIODevice::OpenMode m_mode; + + std::string m_hostName; +// uint16_t m_localPort; + uint16_t m_remotePort; +// HostAddress m_localAddress; + HostAddress m_remoteAddress; + + TcpSocketEngine* m_engine; + int m_cachedSocketDescriptor; + + RollingBuffer m_readBuffer; + + TcpSocket::SocketError m_error; + TcpSocket::SocketState m_state; + std::string m_errorString; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // TCPSOCKET_P_H diff --git a/src/api/internal/sam/CMakeLists.txt b/src/api/internal/sam/CMakeLists.txt new file mode 100644 index 0000000..4b2bce2 --- /dev/null +++ b/src/api/internal/sam/CMakeLists.txt @@ -0,0 +1,17 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2011 Derek Barnett +# +# src/api/internal/sam +# ========================== + +set ( InternalSamDir "${InternalDir}/sam" ) + +set ( InternalSamSources + ${InternalSamDir}/SamFormatParser_p.cpp + ${InternalSamDir}/SamFormatPrinter_p.cpp + ${InternalSamDir}/SamHeaderValidator_p.cpp + + PARENT_SCOPE # <-- leave this last +) + diff --git a/src/api/internal/sam/SamFormatParser_p.cpp b/src/api/internal/sam/SamFormatParser_p.cpp new file mode 100644 index 0000000..74c1fed --- /dev/null +++ b/src/api/internal/sam/SamFormatParser_p.cpp @@ -0,0 +1,222 @@ +// *************************************************************************** +// SamFormatParser.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for parsing SAM header text into SamHeader object +// *************************************************************************** + +#include "api/SamConstants.h" +#include "api/SamHeader.h" +#include "api/internal/sam/SamFormatParser_p.h" +#include "api/internal/utils/BamException_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +using namespace std; + +SamFormatParser::SamFormatParser(SamHeader& header) + : m_header(header) +{ } + +SamFormatParser::~SamFormatParser(void) { } + +void SamFormatParser::Parse(const string& headerText) { + + // clear header's prior contents + m_header.Clear(); + + // empty header is OK, but skip processing + if ( headerText.empty() ) + return; + + // other wise parse SAM lines + istringstream headerStream(headerText); + string headerLine(""); + while ( getline(headerStream, headerLine) ) + ParseSamLine(headerLine); +} + +void SamFormatParser::ParseSamLine(const string& line) { + + // skip if line is not long enough to contain true values + if ( line.length() < 5 ) return; + + // determine token at beginning of line + const string firstToken = line.substr(0,3); + string restOfLine = line.substr(4); + if ( firstToken == Constants::SAM_HD_BEGIN_TOKEN) ParseHDLine(restOfLine); + else if ( firstToken == Constants::SAM_SQ_BEGIN_TOKEN) ParseSQLine(restOfLine); + else if ( firstToken == Constants::SAM_RG_BEGIN_TOKEN) ParseRGLine(restOfLine); + else if ( firstToken == Constants::SAM_PG_BEGIN_TOKEN) ParsePGLine(restOfLine); + else if ( firstToken == Constants::SAM_CO_BEGIN_TOKEN) ParseCOLine(restOfLine); + else { + const string message = string("unknown token: ") + firstToken; + throw BamException("SamFormatParser::ParseSamLine", message); + } +} + +void SamFormatParser::ParseHDLine(const string& line) { + + // split HD lines into tokens + vector tokens = Split(line, Constants::SAM_TAB); + + // iterate over tokens + vector::const_iterator tokenIter = tokens.begin(); + vector::const_iterator tokenEnd = tokens.end(); + for ( ; tokenIter != tokenEnd; ++tokenIter ) { + + // get tag/value + const string tokenTag = (*tokenIter).substr(0,2); + const string tokenValue = (*tokenIter).substr(3); + + // set header contents + if ( tokenTag == Constants::SAM_HD_VERSION_TAG ) m_header.Version = tokenValue; + else if ( tokenTag == Constants::SAM_HD_SORTORDER_TAG ) m_header.SortOrder = tokenValue; + else if ( tokenTag == Constants::SAM_HD_GROUPORDER_TAG ) m_header.GroupOrder = tokenValue; + else { + const string message = string("unknown HD tag: ") + tokenTag; + throw BamException("SamFormatParser::ParseHDLine", message); + } + } + + // check for required tags + if ( !m_header.HasVersion() ) + throw BamException("SamFormatParser::ParseHDLine", "@HD line is missing VN tag"); +} + +void SamFormatParser::ParseSQLine(const string& line) { + + SamSequence seq; + + // split SQ line into tokens + vector tokens = Split(line, Constants::SAM_TAB); + + // iterate over tokens + vector::const_iterator tokenIter = tokens.begin(); + vector::const_iterator tokenEnd = tokens.end(); + for ( ; tokenIter != tokenEnd; ++tokenIter ) { + + // get tag/value + const string tokenTag = (*tokenIter).substr(0,2); + const string tokenValue = (*tokenIter).substr(3); + + // set sequence contents + if ( tokenTag == Constants::SAM_SQ_NAME_TAG ) seq.Name = tokenValue; + else if ( tokenTag == Constants::SAM_SQ_LENGTH_TAG ) seq.Length = tokenValue; + else if ( tokenTag == Constants::SAM_SQ_ASSEMBLYID_TAG ) seq.AssemblyID = tokenValue; + else if ( tokenTag == Constants::SAM_SQ_CHECKSUM_TAG ) seq.Checksum = tokenValue; + else if ( tokenTag == Constants::SAM_SQ_SPECIES_TAG ) seq.Species = tokenValue; + else if ( tokenTag == Constants::SAM_SQ_URI_TAG ) seq.URI = tokenValue; + else { + const string message = string("unknown SQ tag: ") + tokenTag; + throw BamException("SamFormatParser::ParseSQLine", message); + } + } + + // check for required tags + if ( !seq.HasName() ) + throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing SN tag"); + if ( !seq.HasLength() ) + throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing LN tag"); + + // store SAM sequence entry + m_header.Sequences.Add(seq); +} + +void SamFormatParser::ParseRGLine(const string& line) { + + SamReadGroup rg; + + // split string into tokens + vector tokens = Split(line, Constants::SAM_TAB); + + // iterate over tokens + vector::const_iterator tokenIter = tokens.begin(); + vector::const_iterator tokenEnd = tokens.end(); + for ( ; tokenIter != tokenEnd; ++tokenIter ) { + + // get token tag/value + const string tokenTag = (*tokenIter).substr(0,2); + const string tokenValue = (*tokenIter).substr(3); + + // set read group contents + if ( tokenTag == Constants::SAM_RG_ID_TAG ) rg.ID = tokenValue; + else if ( tokenTag == Constants::SAM_RG_DESCRIPTION_TAG ) rg.Description = tokenValue; + else if ( tokenTag == Constants::SAM_RG_FLOWORDER_TAG ) rg.FlowOrder = tokenValue; + else if ( tokenTag == Constants::SAM_RG_KEYSEQUENCE_TAG ) rg.KeySequence = tokenValue; + else if ( tokenTag == Constants::SAM_RG_LIBRARY_TAG ) rg.Library = tokenValue; + else if ( tokenTag == Constants::SAM_RG_PLATFORMUNIT_TAG ) rg.PlatformUnit = tokenValue; + else if ( tokenTag == Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG ) rg.PredictedInsertSize = tokenValue; + else if ( tokenTag == Constants::SAM_RG_PRODUCTIONDATE_TAG ) rg.ProductionDate = tokenValue; + else if ( tokenTag == Constants::SAM_RG_PROGRAM_TAG ) rg.Program = tokenValue; + else if ( tokenTag == Constants::SAM_RG_SAMPLE_TAG ) rg.Sample = tokenValue; + else if ( tokenTag == Constants::SAM_RG_SEQCENTER_TAG ) rg.SequencingCenter = tokenValue; + else if ( tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG ) rg.SequencingTechnology = tokenValue; + else { + const string message = string("unknown RG tag: ") + tokenTag; + throw BamException("SamFormatParser::ParseRGLine", message); + } + } + + // check for required tags + if ( !rg.HasID() ) + throw BamException("SamFormatParser::ParseRGLine", "@RG line is missing ID tag"); + + // store SAM read group entry + m_header.ReadGroups.Add(rg); +} + +void SamFormatParser::ParsePGLine(const string& line) { + + SamProgram pg; + + // split string into tokens + vector tokens = Split(line, Constants::SAM_TAB); + + // iterate over tokens + vector::const_iterator tokenIter = tokens.begin(); + vector::const_iterator tokenEnd = tokens.end(); + for ( ; tokenIter != tokenEnd; ++tokenIter ) { + + // get token tag/value + const string tokenTag = (*tokenIter).substr(0,2); + const string tokenValue = (*tokenIter).substr(3); + + // set program record contents + if ( tokenTag == Constants::SAM_PG_ID_TAG ) pg.ID = tokenValue; + else if ( tokenTag == Constants::SAM_PG_NAME_TAG ) pg.Name = tokenValue; + else if ( tokenTag == Constants::SAM_PG_COMMANDLINE_TAG ) pg.CommandLine = tokenValue; + else if ( tokenTag == Constants::SAM_PG_PREVIOUSPROGRAM_TAG ) pg.PreviousProgramID = tokenValue; + else if ( tokenTag == Constants::SAM_PG_VERSION_TAG ) pg.Version = tokenValue; + else { + const string message = string("unknown PG tag: ") + tokenTag; + throw BamException("SamFormatParser::ParsePGLine", message); + } + } + + // check for required tags + if ( !pg.HasID() ) + throw BamException("SamFormatParser::ParsePGLine", "@PG line is missing ID tag"); + + // store SAM program entry + m_header.Programs.Add(pg); +} + +void SamFormatParser::ParseCOLine(const string& line) { + // simply add line to comments list + m_header.Comments.push_back(line); +} + +const vector SamFormatParser::Split(const string& line, const char delim) { + vector tokens; + stringstream lineStream(line); + string token; + while ( getline(lineStream, token, delim) ) + tokens.push_back(token); + return tokens; +} diff --git a/src/api/internal/sam/SamFormatParser_p.h b/src/api/internal/sam/SamFormatParser_p.h new file mode 100644 index 0000000..cf6d54c --- /dev/null +++ b/src/api/internal/sam/SamFormatParser_p.h @@ -0,0 +1,61 @@ +// *************************************************************************** +// SamFormatParser.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for parsing SAM header text into SamHeader object +// *************************************************************************** + +#ifndef SAM_FORMAT_PARSER_H +#define SAM_FORMAT_PARSER_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include +#include + +namespace BamTools { + +class SamHeader; + +namespace Internal { + +class SamFormatParser { + + // ctor & dtor + public: + SamFormatParser(BamTools::SamHeader& header); + ~SamFormatParser(void); + + // parse text & populate header data + public: + void Parse(const std::string& headerText); + + // internal methods + private: + void ParseSamLine(const std::string& line); + void ParseHDLine(const std::string& line); + void ParseSQLine(const std::string& line); + void ParseRGLine(const std::string& line); + void ParsePGLine(const std::string& line); + void ParseCOLine(const std::string& line); + const std::vector Split(const std::string& line, const char delim); + + // data members + private: + SamHeader& m_header; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // SAM_FORMAT_PARSER_H diff --git a/src/api/internal/sam/SamFormatPrinter_p.cpp b/src/api/internal/sam/SamFormatPrinter_p.cpp new file mode 100644 index 0000000..5a51a2f --- /dev/null +++ b/src/api/internal/sam/SamFormatPrinter_p.cpp @@ -0,0 +1,219 @@ +// *************************************************************************** +// SamFormatPrinter.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for printing formatted SAM header to string +// *************************************************************************** + +#include "api/SamConstants.h" +#include "api/SamHeader.h" +#include "api/internal/sam/SamFormatPrinter_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +using namespace std; + +// ------------------------ +// static utility methods +// ------------------------ + +static inline +const string FormatTag(const string& tag, const string& value) { + return string(Constants::SAM_TAB + tag + Constants::SAM_COLON + value); +} + +// --------------------------------- +// SamFormatPrinter implementation +// --------------------------------- + +SamFormatPrinter::SamFormatPrinter(const SamHeader& header) + : m_header(header) +{ } + +SamFormatPrinter::~SamFormatPrinter(void) { } + +const string SamFormatPrinter::ToString(void) const { + + // clear out stream + stringstream out(""); + + // generate formatted header text + PrintHD(out); + PrintSQ(out); + PrintRG(out); + PrintPG(out); + PrintCO(out); + + // return result + return out.str(); +} + +void SamFormatPrinter::PrintHD(std::stringstream& out) const { + + // if header has @HD data + if ( m_header.HasVersion() ) { + + // @HD VN: + out << Constants::SAM_HD_BEGIN_TOKEN + << FormatTag(Constants::SAM_HD_VERSION_TAG, m_header.Version); + + // SO: + if ( m_header.HasSortOrder() ) + out << FormatTag(Constants::SAM_HD_SORTORDER_TAG, m_header.SortOrder); + + // GO: + if ( m_header.HasGroupOrder() ) + out << FormatTag(Constants::SAM_HD_GROUPORDER_TAG, m_header.GroupOrder); + + // newline + out << endl; + } +} + +void SamFormatPrinter::PrintSQ(std::stringstream& out) const { + + // iterate over sequence entries + SamSequenceConstIterator seqIter = m_header.Sequences.ConstBegin(); + SamSequenceConstIterator seqEnd = m_header.Sequences.ConstEnd(); + for ( ; seqIter != seqEnd; ++seqIter ) { + const SamSequence& seq = (*seqIter); + + // @SQ SN: LN: + out << Constants::SAM_SQ_BEGIN_TOKEN + << FormatTag(Constants::SAM_SQ_NAME_TAG, seq.Name) + << FormatTag(Constants::SAM_SQ_LENGTH_TAG, seq.Length); + + // AS: + if ( seq.HasAssemblyID() ) + out << FormatTag(Constants::SAM_SQ_ASSEMBLYID_TAG, seq.AssemblyID); + + // M5: + if ( seq.HasChecksum() ) + out << FormatTag(Constants::SAM_SQ_CHECKSUM_TAG, seq.Checksum); + + // SP: + if ( seq.HasSpecies() ) + out << FormatTag(Constants::SAM_SQ_SPECIES_TAG, seq.Species); + + // UR: + if ( seq.HasURI() ) + out << FormatTag(Constants::SAM_SQ_URI_TAG, seq.URI); + + // newline + out << endl; + } +} + +void SamFormatPrinter::PrintRG(std::stringstream& out) const { + + // iterate over read group entries + SamReadGroupConstIterator rgIter = m_header.ReadGroups.ConstBegin(); + SamReadGroupConstIterator rgEnd = m_header.ReadGroups.ConstEnd(); + for ( ; rgIter != rgEnd; ++rgIter ) { + const SamReadGroup& rg = (*rgIter); + + // @RG ID: + out << Constants::SAM_RG_BEGIN_TOKEN + << FormatTag(Constants::SAM_RG_ID_TAG, rg.ID); + + // CN: + if ( rg.HasSequencingCenter() ) + out << FormatTag(Constants::SAM_RG_SEQCENTER_TAG, rg.SequencingCenter); + + // DS: + if ( rg.HasDescription() ) + out << FormatTag(Constants::SAM_RG_DESCRIPTION_TAG, rg.Description); + + // DT: + if ( rg.HasProductionDate() ) + out << FormatTag(Constants::SAM_RG_PRODUCTIONDATE_TAG, rg.ProductionDate); + + // FO: + if ( rg.HasFlowOrder() ) + out << FormatTag(Constants::SAM_RG_FLOWORDER_TAG, rg.FlowOrder); + + // KS: + if ( rg.HasKeySequence() ) + out << FormatTag(Constants::SAM_RG_KEYSEQUENCE_TAG, rg.KeySequence); + + // LB: + if ( rg.HasLibrary() ) + out << FormatTag(Constants::SAM_RG_LIBRARY_TAG, rg.Library); + + // PG: + if ( rg.HasProgram() ) + out << FormatTag(Constants::SAM_RG_PROGRAM_TAG, rg.Program); + + // PI: + if ( rg.HasPredictedInsertSize() ) + out << FormatTag(Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG, rg.PredictedInsertSize); + + // PL: + if ( rg.HasSequencingTechnology() ) + out << FormatTag(Constants::SAM_RG_SEQTECHNOLOGY_TAG, rg.SequencingTechnology); + + // PU: + if ( rg.HasPlatformUnit() ) + out << FormatTag(Constants::SAM_RG_PLATFORMUNIT_TAG, rg.PlatformUnit); + + // SM: + if ( rg.HasSample() ) + out << FormatTag(Constants::SAM_RG_SAMPLE_TAG, rg.Sample); + + // newline + out << endl; + } +} + +void SamFormatPrinter::PrintPG(std::stringstream& out) const { + + // iterate over program record entries + SamProgramConstIterator pgIter = m_header.Programs.ConstBegin(); + SamProgramConstIterator pgEnd = m_header.Programs.ConstEnd(); + for ( ; pgIter != pgEnd; ++pgIter ) { + const SamProgram& pg = (*pgIter); + + // @PG ID: + out << Constants::SAM_PG_BEGIN_TOKEN + << FormatTag(Constants::SAM_PG_ID_TAG, pg.ID); + + // PN: + if ( pg.HasName() ) + out << FormatTag(Constants::SAM_PG_NAME_TAG, pg.Name); + + // CL: + if ( pg.HasCommandLine() ) + out << FormatTag(Constants::SAM_PG_COMMANDLINE_TAG, pg.CommandLine); + + // PP: + if ( pg.HasPreviousProgramID() ) + out << FormatTag(Constants::SAM_PG_PREVIOUSPROGRAM_TAG, pg.PreviousProgramID); + + // VN: + if ( pg.HasVersion() ) + out << FormatTag(Constants::SAM_PG_VERSION_TAG, pg.Version); + + // newline + out << endl; + } +} + +void SamFormatPrinter::PrintCO(std::stringstream& out) const { + + // iterate over comments + vector::const_iterator commentIter = m_header.Comments.begin(); + vector::const_iterator commentEnd = m_header.Comments.end(); + for ( ; commentIter != commentEnd; ++commentIter ) { + + // @CO + out << Constants::SAM_CO_BEGIN_TOKEN + << Constants::SAM_TAB + << (*commentIter) + << endl; + } +} diff --git a/src/api/internal/sam/SamFormatPrinter_p.h b/src/api/internal/sam/SamFormatPrinter_p.h new file mode 100644 index 0000000..ea29181 --- /dev/null +++ b/src/api/internal/sam/SamFormatPrinter_p.h @@ -0,0 +1,59 @@ +// *************************************************************************** +// SamFormatPrinter.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 6 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for printing formatted SAM header to string +// *************************************************************************** + +#ifndef SAM_FORMAT_PRINTER_H +#define SAM_FORMAT_PRINTER_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include +#include + +namespace BamTools { + +class SamHeader; + +namespace Internal { + +class SamFormatPrinter { + + // ctor & dtor + public: + SamFormatPrinter(const BamTools::SamHeader& header); + ~SamFormatPrinter(void); + + // generates SAM-formatted string from header data + public: + const std::string ToString(void) const; + + // internal methods + private: + void PrintHD(std::stringstream& out) const; + void PrintSQ(std::stringstream& out) const; + void PrintRG(std::stringstream& out) const; + void PrintPG(std::stringstream& out) const; + void PrintCO(std::stringstream& out) const; + + // data members + private: + const SamHeader& m_header; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // SAM_FORMAT_PRINTER_H diff --git a/src/api/internal/sam/SamHeaderValidator_p.cpp b/src/api/internal/sam/SamHeaderValidator_p.cpp new file mode 100644 index 0000000..6bcb8a9 --- /dev/null +++ b/src/api/internal/sam/SamHeaderValidator_p.cpp @@ -0,0 +1,524 @@ +// *************************************************************************** +// SamHeaderValidator.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 25 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for validating SamHeader data +// *************************************************************************** + +#include "api/SamConstants.h" +#include "api/SamHeader.h" +#include "api/internal/sam/SamHeaderValidator_p.h" +#include "api/internal/sam/SamHeaderVersion_p.h" +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +using namespace std; + +// ------------------------ +// static utility methods +// ------------------------- + +static +bool caseInsensitiveCompare(const string& lhs, const string& rhs) { + + // can omit checking chars if lengths not equal + const int lhsLength = lhs.length(); + const int rhsLength = rhs.length(); + if ( lhsLength != rhsLength ) + return false; + + // do *basic* toupper checks on each string char's + for ( int i = 0; i < lhsLength; ++i ) { + if ( toupper( (int)lhs.at(i)) != toupper( (int)rhs.at(i)) ) + return false; + } + + // otherwise OK + return true; +} + +// ------------------------------------------------------------------------ +// Allow validation rules to vary, as needed, between SAM header versions +// +// use SAM_VERSION_X_Y to tag important changes +// +// Together, they will allow for comparisons like: +// if ( m_version < SAM_VERSION_2_0 ) { +// // use some older rule +// else +// // use rule introduced with version 2.0 + +static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0); +static const SamHeaderVersion SAM_VERSION_1_1 = SamHeaderVersion(1,1); +static const SamHeaderVersion SAM_VERSION_1_2 = SamHeaderVersion(1,2); +static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3); +static const SamHeaderVersion SAM_VERSION_1_4 = SamHeaderVersion(1,4); + +// TODO: This functionality is currently unused. +// Make validation "version-aware." +// +// ------------------------------------------------------------------------ + +const string SamHeaderValidator::ERROR_PREFIX = "ERROR: "; +const string SamHeaderValidator::WARN_PREFIX = "WARNING: "; +const string SamHeaderValidator::NEWLINE = "\n"; + +SamHeaderValidator::SamHeaderValidator(const SamHeader& header) + : m_header(header) +{ } + +SamHeaderValidator::~SamHeaderValidator(void) { } + +void SamHeaderValidator::AddError(const string& message) { + m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE); +} + +void SamHeaderValidator::AddWarning(const string& message) { + m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE); +} + +void SamHeaderValidator::PrintErrorMessages(ostream& stream) { + + // skip if no error messages + if ( m_errorMessages.empty() ) + return; + + // print error header line + stream << "* SAM header has " << m_errorMessages.size() << " errors:" << endl; + + // print each error message + vector::const_iterator errorIter = m_errorMessages.begin(); + vector::const_iterator errorEnd = m_errorMessages.end(); + for ( ; errorIter != errorEnd; ++errorIter ) + stream << (*errorIter); +} + +void SamHeaderValidator::PrintMessages(ostream& stream) { + PrintErrorMessages(stream); + PrintWarningMessages(stream); +} + +void SamHeaderValidator::PrintWarningMessages(ostream& stream) { + + // skip if no warning messages + if ( m_warningMessages.empty() ) + return; + + // print warning header line + stream << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl; + + // print each warning message + vector::const_iterator warnIter = m_warningMessages.begin(); + vector::const_iterator warnEnd = m_warningMessages.end(); + for ( ; warnIter != warnEnd; ++warnIter ) + stream << (*warnIter); +} + +// entry point for validation +bool SamHeaderValidator::Validate(void) { + bool isValid = true; + isValid &= ValidateMetadata(); + isValid &= ValidateSequenceDictionary(); + isValid &= ValidateReadGroupDictionary(); + isValid &= ValidateProgramChain(); + return isValid; +} + +// check all SAM header 'metadata' +bool SamHeaderValidator::ValidateMetadata(void) { + bool isValid = true; + isValid &= ValidateVersion(); + isValid &= ValidateSortOrder(); + isValid &= ValidateGroupOrder(); + return isValid; +} + +// check SAM header version tag +bool SamHeaderValidator::ValidateVersion(void) { + + const string& version = m_header.Version; + + // warn if version not present + if ( version.empty() ) { + AddWarning("Version (VN) missing. Not required, but strongly recommended"); + return true; + } + + // invalid if version does not contain a period + const size_t periodFound = version.find(Constants::SAM_PERIOD); + if ( periodFound == string::npos ) { + AddError("Invalid version (VN) format: " + version); + return false; + } + + // invalid if major version is empty or contains non-digits + const string majorVersion = version.substr(0, periodFound); + if ( majorVersion.empty() || !ContainsOnlyDigits(majorVersion) ) { + AddError("Invalid version (VN) format: " + version); + return false; + } + + // invalid if major version is empty or contains non-digits + const string minorVersion = version.substr(periodFound + 1); + if ( minorVersion.empty() || !ContainsOnlyDigits(minorVersion) ) { + AddError("Invalid version (VN) format: " + version); + return false; + } + + // TODO: check if version is not just syntactically OK, + // but is also a valid SAM version ( 1.0 .. CURRENT ) + + // all checked out this far, then version is OK + return true; +} + +// assumes non-empty input string +bool SamHeaderValidator::ContainsOnlyDigits(const string& s) { + const size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS); + return ( nonDigitPosition == string::npos ) ; +} + +// validate SAM header sort order tag +bool SamHeaderValidator::ValidateSortOrder(void) { + + const string& sortOrder = m_header.SortOrder; + + // warn if sort order not present + if ( sortOrder.empty() ) { + AddWarning("Sort order (SO) missing. Not required, but strongly recommended"); + return true; + } + + // if sort order is valid keyword + if ( sortOrder == Constants::SAM_HD_SORTORDER_COORDINATE || + sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME || + sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED + ) + { + return true; + } + + // otherwise + AddError("Invalid sort order (SO): " + sortOrder); + return false; +} + +// validate SAM header group order tag +bool SamHeaderValidator::ValidateGroupOrder(void) { + + const string& groupOrder = m_header.GroupOrder; + + // if no group order, no problem, just return OK + if ( groupOrder.empty() ) + return true; + + // if group order is valid keyword + if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE || + groupOrder == Constants::SAM_HD_GROUPORDER_QUERY || + groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE + ) + { + return true; + } + + // otherwise + AddError("Invalid group order (GO): " + groupOrder); + return false; +} + +// validate SAM header sequence dictionary +bool SamHeaderValidator::ValidateSequenceDictionary(void) { + + bool isValid = true; + + // check for unique sequence names + isValid &= ContainsUniqueSequenceNames(); + + // iterate over sequences + const SamSequenceDictionary& sequences = m_header.Sequences; + SamSequenceConstIterator seqIter = sequences.ConstBegin(); + SamSequenceConstIterator seqEnd = sequences.ConstEnd(); + for ( ; seqIter != seqEnd; ++seqIter ) { + const SamSequence& seq = (*seqIter); + isValid &= ValidateSequence(seq); + } + + // return validation state + return isValid; +} + +// make sure all SQ names are unique +bool SamHeaderValidator::ContainsUniqueSequenceNames(void) { + + bool isValid = true; + set sequenceNames; + set::iterator nameIter; + + // iterate over sequences + const SamSequenceDictionary& sequences = m_header.Sequences; + SamSequenceConstIterator seqIter = sequences.ConstBegin(); + SamSequenceConstIterator seqEnd = sequences.ConstEnd(); + for ( ; seqIter != seqEnd; ++seqIter ) { + const SamSequence& seq = (*seqIter); + + // lookup sequence name + const string& name = seq.Name; + nameIter = sequenceNames.find(name); + + // error if found (duplicate entry) + if ( nameIter != sequenceNames.end() ) { + AddError("Sequence name (SN): " + name + " is not unique"); + isValid = false; + } + + // otherwise ok, store name + sequenceNames.insert(name); + } + + // return validation state + return isValid; +} + +// validate SAM header sequence entry +bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) { + bool isValid = true; + isValid &= CheckNameFormat(seq.Name); + isValid &= CheckLengthInRange(seq.Length); + return isValid; +} + +// check sequence name is valid format +bool SamHeaderValidator::CheckNameFormat(const string& name) { + + // invalid if name is empty + if ( name.empty() ) { + AddError("Sequence entry (@SQ) is missing SN tag"); + return false; + } + + // invalid if first character is a reserved char + const char firstChar = name.at(0); + if ( firstChar == Constants::SAM_EQUAL || firstChar == Constants::SAM_STAR ) { + AddError("Invalid sequence name (SN): " + name); + return false; + } + // otherwise OK + return true; +} + +// check that sequence length is within accepted range +bool SamHeaderValidator::CheckLengthInRange(const string& length) { + + // invalid if empty + if ( length.empty() ) { + AddError("Sequence entry (@SQ) is missing LN tag"); + return false; + } + + // convert string length to numeric + stringstream lengthStream(length); + unsigned int sequenceLength; + lengthStream >> sequenceLength; + + // invalid if length outside accepted range + if ( sequenceLength < Constants::SAM_SQ_LENGTH_MIN || sequenceLength > Constants::SAM_SQ_LENGTH_MAX ) { + AddError("Sequence length (LN): " + length + " out of range"); + return false; + } + + // otherwise OK + return true; +} + +// validate SAM header read group dictionary +bool SamHeaderValidator::ValidateReadGroupDictionary(void) { + + bool isValid = true; + + // check for unique read group IDs & platform units + isValid &= ContainsUniqueIDsAndPlatformUnits(); + + // iterate over read groups + const SamReadGroupDictionary& readGroups = m_header.ReadGroups; + SamReadGroupConstIterator rgIter = readGroups.ConstBegin(); + SamReadGroupConstIterator rgEnd = readGroups.ConstEnd(); + for ( ; rgIter != rgEnd; ++rgIter ) { + const SamReadGroup& rg = (*rgIter); + isValid &= ValidateReadGroup(rg); + } + + // return validation state + return isValid; +} + +// make sure RG IDs and platform units are unique +bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) { + + bool isValid = true; + set readGroupIds; + set platformUnits; + set::iterator idIter; + set::iterator puIter; + + // iterate over sequences + const SamReadGroupDictionary& readGroups = m_header.ReadGroups; + SamReadGroupConstIterator rgIter = readGroups.ConstBegin(); + SamReadGroupConstIterator rgEnd = readGroups.ConstEnd(); + for ( ; rgIter != rgEnd; ++rgIter ) { + const SamReadGroup& rg = (*rgIter); + + // -------------------------------- + // check for unique ID + + // lookup read group ID + const string& id = rg.ID; + idIter = readGroupIds.find(id); + + // error if found (duplicate entry) + if ( idIter != readGroupIds.end() ) { + AddError("Read group ID (ID): " + id + " is not unique"); + isValid = false; + } + + // otherwise ok, store id + readGroupIds.insert(id); + + // -------------------------------- + // check for unique platform unit + + // lookup platform unit + const string& pu = rg.PlatformUnit; + puIter = platformUnits.find(pu); + + // error if found (duplicate entry) + if ( puIter != platformUnits.end() ) { + AddError("Platform unit (PU): " + pu + " is not unique"); + isValid = false; + } + + // otherwise ok, store platform unit + platformUnits.insert(pu); + } + + // return validation state + return isValid; +} + +// validate SAM header read group entry +bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) { + bool isValid = true; + isValid &= CheckReadGroupID(rg.ID); + isValid &= CheckSequencingTechnology(rg.SequencingTechnology); + return isValid; +} + +// make sure RG ID exists +bool SamHeaderValidator::CheckReadGroupID(const string& id) { + + // invalid if empty + if ( id.empty() ) { + AddError("Read group entry (@RG) is missing ID tag"); + return false; + } + + // otherwise OK + return true; +} + +// make sure RG sequencing tech is one of the accepted keywords +bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) { + + // if no technology provided, no problem, just return OK + if ( technology.empty() ) + return true; + + // if technology is valid keyword + if ( caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_CAPILLARY) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_HELICOS) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_IONTORRENT) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_LS454) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_PACBIO) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_SOLID) + ) + { + return true; + } + + // otherwise + AddError("Invalid read group sequencing platform (PL): " + technology); + return false; +} + +// validate the SAM header "program chain" +bool SamHeaderValidator::ValidateProgramChain(void) { + bool isValid = true; + isValid &= ContainsUniqueProgramIds(); + isValid &= ValidatePreviousProgramIds(); + return isValid; +} + +// make sure all PG IDs are unique +bool SamHeaderValidator::ContainsUniqueProgramIds(void) { + + bool isValid = true; + set programIds; + set::iterator pgIdIter; + + // iterate over program records + const SamProgramChain& programs = m_header.Programs; + SamProgramConstIterator pgIter = programs.ConstBegin(); + SamProgramConstIterator pgEnd = programs.ConstEnd(); + for ( ; pgIter != pgEnd; ++pgIter ) { + const SamProgram& pg = (*pgIter); + + // lookup program ID + const string& pgId = pg.ID; + pgIdIter = programIds.find(pgId); + + // error if found (duplicate entry) + if ( pgIdIter != programIds.end() ) { + AddError("Program ID (ID): " + pgId + " is not unique"); + isValid = false; + } + + // otherwise ok, store ID + programIds.insert(pgId); + } + + // return validation state + return isValid; +} + +// make sure that any PP tags present point to existing @PG IDs +bool SamHeaderValidator::ValidatePreviousProgramIds(void) { + + bool isValid = true; + + // iterate over program records + const SamProgramChain& programs = m_header.Programs; + SamProgramConstIterator pgIter = programs.ConstBegin(); + SamProgramConstIterator pgEnd = programs.ConstEnd(); + for ( ; pgIter != pgEnd; ++pgIter ) { + const SamProgram& pg = (*pgIter); + + // ignore record for validation if PreviousProgramID is empty + const string& ppId = pg.PreviousProgramID; + if ( ppId.empty() ) + continue; + + // see if program "chain" contains an entry for ppId + if ( !programs.Contains(ppId) ) { + AddError("PreviousProgramID (PP): " + ppId + " is not a known ID"); + isValid = false; + } + } + + // return validation state + return isValid; +} diff --git a/src/api/internal/sam/SamHeaderValidator_p.h b/src/api/internal/sam/SamHeaderValidator_p.h new file mode 100644 index 0000000..7d0c60a --- /dev/null +++ b/src/api/internal/sam/SamHeaderValidator_p.h @@ -0,0 +1,105 @@ +// *************************************************************************** +// SamHeaderValidator.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 6 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for validating SamHeader data +// *************************************************************************** + +#ifndef SAM_HEADER_VALIDATOR_P_H +#define SAM_HEADER_VALIDATOR_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include +#include +#include + +namespace BamTools { + +class SamHeader; +class SamReadGroup; +class SamSequence; + +namespace Internal { + +class SamHeaderValidator { + + // ctor & dtor + public: + SamHeaderValidator(const SamHeader& header); + ~SamHeaderValidator(void); + + // SamHeaderValidator interface + public: + + // prints error & warning messages + void PrintMessages(std::ostream& stream); + + // validates SamHeader data, returns true/false accordingly + bool Validate(void); + + // internal methods + private: + + // validate header metadata + bool ValidateMetadata(void); + bool ValidateVersion(void); + bool ContainsOnlyDigits(const std::string& s); + bool ValidateSortOrder(void); + bool ValidateGroupOrder(void); + + // validate sequence dictionary + bool ValidateSequenceDictionary(void); + bool ContainsUniqueSequenceNames(void); + bool CheckNameFormat(const std::string& name); + bool ValidateSequence(const SamSequence& seq); + bool CheckLengthInRange(const std::string& length); + + // validate read group dictionary + bool ValidateReadGroupDictionary(void); + bool ContainsUniqueIDsAndPlatformUnits(void); + bool ValidateReadGroup(const SamReadGroup& rg); + bool CheckReadGroupID(const std::string& id); + bool CheckSequencingTechnology(const std::string& technology); + + // validate program data + bool ValidateProgramChain(void); + bool ContainsUniqueProgramIds(void); + bool ValidatePreviousProgramIds(void); + + // error reporting + void AddError(const std::string& message); + void AddWarning(const std::string& message); + void PrintErrorMessages(std::ostream& stream); + void PrintWarningMessages(std::ostream& stream); + + // data members + private: + + // SamHeader being validated + const SamHeader& m_header; + + // error reporting helpers + static const std::string ERROR_PREFIX; + static const std::string WARN_PREFIX; + static const std::string NEWLINE; + + // error reporting messages + std::vector m_errorMessages; + std::vector m_warningMessages; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // SAM_HEADER_VALIDATOR_P_H diff --git a/src/api/internal/sam/SamHeaderVersion_p.h b/src/api/internal/sam/SamHeaderVersion_p.h new file mode 100644 index 0000000..4f85df0 --- /dev/null +++ b/src/api/internal/sam/SamHeaderVersion_p.h @@ -0,0 +1,134 @@ +// *************************************************************************** +// SamHeaderVersion.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// --------------------------------------------------------------------------- +// Last modified: 10 October 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for comparing SAM header versions +// ************************************************************************* + +#ifndef SAM_HEADERVERSION_P_H +#define SAM_HEADERVERSION_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include "api/SamConstants.h" +#include +#include + +namespace BamTools { +namespace Internal { + +class SamHeaderVersion { + + // ctors & dtor + public: + SamHeaderVersion(void) + : m_majorVersion(0) + , m_minorVersion(0) + { } + + explicit SamHeaderVersion(const std::string& version) + : m_majorVersion(0) + , m_minorVersion(0) + { + SetVersion(version); + } + + SamHeaderVersion(const unsigned int& major, const unsigned int& minor) + : m_majorVersion(major) + , m_minorVersion(minor) + { } + + ~SamHeaderVersion(void) { + m_majorVersion = 0; + m_minorVersion = 0; + } + + // acess data + public: + unsigned int MajorVersion(void) const { return m_majorVersion; } + unsigned int MinorVersion(void) const { return m_minorVersion; } + + void SetVersion(const std::string& version); + std::string ToString(void) const; + + // data members + private: + unsigned int m_majorVersion; + unsigned int m_minorVersion; +}; + +inline +void SamHeaderVersion::SetVersion(const std::string& version) { + + // do nothing if version is empty + if ( !version.empty() ) { + + std::stringstream versionStream(""); + + // do nothing if period not found + const size_t periodFound = version.find(Constants::SAM_PERIOD); + if ( periodFound != std::string::npos ) { + + // store major version if non-empty and contains only digits + const std::string& majorVersion = version.substr(0, periodFound); + versionStream.str(majorVersion); + if ( !majorVersion.empty() ) { + const size_t nonDigitFound = majorVersion.find_first_not_of(Constants::SAM_DIGITS); + if ( nonDigitFound == std::string::npos ) + versionStream >> m_majorVersion; + } + + // store minor version if non-empty and contains only digits + const std::string& minorVersion = version.substr(periodFound + 1); + versionStream.str(minorVersion); + if ( !minorVersion.empty() ) { + const size_t nonDigitFound = minorVersion.find_first_not_of(Constants::SAM_DIGITS); + if ( nonDigitFound == std::string::npos ) + versionStream >> m_minorVersion; + } + } + } +} + +// ----------------------------------------------------- +// printing + +inline std::string SamHeaderVersion::ToString(void) const { + std::stringstream version; + version << m_majorVersion << Constants::SAM_PERIOD << m_minorVersion; + return version.str(); +} + +// ----------------------------------------------------- +// comparison operators + +inline bool operator==(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { + return (lhs.MajorVersion() == rhs.MajorVersion()) && + (lhs.MinorVersion() == rhs.MinorVersion()); +} + +inline bool operator<(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { + if ( lhs.MajorVersion() == rhs.MajorVersion() ) + return lhs.MinorVersion() < rhs.MinorVersion(); + else + return lhs.MajorVersion() < rhs.MajorVersion(); +} + +inline bool operator> (const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return rhs < lhs; } +inline bool operator<=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return !(lhs>rhs); } +inline bool operator>=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return !(lhs +#include + +namespace BamTools { +namespace Internal { + +class BamException : public std::exception { + + public: + inline BamException(const std::string& where, const std::string& message) + : std::exception() + , m_errorString(where + SEPARATOR + message) + { } + + inline ~BamException(void) throw() { } + + inline const char* what(void) const throw() { + return m_errorString.c_str(); + } + + private: + std::string m_errorString; + static const std::string SEPARATOR; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMEXCEPTION_P_H diff --git a/src/api/internal/utils/CMakeLists.txt b/src/api/internal/utils/CMakeLists.txt new file mode 100644 index 0000000..38a6957 --- /dev/null +++ b/src/api/internal/utils/CMakeLists.txt @@ -0,0 +1,15 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2011 Derek Barnett +# +# src/api/internal/utils +# ========================== + +set ( InternalUtilsDir "${InternalDir}/utils" ) + +set ( InternalUtilsSources + ${InternalUtilsDir}/BamException_p.cpp + + PARENT_SCOPE # <-- leave this last +) +