// BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
+// Last modified: 25 October 2011 (DB)
// ---------------------------------------------------------------------------
// Convenience class for reading multiple BAM files.
//
// ***************************************************************************
#include "api/BamMultiReader.h"
-#include "api/internal/BamMultiReader_p.h"
+#include "api/internal/bam/BamMultiReader_p.h"
using namespace BamTools;
#include <string>
// BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
+// Last modified: 25 October 2011 (DB)
// ---------------------------------------------------------------------------
// Convenience class for reading multiple BAM files.
// ***************************************************************************
// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
+// Last modified: 25 October 2011 (DB)
// ---------------------------------------------------------------------------
// Provides read access to BAM files.
// ***************************************************************************
#include "api/BamReader.h"
-#include "api/internal/BamReader_p.h"
+#include "api/internal/bam/BamReader_p.h"
using namespace BamTools;
using namespace BamTools::Internal;
// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett\r
// Marth Lab, Department of Biology, Boston College\r
// ---------------------------------------------------------------------------\r
-// Last modified: 10 October 2011 (DB)\r
+// Last modified: 25 October 2011 (DB)\r
// ---------------------------------------------------------------------------\r
// Provides the basic functionality for producing BAM files\r
// ***************************************************************************\r
#include "api/BamAlignment.h"\r
#include "api/BamWriter.h"\r
#include "api/SamHeader.h"\r
-#include "api/internal/BamWriter_p.h"\r
+#include "api/internal/bam/BamWriter_p.h"\r
using namespace BamTools;\r
using namespace BamTools::Internal;\r
using namespace std;\r
SamReadGroupDictionary.cpp
SamSequence.cpp
SamSequenceDictionary.cpp
- internal/BamDeviceFactory_p.cpp
- internal/BamException_p.cpp
- internal/BamFile_p.cpp
- internal/BamFtp_p.cpp
- internal/BamHeader_p.cpp
- internal/BamHttp_p.cpp
- internal/BamIndexFactory_p.cpp
- internal/BamMultiReader_p.cpp
- internal/BamPipe_p.cpp
- internal/BamRandomAccessController_p.cpp
- internal/BamReader_p.cpp
- internal/BamStandardIndex_p.cpp
- internal/BamToolsIndex_p.cpp
- internal/BamWriter_p.cpp
- internal/BgzfStream_p.cpp
- internal/ILocalIODevice_p.cpp
- internal/IRemoteIODevice_p.cpp
- internal/SamFormatParser_p.cpp
- internal/SamFormatPrinter_p.cpp
- internal/SamHeaderValidator_p.cpp
+ internal/bam/BamHeader_p.cpp
+ internal/bam/BamMultiReader_p.cpp
+ internal/bam/BamRandomAccessController_p.cpp
+ internal/bam/BamReader_p.cpp
+ internal/bam/BamWriter_p.cpp
+ internal/index/BamIndexFactory_p.cpp
+ internal/index/BamStandardIndex_p.cpp
+ internal/index/BamToolsIndex_p.cpp
+ internal/io/BamDeviceFactory_p.cpp
+ internal/io/BamFile_p.cpp
+ internal/io/BamFtp_p.cpp
+ internal/io/BamHttp_p.cpp
+ internal/io/BamPipe_p.cpp
+ internal/io/BgzfStream_p.cpp
+ internal/io/ILocalIODevice_p.cpp
+ internal/io/IRemoteIODevice_p.cpp
+ internal/sam/SamFormatParser_p.cpp
+ internal/sam/SamFormatPrinter_p.cpp
+ internal/sam/SamHeaderValidator_p.cpp
+ internal/utils/BamException_p.cpp
)
# create main BamTools API shared library
// SamHeader.cpp (c) 2010 Derek Barnett
// Marth Lab, Department of Biology, Boston College
// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
+// Last modified: 25 October 2011 (DB)
// ---------------------------------------------------------------------------
// Provides direct read/write access to the SAM header data fields.
// ***************************************************************************
#include "api/SamConstants.h"
#include "api/SamHeader.h"
-#include "api/internal/BamException_p.h"
-#include "api/internal/SamFormatParser_p.h"
-#include "api/internal/SamFormatPrinter_p.h"
-#include "api/internal/SamHeaderValidator_p.h"
+#include "api/internal/utils/BamException_p.h"
+#include "api/internal/sam/SamFormatParser_p.h"
+#include "api/internal/sam/SamFormatPrinter_p.h"
+#include "api/internal/sam/SamHeaderValidator_p.h"
using namespace BamTools;
using namespace BamTools::Internal;
using namespace std;
+++ /dev/null
-// ***************************************************************************
-// BamDeviceFactory_p.cpp (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 September 2011 (DB)
-// ---------------------------------------------------------------------------
-// Creates built-in concrete implementations of IBamIODevices
-// ***************************************************************************
-
-#include "api/internal/BamDeviceFactory_p.h"
-#include "api/internal/BamFile_p.h"
-#include "api/internal/BamFtp_p.h"
-#include "api/internal/BamHttp_p.h"
-#include "api/internal/BamPipe_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include <iostream>
-using namespace std;
-
-IBamIODevice* BamDeviceFactory::CreateDevice(const string& source) {
-
- // check for requested pipe
- if ( source == "-" || source == "stdin" || source == "stdout" )
- return new BamPipe;
-
- // check for HTTP prefix
- if ( source.find("http://") == 0 )
- return new BamHttp(source);
-
- // check for FTP prefix
- if ( source.find("ftp://") == 0 )
- return new BamFtp(source);
-
- // otherwise assume a "normal" file
- return new BamFile(source);
-}
+++ /dev/null
-// ***************************************************************************
-// BamDeviceFactory_p.h (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Creates built-in concrete implementations of IBamIODevices
-// ***************************************************************************
-
-#ifndef BAMDEVICEFACTORY_P_H
-#define BAMDEVICEFACTORY_P_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include "api/IBamIODevice.h"
-#include <string>
-
-namespace BamTools {
-namespace Internal {
-
-class BamDeviceFactory {
- public:
- static IBamIODevice* CreateDevice(const std::string& source);
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // BAMDEVICEFACTORY_P_H
+++ /dev/null
-// ***************************************************************************
-// BamException_p.cpp (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides a basic exception class for BamTools internals
-// ***************************************************************************
-
-#include "api/internal/BamException_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-using namespace std;
-
-const string BamException::SEPARATOR = ": ";
+++ /dev/null
-// ***************************************************************************
-// BamException_p.h (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 6 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides a basic exception class for BamTools internals
-// ***************************************************************************
-
-#ifndef BAMEXCEPTION_P_H
-#define BAMEXCEPTION_P_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include <exception>
-#include <string>
-
-namespace BamTools {
-namespace Internal {
-
-class BamException : public std::exception {
-
- public:
- inline BamException(const std::string& where, const std::string& message)
- : std::exception()
- , m_errorString(where + SEPARATOR + message)
- { }
-
- inline ~BamException(void) throw() { }
-
- inline const char* what(void) const throw() {
- return m_errorString.c_str();
- }
-
- private:
- std::string m_errorString;
- static const std::string SEPARATOR;
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // BAMEXCEPTION_P_H
+++ /dev/null
-// ***************************************************************************
-// BamFile_p.cpp (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides BAM file-specific IO behavior
-// ***************************************************************************
-
-#include "api/internal/BamFile_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include <cstdio>
-#include <iostream>
-using namespace std;
-
-BamFile::BamFile(const string& filename)
- : ILocalIODevice()
- , m_filename(filename)
-{ }
-
-BamFile::~BamFile(void) { }
-
-void BamFile::Close(void) {
- if ( IsOpen() ) {
- m_filename.clear();
- ILocalIODevice::Close();
- }
-}
-
-bool BamFile::IsRandomAccess(void) const {
- return true;
-}
-
-bool BamFile::Open(const IBamIODevice::OpenMode mode) {
-
- // make sure we're starting with a fresh file stream
- Close();
-
- // attempt to open FILE* depending on requested openmode
- if ( mode == IBamIODevice::ReadOnly )
- m_stream = fopen(m_filename.c_str(), "rb");
- else if ( mode == IBamIODevice::WriteOnly )
- m_stream = fopen(m_filename.c_str(), "wb");
- else {
- SetErrorString("BamFile::Open", "unknown open mode requested");
- return false;
- }
-
- // check that we obtained a valid FILE*
- if ( m_stream == 0 ) {
- const string message_base = string("could not open file handle for ");
- const string message = message_base + ( (m_filename.empty()) ? "empty filename" : m_filename );
- SetErrorString("BamFile::Open", message);
- return false;
- }
-
- // store current IO mode & return success
- m_mode = mode;
- return true;
-}
-
-bool BamFile::Seek(const int64_t& position) {
- BT_ASSERT_X( m_stream, "BamFile::Seek() - null stream" );
- return ( fseek64(m_stream, position, SEEK_SET) == 0 );
-}
+++ /dev/null
-// ***************************************************************************
-// BamFile_p.h (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides BAM file-specific IO behavior
-// ***************************************************************************
-
-#ifndef BAMFILE_P_H
-#define BAMFILE_P_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include "api/internal/ILocalIODevice_p.h"
-#include <string>
-
-namespace BamTools {
-namespace Internal {
-
-class BamFile : public ILocalIODevice {
-
- // ctor & dtor
- public:
- BamFile(const std::string& filename);
- ~BamFile(void);
-
- // ILocalIODevice implementation
- public:
- void Close(void);
- bool IsRandomAccess(void) const;
- bool Open(const IBamIODevice::OpenMode mode);
- bool Seek(const int64_t& position);
-
- // data members
- private:
- std::string m_filename;
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // BAMFILE_P_H
+++ /dev/null
-// ***************************************************************************
-// BamFtp_p.cpp (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides reading/writing of BAM files on FTP server
-// ***************************************************************************
-
-#include "api/internal/BamFtp_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-using namespace std;
-
-BamFtp::BamFtp(const string& url)
- : IBamIODevice()
-{
- BT_ASSERT_X(false, "BamFtp not yet implemented");
-}
-
-BamFtp::~BamFtp(void) { }
-
-void BamFtp::Close(void) {
- return ;
-}
-
-bool BamFtp::IsRandomAccess(void) const {
- return true;
-}
-
-bool BamFtp::Open(const IBamIODevice::OpenMode mode) {
- (void) mode;
- return true;
-}
-
-size_t BamFtp::Read(char* data, const unsigned int numBytes) {
- (void)data;
- (void)numBytes;
- return 0;
-}
-
-bool BamFtp::Seek(const int64_t& position) {
- (void)position;
- return true;
-}
-
-int64_t BamFtp::Tell(void) const {
- return -1;
-}
-
-size_t BamFtp::Write(const char* data, const unsigned int numBytes) {
- (void)data;
- (void)numBytes;
- return 0;
-}
+++ /dev/null
-// ***************************************************************************
-// BamFtp_p.h (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides reading/writing of BAM files on FTP server
-// ***************************************************************************
-
-#ifndef BAMFTP_P_H
-#define BAMFTP_P_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include "api/IBamIODevice.h"
-#include <string>
-
-namespace BamTools {
-namespace Internal {
-
-class BamFtp : public IBamIODevice {
-
- // ctor & dtor
- public:
- BamFtp(const std::string& url);
- ~BamFtp(void);
-
- // IBamIODevice implementation
- public:
- void Close(void);
- bool IsRandomAccess(void) const;
- bool Open(const IBamIODevice::OpenMode mode);
- size_t Read(char* data, const unsigned int numBytes);
- bool Seek(const int64_t& position);
- int64_t Tell(void) const;
- size_t Write(const char* data, const unsigned int numBytes);
-
- // internal methods
- private:
-
- // data members
- private:
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // BAMFTP_P_H
+++ /dev/null
-// ***************************************************************************
-// BamHeader_p.cpp (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides the basic functionality for handling BAM headers.
-// ***************************************************************************
-
-#include "api/BamAux.h"
-#include "api/BamConstants.h"
-#include "api/internal/BamException_p.h"
-#include "api/internal/BamHeader_p.h"
-#include "api/internal/BgzfStream_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include <cstdlib>
-#include <cstring>
-using namespace std;
-
-// ------------------------
-// static utility methods
-// ------------------------
-
-static inline
-bool isValidMagicNumber(const char* buffer) {
- return ( strncmp(buffer, Constants::BAM_HEADER_MAGIC,
- Constants::BAM_HEADER_MAGIC_LENGTH) == 0 );
-}
-
-// --------------------------
-// BamHeader implementation
-// --------------------------
-
-// ctor
-BamHeader::BamHeader(void) { }
-
-// dtor
-BamHeader::~BamHeader(void) { }
-
-// reads magic number from BGZF stream, returns true if valid
-void BamHeader::CheckMagicNumber(BgzfStream* stream) {
-
- // try to read magic number
- char buffer[Constants::BAM_HEADER_MAGIC_LENGTH];
- const size_t numBytesRead = stream->Read(buffer, Constants::BAM_HEADER_MAGIC_LENGTH);
- if ( numBytesRead != (int)Constants::BAM_HEADER_MAGIC_LENGTH )
- throw BamException("BamHeader::CheckMagicNumber", "could not read magic number");
-
- // validate magic number
- if ( !isValidMagicNumber(buffer) )
- throw BamException("BamHeader::CheckMagicNumber", "invalid magic number");
-}
-
-// clear SamHeader data
-void BamHeader::Clear(void) {
- m_header.Clear();
-}
-
-// return true if SamHeader data is valid
-bool BamHeader::IsValid(void) const {
- return m_header.IsValid();
-}
-
-// load BAM header ('magic number' and SAM header text) from BGZF stream
-void BamHeader::Load(BgzfStream* stream) {
-
- // read & check magic number
- CheckMagicNumber(stream);
-
- // read header (length, then actual text)
- uint32_t length(0);
- ReadHeaderLength(stream, length);
- ReadHeaderText(stream, length);
-}
-
-// reads SAM header text length from BGZF stream, stores it in @length
-void BamHeader::ReadHeaderLength(BgzfStream* stream, uint32_t& length) {
-
- // read BAM header text length
- char buffer[sizeof(uint32_t)];
- const size_t numBytesRead = stream->Read(buffer, sizeof(uint32_t));
- if ( numBytesRead != sizeof(uint32_t) )
- throw BamException("BamHeader::ReadHeaderLength", "could not read header length");
-
- // convert char buffer to length
- length = BamTools::UnpackUnsignedInt(buffer);
- if ( BamTools::SystemIsBigEndian() )
- BamTools::SwapEndian_32(length);
-}
-
-// reads SAM header text from BGZF stream, stores in SamHeader object
-void BamHeader::ReadHeaderText(BgzfStream* stream, const uint32_t& length) {
-
- // read header text
- char* headerText = (char*)calloc(length + 1, 1);
- const size_t bytesRead = stream->Read(headerText, length);
-
- // if error reading, clean up buffer & throw
- if ( bytesRead != length ) {
- free(headerText);
- throw BamException("BamHeader::ReadHeaderText", "could not read header text");
- }
-
- // otherwise, text was read OK
- // store & cleanup
- m_header.SetHeaderText( (string)((const char*)headerText) );
- free(headerText);
-}
-
-// returns *copy* of SamHeader data object
-SamHeader BamHeader::ToSamHeader(void) const {
- return m_header;
-}
-
-// returns SAM-formatted string of header data
-string BamHeader::ToString(void) const {
- return m_header.ToString();
-}
+++ /dev/null
-// ***************************************************************************
-// BamHeader_p.h (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides the basic functionality for handling BAM headers.
-// ***************************************************************************
-
-#ifndef BAMHEADER_P_H
-#define BAMHEADER_P_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include "api/SamHeader.h"
-#include <string>
-
-namespace BamTools {
-namespace Internal {
-
-class BgzfStream;
-
-class BamHeader {
-
- // ctor & dtor
- public:
- BamHeader(void);
- ~BamHeader(void);
-
- // BamHeader interface
- public:
- // clear SamHeader data
- void Clear(void);
- // return true if SamHeader data is valid
- bool IsValid(void) const;
- // load BAM header ('magic number' and SAM header text) from BGZF stream
- // returns true if all OK
- void Load(BgzfStream* stream);
- // returns (editable) copy of SamHeader data object
- SamHeader ToSamHeader(void) const;
- // returns SAM-formatted string of header data
- std::string ToString(void) const;
-
- // internal methods
- private:
- // reads magic number from BGZF stream
- void CheckMagicNumber(BgzfStream* stream);
- // reads SAM header length from BGZF stream, stores it in @length
- void ReadHeaderLength(BgzfStream* stream, uint32_t& length);
- // reads SAM header text from BGZF stream, stores in SamHeader object
- void ReadHeaderText(BgzfStream* stream, const uint32_t& length);
-
- // data members
- private:
- SamHeader m_header;
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // BAMHEADER_P_H
+++ /dev/null
-// ***************************************************************************
-// BamHttp_p.cpp (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides reading/writing of BAM files on HTTP server
-// ***************************************************************************
-
-#include "api/internal/BamHttp_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-using namespace std;
-
-BamHttp::BamHttp(const string& url)
- : IBamIODevice()
-{
- BT_ASSERT_X(false, "BamHttp not yet implemented");
-}
-
-BamHttp::~BamHttp(void) { }
-
-void BamHttp::Close(void) {
- return ;
-}
-
-bool BamHttp::IsRandomAccess(void) const {
- return true;
-}
-
-bool BamHttp::Open(const IBamIODevice::OpenMode mode) {
- (void) mode;
- return true;
-}
-
-size_t BamHttp::Read(char* data, const unsigned int numBytes) {
- (void)data;
- (void)numBytes;
- return 0;
-}
-
-bool BamHttp::Seek(const int64_t& position) {
- (void)position;
- return true;
-}
-
-int64_t BamHttp::Tell(void) const {
- return -1;
-}
-
-size_t BamHttp::Write(const char* data, const unsigned int numBytes) {
- (void)data;
- (void)numBytes;
- return 0;
-}
+++ /dev/null
-// ***************************************************************************
-// BamHttp_p.h (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides reading/writing of BAM files on HTTP server
-// ***************************************************************************
-
-#ifndef BAMHTTP_P_H
-#define BAMHTTP_P_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include "api/IBamIODevice.h"
-#include <string>
-
-namespace BamTools {
-namespace Internal {
-
-class BamHttp : public IBamIODevice {
-
- // ctor & dtor
- public:
- BamHttp(const std::string& url);
- ~BamHttp(void);
-
- // IBamIODevice implementation
- public:
- void Close(void);
- bool IsRandomAccess(void) const;
- bool Open(const IBamIODevice::OpenMode mode);
- size_t Read(char* data, const unsigned int numBytes);
- bool Seek(const int64_t& position);
- int64_t Tell(void) const;
- size_t Write(const char* data, const unsigned int numBytes);
-
- // internal methods
- private:
-
- // data members
- private:
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // BAMHTTP_P_H
+++ /dev/null
-// ***************************************************************************
-// BamIndexFactory_p.cpp (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides interface for generating BamIndex implementations
-// ***************************************************************************
-
-#include "api/BamAux.h"
-#include "api/internal/BamIndexFactory_p.h"
-#include "api/internal/BamStandardIndex_p.h"
-#include "api/internal/BamToolsIndex_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-using namespace std;
-
-// generates index filename from BAM filename (depending on requested type)
-// if type is unknown, returns empty string
-const string BamIndexFactory::CreateIndexFilename(const string& bamFilename,
- const BamIndex::IndexType& type)
-{
- switch ( type ) {
- case ( BamIndex::STANDARD ) : return ( bamFilename + BamStandardIndex::Extension() );
- case ( BamIndex::BAMTOOLS ) : return ( bamFilename + BamToolsIndex::Extension() );
- default :
- return string();
- }
-}
-
-// creates a new BamIndex object, depending on extension of @indexFilename
-BamIndex* BamIndexFactory::CreateIndexFromFilename(const string& indexFilename, BamReaderPrivate* reader) {
-
- // if file doesn't exist, return null index
- if ( !BamTools::FileExists(indexFilename) )
- return 0;
-
- // get file extension from index filename, including dot (".EXT")
- // if can't get file extension, return null index
- const string extension = FileExtension(indexFilename);
- if ( extension.empty() )
- return 0;
-
- // create index based on extension
- if ( extension == BamStandardIndex::Extension() ) return new BamStandardIndex(reader);
- else if ( extension == BamToolsIndex::Extension() ) return new BamToolsIndex(reader);
- else
- return 0;
-}
-
-// creates a new BamIndex, object of requested @type
-BamIndex* BamIndexFactory::CreateIndexOfType(const BamIndex::IndexType& type,
- BamReaderPrivate* reader)
-{
- switch ( type ) {
- case ( BamIndex::STANDARD ) : return new BamStandardIndex(reader);
- case ( BamIndex::BAMTOOLS ) : return new BamToolsIndex(reader);
- default :
- return 0;
- }
-}
-
-// retrieves file extension (including '.')
-const string BamIndexFactory::FileExtension(const string& filename) {
-
- // if filename cannot contain valid path + extension, return empty string
- if ( filename.empty() || filename.length() <= 4 )
- return string();
-
- // look for last dot in filename
- const size_t lastDotPosition = filename.find_last_of('.');
-
- // if none found, return empty string
- if ( lastDotPosition == string::npos )
- return string();
-
- // return substring from last dot position
- return filename.substr(lastDotPosition);
-}
-
-// returns name of existing index file that corresponds to @bamFilename
-// will defer to @preferredType if possible, if not will attempt to load any supported type
-// returns empty string if not found
-const string BamIndexFactory::FindIndexFilename(const string& bamFilename,
- const BamIndex::IndexType& preferredType)
-{
- // skip if BAM filename provided is empty
- if ( bamFilename.empty() )
- return string();
-
- // try to find index of preferred type first
- // return index filename if found
- string indexFilename = CreateIndexFilename(bamFilename, preferredType);
- if ( !indexFilename.empty() && BamTools::FileExists(indexFilename) )
- return indexFilename;
-
- // couldn't find preferred type, try the other supported types
- // return index filename if found
- if ( preferredType != BamIndex::STANDARD ) {
- indexFilename = CreateIndexFilename(bamFilename, BamIndex::STANDARD);
- if ( !indexFilename.empty() && BamTools::FileExists(indexFilename) )
- return indexFilename;
- }
- if ( preferredType != BamIndex::BAMTOOLS ) {
- indexFilename = CreateIndexFilename(bamFilename, BamIndex::BAMTOOLS);
- if ( !indexFilename.empty() && BamTools::FileExists(indexFilename) )
- return indexFilename;
- }
-
- // otherwise couldn't find any index matching this filename
- return string();
-}
+++ /dev/null
-// ***************************************************************************
-// BamIndexFactory_p.h (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides interface for generating BamIndex implementations
-// ***************************************************************************
-
-#ifndef BAMINDEX_FACTORY_P_H
-#define BAMINDEX_FACTORY_P_H
-
-#include "api/BamIndex.h"
-#include <string>
-
-namespace BamTools {
-namespace Internal {
-
-class BamIndexFactory {
-
- // static interface methods
- public:
- // creates a new BamIndex object, depending on extension of @indexFilename
- static BamIndex* CreateIndexFromFilename(const std::string& indexFilename,
- BamReaderPrivate* reader);
- // creates a new BamIndex object, of requested @type
- static BamIndex* CreateIndexOfType(const BamIndex::IndexType& type,
- BamReaderPrivate* reader);
- // returns name of existing index file that corresponds to @bamFilename
- // will defer to @preferredType if possible
- // if @preferredType not found, will attempt to load any supported index type
- // returns empty string if no index file (of any type) is found
- static const std::string FindIndexFilename(const std::string& bamFilename,
- const BamIndex::IndexType& preferredType);
-
- // internal methods
- public:
- // generates index filename from BAM filename (depending on requested type)
- // if type is unknown, returns empty string
- static const std::string CreateIndexFilename(const std::string& bamFilename,
- const BamIndex::IndexType& type);
- // retrieves file extension (including '.')
- static const std::string FileExtension(const std::string& filename);
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // BAMINDEX_FACTORY_P_H
+++ /dev/null
-// ***************************************************************************
-// BamMultiMerger_p.h (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides merging functionality for BamMultiReader. At this point, supports
-// sorting results by (refId, position) or by read name.
-// ***************************************************************************
-
-#ifndef BAMMULTIMERGER_P_H
-#define BAMMULTIMERGER_P_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include "api/BamAlignment.h"
-#include "api/BamReader.h"
-#include "api/algorithms/Sort.h"
-#include <deque>
-#include <functional>
-#include <set>
-#include <string>
-
-namespace BamTools {
-namespace Internal {
-
-struct MergeItem {
-
- // data members
- BamReader* Reader;
- BamAlignment* Alignment;
-
- // ctors & dtor
- MergeItem(BamReader* reader = 0,
- BamAlignment* alignment = 0)
- : Reader(reader)
- , Alignment(alignment)
- { }
-
- MergeItem(const MergeItem& other)
- : Reader(other.Reader)
- , Alignment(other.Alignment)
- { }
-
- ~MergeItem(void) { }
-};
-
-template<typename Compare>
-struct MergeItemSorter : public std::binary_function<MergeItem, MergeItem, bool> {
-
- public:
- MergeItemSorter(const Compare& comp = Compare())
- : m_comp(comp)
- { }
-
- bool operator()(const MergeItem& lhs, const MergeItem& rhs) {
- const BamAlignment& l = *lhs.Alignment;
- const BamAlignment& r = *rhs.Alignment;
- return m_comp(l,r);
- }
-
- private:
- Compare m_comp;
-};
-
-// pure ABC so we can just work polymorphically with any specific merger implementation
-class IMultiMerger {
-
- public:
- IMultiMerger(void) { }
- virtual ~IMultiMerger(void) { }
- public:
- virtual void Add(MergeItem item) =0;
- virtual void Clear(void) =0;
- virtual const MergeItem& First(void) const =0;
- virtual bool IsEmpty(void) const =0;
- virtual void Remove(BamReader* reader) =0;
- virtual int Size(void) const =0;
- virtual MergeItem TakeFirst(void) =0;
-};
-
-// general merger
-template<typename Compare>
-class MultiMerger : public IMultiMerger {
-
- public:
- typedef Compare CompareType;
- typedef MergeItemSorter<CompareType> MergeType;
-
- public:
- explicit MultiMerger(const Compare& comp = Compare())
- : IMultiMerger()
- , m_data( MergeType(comp) )
- { }
- ~MultiMerger(void) { }
-
- public:
- void Add(MergeItem item);
- void Clear(void);
- const MergeItem& First(void) const;
- bool IsEmpty(void) const;
- void Remove(BamReader* reader);
- int Size(void) const;
- MergeItem TakeFirst(void);
-
- private:
- typedef MergeItem ValueType;
- typedef std::multiset<ValueType, MergeType> ContainerType;
- typedef typename ContainerType::iterator DataIterator;
- typedef typename ContainerType::const_iterator DataConstIterator;
- ContainerType m_data;
-};
-
-template <typename Compare>
-inline void MultiMerger<Compare>::Add(MergeItem item) {
-
- // N.B. - any future custom Compare types must define this method
- // see algorithms/Sort.h
-
- if ( CompareType::UsesCharData() )
- item.Alignment->BuildCharData();
- m_data.insert(item);
-}
-
-template <typename Compare>
-inline void MultiMerger<Compare>::Clear(void) {
- m_data.clear();
-}
-
-template <typename Compare>
-inline const MergeItem& MultiMerger<Compare>::First(void) const {
- const ValueType& entry = (*m_data.begin());
- return entry;
-}
-
-template <typename Compare>
-inline bool MultiMerger<Compare>::IsEmpty(void) const {
- return m_data.empty();
-}
-template <typename Compare>
-inline void MultiMerger<Compare>::Remove(BamReader* reader) {
-
- if ( reader == 0 ) return;
- const std::string& filenameToRemove = reader->GetFilename();
-
- // iterate over readers in cache
- DataIterator dataIter = m_data.begin();
- DataIterator dataEnd = m_data.end();
- for ( ; dataIter != dataEnd; ++dataIter ) {
- const MergeItem& item = (*dataIter);
- const BamReader* itemReader = item.Reader;
- if ( itemReader == 0 ) continue;
-
- // remove iterator on match
- if ( itemReader->GetFilename() == filenameToRemove ) {
- m_data.erase(dataIter);
- return;
- }
- }
-}
-template <typename Compare>
-inline int MultiMerger<Compare>::Size(void) const {
- return m_data.size();
-}
-
-template <typename Compare>
-inline MergeItem MultiMerger<Compare>::TakeFirst(void) {
- DataIterator firstIter = m_data.begin();
- MergeItem firstItem = (*firstIter);
- m_data.erase(firstIter);
- return firstItem;
-}
-
-// unsorted "merger"
-template<>
-class MultiMerger<Algorithms::Sort::Unsorted> : public IMultiMerger {
-
- public:
- explicit MultiMerger(const Algorithms::Sort::Unsorted& comp = Algorithms::Sort::Unsorted())
- : IMultiMerger()
- { }
- ~MultiMerger(void) { }
-
- public:
- void Add(MergeItem item);
- void Clear(void);
- const MergeItem& First(void) const;
- bool IsEmpty(void) const;
- void Remove(BamReader* reader);
- int Size(void) const;
- MergeItem TakeFirst(void);
-
- private:
- typedef MergeItem ValueType;
- typedef std::deque<ValueType> ContainerType;
- typedef ContainerType::iterator DataIterator;
- typedef ContainerType::const_iterator DataConstIterator;
- ContainerType m_data;
-};
-
-inline
-void MultiMerger<Algorithms::Sort::Unsorted>::Add(MergeItem item) {
- m_data.push_back(item);
-}
-
-inline
-void MultiMerger<Algorithms::Sort::Unsorted>::Clear(void) {
- m_data.clear();
-}
-
-inline
-const MergeItem& MultiMerger<Algorithms::Sort::Unsorted>::First(void) const {
- return m_data.front();
-}
-
-inline
-bool MultiMerger<Algorithms::Sort::Unsorted>::IsEmpty(void) const {
- return m_data.empty();
-}
-
-inline
-void MultiMerger<Algorithms::Sort::Unsorted>::Remove(BamReader* reader) {
-
- if ( reader == 0 ) return;
- const std::string filenameToRemove = reader->GetFilename();
-
- // iterate over readers in cache
- DataIterator dataIter = m_data.begin();
- DataIterator dataEnd = m_data.end();
- for ( ; dataIter != dataEnd; ++dataIter ) {
- const MergeItem& item = (*dataIter);
- const BamReader* itemReader = item.Reader;
- if ( itemReader == 0 ) continue;
-
- // remove iterator on match
- if ( itemReader->GetFilename() == filenameToRemove ) {
- m_data.erase(dataIter);
- return;
- }
- }
-}
-
-inline
-int MultiMerger<Algorithms::Sort::Unsorted>::Size(void) const {
- return m_data.size();
-}
-
-inline
-MergeItem MultiMerger<Algorithms::Sort::Unsorted>::TakeFirst(void) {
- MergeItem firstItem = m_data.front();
- m_data.pop_front();
- return firstItem;
-}
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // BAMMULTIMERGER_P_H
+++ /dev/null
-// ***************************************************************************
-// BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 14 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Functionality for simultaneously reading multiple BAM files
-// *************************************************************************
-
-#include "api/BamAlignment.h"
-#include "api/BamMultiReader.h"
-#include "api/SamConstants.h"
-#include "api/algorithms/Sort.h"
-#include "api/internal/BamMultiReader_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <iterator>
-#include <sstream>
-using namespace std;
-
-// ctor
-BamMultiReaderPrivate::BamMultiReaderPrivate(void)
- : m_alignmentCache(0)
-{ }
-
-// dtor
-BamMultiReaderPrivate::~BamMultiReaderPrivate(void) {
- Close();
-}
-
-// close all BAM files
-bool BamMultiReaderPrivate::Close(void) {
-
- m_errorString.clear();
-
- if ( CloseFiles(Filenames()) )
- return true;
- else {
- const string currentError = m_errorString;
- const string message = string("error encountered while closing all files: \n\t") + currentError;
- SetErrorString("BamMultiReader::Close", message);
- return false;
- }
-}
-
-// close requested BAM file
-bool BamMultiReaderPrivate::CloseFile(const string& filename) {
-
- m_errorString.clear();
-
- vector<string> filenames(1, filename);
- if ( CloseFiles(filenames) )
- return true;
- else {
- const string currentError = m_errorString;
- const string message = string("error while closing file: ") + filename + "\n" + currentError;
- SetErrorString("BamMultiReader::CloseFile", message);
- return false;
- }
-}
-
-// close requested BAM files
-bool BamMultiReaderPrivate::CloseFiles(const vector<string>& filenames) {
-
- bool errorsEncountered = false;
- m_errorString.clear();
-
- // iterate over filenames
- vector<string>::const_iterator filesIter = filenames.begin();
- vector<string>::const_iterator filesEnd = filenames.end();
- for ( ; filesIter != filesEnd; ++filesIter ) {
- const string& filename = (*filesIter);
- if ( filename.empty() ) continue;
-
- // iterate over readers
- vector<MergeItem>::iterator readerIter = m_readers.begin();
- vector<MergeItem>::iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- MergeItem& item = (*readerIter);
- BamReader* reader = item.Reader;
- if ( reader == 0 ) continue;
-
- // if reader matches requested filename
- if ( reader->GetFilename() == filename ) {
-
- // remove reader's entry from alignment cache
- m_alignmentCache->Remove(reader);
-
- // clean up reader & its alignment
- if ( !reader->Close() ) {
- m_errorString.append(1, '\t');
- m_errorString.append(reader->GetErrorString());
- m_errorString.append(1, '\n');
- errorsEncountered = true;
- }
- delete reader;
- reader = 0;
-
- // delete reader's alignment entry
- BamAlignment* alignment = item.Alignment;
- delete alignment;
- alignment = 0;
-
- // remove reader from reader list
- m_readers.erase(readerIter);
-
- // on match, just go on to next filename
- // (no need to keep looking and item iterator is invalid now anyway)
- break;
- }
- }
- }
-
- // make sure alignment cache is cleaned up if all readers closed
- if ( m_readers.empty() && m_alignmentCache ) {
- m_alignmentCache->Clear();
- delete m_alignmentCache;
- m_alignmentCache = 0;
- }
-
- // return whether all readers closed OK
- return !errorsEncountered;
-}
-
-// creates index files for BAM files that don't have them
-bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) {
-
- bool errorsEncountered = false;
- m_errorString.clear();
-
- // iterate over readers
- vector<MergeItem>::iterator itemIter = m_readers.begin();
- vector<MergeItem>::iterator itemEnd = m_readers.end();
- for ( ; itemIter != itemEnd; ++itemIter ) {
- MergeItem& item = (*itemIter);
- BamReader* reader = item.Reader;
- if ( reader == 0 ) continue;
-
- // if reader doesn't have an index, create one
- if ( !reader->HasIndex() ) {
- if ( !reader->CreateIndex(type) ) {
- m_errorString.append(1, '\t');
- m_errorString.append(reader->GetErrorString());
- m_errorString.append(1, '\n');
- errorsEncountered = true;
- }
- }
- }
-
- // check for errors encountered before returning success/fail
- if ( errorsEncountered ) {
- const string currentError = m_errorString;
- const string message = string("error while creating index files: ") + "\n" + currentError;
- SetErrorString("BamMultiReader::CreateIndexes", message);
- return false;
- } else
- return true;
-}
-
-IMultiMerger* BamMultiReaderPrivate::CreateAlignmentCache(void) const {
-
- // fetch SamHeader
- SamHeader header = GetHeader();
-
- // if BAM files are sorted by position
- if ( header.SortOrder == Constants::SAM_HD_SORTORDER_COORDINATE )
- return new MultiMerger<Algorithms::Sort::ByPosition>();
-
- // if BAM files are sorted by read name
- if ( header.SortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME )
- return new MultiMerger<Algorithms::Sort::ByName>();
-
- // otherwise "unknown" or "unsorted", use unsorted merger and just read in
- return new MultiMerger<Algorithms::Sort::Unsorted>();
-}
-
-const vector<string> BamMultiReaderPrivate::Filenames(void) const {
-
- // init filename container
- vector<string> filenames;
- filenames.reserve( m_readers.size() );
-
- // iterate over readers
- vector<MergeItem>::const_iterator itemIter = m_readers.begin();
- vector<MergeItem>::const_iterator itemEnd = m_readers.end();
- for ( ; itemIter != itemEnd; ++itemIter ) {
- const MergeItem& item = (*itemIter);
- const BamReader* reader = item.Reader;
- if ( reader == 0 ) continue;
-
- // store filename if not empty
- const string& filename = reader->GetFilename();
- if ( !filename.empty() )
- filenames.push_back(filename);
- }
-
- // return result
- return filenames;
-}
-
-string BamMultiReaderPrivate::GetErrorString(void) const {
- return m_errorString;
-}
-
-SamHeader BamMultiReaderPrivate::GetHeader(void) const {
- const string& text = GetHeaderText();
- return SamHeader(text);
-}
-
-// makes a virtual, unified header for all the bam files in the multireader
-string BamMultiReaderPrivate::GetHeaderText(void) const {
-
- // N.B. - right now, simply copies all header data from first BAM,
- // and then appends RG's from other BAM files
- // TODO: make this more intelligent wrt other header lines/fields
-
- // if no readers open
- const size_t numReaders = m_readers.size();
- if ( numReaders == 0 ) return string();
-
- // retrieve first reader's header
- const MergeItem& firstItem = m_readers.front();
- const BamReader* reader = firstItem.Reader;
- if ( reader == 0 ) return string();
- SamHeader mergedHeader = reader->GetHeader();
-
- // iterate over any remaining readers (skipping the first)
- for ( size_t i = 1; i < numReaders; ++i ) {
- const MergeItem& item = m_readers.at(i);
- const BamReader* reader = item.Reader;
- if ( reader == 0 ) continue;
-
- // retrieve current reader's header
- const SamHeader currentHeader = reader->GetHeader();
-
- // append current reader's RG entries to merged header
- // N.B. - SamReadGroupDictionary handles duplicate-checking
- mergedHeader.ReadGroups.Add(currentHeader.ReadGroups);
-
- // TODO: merge anything else??
- }
-
- // return stringified header
- return mergedHeader.ToString();
-}
-
-// get next alignment among all files
-bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) {
- return PopNextCachedAlignment(al, true);
-}
-
-// get next alignment among all files without parsing character data from alignments
-bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) {
- return PopNextCachedAlignment(al, false);
-}
-
-// ---------------------------------------------------------------------------------------
-//
-// NB: The following GetReferenceX() functions assume that we have identical
-// references for all BAM files. We enforce this by invoking the
-// ValidateReaders() method to verify that our reference data is the same
-// across all files on Open - so we will not encounter a situation in which
-// there is a mismatch and we are still live.
-//
-// ---------------------------------------------------------------------------------------
-
-// returns the number of reference sequences
-int BamMultiReaderPrivate::GetReferenceCount(void) const {
-
- // handle empty multireader
- if ( m_readers.empty() ) return 0;
-
- // return reference count from first reader
- const MergeItem& item = m_readers.front();
- const BamReader* reader = item.Reader;
- if ( reader == 0 ) return 0;
- else
- return reader->GetReferenceCount();
-}
-
-// returns vector of reference objects
-const RefVector BamMultiReaderPrivate::GetReferenceData(void) const {
-
- // handle empty multireader
- if ( m_readers.empty() ) return RefVector();
-
- // return reference data from first BamReader
- const MergeItem& item = m_readers.front();
- const BamReader* reader = item.Reader;
- if ( reader == 0 ) return RefVector();
- else
- return reader->GetReferenceData();
-}
-
-// returns refID from reference name
-int BamMultiReaderPrivate::GetReferenceID(const string& refName) const {
-
- // handle empty multireader
- if ( m_readers.empty() ) return -1;
-
- // return reference ID from first BamReader
- const MergeItem& item = m_readers.front();
- const BamReader* reader = item.Reader;
- if ( reader == 0 ) return -1;
- else
- return reader->GetReferenceID(refName);
-}
-// ---------------------------------------------------------------------------------------
-
-// returns true if all readers have index data available
-// this is useful to indicate whether Jump() or SetRegion() are possible
-bool BamMultiReaderPrivate::HasIndexes(void) const {
-
- // handle empty multireader
- if ( m_readers.empty() )
- return false;
-
- bool result = true;
-
- // iterate over readers
- vector<MergeItem>::const_iterator readerIter = m_readers.begin();
- vector<MergeItem>::const_iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- const MergeItem& item = (*readerIter);
- const BamReader* reader = item.Reader;
- if ( reader == 0 ) continue;
-
- // see if current reader has index data
- result &= reader->HasIndex();
- }
-
- return result;
-}
-
-// returns true if multireader has open readers
-bool BamMultiReaderPrivate::HasOpenReaders(void) {
-
- // iterate over readers
- vector<MergeItem>::const_iterator readerIter = m_readers.begin();
- vector<MergeItem>::const_iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- const MergeItem& item = (*readerIter);
- const BamReader* reader = item.Reader;
- if ( reader == 0 ) continue;
-
- // return true whenever an open reader is found
- if ( reader->IsOpen() ) return true;
- }
-
- // no readers open
- return false;
-}
-
-// performs random-access jump using (refID, position) as a left-bound
-bool BamMultiReaderPrivate::Jump(int refID, int position) {
-
- // NB: While it may make sense to track readers in which we can
- // successfully Jump, in practice a failure of Jump means "no
- // alignments here." It makes sense to simply accept the failure,
- // UpdateAlignments(), and continue.
-
- // iterate over readers
- vector<MergeItem>::iterator readerIter = m_readers.begin();
- vector<MergeItem>::iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- MergeItem& item = (*readerIter);
- BamReader* reader = item.Reader;
- if ( reader == 0 ) continue;
-
- // jump in each BamReader to position of interest
- reader->Jump(refID, position);
- }
-
- // returns status of cache update
- return UpdateAlignmentCache();
-}
-
-// locate (& load) index files for BAM readers that don't already have one loaded
-bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredType) {
-
- bool errorsEncountered = false;
- m_errorString.clear();
-
- // iterate over readers
- vector<MergeItem>::iterator readerIter = m_readers.begin();
- vector<MergeItem>::iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- MergeItem& item = (*readerIter);
- BamReader* reader = item.Reader;
- if ( reader == 0 ) continue;
-
- // if reader has no index, try to locate one
- if ( !reader->HasIndex() ) {
- if ( !reader->LocateIndex(preferredType) ) {
- m_errorString.append(1, '\t');
- m_errorString.append(reader->GetErrorString());
- m_errorString.append(1, '\n');
- errorsEncountered = true;
- }
- }
- }
-
- // check for errors encountered before returning success/fail
- if ( errorsEncountered ) {
- const string currentError = m_errorString;
- const string message = string("error while locating index files: ") + "\n" + currentError;
- SetErrorString("BamMultiReader::LocatingIndexes", message);
- return false;
- } else
- return true;
-}
-
-// opens BAM files
-bool BamMultiReaderPrivate::Open(const vector<string>& filenames) {
-
- m_errorString.clear();
-
- // put all current readers back at beginning (refreshes alignment cache)
- if ( !Rewind() ) {
- const string currentError = m_errorString;
- const string message = string("unable to rewind existing readers: \n\t") + currentError;
- SetErrorString("BamMultiReader::Open", message);
- return false;
- }
-
- // iterate over filenames
- bool errorsEncountered = false;
- vector<string>::const_iterator filenameIter = filenames.begin();
- vector<string>::const_iterator filenameEnd = filenames.end();
- for ( ; filenameIter != filenameEnd; ++filenameIter ) {
- const string& filename = (*filenameIter);
- if ( filename.empty() ) continue;
-
- // attempt to open BamReader
- BamReader* reader = new BamReader;
- const bool readerOpened = reader->Open(filename);
-
- // if opened OK, store it
- if ( readerOpened )
- m_readers.push_back( MergeItem(reader, new BamAlignment) );
-
- // otherwise store error & clean up invalid reader
- else {
- m_errorString.append(1, '\t');
- m_errorString += string("unable to open file: ") + filename;
- m_errorString.append(1, '\n');
- errorsEncountered = true;
-
- delete reader;
- reader = 0;
- }
- }
-
- // check for errors while opening
- if ( errorsEncountered ) {
- const string currentError = m_errorString;
- const string message = string("unable to open all files: \t\n") + currentError;
- SetErrorString("BamMultiReader::Open", message);
- return false;
- }
-
- // check for BAM file consistency
- if ( !ValidateReaders() ) {
- const string currentError = m_errorString;
- const string message = string("unable to open inconsistent files: \t\n") + currentError;
- SetErrorString("BamMultiReader::Open", message);
- return false;
- }
-
- // update alignment cache
- return UpdateAlignmentCache();
-}
-
-bool BamMultiReaderPrivate::OpenFile(const std::string& filename) {
- vector<string> filenames(1, filename);
- if ( Open(filenames) )
- return true;
- else {
- const string currentError = m_errorString;
- const string message = string("could not open file: ") + filename + "\n\t" + currentError;
- SetErrorString("BamMultiReader::OpenFile", message);
- return false;
- }
-}
-
-bool BamMultiReaderPrivate::OpenIndexes(const vector<string>& indexFilenames) {
-
- // TODO: This needs to be cleaner - should not assume same order.
- // And either way, shouldn't start at first reader. Should start at
- // first reader without an index?
-
- // make sure same number of index filenames as readers
- if ( m_readers.size() != indexFilenames.size() ) {
- const string message("size of index file list does not match current BAM file count");
- SetErrorString("BamMultiReader::OpenIndexes", message);
- return false;
- }
-
- bool errorsEncountered = false;
- m_errorString.clear();
-
- // iterate over BamReaders
- vector<string>::const_iterator indexFilenameIter = indexFilenames.begin();
- vector<string>::const_iterator indexFilenameEnd = indexFilenames.end();
- vector<MergeItem>::iterator readerIter = m_readers.begin();
- vector<MergeItem>::iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- MergeItem& item = (*readerIter);
- BamReader* reader = item.Reader;
-
- // open index filename on reader
- if ( reader ) {
- const string& indexFilename = (*indexFilenameIter);
- if ( !reader->OpenIndex(indexFilename) ) {
- m_errorString.append(1, '\t');
- m_errorString += reader->GetErrorString();
- m_errorString.append(1, '\n');
- errorsEncountered = true;
- }
- }
-
- // increment filename iterator, skip if no more index files to open
- if ( ++indexFilenameIter == indexFilenameEnd )
- break;
- }
-
- // return success/fail
- if ( errorsEncountered ) {
- const string currentError = m_errorString;
- const string message = string("could not open all index files: \n\t") + currentError;
- SetErrorString("BamMultiReader::OpenIndexes", message);
- return false;
- } else
- return true;
-}
-
-bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData) {
-
- // skip if no alignments available
- if ( m_alignmentCache == 0 || m_alignmentCache->IsEmpty() )
- return false;
-
- // pop next merge item entry from cache
- MergeItem item = m_alignmentCache->TakeFirst();
- BamReader* reader = item.Reader;
- BamAlignment* alignment = item.Alignment;
- if ( reader == 0 || alignment == 0 )
- return false;
-
- // set char data if requested
- if ( needCharData ) {
- alignment->BuildCharData();
- alignment->Filename = reader->GetFilename();
- }
-
- // store cached alignment into destination parameter (by copy)
- al = *alignment;
-
- // load next alignment from reader & store in cache
- SaveNextAlignment(reader, alignment);
- return true;
-}
-
-// returns BAM file pointers to beginning of alignment data & resets alignment cache
-bool BamMultiReaderPrivate::Rewind(void) {
-
- // skip if no readers open
- if ( m_readers.empty() )
- return true;
-
- // attempt to rewind files
- if ( !RewindReaders() ) {
- const string currentError = m_errorString;
- const string message = string("could not rewind readers: \n\t") + currentError;
- SetErrorString("BamMultiReader::Rewind", message);
- return false;
- }
-
- // return status of cache update
- return UpdateAlignmentCache();
-}
-
-// returns BAM file pointers to beginning of alignment data
-bool BamMultiReaderPrivate::RewindReaders(void) {
-
- m_errorString.clear();
- bool errorsEncountered = false;
-
- // iterate over readers
- vector<MergeItem>::iterator readerIter = m_readers.begin();
- vector<MergeItem>::iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- MergeItem& item = (*readerIter);
- BamReader* reader = item.Reader;
- if ( reader == 0 ) continue;
-
- // attempt rewind on BamReader
- if ( !reader->Rewind() ) {
- m_errorString.append(1, '\t');
- m_errorString.append( reader->GetErrorString() );
- m_errorString.append(1, '\n');
- errorsEncountered = true;
- }
- }
-
- return !errorsEncountered;
-}
-
-void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) {
-
- // if can read alignment from reader, store in cache
- //
- // N.B. - lazy building of alignment's char data - populated only:
- // automatically by alignment cache to maintain its sorting OR
- // on demand from client call to future call to GetNextAlignment()
-
- if ( reader->GetNextAlignmentCore(*alignment) )
- m_alignmentCache->Add( MergeItem(reader, alignment) );
-}
-
-void BamMultiReaderPrivate::SetErrorString(const string& where, const string& what) const {
- static const string SEPARATOR = ": ";
- m_errorString = where + SEPARATOR + what;
-}
-
-bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) {
-
- // NB: While it may make sense to track readers in which we can
- // successfully SetRegion, In practice a failure of SetRegion means "no
- // alignments here." It makes sense to simply accept the failure,
- // UpdateAlignments(), and continue.
-
- // iterate over alignments
- vector<MergeItem>::iterator readerIter = m_readers.begin();
- vector<MergeItem>::iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- MergeItem& item = (*readerIter);
- BamReader* reader = item.Reader;
- if ( reader == 0 ) continue;
-
- // set region of interest
- reader->SetRegion(region);
- }
-
- // return status of cache update
- return UpdateAlignmentCache();
-}
-
-// updates our alignment cache
-bool BamMultiReaderPrivate::UpdateAlignmentCache(void) {
-
- // create alignment cache if not created yet
- if ( m_alignmentCache == 0 ) {
- m_alignmentCache = CreateAlignmentCache();
- if ( m_alignmentCache == 0 ) {
- SetErrorString("BamMultiReader::UpdateAlignmentCache", "unable to create new alignment cache");
- return false;
- }
- }
-
- // clear any prior cache data
- m_alignmentCache->Clear();
-
- // iterate over readers
- vector<MergeItem>::iterator readerIter = m_readers.begin();
- vector<MergeItem>::iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- MergeItem& item = (*readerIter);
- BamReader* reader = item.Reader;
- BamAlignment* alignment = item.Alignment;
- if ( reader == 0 || alignment == 0 ) continue;
-
- // save next alignment from each reader in cache
- SaveNextAlignment(reader, alignment);
- }
-
- // if we get here, ok
- return true;
-}
-
-// ValidateReaders checks that all the readers point to BAM files representing
-// alignments against the same set of reference sequences, and that the
-// sequences are identically ordered. If these checks fail the operation of
-// the multireader is undefined, so we force program exit.
-bool BamMultiReaderPrivate::ValidateReaders(void) const {
-
- m_errorString.clear();
-
- // skip if 0 or 1 readers opened
- if ( m_readers.empty() || (m_readers.size() == 1) )
- return true;
-
- // retrieve first reader
- const MergeItem& firstItem = m_readers.front();
- const BamReader* firstReader = firstItem.Reader;
- if ( firstReader == 0 ) return false;
-
- // retrieve first reader's header data
- const SamHeader& firstReaderHeader = firstReader->GetHeader();
- const string& firstReaderSortOrder = firstReaderHeader.SortOrder;
-
- // retrieve first reader's reference data
- const RefVector& firstReaderRefData = firstReader->GetReferenceData();
- const int firstReaderRefCount = firstReader->GetReferenceCount();
- const int firstReaderRefSize = firstReaderRefData.size();
-
- // iterate over all readers
- vector<MergeItem>::const_iterator readerIter = m_readers.begin();
- vector<MergeItem>::const_iterator readerEnd = m_readers.end();
- for ( ; readerIter != readerEnd; ++readerIter ) {
- const MergeItem& item = (*readerIter);
- BamReader* reader = item.Reader;
- if ( reader == 0 ) continue;
-
- // get current reader's header data
- const SamHeader& currentReaderHeader = reader->GetHeader();
- const string& currentReaderSortOrder = currentReaderHeader.SortOrder;
-
- // check compatible sort order
- if ( currentReaderSortOrder != firstReaderSortOrder ) {
- const string message = string("mismatched sort order in ") + reader->GetFilename() +
- ", expected " + firstReaderSortOrder +
- ", but found " + currentReaderSortOrder;
- SetErrorString("BamMultiReader::ValidateReaders", message);
- return false;
- }
-
- // get current reader's reference data
- const RefVector currentReaderRefData = reader->GetReferenceData();
- const int currentReaderRefCount = reader->GetReferenceCount();
- const int currentReaderRefSize = currentReaderRefData.size();
-
- // init reference data iterators
- RefVector::const_iterator firstRefIter = firstReaderRefData.begin();
- RefVector::const_iterator firstRefEnd = firstReaderRefData.end();
- RefVector::const_iterator currentRefIter = currentReaderRefData.begin();
-
- // compare reference counts from BamReader ( & container size, in case of BR error)
- if ( (currentReaderRefCount != firstReaderRefCount) ||
- (firstReaderRefSize != currentReaderRefSize) )
- {
- stringstream s("");
- s << "mismatched reference count in " << reader->GetFilename()
- << ", expected " << firstReaderRefCount
- << ", but found " << currentReaderRefCount;
- SetErrorString("BamMultiReader::ValidateReaders", s.str());
- return false;
- }
-
- // this will be ok; we just checked above that we have identically-sized sets of references
- // here we simply check if they are all, in fact, equal in content
- while ( firstRefIter != firstRefEnd ) {
- const RefData& firstRef = (*firstRefIter);
- const RefData& currentRef = (*currentRefIter);
-
- // compare reference name & length
- if ( (firstRef.RefName != currentRef.RefName) ||
- (firstRef.RefLength != currentRef.RefLength) )
- {
- stringstream s("");
- s << "mismatched references found in" << reader->GetFilename()
- << "expected: " << endl;
-
- // print first reader's reference data
- RefVector::const_iterator refIter = firstReaderRefData.begin();
- RefVector::const_iterator refEnd = firstReaderRefData.end();
- for ( ; refIter != refEnd; ++refIter ) {
- const RefData& entry = (*refIter);
- stringstream s("");
- s << entry.RefName << " " << endl;
- }
-
- s << "but found: " << endl;
-
- // print current reader's reference data
- refIter = currentReaderRefData.begin();
- refEnd = currentReaderRefData.end();
- for ( ; refIter != refEnd; ++refIter ) {
- const RefData& entry = (*refIter);
- s << entry.RefName << " " << entry.RefLength << endl;
- }
-
- SetErrorString("BamMultiReader::ValidateReaders", s.str());
- return false;
- }
-
- // update iterators
- ++firstRefIter;
- ++currentRefIter;
- }
- }
-
- // if we get here, everything checks out
- return true;
-}
+++ /dev/null
-// ***************************************************************************
-// BamMultiReader_p.h (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Functionality for simultaneously reading multiple BAM files
-// *************************************************************************
-
-#ifndef BAMMULTIREADER_P_H
-#define BAMMULTIREADER_P_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include "api/SamHeader.h"
-#include "api/BamMultiReader.h"
-#include "api/internal/BamMultiMerger_p.h"
-#include <string>
-#include <vector>
-
-namespace BamTools {
-namespace Internal {
-
-class BamMultiReaderPrivate {
-
- // typedefs
- public:
- typedef std::pair<BamReader*, BamAlignment*> ReaderAlignment;
-
- // constructor / destructor
- public:
- BamMultiReaderPrivate(void);
- ~BamMultiReaderPrivate(void);
-
- // public interface
- public:
-
- // file operations
- bool Close(void);
- bool CloseFile(const std::string& filename);
- const std::vector<std::string> Filenames(void) const;
- bool Jump(int refID, int position = 0);
- bool Open(const std::vector<std::string>& filenames);
- bool OpenFile(const std::string& filename);
- bool Rewind(void);
- bool SetRegion(const BamRegion& region);
-
- // access alignment data
- bool GetNextAlignment(BamAlignment& al);
- bool GetNextAlignmentCore(BamAlignment& al);
- bool HasOpenReaders(void);
-
- // access auxiliary data
- SamHeader GetHeader(void) const;
- std::string GetHeaderText(void) const;
- int GetReferenceCount(void) const;
- const BamTools::RefVector GetReferenceData(void) const;
- int GetReferenceID(const std::string& refName) const;
-
- // BAM index operations
- bool CreateIndexes(const BamIndex::IndexType& type = BamIndex::STANDARD);
- bool HasIndexes(void) const;
- bool LocateIndexes(const BamIndex::IndexType& preferredType = BamIndex::STANDARD);
- bool OpenIndexes(const std::vector<std::string>& indexFilenames);
-
- // error handling
- std::string GetErrorString(void) const;
-
- // 'internal' methods
- public:
-
- bool CloseFiles(const std::vector<std::string>& filenames);
- IMultiMerger* CreateAlignmentCache(void) const;
- bool PopNextCachedAlignment(BamAlignment& al, const bool needCharData);
- bool RewindReaders(void);
- void SaveNextAlignment(BamReader* reader, BamAlignment* alignment);
- void SetErrorString(const std::string& where, const std::string& what) const; //
- bool UpdateAlignmentCache(void);
- bool ValidateReaders(void) const;
-
- // data members
- public:
- std::vector<MergeItem> m_readers;
- IMultiMerger* m_alignmentCache;
- mutable std::string m_errorString;
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // BAMMULTIREADER_P_H
+++ /dev/null
-// ***************************************************************************
-// BamPipe_p.cpp (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides BAM pipe-specific IO behavior
-// ***************************************************************************
-
-#include "api/internal/BamPipe_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include <cstdio>
-#include <iostream>
-using namespace std;
-
-BamPipe::BamPipe(void) : ILocalIODevice() { }
-
-BamPipe::~BamPipe(void) { }
-
-bool BamPipe::IsRandomAccess(void) const {
- return false;
-}
-
-bool BamPipe::Open(const IBamIODevice::OpenMode mode) {
-
- // make sure we're starting with a fresh pipe
- Close();
-
- // open stdin/stdout depending on requested openmode
- if ( mode == IBamIODevice::ReadOnly )
- m_stream = freopen(0, "rb", stdin);
- else if ( mode == IBamIODevice::WriteOnly )
- m_stream = freopen(0, "wb", stdout);
- else {
- SetErrorString("BamPipe::Open", "unknown open mode requested");
- return false;
- }
-
- // check that we obtained a valid FILE*
- if ( m_stream == 0 ) {
- const string message_base = string("could not open handle on ");
- const string message = message_base + ( (mode == IBamIODevice::ReadOnly) ? "stdin" : "stdout" );
- SetErrorString("BamPipe::Open", message);
- return false;
- }
-
- // store current IO mode & return success
- m_mode = mode;
- return true;
-}
-
-bool BamPipe::Seek(const int64_t& ) {
- SetErrorString("BamPipe::Seek", "random access not allowed in FIFO pipe");
- return false;
-}
+++ /dev/null
-// ***************************************************************************
-// BamPipe_p.h (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides BAM pipe-specific IO behavior
-// ***************************************************************************
-
-#ifndef BAMPIPE_P_H
-#define BAMPIPE_P_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include "api/internal/ILocalIODevice_p.h"
-#include <string>
-
-namespace BamTools {
-namespace Internal {
-
-class BamPipe : public ILocalIODevice {
-
- // ctor & dtor
- public:
- BamPipe(void);
- ~BamPipe(void);
-
- // IBamIODevice implementation
- public:
- bool IsRandomAccess(void) const;
- bool Open(const IBamIODevice::OpenMode mode);
- bool Seek(const int64_t& position);
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // BAMPIPE_P_H
+++ /dev/null
-// ***************************************************************************
-// BamRandomAccessController_p.cpp (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011(DB)
-// ---------------------------------------------------------------------------
-// Manages random access operations in a BAM file
-// **************************************************************************
-
-#include "api/BamIndex.h"
-#include "api/internal/BamException_p.h"
-#include "api/internal/BamRandomAccessController_p.h"
-#include "api/internal/BamReader_p.h"
-#include "api/internal/BamIndexFactory_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include <cassert>
-#include <sstream>
-using namespace std;
-
-BamRandomAccessController::BamRandomAccessController(void)
- : m_index(0)
- , m_hasAlignmentsInRegion(true)
-{ }
-
-BamRandomAccessController::~BamRandomAccessController(void) {
- Close();
-}
-
-void BamRandomAccessController::AdjustRegion(const int& referenceCount) {
-
- // skip if no index available
- if ( m_index == 0 )
- return;
-
- // see if any references in region have alignments
- m_hasAlignmentsInRegion = false;
- int currentId = m_region.LeftRefID;
- const int rightBoundRefId = ( m_region.isRightBoundSpecified() ? m_region.RightRefID : referenceCount - 1 );
- while ( currentId <= rightBoundRefId ) {
- m_hasAlignmentsInRegion = m_index->HasAlignments(currentId);
- if ( m_hasAlignmentsInRegion ) break;
- ++currentId;
- }
-
- // if no data found on any reference in region
- if ( !m_hasAlignmentsInRegion )
- return;
-
- // if left bound of desired region had no data, use first reference that had data
- // otherwise, leave requested region as-is
- if ( currentId != m_region.LeftRefID ) {
- m_region.LeftRefID = currentId;
- m_region.LeftPosition = 0;
- }
-}
-
-// returns alignments' "RegionState": { Before|Overlaps|After } current region
-BamRandomAccessController::RegionState
-BamRandomAccessController::AlignmentState(const BamAlignment& alignment) const {
-
- // if region has no left bound at all
- if ( !m_region.isLeftBoundSpecified() )
- return OverlapsRegion;
-
- // handle unmapped reads - return AFTER region to halt processing
- if ( alignment.RefID == -1 )
- return AfterRegion;
-
- // if alignment is on any reference before left bound reference
- if ( alignment.RefID < m_region.LeftRefID )
- return BeforeRegion;
-
- // if alignment is on left bound reference
- else if ( alignment.RefID == m_region.LeftRefID ) {
-
- // if alignment starts at or after left bound position
- if ( alignment.Position >= m_region.LeftPosition) {
-
- if ( m_region.isRightBoundSpecified() && // right bound is specified AND
- m_region.LeftRefID == m_region.RightRefID && // left & right bounds on same reference AND
- alignment.Position >= m_region.RightPosition ) // alignment starts on or after right bound position
- return AfterRegion;
-
- // otherwise, alignment overlaps region
- else return OverlapsRegion;
- }
-
- // alignment starts before left bound position
- else {
-
- // if alignment overlaps left bound position
- if ( alignment.GetEndPosition() > m_region.LeftPosition )
- return OverlapsRegion;
- else
- return BeforeRegion;
- }
- }
-
- // otherwise alignment is on a reference after left bound reference
- else {
-
- // if region has a right bound
- if ( m_region.isRightBoundSpecified() ) {
-
- // alignment is on any reference between boundaries
- if ( alignment.RefID < m_region.RightRefID )
- return OverlapsRegion;
-
- // alignment is on any reference after right boundary
- else if ( alignment.RefID > m_region.RightRefID )
- return AfterRegion;
-
- // alignment is on right bound reference
- else {
-
- // if alignment starts before right bound position
- if ( alignment.Position < m_region.RightPosition )
- return OverlapsRegion;
- else
- return AfterRegion;
- }
- }
-
- // otherwise, alignment starts after left bound and there is no right bound given
- else return OverlapsRegion;
- }
-}
-
-void BamRandomAccessController::Close(void) {
- ClearIndex();
- ClearRegion();
-}
-
-void BamRandomAccessController::ClearIndex(void) {
- if ( m_index ) {
- delete m_index;
- m_index = 0;
- }
-}
-
-void BamRandomAccessController::ClearRegion(void) {
- m_region.clear();
- m_hasAlignmentsInRegion = true;
-}
-
-bool BamRandomAccessController::CreateIndex(BamReaderPrivate* reader,
- const BamIndex::IndexType& type)
-{
- // skip if reader is invalid
- assert(reader);
- if ( !reader->IsOpen() ) {
- SetErrorString("BamRandomAccessController::CreateIndex",
- "cannot create index for unopened reader");
- return false;
- }
-
- // create new index of requested type
- BamIndex* newIndex = BamIndexFactory::CreateIndexOfType(type, reader);
- if ( newIndex == 0 ) {
- stringstream s("");
- s << "could not create index of type: " << type;
- SetErrorString("BamRandomAccessController::CreateIndex", s.str());
- return false;
- }
-
- // attempt to build index from current BamReader file
- if ( !newIndex->Create() ) {
- const string indexError = newIndex->GetErrorString();
- const string message = "could not create index: \n\t" + indexError;
- SetErrorString("BamRandomAccessController::CreateIndex", message);
- return false;
- }
-
- // save new index & return success
- SetIndex(newIndex);
- return true;
-}
-
-string BamRandomAccessController::GetErrorString(void) const {
- return m_errorString;
-}
-
-bool BamRandomAccessController::HasIndex(void) const {
- return ( m_index != 0 );
-}
-
-bool BamRandomAccessController::HasRegion(void) const {
- return ( !m_region.isNull() );
-}
-
-bool BamRandomAccessController::IndexHasAlignmentsForReference(const int& refId) {
- return m_index->HasAlignments(refId);
-}
-
-bool BamRandomAccessController::LocateIndex(BamReaderPrivate* reader,
- const BamIndex::IndexType& preferredType)
-{
- // look up index filename, deferring to preferredType if possible
- assert(reader);
- const string& indexFilename = BamIndexFactory::FindIndexFilename(reader->Filename(), preferredType);
-
- // if no index file found (of any type)
- if ( indexFilename.empty() ) {
- const string message = string("could not find index file for:") + reader->Filename();
- SetErrorString("BamRandomAccessController::LocateIndex", message);
- return false;
- }
-
- // otherwise open & use index file that was found
- return OpenIndex(indexFilename, reader);
-}
-
-bool BamRandomAccessController::OpenIndex(const string& indexFilename, BamReaderPrivate* reader) {
-
- // attempt create new index of type based on filename
- BamIndex* index = BamIndexFactory::CreateIndexFromFilename(indexFilename, reader);
- if ( index == 0 ) {
- const string message = string("could not open index file: ") + indexFilename;
- SetErrorString("BamRandomAccessController::OpenIndex", message);
- return false;
- }
-
- // attempt to load data from index file
- if ( !index->Load(indexFilename) ) {
- const string indexError = index->GetErrorString();
- const string message = string("could not load index data from file: ") + indexFilename +
- "\n\t" + indexError;
- SetErrorString("BamRandomAccessController::OpenIndex", message);
- return false;
- }
-
- // save new index & return success
- SetIndex(index);
- return true;
-}
-
-bool BamRandomAccessController::RegionHasAlignments(void) const {
- return m_hasAlignmentsInRegion;
-}
-
-void BamRandomAccessController::SetErrorString(const string& where, const string& what) {
- m_errorString = where + ": " + what;
-}
-
-void BamRandomAccessController::SetIndex(BamIndex* index) {
- if ( m_index )
- ClearIndex();
- m_index = index;
-}
-
-bool BamRandomAccessController::SetRegion(const BamRegion& region, const int& referenceCount) {
-
- // store region
- m_region = region;
-
- // cannot jump when no index is available
- if ( !HasIndex() ) {
- SetErrorString("BamRandomAccessController", "cannot jump if no index data available");
- return false;
- }
-
- // adjust region as necessary to reflect where data actually begins
- AdjustRegion(referenceCount);
-
- // if no data present, return true
- // * Not an error, but future attempts to access alignments in this region will not return data
- // Returning true is useful in a BamMultiReader setting where some BAM files may
- // lack alignments in regions where other files still have data available.
- if ( !m_hasAlignmentsInRegion )
- return true;
-
- // return success/failure of jump to specified region,
- //
- // * Index::Jump() is allowed to modify the m_hasAlignmentsInRegion flag
- // This covers 'corner case' where a region is requested that lies beyond the last
- // alignment on a reference. If this occurs, any subsequent calls to GetNextAlignment[Core]
- // will not return data. BamMultiReader will still be able to successfully pull alignments
- // from a region from other files even if this one has no data.
- if ( !m_index->Jump(m_region, &m_hasAlignmentsInRegion) ) {
- const string indexError = m_index->GetErrorString();
- const string message = string("could not set region\n\t") + indexError;
- SetErrorString("BamRandomAccessController::OpenIndex", message);
- return false;
- }
- else
- return true;
-}
+++ /dev/null
-// ***************************************************************************
-// BamRandomAccessController_p.h (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011(DB)
-// ---------------------------------------------------------------------------
-// Manages random access operations in a BAM file
-// ***************************************************************************
-
-#ifndef BAMRACONTROLLER_P_H
-#define BAMRACONTROLLER_P_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include "api/BamAux.h"
-#include "api/BamIndex.h"
-
-namespace BamTools {
-
-class BamAlignment;
-
-namespace Internal {
-
-class BamReaderPrivate;
-
-class BamRandomAccessController {
-
- // enums
- public: enum RegionState { BeforeRegion = 0
- , OverlapsRegion
- , AfterRegion
- };
-
- // ctor & dtor
- public:
- BamRandomAccessController(void);
- ~BamRandomAccessController(void);
-
- // BamRandomAccessController interface
- public:
-
- // index methods
- void ClearIndex(void);
- bool CreateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& type);
- bool HasIndex(void) const;
- bool IndexHasAlignmentsForReference(const int& refId);
- bool LocateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& preferredType);
- bool OpenIndex(const std::string& indexFilename, BamReaderPrivate* reader);
- void SetIndex(BamIndex* index);
-
- // region methods
- void ClearRegion(void);
- bool HasRegion(void) const;
- RegionState AlignmentState(const BamAlignment& alignment) const;
- bool RegionHasAlignments(void) const;
- bool SetRegion(const BamRegion& region, const int& referenceCount);
-
- // general methods
- void Close(void);
- std::string GetErrorString(void) const;
-
- // internal methods
- private:
- // adjusts requested region if necessary (depending on where data actually begins)
- void AdjustRegion(const int& referenceCount);
- // error-string handling
- void SetErrorString(const std::string& where, const std::string& what);
-
- // data members
- private:
-
- // index data
- BamIndex* m_index; // owns the index, not a copy - responsible for deleting
-
- // region data
- BamRegion m_region;
- bool m_hasAlignmentsInRegion;
-
- // general data
- std::string m_errorString;
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // BAMRACONTROLLER_P_H
+++ /dev/null
-// ***************************************************************************
-// BamReader_p.cpp (c) 2009 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides the basic functionality for reading BAM files
-// ***************************************************************************
-
-#include "api/BamConstants.h"
-#include "api/BamReader.h"
-#include "api/IBamIODevice.h"
-#include "api/internal/BamDeviceFactory_p.h"
-#include "api/internal/BamException_p.h"
-#include "api/internal/BamHeader_p.h"
-#include "api/internal/BamRandomAccessController_p.h"
-#include "api/internal/BamReader_p.h"
-#include "api/internal/BamStandardIndex_p.h"
-#include "api/internal/BamToolsIndex_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include <algorithm>
-#include <cassert>
-#include <iostream>
-#include <iterator>
-#include <vector>
-using namespace std;
-
-// constructor
-BamReaderPrivate::BamReaderPrivate(BamReader* parent)
- : m_alignmentsBeginOffset(0)
- , m_parent(parent)
-{
- m_isBigEndian = BamTools::SystemIsBigEndian();
-}
-
-// destructor
-BamReaderPrivate::~BamReaderPrivate(void) {
- Close();
-}
-
-// closes the BAM file
-bool BamReaderPrivate::Close(void) {
-
- // clear BAM metadata
- m_references.clear();
- m_header.Clear();
-
- // clear filename
- m_filename.clear();
-
- // close random access controller
- m_randomAccessController.Close();
-
- // if stream is open, attempt close
- if ( IsOpen() ) {
- try {
- m_stream.Close();
- } catch ( BamException& e ) {
- const string streamError = e.what();
- const string message = string("encountered error closing BAM file: \n\t") + streamError;
- SetErrorString("BamReader::Close", message);
- return false;
- }
- }
-
- // return success
- return true;
-}
-
-// creates an index file of requested type on current BAM file
-bool BamReaderPrivate::CreateIndex(const BamIndex::IndexType& type) {
-
- // skip if BAM file not open
- if ( !IsOpen() ) {
- SetErrorString("BamReader::CreateIndex", "cannot create index on unopened BAM file");
- return false;
- }
-
- // attempt to create index
- if ( m_randomAccessController.CreateIndex(this, type) )
- return true;
- else {
- const string bracError = m_randomAccessController.GetErrorString();
- const string message = string("could not create index: \n\t") + bracError;
- SetErrorString("BamReader::CreateIndex", message);
- return false;
- }
-}
-
-// return path & filename of current BAM file
-const string BamReaderPrivate::Filename(void) const {
- return m_filename;
-}
-
-string BamReaderPrivate::GetErrorString(void) const {
- return m_errorString;
-}
-
-// return header data as std::string
-string BamReaderPrivate::GetHeaderText(void) const {
- return m_header.ToString();
-}
-
-// return header data as SamHeader object
-SamHeader BamReaderPrivate::GetSamHeader(void) const {
- return m_header.ToSamHeader();
-}
-
-// get next alignment (with character data fully parsed)
-bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) {
-
- // if valid alignment found
- if ( GetNextAlignmentCore(alignment) ) {
-
- // store alignment's "source" filename
- alignment.Filename = m_filename;
-
- // return success/failure of parsing char data
- if ( alignment.BuildCharData() )
- return true;
- else {
- const string alError = alignment.GetErrorString();
- const string message = string("could not populate alignment data: \n\t") + alError;
- SetErrorString("BamReader::GetNextAlignment", message);
- return false;
- }
- }
-
- // no valid alignment found
- return false;
-}
-
-// retrieves next available alignment core data (returns success/fail)
-// ** DOES NOT populate any character data fields (read name, bases, qualities, tag data, filename)
-// these can be accessed, if necessary, from the supportData
-// useful for operations requiring ONLY positional or other alignment-related information
-bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& alignment) {
-
- // skip if stream not opened
- if ( !m_stream.IsOpen() )
- return false;
-
- try {
-
- // skip if region is set but has no alignments
- if ( m_randomAccessController.HasRegion() &&
- !m_randomAccessController.RegionHasAlignments() )
- {
- return false;
- }
-
- // if can't read next alignment
- if ( !LoadNextAlignment(alignment) )
- return false;
-
- // check alignment's region-overlap state
- BamRandomAccessController::RegionState state = m_randomAccessController.AlignmentState(alignment);
-
- // if alignment starts after region, no need to keep reading
- if ( state == BamRandomAccessController::AfterRegion )
- return false;
-
- // read until overlap is found
- while ( state != BamRandomAccessController::OverlapsRegion ) {
-
- // if can't read next alignment
- if ( !LoadNextAlignment(alignment) )
- return false;
-
- // check alignment's region-overlap state
- state = m_randomAccessController.AlignmentState(alignment);
-
- // if alignment starts after region, no need to keep reading
- if ( state == BamRandomAccessController::AfterRegion )
- return false;
- }
-
- // if we get here, we found the next 'valid' alignment
- // (e.g. overlaps current region if one was set, simply the next alignment if not)
- alignment.SupportData.HasCoreOnly = true;
- return true;
-
- } catch ( BamException& e ) {
- const string streamError = e.what();
- const string message = string("encountered error reading BAM alignment: \n\t") + streamError;
- SetErrorString("BamReader::GetNextAlignmentCore", message);
- return false;
- }
-}
-
-int BamReaderPrivate::GetReferenceCount(void) const {
- return m_references.size();
-}
-
-const RefVector& BamReaderPrivate::GetReferenceData(void) const {
- return m_references;
-}
-
-// returns RefID for given RefName (returns References.size() if not found)
-int BamReaderPrivate::GetReferenceID(const string& refName) const {
-
- // retrieve names from reference data
- vector<string> refNames;
- RefVector::const_iterator refIter = m_references.begin();
- RefVector::const_iterator refEnd = m_references.end();
- for ( ; refIter != refEnd; ++refIter)
- refNames.push_back( (*refIter).RefName );
-
- // return 'index-of' refName (or -1 if not found)
- int index = distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));
- if ( index == (int)m_references.size() ) return -1;
- else return index;
-}
-
-bool BamReaderPrivate::HasIndex(void) const {
- return m_randomAccessController.HasIndex();
-}
-
-bool BamReaderPrivate::IsOpen(void) const {
- return m_stream.IsOpen();
-}
-
-// load BAM header data
-void BamReaderPrivate::LoadHeaderData(void) {
- m_header.Load(&m_stream);
-}
-
-// populates BamAlignment with alignment data under file pointer, returns success/fail
-bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) {
-
- // read in the 'block length' value, make sure it's not zero
- char buffer[sizeof(uint32_t)];
- m_stream.Read(buffer, sizeof(uint32_t));
- alignment.SupportData.BlockLength = BamTools::UnpackUnsignedInt(buffer);
- if ( m_isBigEndian ) BamTools::SwapEndian_32(alignment.SupportData.BlockLength);
- if ( alignment.SupportData.BlockLength == 0 )
- return false;
-
- // read in core alignment data, make sure the right size of data was read
- char x[Constants::BAM_CORE_SIZE];
- if ( m_stream.Read(x, Constants::BAM_CORE_SIZE) != Constants::BAM_CORE_SIZE )
- return false;
-
- // swap core endian-ness if necessary
- if ( m_isBigEndian ) {
- for ( unsigned int i = 0; i < Constants::BAM_CORE_SIZE; i+=sizeof(uint32_t) )
- BamTools::SwapEndian_32p(&x[i]);
- }
-
- // set BamAlignment 'core' and 'support' data
- alignment.RefID = BamTools::UnpackSignedInt(&x[0]);
- alignment.Position = BamTools::UnpackSignedInt(&x[4]);
-
- unsigned int tempValue = BamTools::UnpackUnsignedInt(&x[8]);
- alignment.Bin = tempValue >> 16;
- alignment.MapQuality = tempValue >> 8 & 0xff;
- alignment.SupportData.QueryNameLength = tempValue & 0xff;
-
- tempValue = BamTools::UnpackUnsignedInt(&x[12]);
- alignment.AlignmentFlag = tempValue >> 16;
- alignment.SupportData.NumCigarOperations = tempValue & 0xffff;
-
- alignment.SupportData.QuerySequenceLength = BamTools::UnpackUnsignedInt(&x[16]);
- alignment.MateRefID = BamTools::UnpackSignedInt(&x[20]);
- alignment.MatePosition = BamTools::UnpackSignedInt(&x[24]);
- alignment.InsertSize = BamTools::UnpackSignedInt(&x[28]);
-
- // set BamAlignment length
- alignment.Length = alignment.SupportData.QuerySequenceLength;
-
- // read in character data - make sure proper data size was read
- bool readCharDataOK = false;
- const unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE;
- RaiiBuffer allCharData(dataLength);
-
- if ( m_stream.Read(allCharData.Buffer, dataLength) == dataLength ) {
-
- // store 'allCharData' in supportData structure
- alignment.SupportData.AllCharData.assign((const char*)allCharData.Buffer, dataLength);
-
- // set success flag
- readCharDataOK = true;
-
- // save CIGAR ops
- // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly,
- // even when GetNextAlignmentCore() is called
- const unsigned int cigarDataOffset = alignment.SupportData.QueryNameLength;
- uint32_t* cigarData = (uint32_t*)(allCharData.Buffer + cigarDataOffset);
- CigarOp op;
- alignment.CigarData.clear();
- alignment.CigarData.reserve(alignment.SupportData.NumCigarOperations);
- for ( unsigned int i = 0; i < alignment.SupportData.NumCigarOperations; ++i ) {
-
- // swap endian-ness if necessary
- if ( m_isBigEndian ) BamTools::SwapEndian_32(cigarData[i]);
-
- // build CigarOp structure
- op.Length = (cigarData[i] >> Constants::BAM_CIGAR_SHIFT);
- op.Type = Constants::BAM_CIGAR_LOOKUP[ (cigarData[i] & Constants::BAM_CIGAR_MASK) ];
-
- // save CigarOp
- alignment.CigarData.push_back(op);
- }
- }
-
- // return success/failure
- return readCharDataOK;
-}
-
-// loads reference data from BAM file
-bool BamReaderPrivate::LoadReferenceData(void) {
-
- // get number of reference sequences
- char buffer[sizeof(uint32_t)];
- m_stream.Read(buffer, sizeof(uint32_t));
- uint32_t numberRefSeqs = BamTools::UnpackUnsignedInt(buffer);
- if ( m_isBigEndian ) BamTools::SwapEndian_32(numberRefSeqs);
- m_references.reserve((int)numberRefSeqs);
-
- // iterate over all references in header
- for ( unsigned int i = 0; i != numberRefSeqs; ++i ) {
-
- // get length of reference name
- m_stream.Read(buffer, sizeof(uint32_t));
- uint32_t refNameLength = BamTools::UnpackUnsignedInt(buffer);
- if ( m_isBigEndian ) BamTools::SwapEndian_32(refNameLength);
- RaiiBuffer refName(refNameLength);
-
- // get reference name and reference sequence length
- m_stream.Read(refName.Buffer, refNameLength);
- m_stream.Read(buffer, sizeof(int32_t));
- int32_t refLength = BamTools::UnpackSignedInt(buffer);
- if ( m_isBigEndian ) BamTools::SwapEndian_32(refLength);
-
- // store data for reference
- RefData aReference;
- aReference.RefName = (string)((const char*)refName.Buffer);
- aReference.RefLength = refLength;
- m_references.push_back(aReference);
- }
-
- // return success
- return true;
-}
-
-bool BamReaderPrivate::LocateIndex(const BamIndex::IndexType& preferredType) {
-
- if ( m_randomAccessController.LocateIndex(this, preferredType) )
- return true;
- else {
- const string bracError = m_randomAccessController.GetErrorString();
- const string message = string("could not locate index: \n\t") + bracError;
- SetErrorString("BamReader::LocateIndex", message);
- return false;
- }
-}
-
-// opens BAM file (and index)
-bool BamReaderPrivate::Open(const string& filename) {
-
- try {
-
- // make sure we're starting with fresh state
- Close();
-
- // open BgzfStream
- m_stream.Open(filename, IBamIODevice::ReadOnly);
- assert(m_stream);
-
- // load BAM metadata
- LoadHeaderData();
- LoadReferenceData();
-
- // store filename & offset of first alignment
- m_filename = filename;
- m_alignmentsBeginOffset = m_stream.Tell();
-
- // return success
- return true;
-
- } catch ( BamException& e ) {
- const string error = e.what();
- const string message = string("could not open file: ") + filename +
- "\n\t" + error;
- SetErrorString("BamReader::Open", message);
- return false;
- }
-}
-
-bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) {
-
- if ( m_randomAccessController.OpenIndex(indexFilename, this) )
- return true;
- else {
- const string bracError = m_randomAccessController.GetErrorString();
- const string message = string("could not open index: \n\t") + bracError;
- SetErrorString("BamReader::OpenIndex", message);
- return false;
- }
-}
-
-// returns BAM file pointer to beginning of alignment data
-bool BamReaderPrivate::Rewind(void) {
-
- // reset region
- m_randomAccessController.ClearRegion();
-
- // return status of seeking back to first alignment
- if ( Seek(m_alignmentsBeginOffset) )
- return true;
- else {
- const string currentError = m_errorString;
- const string message = string("could not rewind: \n\t") + currentError;
- SetErrorString("BamReader::Rewind", message);
- return false;
- }
-}
-
-bool BamReaderPrivate::Seek(const int64_t& position) {
-
- // skip if BAM file not open
- if ( !IsOpen() ) {
- SetErrorString("BamReader::Seek", "cannot seek on unopened BAM file");
- return false;
- }
-
- try {
- m_stream.Seek(position);
- return true;
- }
- catch ( BamException& e ) {
- const string streamError = e.what();
- const string message = string("could not seek in BAM file: \n\t") + streamError;
- SetErrorString("BamReader::Seek", message);
- return false;
- }
-}
-
-void BamReaderPrivate::SetErrorString(const string& where, const string& what) {
- static const string SEPARATOR = ": ";
- m_errorString = where + SEPARATOR + what;
-}
-
-void BamReaderPrivate::SetIndex(BamIndex* index) {
- m_randomAccessController.SetIndex(index);
-}
-
-// sets current region & attempts to jump to it
-// returns success/failure
-bool BamReaderPrivate::SetRegion(const BamRegion& region) {
-
- if ( m_randomAccessController.SetRegion(region, m_references.size()) )
- return true;
- else {
- const string bracError = m_randomAccessController.GetErrorString();
- const string message = string("could not set region: \n\t") + bracError;
- SetErrorString("BamReader::SetRegion", message);
- return false;
- }
-}
-
-int64_t BamReaderPrivate::Tell(void) const {
- return m_stream.Tell();
-}
+++ /dev/null
-// ***************************************************************************
-// BamReader_p.h (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides the basic functionality for reading BAM files
-// ***************************************************************************
-
-#ifndef BAMREADER_P_H
-#define BAMREADER_P_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include "api/BamAlignment.h"
-#include "api/BamIndex.h"
-#include "api/BamReader.h"
-#include "api/SamHeader.h"
-#include "api/internal/BamHeader_p.h"
-#include "api/internal/BamRandomAccessController_p.h"
-#include "api/internal/BgzfStream_p.h"
-#include <string>
-
-namespace BamTools {
-namespace Internal {
-
-class BamReaderPrivate {
-
- // ctor & dtor
- public:
- BamReaderPrivate(BamReader* parent);
- ~BamReaderPrivate(void);
-
- // BamReader interface
- public:
-
- // file operations
- bool Close(void);
- const std::string Filename(void) const;
- bool IsOpen(void) const;
- bool Open(const std::string& filename);
- bool Rewind(void);
- bool SetRegion(const BamRegion& region);
-
- // access alignment data
- bool GetNextAlignment(BamAlignment& alignment);
- bool GetNextAlignmentCore(BamAlignment& alignment);
-
- // access auxiliary data
- std::string GetHeaderText(void) const;
- SamHeader GetSamHeader(void) const;
- int GetReferenceCount(void) const;
- const RefVector& GetReferenceData(void) const;
- int GetReferenceID(const std::string& refName) const;
-
- // index operations
- bool CreateIndex(const BamIndex::IndexType& type);
- bool HasIndex(void) const;
- bool LocateIndex(const BamIndex::IndexType& preferredType);
- bool OpenIndex(const std::string& indexFilename);
- void SetIndex(BamIndex* index);
-
- // error handling
- std::string GetErrorString(void) const;
- void SetErrorString(const std::string& where, const std::string& what);
-
- // internal methods, but available as a BamReaderPrivate 'interface'
- //
- // these methods should only be used by BamTools::Internal classes
- // (currently only used by the BamIndex subclasses)
- public:
- // retrieves header text from BAM file
- void LoadHeaderData(void);
- // retrieves BAM alignment under file pointer
- // (does no overlap checking or character data parsing)
- bool LoadNextAlignment(BamAlignment& alignment);
- // builds reference data structure from BAM file
- bool LoadReferenceData(void);
- // seek reader to file position
- bool Seek(const int64_t& position);
- // return reader's file position
- int64_t Tell(void) const;
-
- // data members
- public:
-
- // general BAM file data
- int64_t m_alignmentsBeginOffset;
- std::string m_filename;
- RefVector m_references;
-
- // system data
- bool m_isBigEndian;
-
- // parent BamReader
- BamReader* m_parent;
-
- // BamReaderPrivate components
- BamHeader m_header;
- BamRandomAccessController m_randomAccessController;
- BgzfStream m_stream;
-
- // error handling
- std::string m_errorString;
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // BAMREADER_P_H
+++ /dev/null
-// ***************************************************************************
-// BamStandardIndex.cpp (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides index operations for the standardized BAM index format (".bai")
-// ***************************************************************************
-
-#include "api/BamAlignment.h"
-#include "api/internal/BamException_p.h"
-#include "api/internal/BamReader_p.h"
-#include "api/internal/BamStandardIndex_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <algorithm>
-#include <sstream>
-using namespace std;
-
-// -----------------------------------
-// static BamStandardIndex constants
-// -----------------------------------
-
-const int BamStandardIndex::MAX_BIN = 37450; // =(8^6-1)/7+1
-const int BamStandardIndex::BAM_LIDX_SHIFT = 14;
-const string BamStandardIndex::BAI_EXTENSION = ".bai";
-const char* const BamStandardIndex::BAI_MAGIC = "BAI\1";
-const int BamStandardIndex::SIZEOF_ALIGNMENTCHUNK = sizeof(uint64_t)*2;
-const int BamStandardIndex::SIZEOF_BINCORE = sizeof(uint32_t) + sizeof(int32_t);
-const int BamStandardIndex::SIZEOF_LINEAROFFSET = sizeof(uint64_t);
-
-// ----------------------------
-// RaiiWrapper implementation
-// ----------------------------
-
-BamStandardIndex::RaiiWrapper::RaiiWrapper(void)
- : IndexStream(0)
- , Buffer(0)
-{ }
-
-BamStandardIndex::RaiiWrapper::~RaiiWrapper(void) {
-
- if ( IndexStream ) {
- fclose(IndexStream);
- IndexStream = 0;
- }
-
- if ( Buffer ) {
- delete[] Buffer;
- Buffer = 0;
- }
-}
-
-// ---------------------------------
-// BamStandardIndex implementation
-// ---------------------------------
-
-// ctor
-BamStandardIndex::BamStandardIndex(Internal::BamReaderPrivate* reader)
- : BamIndex(reader)
- , m_bufferLength(0)
-{
- m_isBigEndian = BamTools::SystemIsBigEndian();
-}
-
-// dtor
-BamStandardIndex::~BamStandardIndex(void) {
- CloseFile();
-}
-
-void BamStandardIndex::AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end) {
-
- // retrieve references from reader
- const RefVector& references = m_reader->GetReferenceData();
-
- // LeftPosition cannot be greater than or equal to reference length
- if ( region.LeftPosition >= references.at(region.LeftRefID).RefLength )
- throw BamException("BamStandardIndex::AdjustRegion", "invalid region requested");
-
- // set region 'begin'
- begin = (unsigned int)region.LeftPosition;
-
- // if right bound specified AND left&right bounds are on same reference
- // OK to use right bound position as region 'end'
- if ( region.isRightBoundSpecified() && ( region.LeftRefID == region.RightRefID ) )
- end = (unsigned int)region.RightPosition;
-
- // otherwise, set region 'end' to last reference base
- else end = (unsigned int)references.at(region.LeftRefID).RefLength;
-}
-
-// [begin, end)
-void BamStandardIndex::CalculateCandidateBins(const uint32_t& begin,
- const uint32_t& end,
- set<uint16_t>& candidateBins)
-{
- // initialize list, bin '0' is always a valid bin
- candidateBins.insert(0);
-
- // get rest of bins that contain this region
- unsigned int k;
- for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { candidateBins.insert(k); }
- for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { candidateBins.insert(k); }
- for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { candidateBins.insert(k); }
- for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { candidateBins.insert(k); }
- for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { candidateBins.insert(k); }
-}
-
-void BamStandardIndex::CalculateCandidateOffsets(const BaiReferenceSummary& refSummary,
- const uint64_t& minOffset,
- set<uint16_t>& candidateBins,
- vector<int64_t>& offsets)
-{
- // seek to first bin
- Seek(refSummary.FirstBinFilePosition, SEEK_SET);
-
- // iterate over reference bins
- uint32_t binId;
- int32_t numAlignmentChunks;
- set<uint16_t>::iterator candidateBinIter;
- for ( int i = 0; i < refSummary.NumBins; ++i ) {
-
- // read bin contents (if successful, alignment chunks are now in m_buffer)
- ReadBinIntoBuffer(binId, numAlignmentChunks);
-
- // see if bin is a 'candidate bin'
- candidateBinIter = candidateBins.find(binId);
-
- // if not, move on to next bin
- if ( candidateBinIter == candidateBins.end() )
- continue;
-
- // otherwise, check bin's contents against for overlap
- else {
-
- size_t offset = 0;
- uint64_t chunkStart;
- uint64_t chunkStop;
-
- // iterate over alignment chunks
- for ( int j = 0; j < numAlignmentChunks; ++j ) {
-
- // read chunk start & stop from buffer
- memcpy((char*)&chunkStart, Resources.Buffer+offset, sizeof(uint64_t));
- offset += sizeof(uint64_t);
- memcpy((char*)&chunkStop, Resources.Buffer+offset, sizeof(uint64_t));
- offset += sizeof(uint64_t);
-
- // swap endian-ness if necessary
- if ( m_isBigEndian ) {
- SwapEndian_64(chunkStart);
- SwapEndian_64(chunkStop);
- }
-
- // store alignment chunk's start offset
- // if its stop offset is larger than our 'minOffset'
- if ( chunkStop >= minOffset )
- offsets.push_back(chunkStart);
- }
-
- // 'pop' bin ID from candidate bins set
- candidateBins.erase(candidateBinIter);
-
- // quit if no more candidates
- if ( candidateBins.empty() )
- break;
- }
- }
-}
-
-uint64_t BamStandardIndex::CalculateMinOffset(const BaiReferenceSummary& refSummary,
- const uint32_t& begin)
-{
- // if no linear offsets exist, return 0
- if ( refSummary.NumLinearOffsets == 0 )
- return 0;
-
- // if 'begin' starts beyond last linear offset, use the last linear offset as minimum
- // else use the offset corresponding to the requested start position
- const int shiftedBegin = begin>>BamStandardIndex::BAM_LIDX_SHIFT;
- if ( shiftedBegin >= refSummary.NumLinearOffsets )
- return LookupLinearOffset( refSummary, refSummary.NumLinearOffsets-1 );
- else
- return LookupLinearOffset( refSummary, shiftedBegin );
-}
-
-void BamStandardIndex::CheckBufferSize(char*& buffer,
- unsigned int& bufferLength,
- const unsigned int& requestedBytes)
-{
- try {
- if ( requestedBytes > bufferLength ) {
- bufferLength = requestedBytes + 10;
- delete[] buffer;
- buffer = new char[bufferLength];
- }
- } catch ( std::bad_alloc& ) {
- stringstream s("");
- s << "out of memory when allocating " << requestedBytes << " bytes";
- throw BamException("BamStandardIndex::CheckBufferSize", s.str());
- }
-}
-
-void BamStandardIndex::CheckBufferSize(unsigned char*& buffer,
- unsigned int& bufferLength,
- const unsigned int& requestedBytes)
-{
- try {
- if ( requestedBytes > bufferLength ) {
- bufferLength = requestedBytes + 10;
- delete[] buffer;
- buffer = new unsigned char[bufferLength];
- }
- } catch ( std::bad_alloc& ) {
- stringstream s("");
- s << "out of memory when allocating " << requestedBytes << " bytes";
- throw BamException("BamStandardIndex::CheckBufferSize", s.str());
- }
-}
-
-void BamStandardIndex::CheckMagicNumber(void) {
-
- // check 'magic number' to see if file is BAI index
- char magic[4];
- const size_t elementsRead = fread(magic, sizeof(char), 4, Resources.IndexStream);
- if ( elementsRead != 4 )
- throw BamException("BamStandardIndex::CheckMagicNumber", "could not read BAI magic number");
-
- // compare to expected value
- if ( strncmp(magic, BamStandardIndex::BAI_MAGIC, 4) != 0 )
- throw BamException("BamStandardIndex::CheckMagicNumber", "invalid BAI magic number");
-}
-
-void BamStandardIndex::ClearReferenceEntry(BaiReferenceEntry& refEntry) {
- refEntry.ID = -1;
- refEntry.Bins.clear();
- refEntry.LinearOffsets.clear();
-}
-
-void BamStandardIndex::CloseFile(void) {
-
- // close file stream
- if ( IsFileOpen() ) {
- fclose(Resources.IndexStream);
- Resources.IndexStream = 0;
- }
-
- // clear index file summary data
- m_indexFileSummary.clear();
-
- // clean up I/O buffer
- delete[] Resources.Buffer;
- Resources.Buffer = 0;
- m_bufferLength = 0;
-}
-
-// builds index from associated BAM file & writes out to index file
-bool BamStandardIndex::Create(void) {
-
- // skip if BamReader is invalid or not open
- if ( m_reader == 0 || !m_reader->IsOpen() ) {
- SetErrorString("BamStandardIndex::Create", "could not create index: reader is not open");
- return false;
- }
-
- // rewind BamReader
- if ( !m_reader->Rewind() ) {
- const string readerError = m_reader->GetErrorString();
- const string message = "could not create index: \n\t" + readerError;
- SetErrorString("BamStandardIndex::Create", message);
- return false;
- }
-
- try {
-
- // open new index file (read & write)
- string indexFilename = m_reader->Filename() + Extension();
- OpenFile(indexFilename, "w+b");
-
- // initialize BaiFileSummary with number of references
- const int& numReferences = m_reader->GetReferenceCount();
- ReserveForSummary(numReferences);
-
- // initialize output file
- WriteHeader();
-
- // set up bin, ID, offset, & coordinate markers
- const uint32_t defaultValue = 0xffffffffu;
- uint32_t currentBin = defaultValue;
- uint32_t lastBin = defaultValue;
- int32_t currentRefID = defaultValue;
- int32_t lastRefID = defaultValue;
- uint64_t currentOffset = (uint64_t)m_reader->Tell();
- uint64_t lastOffset = currentOffset;
- int32_t lastPosition = defaultValue;
-
- // iterate through alignments in BAM file
- BamAlignment al;
- BaiReferenceEntry refEntry;
- while ( m_reader->LoadNextAlignment(al) ) {
-
- // changed to new reference
- if ( lastRefID != al.RefID ) {
-
- // if not first reference, save previous reference data
- if ( lastRefID != (int32_t)defaultValue ) {
-
- SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset);
- WriteReferenceEntry(refEntry);
- ClearReferenceEntry(refEntry);
-
- // write any empty references between (but *NOT* including) lastRefID & al.RefID
- for ( int i = lastRefID+1; i < al.RefID; ++i ) {
- BaiReferenceEntry emptyEntry(i);
- WriteReferenceEntry(emptyEntry);
- }
-
- // update bin markers
- currentOffset = lastOffset;
- currentBin = al.Bin;
- lastBin = al.Bin;
- currentRefID = al.RefID;
- }
-
- // otherwise, this is first pass
- // be sure to write any empty references up to (but *NOT* including) current RefID
- else {
- for ( int i = 0; i < al.RefID; ++i ) {
- BaiReferenceEntry emptyEntry(i);
- WriteReferenceEntry(emptyEntry);
- }
- }
-
- // update reference markers
- refEntry.ID = al.RefID;
- lastRefID = al.RefID;
- lastBin = defaultValue;
- }
-
- // if lastPosition greater than current alignment position - file not sorted properly
- else if ( lastPosition > al.Position ) {
- stringstream s("");
- s << "BAM file is not properly sorted by coordinate" << endl
- << "Current alignment position: " << al.Position
- << " < previous alignment position: " << lastPosition
- << " on reference ID: " << al.RefID << endl;
- SetErrorString("BamStandardIndex::Create", s.str());
- return false;
- }
-
- // if alignment's ref ID is valid & its bin is not a 'leaf'
- if ( (al.RefID >= 0) && (al.Bin < 4681) )
- SaveLinearOffsetEntry(refEntry.LinearOffsets, al.Position, al.GetEndPosition(), lastOffset);
-
- // changed to new BAI bin
- if ( al.Bin != lastBin ) {
-
- // if not first bin on reference, save previous bin data
- if ( currentBin != defaultValue )
- SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset);
-
- // update markers
- currentOffset = lastOffset;
- currentBin = al.Bin;
- lastBin = al.Bin;
- currentRefID = al.RefID;
-
- // if invalid RefID, break out
- if ( currentRefID < 0 )
- break;
- }
-
- // make sure that current file pointer is beyond lastOffset
- if ( m_reader->Tell() <= (int64_t)lastOffset ) {
- SetErrorString("BamStandardIndex::Create", "calculating offsets failed");
- return false;
- }
-
- // update lastOffset & lastPosition
- lastOffset = m_reader->Tell();
- lastPosition = al.Position;
- }
-
- // after finishing alignments, if any data was read, check:
- if ( currentRefID >= 0 ) {
-
- // store last alignment chunk to its bin, then write last reference entry with data
- SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset);
- WriteReferenceEntry(refEntry);
-
- // then write any empty references remaining at end of file
- for ( int i = currentRefID+1; i < numReferences; ++i ) {
- BaiReferenceEntry emptyEntry(i);
- WriteReferenceEntry(emptyEntry);
- }
- }
-
- } catch ( BamException& e) {
- m_errorString = e.what();
- return false;
- }
-
- // rewind BamReader
- if ( !m_reader->Rewind() ) {
- const string readerError = m_reader->GetErrorString();
- const string message = "could not create index: \n\t" + readerError;
- SetErrorString("BamStandardIndex::Create", message);
- return false;
- }
-
- // return success
- return true;
-}
-
-// returns format's file extension
-const string BamStandardIndex::Extension(void) {
- return BamStandardIndex::BAI_EXTENSION;
-}
-
-void BamStandardIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
-
- // cannot calculate offsets if unknown/invalid reference ID requested
- if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() )
- throw BamException("BamStandardIndex::GetOffset", "invalid reference ID requested");
-
- // retrieve index summary for left bound reference
- const BaiReferenceSummary& refSummary = m_indexFileSummary.at(region.LeftRefID);
-
- // set up region boundaries based on actual BamReader data
- uint32_t begin;
- uint32_t end;
- AdjustRegion(region, begin, end);
-
- // retrieve all candidate bin IDs for region
- set<uint16_t> candidateBins;
- CalculateCandidateBins(begin, end, candidateBins);
-
- // use reference's linear offsets to calculate the minimum offset
- // that must be considered to find overlap
- const uint64_t& minOffset = CalculateMinOffset(refSummary, begin);
-
- // attempt to use reference summary, minOffset, & candidateBins to calculate offsets
- // no data should not be error, just bail
- vector<int64_t> offsets;
- CalculateCandidateOffsets(refSummary, minOffset, candidateBins, offsets);
- if ( offsets.empty() )
- return;
-
- // ensure that offsets are sorted before processing
- sort( offsets.begin(), offsets.end() );
-
- // binary search for an overlapping block (may not be first one though)
- BamAlignment al;
- typedef vector<int64_t>::const_iterator OffsetConstIterator;
- OffsetConstIterator offsetFirst = offsets.begin();
- OffsetConstIterator offsetIter = offsetFirst;
- OffsetConstIterator offsetLast = offsets.end();
- iterator_traits<OffsetConstIterator>::difference_type count = distance(offsetFirst, offsetLast);
- iterator_traits<OffsetConstIterator>::difference_type step;
- while ( count > 0 ) {
- offsetIter = offsetFirst;
- step = count/2;
- advance(offsetIter, step);
-
- // attempt seek to candidate offset
- const int64_t& candidateOffset = (*offsetIter);
- if ( !m_reader->Seek(candidateOffset) ) {
- const string readerError = m_reader->GetErrorString();
- const string message = "could not seek in BAM file: \n\t" + readerError;
- throw BamException("BamToolsIndex::GetOffset", message);
- }
-
- // load first available alignment, setting flag to true if data exists
- *hasAlignmentsInRegion = m_reader->LoadNextAlignment(al);
-
- // check alignment against region
- if ( al.GetEndPosition() <= region.LeftPosition ) {
- offsetFirst = ++offsetIter;
- count -= step+1;
- } else count = step;
- }
-
- // step back to the offset before the 'current offset' (to make sure we cover overlaps)
- if ( offsetIter != offsets.begin() )
- --offsetIter;
- offset = (*offsetIter);
-}
-
-// returns whether reference has alignments or no
-bool BamStandardIndex::HasAlignments(const int& referenceID) const {
- if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() )
- return false;
- const BaiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID);
- return ( refSummary.NumBins > 0 );
-}
-
-bool BamStandardIndex::IsFileOpen(void) const {
- return ( Resources.IndexStream != 0 );
-}
-
-// attempts to use index data to jump to @region, returns success/fail
-// a "successful" jump indicates no error, but not whether this region has data
-// * thus, the method sets a flag to indicate whether there are alignments
-// available after the jump position
-bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
-
- // clear out flag
- *hasAlignmentsInRegion = false;
-
- // skip if invalid reader or not open
- if ( m_reader == 0 || !m_reader->IsOpen() ) {
- SetErrorString("BamStandardIndex::Jump", "could not jump: reader is not open");
- return false;
- }
-
- // calculate nearest offset to jump to
- int64_t offset;
- try {
- GetOffset(region, offset, hasAlignmentsInRegion);
- } catch ( BamException& e ) {
- m_errorString = e.what();
- return false;
- }
-
- // if region has alignments, return success/fail of seeking there
- if ( *hasAlignmentsInRegion )
- return m_reader->Seek(offset);
-
- // otherwise, simply return true (but hasAlignmentsInRegion flag has been set to false)
- // (this is OK, BamReader will check this flag before trying to load data)
- return true;
-}
-
-// loads existing data from file into memory
-bool BamStandardIndex::Load(const std::string& filename) {
-
- try {
-
- // attempt to open file (read-only)
- OpenFile(filename, "rb");
-
- // validate format
- CheckMagicNumber();
-
- // load in-memory summary of index data
- SummarizeIndexFile();
-
- // return success
- return true;
-
- } catch ( BamException& e ) {
- m_errorString = e.what();
- return false;
- }
-}
-
-uint64_t BamStandardIndex::LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index) {
-
- // attempt seek to proper index file position
- const int64_t linearOffsetFilePosition = (int64_t)refSummary.FirstLinearOffsetFilePosition +
- index*BamStandardIndex::SIZEOF_LINEAROFFSET;
- Seek(linearOffsetFilePosition, SEEK_SET);
-
- // read linear offset from BAI file
- uint64_t linearOffset;
- ReadLinearOffset(linearOffset);
- return linearOffset;
-}
-
-void BamStandardIndex::MergeAlignmentChunks(BaiAlignmentChunkVector& chunks) {
-
- // skip if chunks are empty, nothing to merge
- if ( chunks.empty() )
- return;
-
- // set up merged alignment chunk container
- BaiAlignmentChunkVector mergedChunks;
- mergedChunks.push_back( chunks[0] );
-
- // iterate over chunks
- int i = 0;
- BaiAlignmentChunkVector::iterator chunkIter = chunks.begin();
- BaiAlignmentChunkVector::iterator chunkEnd = chunks.end();
- for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
-
- // get 'currentMergeChunk' based on numeric index
- BaiAlignmentChunk& currentMergeChunk = mergedChunks[i];
-
- // get sourceChunk based on source vector iterator
- BaiAlignmentChunk& sourceChunk = (*chunkIter);
-
- // if currentMergeChunk ends where sourceChunk starts, then merge the two
- if ( currentMergeChunk.Stop>>16 == sourceChunk.Start>>16 )
- currentMergeChunk.Stop = sourceChunk.Stop;
-
- // otherwise
- else {
- // append sourceChunk after currentMergeChunk
- mergedChunks.push_back(sourceChunk);
-
- // update i, so the next iteration will consider the
- // recently-appended sourceChunk as new mergeChunk candidate
- ++i;
- }
- }
-
- // saved newly-merged chunks into (parameter) chunks
- chunks = mergedChunks;
-}
-
-void BamStandardIndex::OpenFile(const std::string& filename, const char* mode) {
-
- // make sure any previous index file is closed
- CloseFile();
-
- // attempt to open file
- Resources.IndexStream = fopen(filename.c_str(), mode);
- if ( !IsFileOpen() ) {
- const string message = string("could not open file: ") + filename;
- throw BamException("BamStandardIndex::OpenFile", message);
- }
-}
-
-void BamStandardIndex::ReadBinID(uint32_t& binId) {
- const size_t elementsRead = fread(&binId, sizeof(binId), 1, Resources.IndexStream);
- if ( m_isBigEndian ) SwapEndian_32(binId);
- if ( elementsRead != 1 )
- throw BamException("BamStandardIndex::ReadBinID", "could not read BAI bin ID");
-}
-
-void BamStandardIndex::ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks) {
-
- // read bin header
- ReadBinID(binId);
- ReadNumAlignmentChunks(numAlignmentChunks);
-
- // read bin contents
- const unsigned int bytesRequested = numAlignmentChunks*BamStandardIndex::SIZEOF_ALIGNMENTCHUNK;
- ReadIntoBuffer(bytesRequested);
-}
-
-void BamStandardIndex::ReadIntoBuffer(const unsigned int& bytesRequested) {
-
- // ensure that our buffer is big enough for request
- BamStandardIndex::CheckBufferSize(Resources.Buffer, m_bufferLength, bytesRequested);
-
- // read from BAI file stream
- const size_t bytesRead = fread( Resources.Buffer, sizeof(char), bytesRequested, Resources.IndexStream );
- if ( bytesRead != (size_t)bytesRequested ) {
- stringstream s("");
- s << "expected to read: " << bytesRequested << " bytes, "
- << "but instead read: " << bytesRead;
- throw BamException("BamStandardIndex::ReadIntoBuffer", s.str());
- }
-}
-
-void BamStandardIndex::ReadLinearOffset(uint64_t& linearOffset) {
- const size_t elementsRead = fread(&linearOffset, sizeof(linearOffset), 1, Resources.IndexStream);
- if ( m_isBigEndian ) SwapEndian_64(linearOffset);
- if ( elementsRead != 1 )
- throw BamException("BamStandardIndex::ReadLinearOffset", "could not read BAI linear offset");
-}
-
-void BamStandardIndex::ReadNumAlignmentChunks(int& numAlignmentChunks) {
- const size_t elementsRead = fread(&numAlignmentChunks, sizeof(numAlignmentChunks), 1, Resources.IndexStream);
- if ( m_isBigEndian ) SwapEndian_32(numAlignmentChunks);
- if ( elementsRead != 1 )
- throw BamException("BamStandardIndex::ReadNumAlignmentChunks", "could not read BAI chunk count");
-}
-
-void BamStandardIndex::ReadNumBins(int& numBins) {
- const size_t elementsRead = fread(&numBins, sizeof(numBins), 1, Resources.IndexStream);
- if ( m_isBigEndian ) SwapEndian_32(numBins);
- if ( elementsRead != 1 )
- throw BamException("BamStandardIndex::ReadNumBins", "could not read BAI bin count");
-}
-
-void BamStandardIndex::ReadNumLinearOffsets(int& numLinearOffsets) {
- const size_t elementsRead = fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, Resources.IndexStream);
- if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets);
- if ( elementsRead != 1 )
- throw BamException("BamStandardIndex::ReadNumAlignmentChunks", "could not read BAI linear offset count");
-}
-
-void BamStandardIndex::ReadNumReferences(int& numReferences) {
- const size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, Resources.IndexStream);
- if ( m_isBigEndian ) SwapEndian_32(numReferences);
- if ( elementsRead != 1 )
- throw BamException("BamStandardIndex::ReadNumReferences", "could not read reference count");
-}
-
-void BamStandardIndex::ReserveForSummary(const int& numReferences) {
- m_indexFileSummary.clear();
- m_indexFileSummary.assign( numReferences, BaiReferenceSummary() );
-}
-
-void BamStandardIndex::SaveAlignmentChunkToBin(BaiBinMap& binMap,
- const uint32_t& currentBin,
- const uint64_t& currentOffset,
- const uint64_t& lastOffset)
-{
- // create new alignment chunk
- BaiAlignmentChunk newChunk(currentOffset, lastOffset);
-
- // if no entry exists yet for this bin, create one and store alignment chunk
- BaiBinMap::iterator binIter = binMap.find(currentBin);
- if ( binIter == binMap.end() ) {
- BaiAlignmentChunkVector newChunks;
- newChunks.push_back(newChunk);
- binMap.insert( pair<uint32_t, BaiAlignmentChunkVector>(currentBin, newChunks));
- }
-
- // otherwise, just append alignment chunk
- else {
- BaiAlignmentChunkVector& binChunks = (*binIter).second;
- binChunks.push_back( newChunk );
- }
-}
-
-void BamStandardIndex::SaveBinsSummary(const int& refId, const int& numBins) {
- BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId);
- refSummary.NumBins = numBins;
- refSummary.FirstBinFilePosition = Tell();
-}
-
-void BamStandardIndex::SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets,
- const int& alignmentStartPosition,
- const int& alignmentStopPosition,
- const uint64_t& lastOffset)
-{
- // get converted offsets
- const int beginOffset = alignmentStartPosition >> BamStandardIndex::BAM_LIDX_SHIFT;
- const int endOffset = (alignmentStopPosition - 1) >> BamStandardIndex::BAM_LIDX_SHIFT;
-
- // resize vector if necessary
- int oldSize = offsets.size();
- int newSize = endOffset + 1;
- if ( oldSize < newSize )
- offsets.resize(newSize, 0);
-
- // store offset
- for( int i = beginOffset + 1; i <= endOffset; ++i ) {
- if ( offsets[i] == 0 )
- offsets[i] = lastOffset;
- }
-}
-
-void BamStandardIndex::SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets) {
- BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId);
- refSummary.NumLinearOffsets = numLinearOffsets;
- refSummary.FirstLinearOffsetFilePosition = Tell();
-}
-
-// seek to position in index file stream
-void BamStandardIndex::Seek(const int64_t& position, const int& origin) {
- if ( fseek64(Resources.IndexStream, position, origin) != 0 )
- throw BamException("BamStandardIndex::Seek", "could not seek in BAI file");
-}
-
-void BamStandardIndex::SkipBins(const int& numBins) {
- uint32_t binId;
- int32_t numAlignmentChunks;
- for (int i = 0; i < numBins; ++i)
- ReadBinIntoBuffer(binId, numAlignmentChunks); // results & buffer ignored
-}
-
-void BamStandardIndex::SkipLinearOffsets(const int& numLinearOffsets) {
- const unsigned int bytesRequested = numLinearOffsets*BamStandardIndex::SIZEOF_LINEAROFFSET;
- ReadIntoBuffer(bytesRequested);
-}
-
-void BamStandardIndex::SortLinearOffsets(BaiLinearOffsetVector& linearOffsets) {
- sort( linearOffsets.begin(), linearOffsets.end() );
-}
-
-void BamStandardIndex::SummarizeBins(BaiReferenceSummary& refSummary) {
-
- // load number of bins
- int numBins;
- ReadNumBins(numBins);
-
- // store bins summary for this reference
- refSummary.NumBins = numBins;
- refSummary.FirstBinFilePosition = Tell();
-
- // skip this reference's bins
- SkipBins(numBins);
-}
-
-void BamStandardIndex::SummarizeIndexFile(void) {
-
- // load number of reference sequences
- int numReferences;
- ReadNumReferences(numReferences);
-
- // initialize file summary data
- ReserveForSummary(numReferences);
-
- // iterate over reference entries
- BaiFileSummary::iterator summaryIter = m_indexFileSummary.begin();
- BaiFileSummary::iterator summaryEnd = m_indexFileSummary.end();
- for ( int i = 0; summaryIter != summaryEnd; ++summaryIter, ++i )
- SummarizeReference(*summaryIter);
-}
-
-void BamStandardIndex::SummarizeLinearOffsets(BaiReferenceSummary& refSummary) {
-
- // load number of linear offsets
- int numLinearOffsets;
- ReadNumLinearOffsets(numLinearOffsets);
-
- // store bin summary data for this reference
- refSummary.NumLinearOffsets = numLinearOffsets;
- refSummary.FirstLinearOffsetFilePosition = Tell();
-
- // skip linear offsets in index file
- SkipLinearOffsets(numLinearOffsets);
-}
-
-void BamStandardIndex::SummarizeReference(BaiReferenceSummary& refSummary) {
- SummarizeBins(refSummary);
- SummarizeLinearOffsets(refSummary);
-}
-
-// return position of file pointer in index file stream
-int64_t BamStandardIndex::Tell(void) const {
- return ftell64(Resources.IndexStream);
-}
-
-void BamStandardIndex::WriteAlignmentChunk(const BaiAlignmentChunk& chunk) {
-
- // localize alignment chunk offsets
- uint64_t start = chunk.Start;
- uint64_t stop = chunk.Stop;
-
- // swap endian-ness if necessary
- if ( m_isBigEndian ) {
- SwapEndian_64(start);
- SwapEndian_64(stop);
- }
-
- // write to index file
- size_t elementsWritten = 0;
- elementsWritten += fwrite(&start, sizeof(start), 1, Resources.IndexStream);
- elementsWritten += fwrite(&stop, sizeof(stop), 1, Resources.IndexStream);
- if ( elementsWritten != 2 )
- throw BamException("BamStandardIndex::WriteAlignmentChunk", "could not write BAI alignment chunk");
-}
-
-void BamStandardIndex::WriteAlignmentChunks(BaiAlignmentChunkVector& chunks) {
-
- // make sure chunks are merged (simplified) before writing & saving summary
- MergeAlignmentChunks(chunks);
-
- // write chunks
- int32_t chunkCount = chunks.size();
- if ( m_isBigEndian ) SwapEndian_32(chunkCount);
- const size_t elementsWritten = fwrite(&chunkCount, sizeof(chunkCount), 1, Resources.IndexStream);
- if ( elementsWritten != 1 )
- throw BamException("BamStandardIndex::WriteAlignmentChunks", "could not write BAI chunk count");
-
- // iterate over chunks
- BaiAlignmentChunkVector::const_iterator chunkIter = chunks.begin();
- BaiAlignmentChunkVector::const_iterator chunkEnd = chunks.end();
- for ( ; chunkIter != chunkEnd; ++chunkIter )
- WriteAlignmentChunk( (*chunkIter) );
-}
-
-void BamStandardIndex::WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks) {
-
- // write BAM bin ID
- uint32_t binKey = binId;
- if ( m_isBigEndian ) SwapEndian_32(binKey);
- const size_t elementsWritten = fwrite(&binKey, sizeof(binKey), 1, Resources.IndexStream);
- if ( elementsWritten != 1 )
- throw BamException("BamStandardIndex::WriteBin", "could not write bin ID");
-
- // write bin's alignment chunks
- WriteAlignmentChunks(chunks);
-}
-
-void BamStandardIndex::WriteBins(const int& refId, BaiBinMap& bins) {
-
- // write number of bins
- int32_t binCount = bins.size();
- if ( m_isBigEndian ) SwapEndian_32(binCount);
- const size_t elementsWritten = fwrite(&binCount, sizeof(binCount), 1, Resources.IndexStream);
- if ( elementsWritten != 1 )
- throw BamException("BamStandardIndex::WriteBins", "could not write bin count");
-
- // save summary for reference's bins
- SaveBinsSummary(refId, bins.size());
-
- // iterate over bins
- BaiBinMap::iterator binIter = bins.begin();
- BaiBinMap::iterator binEnd = bins.end();
- for ( ; binIter != binEnd; ++binIter )
- WriteBin( (*binIter).first, (*binIter).second );
-}
-
-void BamStandardIndex::WriteHeader(void) {
-
- size_t elementsWritten = 0;
-
- // write magic number
- elementsWritten += fwrite(BamStandardIndex::BAI_MAGIC, sizeof(char), 4, Resources.IndexStream);
-
- // write number of reference sequences
- int32_t numReferences = m_indexFileSummary.size();
- if ( m_isBigEndian ) SwapEndian_32(numReferences);
- elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, Resources.IndexStream);
-
- if ( elementsWritten != 5 )
- throw BamException("BamStandardIndex::WriteHeader", "could not write BAI header");
-}
-
-void BamStandardIndex::WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets) {
-
- // make sure linear offsets are sorted before writing & saving summary
- SortLinearOffsets(linearOffsets);
-
- size_t elementsWritten = 0;
-
- // write number of linear offsets
- int32_t offsetCount = linearOffsets.size();
- if ( m_isBigEndian ) SwapEndian_32(offsetCount);
- elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, Resources.IndexStream);
-
- // save summary for reference's linear offsets
- SaveLinearOffsetsSummary(refId, linearOffsets.size());
-
- // iterate over linear offsets
- BaiLinearOffsetVector::const_iterator offsetIter = linearOffsets.begin();
- BaiLinearOffsetVector::const_iterator offsetEnd = linearOffsets.end();
- for ( ; offsetIter != offsetEnd; ++offsetIter ) {
-
- // write linear offset
- uint64_t linearOffset = (*offsetIter);
- if ( m_isBigEndian ) SwapEndian_64(linearOffset);
- elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, Resources.IndexStream);
- }
-
- if ( elementsWritten != (linearOffsets.size() + 1) )
- throw BamException("BamStandardIndex::WriteLinearOffsets", "could not write BAI linear offsets");
-}
-
-void BamStandardIndex::WriteReferenceEntry(BaiReferenceEntry& refEntry) {
- WriteBins(refEntry.ID, refEntry.Bins);
- WriteLinearOffsets(refEntry.ID, refEntry.LinearOffsets);
-}
+++ /dev/null
-// ***************************************************************************
-// BamStandardIndex.h (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides index operations for the standardized BAM index format (".bai")
-// ***************************************************************************
-
-#ifndef BAM_STANDARD_INDEX_FORMAT_H
-#define BAM_STANDARD_INDEX_FORMAT_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to
-// version without notice, or even be removed.
-//
-// We mean it.
-
-#include "api/BamAux.h"
-#include "api/BamIndex.h"
-#include <map>
-#include <set>
-#include <string>
-#include <vector>
-
-namespace BamTools {
-namespace Internal {
-
-// -----------------------------------------------------------------------------
-// BamStandardIndex data structures
-
-// defines start and end of a contiguous run of alignments
-struct BaiAlignmentChunk {
-
- // data members
- uint64_t Start;
- uint64_t Stop;
-
- // constructor
- BaiAlignmentChunk(const uint64_t& start = 0,
- const uint64_t& stop = 0)
- : Start(start)
- , Stop(stop)
- { }
-};
-
-// comparison operator (for sorting)
-inline
-bool operator<(const BaiAlignmentChunk& lhs, const BaiAlignmentChunk& rhs) {
- return lhs.Start < rhs.Start;
-}
-
-// convenience typedef for a list of all alignment 'chunks' in a BAI bin
-typedef std::vector<BaiAlignmentChunk> BaiAlignmentChunkVector;
-
-// convenience typedef for a map of all BAI bins in a reference (ID => chunks)
-typedef std::map<uint32_t, BaiAlignmentChunkVector> BaiBinMap;
-
-// convenience typedef for a list of all 'linear offsets' in a reference
-typedef std::vector<uint64_t> BaiLinearOffsetVector;
-
-// contains all fields necessary for building, loading, & writing
-// full BAI index data for a single reference
-struct BaiReferenceEntry {
-
- // data members
- int32_t ID;
- BaiBinMap Bins;
- BaiLinearOffsetVector LinearOffsets;
-
- // ctor
- BaiReferenceEntry(const int32_t& id = -1)
- : ID(id)
- { }
-};
-
-// provides (persistent) summary of BaiReferenceEntry's index data
-struct BaiReferenceSummary {
-
- // data members
- int NumBins;
- int NumLinearOffsets;
- uint64_t FirstBinFilePosition;
- uint64_t FirstLinearOffsetFilePosition;
-
- // ctor
- BaiReferenceSummary(void)
- : NumBins(0)
- , NumLinearOffsets(0)
- , FirstBinFilePosition(0)
- , FirstLinearOffsetFilePosition(0)
- { }
-};
-
-// convenience typedef for describing a full BAI index file summary
-typedef std::vector<BaiReferenceSummary> BaiFileSummary;
-
-// end BamStandardIndex data structures
-// -----------------------------------------------------------------------------
-
-class BamStandardIndex : public BamIndex {
-
- // ctor & dtor
- public:
- BamStandardIndex(Internal::BamReaderPrivate* reader);
- ~BamStandardIndex(void);
-
- // BamIndex implementation
- public:
- // builds index from associated BAM file & writes out to index file
- bool Create(void);
- // returns whether reference has alignments or no
- bool HasAlignments(const int& referenceID) const;
- // attempts to use index data to jump to @region, returns success/fail
- // a "successful" jump indicates no error, but not whether this region has data
- // * thus, the method sets a flag to indicate whether there are alignments
- // available after the jump position
- bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
- // loads existing data from file into memory
- bool Load(const std::string& filename);
- public:
- // returns format's file extension
- static const std::string Extension(void);
-
- // internal methods
- private:
-
- // index file ops
- void CheckMagicNumber(void);
- void CloseFile(void);
- bool IsFileOpen(void) const;
- void OpenFile(const std::string& filename, const char* mode);
- void Seek(const int64_t& position, const int& origin);
- int64_t Tell(void) const;
-
- // BAI index building methods
- void ClearReferenceEntry(BaiReferenceEntry& refEntry);
- void SaveAlignmentChunkToBin(BaiBinMap& binMap,
- const uint32_t& currentBin,
- const uint64_t& currentOffset,
- const uint64_t& lastOffset);
- void SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets,
- const int& alignmentStartPosition,
- const int& alignmentStopPosition,
- const uint64_t& lastOffset);
-
- // random-access methods
- void AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end);
- void CalculateCandidateBins(const uint32_t& begin,
- const uint32_t& end,
- std::set<uint16_t>& candidateBins);
- void CalculateCandidateOffsets(const BaiReferenceSummary& refSummary,
- const uint64_t& minOffset,
- std::set<uint16_t>& candidateBins,
- std::vector<int64_t>& offsets);
- uint64_t CalculateMinOffset(const BaiReferenceSummary& refSummary, const uint32_t& begin);
- void GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion);
- uint64_t LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index);
-
- // BAI summary (create/load) methods
- void ReserveForSummary(const int& numReferences);
- void SaveBinsSummary(const int& refId, const int& numBins);
- void SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets);
- void SkipBins(const int& numBins);
- void SkipLinearOffsets(const int& numLinearOffsets);
- void SummarizeBins(BaiReferenceSummary& refSummary);
- void SummarizeIndexFile(void);
- void SummarizeLinearOffsets(BaiReferenceSummary& refSummary);
- void SummarizeReference(BaiReferenceSummary& refSummary);
-
- // BAI full index input methods
- void ReadBinID(uint32_t& binId);
- void ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks);
- void ReadIntoBuffer(const unsigned int& bytesRequested);
- void ReadLinearOffset(uint64_t& linearOffset);
- void ReadNumAlignmentChunks(int& numAlignmentChunks);
- void ReadNumBins(int& numBins);
- void ReadNumLinearOffsets(int& numLinearOffsets);
- void ReadNumReferences(int& numReferences);
-
- // BAI full index output methods
- void MergeAlignmentChunks(BaiAlignmentChunkVector& chunks);
- void SortLinearOffsets(BaiLinearOffsetVector& linearOffsets);
- void WriteAlignmentChunk(const BaiAlignmentChunk& chunk);
- void WriteAlignmentChunks(BaiAlignmentChunkVector& chunks);
- void WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks);
- void WriteBins(const int& refId, BaiBinMap& bins);
- void WriteHeader(void);
- void WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets);
- void WriteReferenceEntry(BaiReferenceEntry& refEntry);
-
- // data members
- private:
- bool m_isBigEndian;
- BaiFileSummary m_indexFileSummary;
-
- // our input buffer
- unsigned int m_bufferLength;
-
- struct RaiiWrapper {
- FILE* IndexStream;
- char* Buffer;
- RaiiWrapper(void);
- ~RaiiWrapper(void);
- };
- RaiiWrapper Resources;
-
- // static methods
- private:
- // checks if the buffer is large enough to accomodate the requested size
- static void CheckBufferSize(char*& buffer,
- unsigned int& bufferLength,
- const unsigned int& requestedBytes);
- // checks if the buffer is large enough to accomodate the requested size
- static void CheckBufferSize(unsigned char*& buffer,
- unsigned int& bufferLength,
- const unsigned int& requestedBytes);
- // static constants
- private:
- static const int MAX_BIN;
- static const int BAM_LIDX_SHIFT;
- static const std::string BAI_EXTENSION;
- static const char* const BAI_MAGIC;
- static const int SIZEOF_ALIGNMENTCHUNK;
- static const int SIZEOF_BINCORE;
- static const int SIZEOF_LINEAROFFSET;
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // BAM_STANDARD_INDEX_FORMAT_H
+++ /dev/null
-// ***************************************************************************
-// BamToolsIndex.cpp (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides index operations for the BamTools index format (".bti")
-// ***************************************************************************
-
-#include "api/BamAlignment.h"
-#include "api/internal/BamException_p.h"
-#include "api/internal/BamReader_p.h"
-#include "api/internal/BamToolsIndex_p.h"
-#include "api/internal/BgzfStream_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <algorithm>
-#include <iostream>
-#include <iterator>
-#include <map>
-using namespace std;
-
-// --------------------------------
-// static BamToolsIndex constants
-// --------------------------------
-
-const uint32_t BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000;
-const string BamToolsIndex::BTI_EXTENSION = ".bti";
-const char* const BamToolsIndex::BTI_MAGIC = "BTI\1";
-const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t)*2 + sizeof(int64_t);
-
-// ----------------------------
-// RaiiWrapper implementation
-// ----------------------------
-
-BamToolsIndex::RaiiWrapper::RaiiWrapper(void)
- : IndexStream(0)
-{ }
-
-BamToolsIndex::RaiiWrapper::~RaiiWrapper(void) {
- if ( IndexStream )
- fclose(IndexStream);
-}
-
-// ------------------------------
-// BamToolsIndex implementation
-// ------------------------------
-
-// ctor
-BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader)
- : BamIndex(reader)
- , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH)
- , m_inputVersion(0)
- , m_outputVersion(BTI_2_0) // latest version - used for writing new index files
-{
- m_isBigEndian = BamTools::SystemIsBigEndian();
-}
-
-// dtor
-BamToolsIndex::~BamToolsIndex(void) {
- CloseFile();
-}
-
-void BamToolsIndex::CheckMagicNumber(void) {
-
- // read magic number
- char magic[4];
- size_t elementsRead = fread(magic, sizeof(char), 4, Resources.IndexStream);
- if ( elementsRead != 4 )
- throw BamException("BamToolsIndex::CheckMagicNumber", "could not read BTI magic number");
-
- // validate expected magic number
- if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 )
- throw BamException("BamToolsIndex::CheckMagicNumber", "invalid BTI magic number");
-}
-
-// check index file version, return true if OK
-void BamToolsIndex::CheckVersion(void) {
-
- // read version from file
- size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, Resources.IndexStream);
- if ( elementsRead != 1 )
- throw BamException("BamToolsIndex::CheckVersion", "could not read format version");
- if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
-
- // if version is negative, or zero
- if ( m_inputVersion <= 0 )
- throw BamException("BamToolsIndex::CheckVersion", "invalid format version");
-
- // if version is newer than can be supported by this version of bamtools
- else if ( m_inputVersion > m_outputVersion ) {
- const string message = "unsupported format: this index was created by a newer version of BamTools. "
- "Update your local version of BamTools to use the index file.";
- throw BamException("BamToolsIndex::CheckVersion", message);
- }
-
- // ------------------------------------------------------------------
- // check for deprecated, unsupported versions
- // (the format had to be modified to accomodate a particular bug fix)
-
- // Version 2.0: introduced support for half-open intervals, instead of the old closed intervals
- // respondBy: throwing exception - we're not going to try to handle the old BTI files.
- else if ( (Version)m_inputVersion < BamToolsIndex::BTI_2_0 ) {
- const string message = "unsupported format: this version of the index may not properly handle "
- "coordinate intervals. Please run 'bamtools index -bti -in yourData.bam' "
- "to generate an up-to-date, fixed BTI file.";
- throw BamException("BamToolsIndex::CheckVersion", message);
- }
-}
-
-void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) {
- refEntry.ID = -1;
- refEntry.Blocks.clear();
-}
-
-void BamToolsIndex::CloseFile(void) {
- if ( IsFileOpen() ) {
- fclose(Resources.IndexStream);
- Resources.IndexStream = 0;
- }
- m_indexFileSummary.clear();
-}
-
-// builds index from associated BAM file & writes out to index file
-bool BamToolsIndex::Create(void) {
-
- // skip if BamReader is invalid or not open
- if ( m_reader == 0 || !m_reader->IsOpen() ) {
- SetErrorString("BamToolsIndex::Create", "could not create index: reader is not open");
- return false;
- }
-
- // rewind BamReader
- if ( !m_reader->Rewind() ) {
- const string readerError = m_reader->GetErrorString();
- const string message = "could not create index: \n\t" + readerError;
- SetErrorString("BamToolsIndex::Create", message);
- return false;
- }
-
- try {
- // open new index file (read & write)
- const string indexFilename = m_reader->Filename() + Extension();
- OpenFile(indexFilename, "w+b");
-
- // initialize BtiFileSummary with number of references
- const int& numReferences = m_reader->GetReferenceCount();
- InitializeFileSummary(numReferences);
-
- // intialize output file header
- WriteHeader();
-
- // index building markers
- uint32_t currentBlockCount = 0;
- int64_t currentAlignmentOffset = m_reader->Tell();
- int32_t blockRefId = -1;
- int32_t blockMaxEndPosition = -1;
- int64_t blockStartOffset = currentAlignmentOffset;
- int32_t blockStartPosition = -1;
-
- // plow through alignments, storing index entries
- BamAlignment al;
- BtiReferenceEntry refEntry;
- while ( m_reader->LoadNextAlignment(al) ) {
-
- // if moved to new reference
- if ( al.RefID != blockRefId ) {
-
- // if first pass, check:
- if ( currentBlockCount == 0 ) {
-
- // write any empty references up to (but not including) al.RefID
- for ( int i = 0; i < al.RefID; ++i )
- WriteReferenceEntry( BtiReferenceEntry(i) );
- }
-
- // not first pass:
- else {
-
- // store previous BTI block data in reference entry
- const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
- refEntry.Blocks.push_back(block);
-
- // write reference entry, then clear
- WriteReferenceEntry(refEntry);
- ClearReferenceEntry(refEntry);
-
- // write any empty references between (but not including)
- // the last blockRefID and current al.RefID
- for ( int i = blockRefId+1; i < al.RefID; ++i )
- WriteReferenceEntry( BtiReferenceEntry(i) );
-
- // reset block count
- currentBlockCount = 0;
- }
-
- // set ID for new reference entry
- refEntry.ID = al.RefID;
- }
-
- // if beginning of block, update counters
- if ( currentBlockCount == 0 ) {
- blockRefId = al.RefID;
- blockStartOffset = currentAlignmentOffset;
- blockStartPosition = al.Position;
- blockMaxEndPosition = al.GetEndPosition();
- }
-
- // increment block counter
- ++currentBlockCount;
-
- // check end position
- const int32_t alignmentEndPosition = al.GetEndPosition();
- if ( alignmentEndPosition > blockMaxEndPosition )
- blockMaxEndPosition = alignmentEndPosition;
-
- // if block is full, get offset for next block, reset currentBlockCount
- if ( currentBlockCount == m_blockSize ) {
-
- // store previous block data in reference entry
- const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
- refEntry.Blocks.push_back(block);
-
- // update markers
- blockStartOffset = m_reader->Tell();
- currentBlockCount = 0;
- }
-
- // not the best name, but for the next iteration, this value will be the offset of the
- // *current* alignment. this is necessary because we won't know if this next alignment
- // is on a new reference until we actually read it
- currentAlignmentOffset = m_reader->Tell();
- }
-
- // after finishing alignments, if any data was read, check:
- if ( blockRefId >= 0 ) {
-
- // store last BTI block data in reference entry
- const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
- refEntry.Blocks.push_back(block);
-
- // write last reference entry, then clear
- WriteReferenceEntry(refEntry);
- ClearReferenceEntry(refEntry);
-
- // then write any empty references remaining at end of file
- for ( int i = blockRefId+1; i < numReferences; ++i )
- WriteReferenceEntry( BtiReferenceEntry(i) );
- }
-
- } catch ( BamException& e ) {
- m_errorString = e.what();
- return false;
- }
-
- // rewind BamReader
- if ( !m_reader->Rewind() ) {
- const string readerError = m_reader->GetErrorString();
- const string message = "could not create index: \n\t" + readerError;
- SetErrorString("BamToolsIndex::Create", message);
- return false;
- }
-
- // return success
- return true;
-}
-
-// returns format's file extension
-const std::string BamToolsIndex::Extension(void) {
- return BamToolsIndex::BTI_EXTENSION;
-}
-
-void BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
-
- // return false ref ID is not a valid index in file summary data
- if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() )
- throw BamException("BamToolsIndex::GetOffset", "invalid region requested");
-
- // retrieve reference index data for left bound reference
- BtiReferenceEntry refEntry(region.LeftRefID);
- ReadReferenceEntry(refEntry);
-
- // binary search for an overlapping block (may not be first one though)
- bool found = false;
- typedef BtiBlockVector::const_iterator BtiBlockConstIterator;
- BtiBlockConstIterator blockFirst = refEntry.Blocks.begin();
- BtiBlockConstIterator blockIter = blockFirst;
- BtiBlockConstIterator blockLast = refEntry.Blocks.end();
- iterator_traits<BtiBlockConstIterator>::difference_type count = distance(blockFirst, blockLast);
- iterator_traits<BtiBlockConstIterator>::difference_type step;
- while ( count > 0 ) {
- blockIter = blockFirst;
- step = count/2;
- advance(blockIter, step);
-
- const BtiBlock& block = (*blockIter);
- if ( block.StartPosition <= region.RightPosition ) {
- if ( block.MaxEndPosition > region.LeftPosition ) {
- offset = block.StartOffset;
- break;
- }
- blockFirst = ++blockIter;
- count -= step+1;
- }
- else count = step;
- }
-
- // if we didn't search "off the end" of the blocks
- if ( blockIter != blockLast ) {
-
- // "walk back" until we've gone too far
- while ( blockIter != blockFirst ) {
- const BtiBlock& currentBlock = (*blockIter);
-
- --blockIter;
- const BtiBlock& previousBlock = (*blockIter);
- if ( previousBlock.MaxEndPosition <= region.LeftPosition ) {
- offset = currentBlock.StartOffset;
- found = true;
- break;
- }
- }
-
- // if we walked all the way to first block, just return that and let the reader's
- // region overlap parsing do the rest
- if ( blockIter == blockFirst ) {
- const BtiBlock& block = (*blockIter);
- offset = block.StartOffset;
- found = true;
- }
- }
-
-
- // sets to false if blocks container is empty, or if no matching block could be found
- *hasAlignmentsInRegion = found;
-}
-
-// returns whether reference has alignments or no
-bool BamToolsIndex::HasAlignments(const int& referenceID) const {
- if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() )
- return false;
- const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID);
- return ( refSummary.NumBlocks > 0 );
-}
-
-// pre-allocates space for each reference's summary data
-void BamToolsIndex::InitializeFileSummary(const int& numReferences) {
- m_indexFileSummary.clear();
- for ( int i = 0; i < numReferences; ++i )
- m_indexFileSummary.push_back( BtiReferenceSummary() );
-}
-
-// returns true if the index stream is open
-bool BamToolsIndex::IsFileOpen(void) const {
- return ( Resources.IndexStream != 0 );
-}
-
-// attempts to use index data to jump to @region, returns success/fail
-// a "successful" jump indicates no error, but not whether this region has data
-// * thus, the method sets a flag to indicate whether there are alignments
-// available after the jump position
-bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) {
-
- // clear flag
- *hasAlignmentsInRegion = false;
-
- // skip if invalid reader or not open
- if ( m_reader == 0 || !m_reader->IsOpen() ) {
- SetErrorString("BamToolsIndex::Jump", "could not jump: reader is not open");
- return false;
- }
-
- // make sure left-bound position is valid
- const RefVector& references = m_reader->GetReferenceData();
- if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) {
- SetErrorString("BamToolsIndex::Jump", "could not create index: invalid region requested");
- return false;
- }
-
- // calculate nearest offset to jump to
- int64_t offset;
- try {
- GetOffset(region, offset, hasAlignmentsInRegion);
- } catch ( BamException& e ) {
- m_errorString = e.what();
- return false;
- }
-
- // return success/failure of seek
- return m_reader->Seek(offset);
-}
-
-// loads existing data from file into memory
-bool BamToolsIndex::Load(const std::string& filename) {
-
- try {
-
- // attempt to open file (read-only)
- OpenFile(filename, "rb");
-
- // load metadata & generate in-memory summary
- LoadHeader();
- LoadFileSummary();
-
- // return success
- return true;
-
- } catch ( BamException& e ) {
- m_errorString = e.what();
- return false;
- }
-}
-
-void BamToolsIndex::LoadFileSummary(void) {
-
- // load number of reference sequences
- int numReferences;
- LoadNumReferences(numReferences);
-
- // initialize file summary data
- InitializeFileSummary(numReferences);
-
- // load summary for each reference
- BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin();
- BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end();
- for ( ; summaryIter != summaryEnd; ++summaryIter )
- LoadReferenceSummary(*summaryIter);
-}
-
-void BamToolsIndex::LoadHeader(void) {
-
- // check BTI file metadata
- CheckMagicNumber();
- CheckVersion();
-
- // use file's BTI block size to set member variable
- const size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, Resources.IndexStream);
- if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
- if ( elementsRead != 1 )
- throw BamException("BamToolsIndex::LoadHeader", "could not read BTI block size");
-}
-
-void BamToolsIndex::LoadNumBlocks(int& numBlocks) {
- const size_t elementsRead = fread(&numBlocks, sizeof(numBlocks), 1, Resources.IndexStream);
- if ( m_isBigEndian ) SwapEndian_32(numBlocks);
- if ( elementsRead != 1 )
- throw BamException("BamToolsIndex::LoadNumBlocks", "could not read number of BTI blocks");
-}
-
-void BamToolsIndex::LoadNumReferences(int& numReferences) {
- const size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, Resources.IndexStream);
- if ( m_isBigEndian ) SwapEndian_32(numReferences);
- if ( elementsRead != 1 )
- throw BamException("BamToolsIndex::LoadNumReferences", "could not read number of references");
-}
-
-void BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) {
-
- // load number of blocks
- int numBlocks;
- LoadNumBlocks(numBlocks);
-
- // store block summary data for this reference
- refSummary.NumBlocks = numBlocks;
- refSummary.FirstBlockFilePosition = Tell();
-
- // skip reference's blocks
- SkipBlocks(numBlocks);
-}
-
-void BamToolsIndex::OpenFile(const std::string& filename, const char* mode) {
-
- // make sure any previous index file is closed
- CloseFile();
-
- // attempt to open file
- Resources.IndexStream = fopen(filename.c_str(), mode);
- if ( !IsFileOpen() ) {
- const string message = string("could not open file: ") + filename;
- throw BamException("BamToolsIndex::OpenFile", message);
- }
-}
-
-void BamToolsIndex::ReadBlock(BtiBlock& block) {
-
- // read in block data members
- size_t elementsRead = 0;
- elementsRead += fread(&block.MaxEndPosition, sizeof(block.MaxEndPosition), 1, Resources.IndexStream);
- elementsRead += fread(&block.StartOffset, sizeof(block.StartOffset), 1, Resources.IndexStream);
- elementsRead += fread(&block.StartPosition, sizeof(block.StartPosition), 1, Resources.IndexStream);
-
- // swap endian-ness if necessary
- if ( m_isBigEndian ) {
- SwapEndian_32(block.MaxEndPosition);
- SwapEndian_64(block.StartOffset);
- SwapEndian_32(block.StartPosition);
- }
-
- if ( elementsRead != 3 )
- throw BamException("BamToolsIndex::ReadBlock", "could not read block");
-}
-
-void BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) {
-
- // prep blocks container
- blocks.clear();
- blocks.reserve(refSummary.NumBlocks);
-
- // skip to first block entry
- Seek( refSummary.FirstBlockFilePosition, SEEK_SET );
-
- // read & store block entries
- BtiBlock block;
- for ( int i = 0; i < refSummary.NumBlocks; ++i ) {
- ReadBlock(block);
- blocks.push_back(block);
- }
-}
-
-void BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) {
-
- // return false if refId not valid index in file summary structure
- if ( refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size() )
- throw BamException("BamToolsIndex::ReadReferenceEntry", "invalid reference requested");
-
- // use index summary to assist reading the reference's BTI blocks
- const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID);
- ReadBlocks(refSummary, refEntry.Blocks);
-}
-
-void BamToolsIndex::Seek(const int64_t& position, const int& origin) {
- if ( fseek64(Resources.IndexStream, position, origin) != 0 )
- throw BamException("BamToolsIndex::Seek", "could not seek in BAI file");
-}
-
-void BamToolsIndex::SkipBlocks(const int& numBlocks) {
- Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR );
-}
-
-int64_t BamToolsIndex::Tell(void) const {
- return ftell64(Resources.IndexStream);
-}
-
-void BamToolsIndex::WriteBlock(const BtiBlock& block) {
-
- // copy entry data
- int32_t maxEndPosition = block.MaxEndPosition;
- int64_t startOffset = block.StartOffset;
- int32_t startPosition = block.StartPosition;
-
- // swap endian-ness if necessary
- if ( m_isBigEndian ) {
- SwapEndian_32(maxEndPosition);
- SwapEndian_64(startOffset);
- SwapEndian_32(startPosition);
- }
-
- // write the reference index entry
- size_t elementsWritten = 0;
- elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, Resources.IndexStream);
- elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, Resources.IndexStream);
- elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, Resources.IndexStream);
- if ( elementsWritten != 3 )
- throw BamException("BamToolsIndex::WriteBlock", "could not write BTI block");
-}
-
-void BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) {
- BtiBlockVector::const_iterator blockIter = blocks.begin();
- BtiBlockVector::const_iterator blockEnd = blocks.end();
- for ( ; blockIter != blockEnd; ++blockIter )
- WriteBlock(*blockIter);
-}
-
-void BamToolsIndex::WriteHeader(void) {
-
- size_t elementsWritten = 0;
-
- // write BTI index format 'magic number'
- elementsWritten += fwrite(BamToolsIndex::BTI_MAGIC, 1, 4, Resources.IndexStream);
-
- // write BTI index format version
- int32_t currentVersion = (int32_t)m_outputVersion;
- if ( m_isBigEndian ) SwapEndian_32(currentVersion);
- elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, Resources.IndexStream);
-
- // write block size
- uint32_t blockSize = m_blockSize;
- if ( m_isBigEndian ) SwapEndian_32(blockSize);
- elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, Resources.IndexStream);
-
- // write number of references
- int32_t numReferences = m_indexFileSummary.size();
- if ( m_isBigEndian ) SwapEndian_32(numReferences);
- elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, Resources.IndexStream);
-
- if ( elementsWritten != 7 )
- throw BamException("BamToolsIndex::WriteHeader", "could not write BTI header");
-}
-
-void BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) {
-
- // write number of blocks this reference
- uint32_t numBlocks = refEntry.Blocks.size();
- if ( m_isBigEndian ) SwapEndian_32(numBlocks);
- const size_t elementsWritten = fwrite(&numBlocks, sizeof(numBlocks), 1, Resources.IndexStream);
- if ( elementsWritten != 1 )
- throw BamException("BamToolsIndex::WriteReferenceEntry", "could not write number of blocks");
-
- // write actual block entries
- WriteBlocks(refEntry.Blocks);
-}
+++ /dev/null
-// ***************************************************************************
-// BamToolsIndex.h (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides index operations for the BamTools index format (".bti")
-// ***************************************************************************
-
-#ifndef BAMTOOLS_INDEX_FORMAT_H
-#define BAMTOOLS_INDEX_FORMAT_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to
-// version without notice, or even be removed.
-//
-// We mean it.
-
-#include "api/BamAux.h"
-#include "api/BamIndex.h"
-#include <map>
-#include <string>
-#include <vector>
-
-namespace BamTools {
-namespace Internal {
-
-// contains data for each 'block' in a BTI index
-struct BtiBlock {
-
- // data members
- int32_t MaxEndPosition;
- int64_t StartOffset;
- int32_t StartPosition;
-
- // ctor
- BtiBlock(const int32_t& maxEndPosition = 0,
- const int64_t& startOffset = 0,
- const int32_t& startPosition = 0)
- : MaxEndPosition(maxEndPosition)
- , StartOffset(startOffset)
- , StartPosition(startPosition)
- { }
-};
-
-// convenience typedef for describing a a list of BTI blocks on a reference
-typedef std::vector<BtiBlock> BtiBlockVector;
-
-// contains all fields necessary for building, loading, & writing
-// full BTI index data for a single reference
-struct BtiReferenceEntry {
-
- // data members
- int32_t ID;
- BtiBlockVector Blocks;
-
- // ctor
- BtiReferenceEntry(const int& id = -1)
- : ID(id)
- { }
-};
-
-// provides (persistent) summary of BtiReferenceEntry's index data
-struct BtiReferenceSummary {
-
- // data members
- int NumBlocks;
- uint64_t FirstBlockFilePosition;
-
- // ctor
- BtiReferenceSummary(void)
- : NumBlocks(0)
- , FirstBlockFilePosition(0)
- { }
-};
-
-// convenience typedef for describing a full BTI index file summary
-typedef std::vector<BtiReferenceSummary> BtiFileSummary;
-
-class BamToolsIndex : public BamIndex {
-
- // keep a list of any supported versions here
- // (might be useful later to handle any 'legacy' versions if the format changes)
- // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
- //
- // so a change introduced in BTI_1_2 may be handled from then on by:
- //
- // if ( indexVersion >= BTI_1_2 )
- // do something new
- // else
- // do the old thing
- enum Version { BTI_1_0 = 1
- , BTI_1_1
- , BTI_1_2
- , BTI_2_0
- };
-
- // ctor & dtor
- public:
- BamToolsIndex(Internal::BamReaderPrivate* reader);
- ~BamToolsIndex(void);
-
- // BamIndex implementation
- public:
- // builds index from associated BAM file & writes out to index file
- bool Create(void);
- // returns whether reference has alignments or no
- bool HasAlignments(const int& referenceID) const;
- // attempts to use index data to jump to @region, returns success/fail
- // a "successful" jump indicates no error, but not whether this region has data
- // * thus, the method sets a flag to indicate whether there are alignments
- // available after the jump position
- bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
- // loads existing data from file into memory
- bool Load(const std::string& filename);
- public:
- // returns format's file extension
- static const std::string Extension(void);
-
- // internal methods
- private:
-
- // index file ops
- void CheckMagicNumber(void);
- void CheckVersion(void);
- void CloseFile(void);
- bool IsFileOpen(void) const;
- void OpenFile(const std::string& filename, const char* mode);
- void Seek(const int64_t& position, const int& origin);
- int64_t Tell(void) const;
-
- // index-creation methods
- void ClearReferenceEntry(BtiReferenceEntry& refEntry);
- void WriteBlock(const BtiBlock& block);
- void WriteBlocks(const BtiBlockVector& blocks);
- void WriteHeader(void);
- void WriteReferenceEntry(const BtiReferenceEntry& refEntry);
-
- // random-access methods
- void GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion);
- void ReadBlock(BtiBlock& block);
- void ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks);
- void ReadReferenceEntry(BtiReferenceEntry& refEntry);
-
- // BTI summary data methods
- void InitializeFileSummary(const int& numReferences);
- void LoadFileSummary(void);
- void LoadHeader(void);
- void LoadNumBlocks(int& numBlocks);
- void LoadNumReferences(int& numReferences);
- void LoadReferenceSummary(BtiReferenceSummary& refSummary);
- void SkipBlocks(const int& numBlocks);
-
- // data members
- private:
- bool m_isBigEndian;
- BtiFileSummary m_indexFileSummary;
- uint32_t m_blockSize;
- int32_t m_inputVersion; // Version is serialized as int
- Version m_outputVersion;
-
- struct RaiiWrapper {
- FILE* IndexStream;
- RaiiWrapper(void);
- ~RaiiWrapper(void);
- };
- RaiiWrapper Resources;
-
- // static constants
- private:
- static const uint32_t DEFAULT_BLOCK_LENGTH;
- static const std::string BTI_EXTENSION;
- static const char* const BTI_MAGIC;
- static const int SIZEOF_BLOCK;
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // BAMTOOLS_INDEX_FORMAT_H
+++ /dev/null
-// ***************************************************************************
-// BamWriter_p.cpp (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides the basic functionality for producing BAM files
-// ***************************************************************************
-
-#include "api/BamAlignment.h"
-#include "api/BamConstants.h"
-#include "api/IBamIODevice.h"
-#include "api/internal/BamException_p.h"
-#include "api/internal/BamWriter_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include <cstdlib>
-#include <cstring>
-using namespace std;
-
-// ctor
-BamWriterPrivate::BamWriterPrivate(void)
- : m_isBigEndian( BamTools::SystemIsBigEndian() )
-{ }
-
-// dtor
-BamWriterPrivate::~BamWriterPrivate(void) {
- Close();
-}
-
-// calculates minimum bin for a BAM alignment interval [begin, end)
-uint32_t BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
- --end;
- if ( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
- if ( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);
- if ( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);
- if ( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);
- if ( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);
- return 0;
-}
-
-// closes the alignment archive
-void BamWriterPrivate::Close(void) {
-
- // skip if file not open
- if ( !IsOpen() ) return;
-
- // close output stream
- try {
- m_stream.Close();
- } catch ( BamException& e ) {
- m_errorString = e.what();
- }
-}
-
-// creates a cigar string from the supplied alignment
-void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
-
- // initialize
- const size_t numCigarOperations = cigarOperations.size();
- packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT);
-
- // pack the cigar data into the string
- unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
-
- // iterate over cigar operations
- vector<CigarOp>::const_iterator coIter = cigarOperations.begin();
- vector<CigarOp>::const_iterator coEnd = cigarOperations.end();
- for ( ; coIter != coEnd; ++coIter ) {
-
- // store op in packedCigar
- uint8_t cigarOp;
- switch ( coIter->Type ) {
- case (Constants::BAM_CIGAR_MATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MATCH; break;
- case (Constants::BAM_CIGAR_INS_CHAR) : cigarOp = Constants::BAM_CIGAR_INS; break;
- case (Constants::BAM_CIGAR_DEL_CHAR) : cigarOp = Constants::BAM_CIGAR_DEL; break;
- case (Constants::BAM_CIGAR_REFSKIP_CHAR) : cigarOp = Constants::BAM_CIGAR_REFSKIP; break;
- case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_SOFTCLIP; break;
- case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_HARDCLIP; break;
- case (Constants::BAM_CIGAR_PAD_CHAR) : cigarOp = Constants::BAM_CIGAR_PAD; break;
- case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_SEQMATCH; break;
- case (Constants::BAM_CIGAR_MISMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MISMATCH; break;
- default:
- const string message = string("invalid CIGAR operation type") + coIter->Type;
- throw BamException("BamWriter::CreatePackedCigar", message);
- }
-
- *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp;
- pPackedCigar++;
- }
-}
-
-// encodes the supplied query sequence into 4-bit notation
-void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
-
- // prepare the encoded query string
- const size_t queryLength = query.size();
- const size_t encodedQueryLength = static_cast<size_t>((queryLength+1)/2);
- encodedQuery.resize(encodedQueryLength);
- char* pEncodedQuery = (char*)encodedQuery.data();
- const char* pQuery = (const char*)query.data();
-
- // walk through original query sequence, encoding its bases
- unsigned char nucleotideCode;
- bool useHighWord = true;
- while ( *pQuery ) {
- switch ( *pQuery ) {
- case (Constants::BAM_DNA_EQUAL) : nucleotideCode = Constants::BAM_BASECODE_EQUAL; break;
- case (Constants::BAM_DNA_A) : nucleotideCode = Constants::BAM_BASECODE_A; break;
- case (Constants::BAM_DNA_C) : nucleotideCode = Constants::BAM_BASECODE_C; break;
- case (Constants::BAM_DNA_M) : nucleotideCode = Constants::BAM_BASECODE_M; break;
- case (Constants::BAM_DNA_G) : nucleotideCode = Constants::BAM_BASECODE_G; break;
- case (Constants::BAM_DNA_R) : nucleotideCode = Constants::BAM_BASECODE_R; break;
- case (Constants::BAM_DNA_S) : nucleotideCode = Constants::BAM_BASECODE_S; break;
- case (Constants::BAM_DNA_V) : nucleotideCode = Constants::BAM_BASECODE_V; break;
- case (Constants::BAM_DNA_T) : nucleotideCode = Constants::BAM_BASECODE_T; break;
- case (Constants::BAM_DNA_W) : nucleotideCode = Constants::BAM_BASECODE_W; break;
- case (Constants::BAM_DNA_Y) : nucleotideCode = Constants::BAM_BASECODE_Y; break;
- case (Constants::BAM_DNA_H) : nucleotideCode = Constants::BAM_BASECODE_H; break;
- case (Constants::BAM_DNA_K) : nucleotideCode = Constants::BAM_BASECODE_K; break;
- case (Constants::BAM_DNA_D) : nucleotideCode = Constants::BAM_BASECODE_D; break;
- case (Constants::BAM_DNA_B) : nucleotideCode = Constants::BAM_BASECODE_B; break;
- case (Constants::BAM_DNA_N) : nucleotideCode = Constants::BAM_BASECODE_N; break;
- default:
- const string message = string("invalid base: ") + *pQuery;
- throw BamException("BamWriter::EncodeQuerySequence", message);
- }
-
- // pack the nucleotide code
- if ( useHighWord ) {
- *pEncodedQuery = nucleotideCode << 4;
- useHighWord = false;
- } else {
- *pEncodedQuery |= nucleotideCode;
- ++pEncodedQuery;
- useHighWord = true;
- }
-
- // increment the query position
- ++pQuery;
- }
-}
-
-// returns a description of the last error that occurred
-std::string BamWriterPrivate::GetErrorString(void) const {
- return m_errorString;
-}
-
-// returns whether BAM file is open for writing or not
-bool BamWriterPrivate::IsOpen(void) const {
- return m_stream.IsOpen();
-}
-
-// opens the alignment archive
-bool BamWriterPrivate::Open(const string& filename,
- const string& samHeaderText,
- const RefVector& referenceSequences)
-{
- try {
-
- // open the BGZF file for writing
- m_stream.Open(filename, IBamIODevice::WriteOnly);
-
- // write BAM file 'metadata' components
- WriteMagicNumber();
- WriteSamHeaderText(samHeaderText);
- WriteReferences(referenceSequences);
-
- // return success
- return true;
-
- } catch ( BamException& e ) {
- m_errorString = e.what();
- return false;
- }
-}
-
-// saves the alignment to the alignment archive
-bool BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
-
- try {
-
- // if BamAlignment contains only the core data and a raw char data buffer
- // (as a result of BamReader::GetNextAlignmentCore())
- if ( al.SupportData.HasCoreOnly )
- WriteCoreAlignment(al);
-
- // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
- // (resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code)
- else WriteAlignment(al);
-
- // if we get here, everything OK
- return true;
-
- } catch ( BamException& e ) {
- m_errorString = e.what();
- return false;
- }
-}
-
-void BamWriterPrivate::SetWriteCompressed(bool ok) {
- // modifying compression is not allowed if BAM file is open
- if ( !IsOpen() )
- m_stream.SetWriteCompressed(ok);
-}
-
-void BamWriterPrivate::WriteAlignment(const BamAlignment& al) {
-
- // calculate char lengths
- const unsigned int nameLength = al.Name.size() + 1;
- const unsigned int numCigarOperations = al.CigarData.size();
- const unsigned int queryLength = al.QueryBases.size();
- const unsigned int tagDataLength = al.TagData.size();
-
- // no way to tell if alignment's bin is already defined (there is no default, invalid value)
- // so we'll go ahead calculate its bin ID before storing
- const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition());
-
- // create our packed cigar string
- string packedCigar;
- CreatePackedCigar(al.CigarData, packedCigar);
- const unsigned int packedCigarLength = packedCigar.size();
-
- // encode the query
- string encodedQuery;
- EncodeQuerySequence(al.QueryBases, encodedQuery);
- const unsigned int encodedQueryLength = encodedQuery.size();
-
- // write the block size
- const unsigned int dataBlockSize = nameLength +
- packedCigarLength +
- encodedQueryLength +
- queryLength +
- tagDataLength;
- unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize;
- if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
- m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
-
- // assign the BAM core data
- uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
- buffer[0] = al.RefID;
- buffer[1] = al.Position;
- buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
- buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
- buffer[4] = queryLength;
- buffer[5] = al.MateRefID;
- buffer[6] = al.MatePosition;
- buffer[7] = al.InsertSize;
-
- // swap BAM core endian-ness, if necessary
- if ( m_isBigEndian ) {
- for ( int i = 0; i < 8; ++i )
- BamTools::SwapEndian_32(buffer[i]);
- }
-
- // write the BAM core
- m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
-
- // write the query name
- m_stream.Write(al.Name.c_str(), nameLength);
-
- // write the packed cigar
- if ( m_isBigEndian ) {
- char* cigarData = new char[packedCigarLength]();
- memcpy(cigarData, packedCigar.data(), packedCigarLength);
- if ( m_isBigEndian ) {
- for ( size_t i = 0; i < packedCigarLength; ++i )
- BamTools::SwapEndian_32p(&cigarData[i]);
- }
- m_stream.Write(cigarData, packedCigarLength);
- delete[] cigarData; // TODO: cleanup on Write exception thrown?
- }
- else
- m_stream.Write(packedCigar.data(), packedCigarLength);
-
- // write the encoded query sequence
- m_stream.Write(encodedQuery.data(), encodedQueryLength);
-
- // write the base qualities
- char* pBaseQualities = (char*)al.Qualities.data();
- for ( size_t i = 0; i < queryLength; ++i )
- pBaseQualities[i] -= 33; // FASTQ conversion
- m_stream.Write(pBaseQualities, queryLength);
-
- // write the read group tag
- if ( m_isBigEndian ) {
-
- char* tagData = new char[tagDataLength]();
- memcpy(tagData, al.TagData.data(), tagDataLength);
-
- size_t i = 0;
- while ( i < tagDataLength ) {
-
- i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.)
- const char type = tagData[i]; // get tag type at position i
- ++i;
-
- switch ( type ) {
-
- case(Constants::BAM_TAG_TYPE_ASCII) :
- case(Constants::BAM_TAG_TYPE_INT8) :
- case(Constants::BAM_TAG_TYPE_UINT8) :
- ++i;
- break;
-
- case(Constants::BAM_TAG_TYPE_INT16) :
- case(Constants::BAM_TAG_TYPE_UINT16) :
- BamTools::SwapEndian_16p(&tagData[i]);
- i += sizeof(uint16_t);
- break;
-
- case(Constants::BAM_TAG_TYPE_FLOAT) :
- case(Constants::BAM_TAG_TYPE_INT32) :
- case(Constants::BAM_TAG_TYPE_UINT32) :
- BamTools::SwapEndian_32p(&tagData[i]);
- i += sizeof(uint32_t);
- break;
-
- case(Constants::BAM_TAG_TYPE_HEX) :
- case(Constants::BAM_TAG_TYPE_STRING) :
- // no endian swapping necessary for hex-string/string data
- while ( tagData[i] )
- ++i;
- // increment one more for null terminator
- ++i;
- break;
-
- case(Constants::BAM_TAG_TYPE_ARRAY) :
-
- {
- // read array type
- const char arrayType = tagData[i];
- ++i;
-
- // swap endian-ness of number of elements in place, then retrieve for loop
- BamTools::SwapEndian_32p(&tagData[i]);
- int32_t numElements;
- memcpy(&numElements, &tagData[i], sizeof(uint32_t));
- i += sizeof(uint32_t);
-
- // swap endian-ness of array elements
- for ( int j = 0; j < numElements; ++j ) {
- switch (arrayType) {
- case (Constants::BAM_TAG_TYPE_INT8) :
- case (Constants::BAM_TAG_TYPE_UINT8) :
- // no endian-swapping necessary
- ++i;
- break;
- case (Constants::BAM_TAG_TYPE_INT16) :
- case (Constants::BAM_TAG_TYPE_UINT16) :
- BamTools::SwapEndian_16p(&tagData[i]);
- i += sizeof(uint16_t);
- break;
- case (Constants::BAM_TAG_TYPE_FLOAT) :
- case (Constants::BAM_TAG_TYPE_INT32) :
- case (Constants::BAM_TAG_TYPE_UINT32) :
- BamTools::SwapEndian_32p(&tagData[i]);
- i += sizeof(uint32_t);
- break;
- default:
- delete[] tagData;
- const string message = string("invalid binary array type: ") + arrayType;
- throw BamException("BamWriter::SaveAlignment", message);
- }
- }
-
- break;
- }
-
- default :
- delete[] tagData;
- const string message = string("invalid tag type: ") + type;
- throw BamException("BamWriter::SaveAlignment", message);
- }
- }
-
- m_stream.Write(tagData, tagDataLength);
- delete[] tagData; // TODO: cleanup on Write exception thrown?
- }
- else
- m_stream.Write(al.TagData.data(), tagDataLength);
-}
-
-void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al) {
-
- // write the block size
- unsigned int blockSize = al.SupportData.BlockLength;
- if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
- m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
-
- // re-calculate bin (in case BamAlignment's position has been previously modified)
- const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition());
-
- // assign the BAM core data
- uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
- buffer[0] = al.RefID;
- buffer[1] = al.Position;
- buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
- buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
- buffer[4] = al.SupportData.QuerySequenceLength;
- buffer[5] = al.MateRefID;
- buffer[6] = al.MatePosition;
- buffer[7] = al.InsertSize;
-
- // swap BAM core endian-ness, if necessary
- if ( m_isBigEndian ) {
- for ( int i = 0; i < 8; ++i )
- BamTools::SwapEndian_32(buffer[i]);
- }
-
- // write the BAM core
- m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
-
- // write the raw char data
- m_stream.Write((char*)al.SupportData.AllCharData.data(),
- al.SupportData.BlockLength-Constants::BAM_CORE_SIZE);
-}
-
-void BamWriterPrivate::WriteMagicNumber(void) {
- // write BAM file 'magic number'
- m_stream.Write(Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH);
-}
-
-void BamWriterPrivate::WriteReferences(const BamTools::RefVector& referenceSequences) {
-
- // write the number of reference sequences
- uint32_t numReferenceSequences = referenceSequences.size();
- if ( m_isBigEndian ) BamTools::SwapEndian_32(numReferenceSequences);
- m_stream.Write((char*)&numReferenceSequences, Constants::BAM_SIZEOF_INT);
-
- // foreach reference sequence
- RefVector::const_iterator rsIter = referenceSequences.begin();
- RefVector::const_iterator rsEnd = referenceSequences.end();
- for ( ; rsIter != rsEnd; ++rsIter ) {
-
- // write the reference sequence name length
- uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
- if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceSequenceNameLen);
- m_stream.Write((char*)&referenceSequenceNameLen, Constants::BAM_SIZEOF_INT);
-
- // write the reference sequence name
- m_stream.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
-
- // write the reference sequence length
- int32_t referenceLength = rsIter->RefLength;
- if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceLength);
- m_stream.Write((char*)&referenceLength, Constants::BAM_SIZEOF_INT);
- }
-}
-
-void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText) {
-
- // write the SAM header text length
- uint32_t samHeaderLen = samHeaderText.size();
- if ( m_isBigEndian ) BamTools::SwapEndian_32(samHeaderLen);
- m_stream.Write((char*)&samHeaderLen, Constants::BAM_SIZEOF_INT);
-
- // write the SAM header text
- if ( samHeaderLen > 0 )
- m_stream.Write(samHeaderText.data(), samHeaderLen);
-}
+++ /dev/null
-// ***************************************************************************
-// BamWriter_p.h (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides the basic functionality for producing BAM files
-// ***************************************************************************
-
-#ifndef BAMWRITER_P_H
-#define BAMWRITER_P_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to
-// version without notice, or even be removed.
-//
-// We mean it.
-
-#include "api/BamAux.h"
-#include "api/internal/BgzfStream_p.h"
-#include <string>
-#include <vector>
-
-namespace BamTools {
-
-class BamAlignment;
-
-namespace Internal {
-
-class BamWriterPrivate {
-
- // ctor & dtor
- public:
- BamWriterPrivate(void);
- ~BamWriterPrivate(void);
-
- // interface methods
- public:
- void Close(void);
- std::string GetErrorString(void) const;
- bool IsOpen(void) const;
- bool Open(const std::string& filename,
- const std::string& samHeaderText,
- const BamTools::RefVector& referenceSequences);
- bool SaveAlignment(const BamAlignment& al);
- void SetWriteCompressed(bool ok);
-
- // 'internal' methods
- public:
- uint32_t CalculateMinimumBin(const int begin, int end) const;
- void CreatePackedCigar(const std::vector<BamTools::CigarOp>& cigarOperations, std::string& packedCigar);
- void EncodeQuerySequence(const std::string& query, std::string& encodedQuery);
- void WriteAlignment(const BamAlignment& al);
- void WriteCoreAlignment(const BamAlignment& al);
- void WriteMagicNumber(void);
- void WriteReferences(const BamTools::RefVector& referenceSequences);
- void WriteSamHeaderText(const std::string& samHeaderText);
-
- // data members
- private:
- BgzfStream m_stream;
- bool m_isBigEndian;
- std::string m_errorString;
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // BAMWRITER_P_H
+++ /dev/null
-// ***************************************************************************
-// BgzfStream_p.cpp (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 11 October 2011(DB)
-// ---------------------------------------------------------------------------
-// Based on BGZF routines developed at the Broad Institute.
-// Provides the basic functionality for reading & writing BGZF files
-// Replaces the old BGZF.* files to avoid clashing with other toolkits
-// ***************************************************************************
-
-#include "api/BamAux.h"
-#include "api/BamConstants.h"
-#include "api/internal/BamDeviceFactory_p.h"
-#include "api/internal/BamException_p.h"
-#include "api/internal/BgzfStream_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include "zlib.h"
-
-#include <cstring>
-#include <algorithm>
-#include <iostream>
-#include <sstream>
-using namespace std;
-
-// ----------------------------
-// RaiiWrapper implementation
-// ----------------------------
-
-BgzfStream::RaiiWrapper::RaiiWrapper(void) {
- CompressedBlock = new char[Constants::BGZF_MAX_BLOCK_SIZE];
- UncompressedBlock = new char[Constants::BGZF_DEFAULT_BLOCK_SIZE];
-}
-
-BgzfStream::RaiiWrapper::~RaiiWrapper(void) {
-
- // clean up buffers
- delete[] CompressedBlock;
- delete[] UncompressedBlock;
- CompressedBlock = 0;
- UncompressedBlock = 0;
-}
-
-// ---------------------------
-// BgzfStream implementation
-// ---------------------------
-
-// constructor
-BgzfStream::BgzfStream(void)
- : m_blockLength(0)
- , m_blockOffset(0)
- , m_blockAddress(0)
- , m_isWriteCompressed(true)
- , m_device(0)
-{ }
-
-// destructor
-BgzfStream::~BgzfStream(void) {
- Close();
-}
-
-// checks BGZF block header
-bool BgzfStream::CheckBlockHeader(char* header) {
- return (header[0] == Constants::GZIP_ID1 &&
- header[1] == Constants::GZIP_ID2 &&
- header[2] == Z_DEFLATED &&
- (header[3] & Constants::FLG_FEXTRA) != 0 &&
- BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN &&
- header[12] == Constants::BGZF_ID1 &&
- header[13] == Constants::BGZF_ID2 &&
- BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN );
-}
-
-// closes BGZF file
-void BgzfStream::Close(void) {
-
- // skip if no device open
- if ( m_device == 0 ) return;
-
- // if writing to file, flush the current BGZF block,
- // then write an empty block (as EOF marker)
- if ( m_device->IsOpen() && (m_device->Mode() == IBamIODevice::WriteOnly) ) {
- FlushBlock();
- const size_t blockLength = DeflateBlock();
- m_device->Write(Resources.CompressedBlock, blockLength);
- }
-
- // close device
- m_device->Close();
- delete m_device;
- m_device = 0;
-
- // reset state
- m_blockLength = 0;
- m_blockOffset = 0;
- m_blockAddress = 0;
- m_isWriteCompressed = true;
-}
-
-// compresses the current block
-size_t BgzfStream::DeflateBlock(void) {
-
- // initialize the gzip header
- char* buffer = Resources.CompressedBlock;
- memset(buffer, 0, 18);
- buffer[0] = Constants::GZIP_ID1;
- buffer[1] = Constants::GZIP_ID2;
- buffer[2] = Constants::CM_DEFLATE;
- buffer[3] = Constants::FLG_FEXTRA;
- buffer[9] = Constants::OS_UNKNOWN;
- buffer[10] = Constants::BGZF_XLEN;
- buffer[12] = Constants::BGZF_ID1;
- buffer[13] = Constants::BGZF_ID2;
- buffer[14] = Constants::BGZF_LEN;
-
- // set compression level
- const int compressionLevel = ( m_isWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 );
-
- // loop to retry for blocks that do not compress enough
- int inputLength = m_blockOffset;
- size_t compressedLength = 0;
- const unsigned int bufferSize = Constants::BGZF_MAX_BLOCK_SIZE;
-
- while ( true ) {
-
- // initialize zstream values
- z_stream zs;
- zs.zalloc = NULL;
- zs.zfree = NULL;
- zs.next_in = (Bytef*)Resources.UncompressedBlock;
- zs.avail_in = inputLength;
- zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH];
- zs.avail_out = bufferSize -
- Constants::BGZF_BLOCK_HEADER_LENGTH -
- Constants::BGZF_BLOCK_FOOTER_LENGTH;
-
- // initialize the zlib compression algorithm
- int status = deflateInit2(&zs,
- compressionLevel,
- Z_DEFLATED,
- Constants::GZIP_WINDOW_BITS,
- Constants::Z_DEFAULT_MEM_LEVEL,
- Z_DEFAULT_STRATEGY);
- if ( status != Z_OK )
- throw BamException("BgzfStream::DeflateBlock", "zlib deflateInit2 failed");
-
- // compress the data
- status = deflate(&zs, Z_FINISH);
-
- // if not at stream end
- if ( status != Z_STREAM_END ) {
-
- deflateEnd(&zs);
-
- // there was not enough space available in buffer
- // try to reduce the input length & re-start loop
- if ( status == Z_OK ) {
- inputLength -= 1024;
- if ( inputLength < 0 )
- throw BamException("BgzfStream::DeflateBlock", "input reduction failed");
- continue;
- }
-
- throw BamException("BgzfStream::DeflateBlock", "zlib deflate failed");
- }
-
- // finalize the compression routine
- status = deflateEnd(&zs);
- if ( status != Z_OK )
- throw BamException("BgzfStream::DeflateBlock", "zlib deflateEnd failed");
-
- // update compressedLength
- compressedLength = zs.total_out +
- Constants::BGZF_BLOCK_HEADER_LENGTH +
- Constants::BGZF_BLOCK_FOOTER_LENGTH;
- if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE )
- throw BamException("BgzfStream::DeflateBlock", "deflate overflow");
-
- // quit while loop
- break;
- }
-
- // store the compressed length
- BamTools::PackUnsignedShort(&buffer[16], static_cast<uint16_t>(compressedLength - 1));
-
- // store the CRC32 checksum
- uint32_t crc = crc32(0, NULL, 0);
- crc = crc32(crc, (Bytef*)Resources.UncompressedBlock, inputLength);
- BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc);
- BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
-
- // ensure that we have less than a block of data left
- int remaining = m_blockOffset - inputLength;
- if ( remaining > 0 ) {
- if ( remaining > inputLength )
- throw BamException("BgzfStream::DeflateBlock", "after deflate, remainder too large");
- memcpy(Resources.UncompressedBlock, Resources.UncompressedBlock + inputLength, remaining);
- }
-
- // update block data
- m_blockOffset = remaining;
-
- // return result
- return compressedLength;
-}
-
-// flushes the data in the BGZF block
-void BgzfStream::FlushBlock(void) {
-
- BT_ASSERT_X( m_device, "BgzfStream::FlushBlock() - attempting to flush to null device" );
-
- // flush all of the remaining blocks
- while ( m_blockOffset > 0 ) {
-
- // compress the data block
- const size_t blockLength = DeflateBlock();
-
- // flush the data to our output device
- const size_t numBytesWritten = m_device->Write(Resources.CompressedBlock, blockLength);
- if ( numBytesWritten != blockLength ) {
- stringstream s("");
- s << "expected to write " << blockLength
- << " bytes during flushing, but wrote " << numBytesWritten;
- throw BamException("BgzfStream::FlushBlock", s.str());
- }
-
- // update block data
- m_blockAddress += blockLength;
- }
-}
-
-// decompresses the current block
-size_t BgzfStream::InflateBlock(const size_t& blockLength) {
-
- // setup zlib stream object
- z_stream zs;
- zs.zalloc = NULL;
- zs.zfree = NULL;
- zs.next_in = (Bytef*)Resources.CompressedBlock + 18;
- zs.avail_in = blockLength - 16;
- zs.next_out = (Bytef*)Resources.UncompressedBlock;
- zs.avail_out = Constants::BGZF_DEFAULT_BLOCK_SIZE;
-
- // initialize
- int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS);
- if ( status != Z_OK )
- throw BamException("BgzfStream::InflateBlock", "zlib inflateInit failed");
-
- // decompress
- status = inflate(&zs, Z_FINISH);
- if ( status != Z_STREAM_END ) {
- inflateEnd(&zs);
- throw BamException("BgzfStream::InflateBlock", "zlib inflate failed");
- }
-
- // finalize
- status = inflateEnd(&zs);
- if ( status != Z_OK ) {
- inflateEnd(&zs);
- throw BamException("BgzfStream::InflateBlock", "zlib inflateEnd failed");
- }
-
- // return result
- return zs.total_out;
-}
-
-bool BgzfStream::IsOpen(void) const {
- if ( m_device == 0 )
- return false;
- return m_device->IsOpen();
-}
-
-void BgzfStream::Open(const string& filename, const IBamIODevice::OpenMode mode) {
-
- // close current device if necessary
- Close();
- BT_ASSERT_X( (m_device == 0), "BgzfStream::Open() - unable to properly close previous IO device" );
-
- // retrieve new IO device depending on filename
- m_device = BamDeviceFactory::CreateDevice(filename);
- BT_ASSERT_X( m_device, "BgzfStream::Open() - unable to create IO device from filename" );
-
- // if device fails to open
- if ( !m_device->Open(mode) ) {
- const string deviceError = m_device->GetErrorString();
- const string message = string("could not open BGZF stream: \n\t") + deviceError;
- throw BamException("BgzfStream::Open", message);
- }
-}
-
-// reads BGZF data into a byte buffer
-size_t BgzfStream::Read(char* data, const size_t dataLength) {
-
- if ( dataLength == 0 )
- return 0;
-
- // if stream not open for reading
- BT_ASSERT_X( m_device, "BgzfStream::Read() - trying to read from null device");
- if ( !m_device->IsOpen() || (m_device->Mode() != IBamIODevice::ReadOnly) )
- return 0;
-
- // read blocks as needed until desired data length is retrieved
- char* output = data;
- size_t numBytesRead = 0;
- while ( numBytesRead < dataLength ) {
-
- // determine bytes available in current block
- int bytesAvailable = m_blockLength - m_blockOffset;
-
- // read (and decompress) next block if needed
- if ( bytesAvailable <= 0 ) {
- ReadBlock();
- bytesAvailable = m_blockLength - m_blockOffset;
- if ( bytesAvailable <= 0 )
- break;
- }
-
- // copy data from uncompressed source buffer into data destination buffer
- const size_t copyLength = min( (dataLength-numBytesRead), (size_t)bytesAvailable );
- memcpy(output, Resources.UncompressedBlock + m_blockOffset, copyLength);
-
- // update counters
- m_blockOffset += copyLength;
- output += copyLength;
- numBytesRead += copyLength;
- }
-
- // update block data
- if ( m_blockOffset == m_blockLength ) {
- m_blockAddress = m_device->Tell();
- m_blockOffset = 0;
- m_blockLength = 0;
-
- }
-
- // return actual number of bytes read
- return numBytesRead;
-}
-
-// reads a BGZF block
-void BgzfStream::ReadBlock(void) {
-
- BT_ASSERT_X( m_device, "BgzfStream::ReadBlock() - trying to read from null IO device");
-
- // store block's starting address
- int64_t blockAddress = m_device->Tell();
-
- // read block header from file
- char header[Constants::BGZF_BLOCK_HEADER_LENGTH];
- size_t numBytesRead = m_device->Read(header, Constants::BGZF_BLOCK_HEADER_LENGTH);
-
- // if block header empty
- if ( numBytesRead == 0 ) {
- m_blockLength = 0;
- return;
- }
-
- // if block header invalid size
- if ( numBytesRead != Constants::BGZF_BLOCK_HEADER_LENGTH )
- throw BamException("BgzfStream::ReadBlock", "invalid block header size");
-
- // validate block header contents
- if ( !BgzfStream::CheckBlockHeader(header) )
- throw BamException("BgzfStream::ReadBlock", "invalid block header contents");
-
- // copy header contents to compressed buffer
- const size_t blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1;
- memcpy(Resources.CompressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH);
-
- // read remainder of block
- const size_t remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH;
- numBytesRead = m_device->Read(&Resources.CompressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], remaining);
- if ( numBytesRead != remaining )
- throw BamException("BgzfStream::ReadBlock", "could not read data from block");
-
- // decompress block data
- numBytesRead = InflateBlock(blockLength);
-
- // update block data
- if ( m_blockLength != 0 )
- m_blockOffset = 0;
- m_blockAddress = blockAddress;
- m_blockLength = numBytesRead;
-}
-
-// seek to position in BGZF file
-void BgzfStream::Seek(const int64_t& position) {
-
- BT_ASSERT_X( m_device, "BgzfStream::Seek() - trying to seek on null IO device");
-
- // skip if device is not open
- if ( !IsOpen() ) return;
-
- // determine adjusted offset & address
- int blockOffset = (position & 0xFFFF);
- int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
-
- // attempt seek in file
- if ( m_device->IsRandomAccess() && m_device->Seek(blockAddress) ) {
-
- // update block data & return success
- m_blockLength = 0;
- m_blockAddress = blockAddress;
- m_blockOffset = blockOffset;
- }
- else {
- stringstream s("");
- s << "unable to seek to position: " << position;
- throw BamException("BgzfStream::Seek", s.str());
- }
-}
-
-void BgzfStream::SetWriteCompressed(bool ok) {
- m_isWriteCompressed = ok;
-}
-
-// get file position in BGZF file
-int64_t BgzfStream::Tell(void) const {
- if ( !IsOpen() )
- return 0;
- return ( (m_blockAddress << 16) | (m_blockOffset & 0xFFFF) );
-}
-
-// writes the supplied data into the BGZF buffer
-size_t BgzfStream::Write(const char* data, const size_t dataLength) {
-
- BT_ASSERT_X( m_device, "BgzfStream::Write() - trying to write to null IO device");
- BT_ASSERT_X( (m_device->Mode() == IBamIODevice::WriteOnly),
- "BgzfStream::Write() - trying to write to non-writable IO device");
-
- // skip if file not open for writing
- if ( !IsOpen() )
- return 0;
-
- // write blocks as needed til all data is written
- size_t numBytesWritten = 0;
- const char* input = data;
- const size_t blockLength = Constants::BGZF_DEFAULT_BLOCK_SIZE;
- while ( numBytesWritten < dataLength ) {
-
- // copy data contents to uncompressed output buffer
- unsigned int copyLength = min(blockLength - m_blockOffset, dataLength - numBytesWritten);
- char* buffer = Resources.UncompressedBlock;
- memcpy(buffer + m_blockOffset, input, copyLength);
-
- // update counter
- m_blockOffset += copyLength;
- input += copyLength;
- numBytesWritten += copyLength;
-
- // flush (& compress) output buffer when full
- if ( m_blockOffset == blockLength )
- FlushBlock();
- }
-
- // return actual number of bytes written
- return numBytesWritten;
-}
+++ /dev/null
-// ***************************************************************************
-// BgzfStream_p.h (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011(DB)
-// ---------------------------------------------------------------------------
-// Based on BGZF routines developed at the Broad Institute.
-// Provides the basic functionality for reading & writing BGZF files
-// Replaces the old BGZF.* files to avoid clashing with other toolkits
-// ***************************************************************************
-
-#ifndef BGZFSTREAM_P_H
-#define BGZFSTREAM_P_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include "api/api_global.h"
-#include "api/IBamIODevice.h"
-#include <string>
-
-namespace BamTools {
-namespace Internal {
-
-class BgzfStream {
-
- // constructor & destructor
- public:
- BgzfStream(void);
- ~BgzfStream(void);
-
- // main interface methods
- public:
- // closes BGZF file
- void Close(void);
- // returns true if BgzfStream open for IO
- bool IsOpen(void) const;
- // opens the BGZF file
- void Open(const std::string& filename, const IBamIODevice::OpenMode mode);
- // reads BGZF data into a byte buffer
- size_t Read(char* data, const size_t dataLength);
- // seek to position in BGZF file
- void Seek(const int64_t& position);
- // sets IO device (closes previous, if any, but does not attempt to open)
- void SetIODevice(IBamIODevice* device);
- // enable/disable compressed output
- void SetWriteCompressed(bool ok);
- // get file position in BGZF file
- int64_t Tell(void) const;
- // writes the supplied data into the BGZF buffer
- size_t Write(const char* data, const size_t dataLength);
-
- // internal methods
- private:
- // compresses the current block
- size_t DeflateBlock(void);
- // flushes the data in the BGZF block
- void FlushBlock(void);
- // de-compresses the current block
- size_t InflateBlock(const size_t& blockLength);
- // reads a BGZF block
- void ReadBlock(void);
-
- // static 'utility' methods
- public:
- // checks BGZF block header
- static bool CheckBlockHeader(char* header);
-
- // data members
- public:
- unsigned int m_blockLength;
- unsigned int m_blockOffset;
- uint64_t m_blockAddress;
-
- bool m_isWriteCompressed;
- IBamIODevice* m_device;
-
- struct RaiiWrapper {
- RaiiWrapper(void);
- ~RaiiWrapper(void);
- char* UncompressedBlock;
- char* CompressedBlock;
- };
- RaiiWrapper Resources;
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // BGZFSTREAM_P_H
+++ /dev/null
-// ***************************************************************************
-// ILocalIODevice_p.cpp (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides shared behavior for files & pipes
-// ***************************************************************************
-
-#include "api/internal/ILocalIODevice_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include <cstdio>
-using namespace std;
-
-ILocalIODevice::ILocalIODevice(void)
- : IBamIODevice()
- , m_stream(0)
-{ }
-
-ILocalIODevice::~ILocalIODevice(void) {
- Close();
-}
-
-void ILocalIODevice::Close(void) {
-
- // skip if not open
- if ( !IsOpen() )
- return;
-
- // flush & close FILE*
- fflush(m_stream);
- fclose(m_stream);
- m_stream = 0;
-
- // reset other device state
- m_mode = IBamIODevice::NotOpen;
-}
-
-size_t ILocalIODevice::Read(char* data, const unsigned int numBytes) {
- BT_ASSERT_X( m_stream, "ILocalIODevice::Read: trying to read from null stream" );
- BT_ASSERT_X( (m_mode == IBamIODevice::ReadOnly), "ILocalIODevice::Read: device not in read-only mode");
- return fread(data, sizeof(char), numBytes, m_stream);
-}
-
-int64_t ILocalIODevice::Tell(void) const {
- BT_ASSERT_X( m_stream, "ILocalIODevice::Tell: trying to get file position fromnull stream" );
- return ftell64(m_stream);
-}
-
-size_t ILocalIODevice::Write(const char* data, const unsigned int numBytes) {
- BT_ASSERT_X( m_stream, "ILocalIODevice::Write: tryint to write to null stream" );
- BT_ASSERT_X( (m_mode == IBamIODevice::WriteOnly), "ILocalIODevice::Write: device not in write-only mode" );
- return fwrite(data, sizeof(char), numBytes, m_stream);
-}
+++ /dev/null
-// ***************************************************************************
-// ILocalIODevice_p.h (c) 2011 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides shared behavior for files & pipes
-// ***************************************************************************
-
-#ifndef ILOCALIODEVICE_P_H
-#define ILOCALIODEVICE_P_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include "api/IBamIODevice.h"
-
-namespace BamTools {
-namespace Internal {
-
-class ILocalIODevice : public IBamIODevice {
-
- // ctor & dtor
- public:
- ILocalIODevice(void);
- virtual ~ILocalIODevice(void);
-
- // IBamIODevice implementation
- public:
- virtual void Close(void);
- virtual size_t Read(char* data, const unsigned int numBytes);
- virtual int64_t Tell(void) const;
- virtual size_t Write(const char* data, const unsigned int numBytes);
-
- // data members
- protected:
- FILE* m_stream;
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // ILOCALIODEVICE_P_H
+++ /dev/null
-// ***************************************************************************
-// SamFormatParser.cpp (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides functionality for parsing SAM header text into SamHeader object
-// ***************************************************************************
-
-#include "api/SamConstants.h"
-#include "api/SamHeader.h"
-#include "api/internal/BamException_p.h"
-#include "api/internal/SamFormatParser_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-using namespace std;
-
-SamFormatParser::SamFormatParser(SamHeader& header)
- : m_header(header)
-{ }
-
-SamFormatParser::~SamFormatParser(void) { }
-
-void SamFormatParser::Parse(const string& headerText) {
-
- // clear header's prior contents
- m_header.Clear();
-
- // empty header is OK, but skip processing
- if ( headerText.empty() )
- return;
-
- // other wise parse SAM lines
- istringstream headerStream(headerText);
- string headerLine("");
- while ( getline(headerStream, headerLine) )
- ParseSamLine(headerLine);
-}
-
-void SamFormatParser::ParseSamLine(const string& line) {
-
- // skip if line is not long enough to contain true values
- if ( line.length() < 5 ) return;
-
- // determine token at beginning of line
- const string firstToken = line.substr(0,3);
- string restOfLine = line.substr(4);
- if ( firstToken == Constants::SAM_HD_BEGIN_TOKEN) ParseHDLine(restOfLine);
- else if ( firstToken == Constants::SAM_SQ_BEGIN_TOKEN) ParseSQLine(restOfLine);
- else if ( firstToken == Constants::SAM_RG_BEGIN_TOKEN) ParseRGLine(restOfLine);
- else if ( firstToken == Constants::SAM_PG_BEGIN_TOKEN) ParsePGLine(restOfLine);
- else if ( firstToken == Constants::SAM_CO_BEGIN_TOKEN) ParseCOLine(restOfLine);
- else {
- const string message = string("unknown token: ") + firstToken;
- throw BamException("SamFormatParser::ParseSamLine", message);
- }
-}
-
-void SamFormatParser::ParseHDLine(const string& line) {
-
- // split HD lines into tokens
- vector<string> tokens = Split(line, Constants::SAM_TAB);
-
- // iterate over tokens
- vector<string>::const_iterator tokenIter = tokens.begin();
- vector<string>::const_iterator tokenEnd = tokens.end();
- for ( ; tokenIter != tokenEnd; ++tokenIter ) {
-
- // get tag/value
- const string tokenTag = (*tokenIter).substr(0,2);
- const string tokenValue = (*tokenIter).substr(3);
-
- // set header contents
- if ( tokenTag == Constants::SAM_HD_VERSION_TAG ) m_header.Version = tokenValue;
- else if ( tokenTag == Constants::SAM_HD_SORTORDER_TAG ) m_header.SortOrder = tokenValue;
- else if ( tokenTag == Constants::SAM_HD_GROUPORDER_TAG ) m_header.GroupOrder = tokenValue;
- else {
- const string message = string("unknown HD tag: ") + tokenTag;
- throw BamException("SamFormatParser::ParseHDLine", message);
- }
- }
-
- // check for required tags
- if ( !m_header.HasVersion() )
- throw BamException("SamFormatParser::ParseHDLine", "@HD line is missing VN tag");
-}
-
-void SamFormatParser::ParseSQLine(const string& line) {
-
- SamSequence seq;
-
- // split SQ line into tokens
- vector<string> tokens = Split(line, Constants::SAM_TAB);
-
- // iterate over tokens
- vector<string>::const_iterator tokenIter = tokens.begin();
- vector<string>::const_iterator tokenEnd = tokens.end();
- for ( ; tokenIter != tokenEnd; ++tokenIter ) {
-
- // get tag/value
- const string tokenTag = (*tokenIter).substr(0,2);
- const string tokenValue = (*tokenIter).substr(3);
-
- // set sequence contents
- if ( tokenTag == Constants::SAM_SQ_NAME_TAG ) seq.Name = tokenValue;
- else if ( tokenTag == Constants::SAM_SQ_LENGTH_TAG ) seq.Length = tokenValue;
- else if ( tokenTag == Constants::SAM_SQ_ASSEMBLYID_TAG ) seq.AssemblyID = tokenValue;
- else if ( tokenTag == Constants::SAM_SQ_CHECKSUM_TAG ) seq.Checksum = tokenValue;
- else if ( tokenTag == Constants::SAM_SQ_SPECIES_TAG ) seq.Species = tokenValue;
- else if ( tokenTag == Constants::SAM_SQ_URI_TAG ) seq.URI = tokenValue;
- else {
- const string message = string("unknown SQ tag: ") + tokenTag;
- throw BamException("SamFormatParser::ParseSQLine", message);
- }
- }
-
- // check for required tags
- if ( !seq.HasName() )
- throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing SN tag");
- if ( !seq.HasLength() )
- throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing LN tag");
-
- // store SAM sequence entry
- m_header.Sequences.Add(seq);
-}
-
-void SamFormatParser::ParseRGLine(const string& line) {
-
- SamReadGroup rg;
-
- // split string into tokens
- vector<string> tokens = Split(line, Constants::SAM_TAB);
-
- // iterate over tokens
- vector<string>::const_iterator tokenIter = tokens.begin();
- vector<string>::const_iterator tokenEnd = tokens.end();
- for ( ; tokenIter != tokenEnd; ++tokenIter ) {
-
- // get token tag/value
- const string tokenTag = (*tokenIter).substr(0,2);
- const string tokenValue = (*tokenIter).substr(3);
-
- // set read group contents
- if ( tokenTag == Constants::SAM_RG_ID_TAG ) rg.ID = tokenValue;
- else if ( tokenTag == Constants::SAM_RG_DESCRIPTION_TAG ) rg.Description = tokenValue;
- else if ( tokenTag == Constants::SAM_RG_FLOWORDER_TAG ) rg.FlowOrder = tokenValue;
- else if ( tokenTag == Constants::SAM_RG_KEYSEQUENCE_TAG ) rg.KeySequence = tokenValue;
- else if ( tokenTag == Constants::SAM_RG_LIBRARY_TAG ) rg.Library = tokenValue;
- else if ( tokenTag == Constants::SAM_RG_PLATFORMUNIT_TAG ) rg.PlatformUnit = tokenValue;
- else if ( tokenTag == Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG ) rg.PredictedInsertSize = tokenValue;
- else if ( tokenTag == Constants::SAM_RG_PRODUCTIONDATE_TAG ) rg.ProductionDate = tokenValue;
- else if ( tokenTag == Constants::SAM_RG_PROGRAM_TAG ) rg.Program = tokenValue;
- else if ( tokenTag == Constants::SAM_RG_SAMPLE_TAG ) rg.Sample = tokenValue;
- else if ( tokenTag == Constants::SAM_RG_SEQCENTER_TAG ) rg.SequencingCenter = tokenValue;
- else if ( tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG ) rg.SequencingTechnology = tokenValue;
- else {
- const string message = string("unknown RG tag: ") + tokenTag;
- throw BamException("SamFormatParser::ParseRGLine", message);
- }
- }
-
- // check for required tags
- if ( !rg.HasID() )
- throw BamException("SamFormatParser::ParseRGLine", "@RG line is missing ID tag");
-
- // store SAM read group entry
- m_header.ReadGroups.Add(rg);
-}
-
-void SamFormatParser::ParsePGLine(const string& line) {
-
- SamProgram pg;
-
- // split string into tokens
- vector<string> tokens = Split(line, Constants::SAM_TAB);
-
- // iterate over tokens
- vector<string>::const_iterator tokenIter = tokens.begin();
- vector<string>::const_iterator tokenEnd = tokens.end();
- for ( ; tokenIter != tokenEnd; ++tokenIter ) {
-
- // get token tag/value
- const string tokenTag = (*tokenIter).substr(0,2);
- const string tokenValue = (*tokenIter).substr(3);
-
- // set program record contents
- if ( tokenTag == Constants::SAM_PG_ID_TAG ) pg.ID = tokenValue;
- else if ( tokenTag == Constants::SAM_PG_NAME_TAG ) pg.Name = tokenValue;
- else if ( tokenTag == Constants::SAM_PG_COMMANDLINE_TAG ) pg.CommandLine = tokenValue;
- else if ( tokenTag == Constants::SAM_PG_PREVIOUSPROGRAM_TAG ) pg.PreviousProgramID = tokenValue;
- else if ( tokenTag == Constants::SAM_PG_VERSION_TAG ) pg.Version = tokenValue;
- else {
- const string message = string("unknown PG tag: ") + tokenTag;
- throw BamException("SamFormatParser::ParsePGLine", message);
- }
- }
-
- // check for required tags
- if ( !pg.HasID() )
- throw BamException("SamFormatParser::ParsePGLine", "@PG line is missing ID tag");
-
- // store SAM program entry
- m_header.Programs.Add(pg);
-}
-
-void SamFormatParser::ParseCOLine(const string& line) {
- // simply add line to comments list
- m_header.Comments.push_back(line);
-}
-
-const vector<string> SamFormatParser::Split(const string& line, const char delim) {
- vector<string> tokens;
- stringstream lineStream(line);
- string token;
- while ( getline(lineStream, token, delim) )
- tokens.push_back(token);
- return tokens;
-}
+++ /dev/null
-// ***************************************************************************
-// SamFormatParser.h (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 23 December 2010 (DB)
-// ---------------------------------------------------------------------------
-// Provides functionality for parsing SAM header text into SamHeader object
-// ***************************************************************************
-
-#ifndef SAM_FORMAT_PARSER_H
-#define SAM_FORMAT_PARSER_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include <string>
-#include <vector>
-
-namespace BamTools {
-
-class SamHeader;
-
-namespace Internal {
-
-class SamFormatParser {
-
- // ctor & dtor
- public:
- SamFormatParser(BamTools::SamHeader& header);
- ~SamFormatParser(void);
-
- // parse text & populate header data
- public:
- void Parse(const std::string& headerText);
-
- // internal methods
- private:
- void ParseSamLine(const std::string& line);
- void ParseHDLine(const std::string& line);
- void ParseSQLine(const std::string& line);
- void ParseRGLine(const std::string& line);
- void ParsePGLine(const std::string& line);
- void ParseCOLine(const std::string& line);
- const std::vector<std::string> Split(const std::string& line, const char delim);
-
- // data members
- private:
- SamHeader& m_header;
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // SAM_FORMAT_PARSER_H
+++ /dev/null
-// ***************************************************************************
-// SamFormatPrinter.cpp (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 14 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides functionality for printing formatted SAM header to string
-// ***************************************************************************
-
-#include "api/SamConstants.h"
-#include "api/SamHeader.h"
-#include "api/internal/SamFormatPrinter_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-using namespace std;
-
-// ------------------------
-// static utility methods
-// ------------------------
-
-static inline
-const string FormatTag(const string& tag, const string& value) {
- return string(Constants::SAM_TAB + tag + Constants::SAM_COLON + value);
-}
-
-// ---------------------------------
-// SamFormatPrinter implementation
-// ---------------------------------
-
-SamFormatPrinter::SamFormatPrinter(const SamHeader& header)
- : m_header(header)
-{ }
-
-SamFormatPrinter::~SamFormatPrinter(void) { }
-
-const string SamFormatPrinter::ToString(void) const {
-
- // clear out stream
- stringstream out("");
-
- // generate formatted header text
- PrintHD(out);
- PrintSQ(out);
- PrintRG(out);
- PrintPG(out);
- PrintCO(out);
-
- // return result
- return out.str();
-}
-
-void SamFormatPrinter::PrintHD(std::stringstream& out) const {
-
- // if header has @HD data
- if ( m_header.HasVersion() ) {
-
- // @HD VN:<Version>
- out << Constants::SAM_HD_BEGIN_TOKEN
- << FormatTag(Constants::SAM_HD_VERSION_TAG, m_header.Version);
-
- // SO:<SortOrder>
- if ( m_header.HasSortOrder() )
- out << FormatTag(Constants::SAM_HD_SORTORDER_TAG, m_header.SortOrder);
-
- // GO:<GroupOrder>
- if ( m_header.HasGroupOrder() )
- out << FormatTag(Constants::SAM_HD_GROUPORDER_TAG, m_header.GroupOrder);
-
- // newline
- out << endl;
- }
-}
-
-void SamFormatPrinter::PrintSQ(std::stringstream& out) const {
-
- // iterate over sequence entries
- SamSequenceConstIterator seqIter = m_header.Sequences.ConstBegin();
- SamSequenceConstIterator seqEnd = m_header.Sequences.ConstEnd();
- for ( ; seqIter != seqEnd; ++seqIter ) {
- const SamSequence& seq = (*seqIter);
-
- // @SQ SN:<Name> LN:<Length>
- out << Constants::SAM_SQ_BEGIN_TOKEN
- << FormatTag(Constants::SAM_SQ_NAME_TAG, seq.Name)
- << FormatTag(Constants::SAM_SQ_LENGTH_TAG, seq.Length);
-
- // AS:<AssemblyID>
- if ( seq.HasAssemblyID() )
- out << FormatTag(Constants::SAM_SQ_ASSEMBLYID_TAG, seq.AssemblyID);
-
- // M5:<Checksum>
- if ( seq.HasChecksum() )
- out << FormatTag(Constants::SAM_SQ_CHECKSUM_TAG, seq.Checksum);
-
- // SP:<Species>
- if ( seq.HasSpecies() )
- out << FormatTag(Constants::SAM_SQ_SPECIES_TAG, seq.Species);
-
- // UR:<URI>
- if ( seq.HasURI() )
- out << FormatTag(Constants::SAM_SQ_URI_TAG, seq.URI);
-
- // newline
- out << endl;
- }
-}
-
-void SamFormatPrinter::PrintRG(std::stringstream& out) const {
-
- // iterate over read group entries
- SamReadGroupConstIterator rgIter = m_header.ReadGroups.ConstBegin();
- SamReadGroupConstIterator rgEnd = m_header.ReadGroups.ConstEnd();
- for ( ; rgIter != rgEnd; ++rgIter ) {
- const SamReadGroup& rg = (*rgIter);
-
- // @RG ID:<ID>
- out << Constants::SAM_RG_BEGIN_TOKEN
- << FormatTag(Constants::SAM_RG_ID_TAG, rg.ID);
-
- // CN:<SequencingCenter>
- if ( rg.HasSequencingCenter() )
- out << FormatTag(Constants::SAM_RG_SEQCENTER_TAG, rg.SequencingCenter);
-
- // DS:<Description>
- if ( rg.HasDescription() )
- out << FormatTag(Constants::SAM_RG_DESCRIPTION_TAG, rg.Description);
-
- // DT:<ProductionDate>
- if ( rg.HasProductionDate() )
- out << FormatTag(Constants::SAM_RG_PRODUCTIONDATE_TAG, rg.ProductionDate);
-
- // FO:<FlowOrder>
- if ( rg.HasFlowOrder() )
- out << FormatTag(Constants::SAM_RG_FLOWORDER_TAG, rg.FlowOrder);
-
- // KS:<KeySequence>
- if ( rg.HasKeySequence() )
- out << FormatTag(Constants::SAM_RG_KEYSEQUENCE_TAG, rg.KeySequence);
-
- // LB:<Library>
- if ( rg.HasLibrary() )
- out << FormatTag(Constants::SAM_RG_LIBRARY_TAG, rg.Library);
-
- // PG:<Program>
- if ( rg.HasProgram() )
- out << FormatTag(Constants::SAM_RG_PROGRAM_TAG, rg.Program);
-
- // PI:<PredictedInsertSize>
- if ( rg.HasPredictedInsertSize() )
- out << FormatTag(Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG, rg.PredictedInsertSize);
-
- // PL:<SequencingTechnology>
- if ( rg.HasSequencingTechnology() )
- out << FormatTag(Constants::SAM_RG_SEQTECHNOLOGY_TAG, rg.SequencingTechnology);
-
- // PU:<PlatformUnit>
- if ( rg.HasPlatformUnit() )
- out << FormatTag(Constants::SAM_RG_PLATFORMUNIT_TAG, rg.PlatformUnit);
-
- // SM:<Sample>
- if ( rg.HasSample() )
- out << FormatTag(Constants::SAM_RG_SAMPLE_TAG, rg.Sample);
-
- // newline
- out << endl;
- }
-}
-
-void SamFormatPrinter::PrintPG(std::stringstream& out) const {
-
- // iterate over program record entries
- SamProgramConstIterator pgIter = m_header.Programs.ConstBegin();
- SamProgramConstIterator pgEnd = m_header.Programs.ConstEnd();
- for ( ; pgIter != pgEnd; ++pgIter ) {
- const SamProgram& pg = (*pgIter);
-
- // @PG ID:<ID>
- out << Constants::SAM_PG_BEGIN_TOKEN
- << FormatTag(Constants::SAM_PG_ID_TAG, pg.ID);
-
- // PN:<Name>
- if ( pg.HasName() )
- out << FormatTag(Constants::SAM_PG_NAME_TAG, pg.Name);
-
- // CL:<CommandLine>
- if ( pg.HasCommandLine() )
- out << FormatTag(Constants::SAM_PG_COMMANDLINE_TAG, pg.CommandLine);
-
- // PP:<PreviousProgramID>
- if ( pg.HasPreviousProgramID() )
- out << FormatTag(Constants::SAM_PG_PREVIOUSPROGRAM_TAG, pg.PreviousProgramID);
-
- // VN:<Version>
- if ( pg.HasVersion() )
- out << FormatTag(Constants::SAM_PG_VERSION_TAG, pg.Version);
-
- // newline
- out << endl;
- }
-}
-
-void SamFormatPrinter::PrintCO(std::stringstream& out) const {
-
- // iterate over comments
- vector<string>::const_iterator commentIter = m_header.Comments.begin();
- vector<string>::const_iterator commentEnd = m_header.Comments.end();
- for ( ; commentIter != commentEnd; ++commentIter ) {
-
- // @CO <Comment>
- out << Constants::SAM_CO_BEGIN_TOKEN
- << Constants::SAM_TAB
- << (*commentIter)
- << endl;
- }
-}
+++ /dev/null
-// ***************************************************************************
-// SamFormatPrinter.h (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 6 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides functionality for printing formatted SAM header to string
-// ***************************************************************************
-
-#ifndef SAM_FORMAT_PRINTER_H
-#define SAM_FORMAT_PRINTER_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include <sstream>
-#include <string>
-
-namespace BamTools {
-
-class SamHeader;
-
-namespace Internal {
-
-class SamFormatPrinter {
-
- // ctor & dtor
- public:
- SamFormatPrinter(const BamTools::SamHeader& header);
- ~SamFormatPrinter(void);
-
- // generates SAM-formatted string from header data
- public:
- const std::string ToString(void) const;
-
- // internal methods
- private:
- void PrintHD(std::stringstream& out) const;
- void PrintSQ(std::stringstream& out) const;
- void PrintRG(std::stringstream& out) const;
- void PrintPG(std::stringstream& out) const;
- void PrintCO(std::stringstream& out) const;
-
- // data members
- private:
- const SamHeader& m_header;
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // SAM_FORMAT_PRINTER_H
+++ /dev/null
-// ***************************************************************************
-// SamHeaderValidator.cpp (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 14 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides functionality for validating SamHeader data
-// ***************************************************************************
-
-#include "api/SamConstants.h"
-#include "api/SamHeader.h"
-#include "api/internal/SamHeaderValidator_p.h"
-#include "api/internal/SamHeaderVersion_p.h"
-using namespace BamTools;
-using namespace BamTools::Internal;
-
-#include <cctype>
-#include <set>
-#include <sstream>
-using namespace std;
-
-// ------------------------
-// static utility methods
-// -------------------------
-
-static
-bool caseInsensitiveCompare(const string& lhs, const string& rhs) {
-
- // can omit checking chars if lengths not equal
- const int lhsLength = lhs.length();
- const int rhsLength = rhs.length();
- if ( lhsLength != rhsLength )
- return false;
-
- // do *basic* toupper checks on each string char's
- for ( int i = 0; i < lhsLength; ++i ) {
- if ( toupper( (int)lhs.at(i)) != toupper( (int)rhs.at(i)) )
- return false;
- }
-
- // otherwise OK
- return true;
-}
-
-// ------------------------------------------------------------------------
-// Allow validation rules to vary, as needed, between SAM header versions
-//
-// use SAM_VERSION_X_Y to tag important changes
-//
-// Together, they will allow for comparisons like:
-// if ( m_version < SAM_VERSION_2_0 ) {
-// // use some older rule
-// else
-// // use rule introduced with version 2.0
-
-static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0);
-static const SamHeaderVersion SAM_VERSION_1_1 = SamHeaderVersion(1,1);
-static const SamHeaderVersion SAM_VERSION_1_2 = SamHeaderVersion(1,2);
-static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3);
-static const SamHeaderVersion SAM_VERSION_1_4 = SamHeaderVersion(1,4);
-
-// TODO: This functionality is currently unused.
-// Make validation "version-aware."
-//
-// ------------------------------------------------------------------------
-
-const string SamHeaderValidator::ERROR_PREFIX = "ERROR: ";
-const string SamHeaderValidator::WARN_PREFIX = "WARNING: ";
-const string SamHeaderValidator::NEWLINE = "\n";
-
-SamHeaderValidator::SamHeaderValidator(const SamHeader& header)
- : m_header(header)
-{ }
-
-SamHeaderValidator::~SamHeaderValidator(void) { }
-
-void SamHeaderValidator::AddError(const string& message) {
- m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE);
-}
-
-void SamHeaderValidator::AddWarning(const string& message) {
- m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE);
-}
-
-void SamHeaderValidator::PrintErrorMessages(ostream& stream) {
-
- // skip if no error messages
- if ( m_errorMessages.empty() )
- return;
-
- // print error header line
- stream << "* SAM header has " << m_errorMessages.size() << " errors:" << endl;
-
- // print each error message
- vector<string>::const_iterator errorIter = m_errorMessages.begin();
- vector<string>::const_iterator errorEnd = m_errorMessages.end();
- for ( ; errorIter != errorEnd; ++errorIter )
- stream << (*errorIter);
-}
-
-void SamHeaderValidator::PrintMessages(ostream& stream) {
- PrintErrorMessages(stream);
- PrintWarningMessages(stream);
-}
-
-void SamHeaderValidator::PrintWarningMessages(ostream& stream) {
-
- // skip if no warning messages
- if ( m_warningMessages.empty() )
- return;
-
- // print warning header line
- stream << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl;
-
- // print each warning message
- vector<string>::const_iterator warnIter = m_warningMessages.begin();
- vector<string>::const_iterator warnEnd = m_warningMessages.end();
- for ( ; warnIter != warnEnd; ++warnIter )
- stream << (*warnIter);
-}
-
-// entry point for validation
-bool SamHeaderValidator::Validate(void) {
- bool isValid = true;
- isValid &= ValidateMetadata();
- isValid &= ValidateSequenceDictionary();
- isValid &= ValidateReadGroupDictionary();
- isValid &= ValidateProgramChain();
- return isValid;
-}
-
-// check all SAM header 'metadata'
-bool SamHeaderValidator::ValidateMetadata(void) {
- bool isValid = true;
- isValid &= ValidateVersion();
- isValid &= ValidateSortOrder();
- isValid &= ValidateGroupOrder();
- return isValid;
-}
-
-// check SAM header version tag
-bool SamHeaderValidator::ValidateVersion(void) {
-
- const string& version = m_header.Version;
-
- // warn if version not present
- if ( version.empty() ) {
- AddWarning("Version (VN) missing. Not required, but strongly recommended");
- return true;
- }
-
- // invalid if version does not contain a period
- const size_t periodFound = version.find(Constants::SAM_PERIOD);
- if ( periodFound == string::npos ) {
- AddError("Invalid version (VN) format: " + version);
- return false;
- }
-
- // invalid if major version is empty or contains non-digits
- const string majorVersion = version.substr(0, periodFound);
- if ( majorVersion.empty() || !ContainsOnlyDigits(majorVersion) ) {
- AddError("Invalid version (VN) format: " + version);
- return false;
- }
-
- // invalid if major version is empty or contains non-digits
- const string minorVersion = version.substr(periodFound + 1);
- if ( minorVersion.empty() || !ContainsOnlyDigits(minorVersion) ) {
- AddError("Invalid version (VN) format: " + version);
- return false;
- }
-
- // TODO: check if version is not just syntactically OK,
- // but is also a valid SAM version ( 1.0 .. CURRENT )
-
- // all checked out this far, then version is OK
- return true;
-}
-
-// assumes non-empty input string
-bool SamHeaderValidator::ContainsOnlyDigits(const string& s) {
- const size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS);
- return ( nonDigitPosition == string::npos ) ;
-}
-
-// validate SAM header sort order tag
-bool SamHeaderValidator::ValidateSortOrder(void) {
-
- const string& sortOrder = m_header.SortOrder;
-
- // warn if sort order not present
- if ( sortOrder.empty() ) {
- AddWarning("Sort order (SO) missing. Not required, but strongly recommended");
- return true;
- }
-
- // if sort order is valid keyword
- if ( sortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ||
- sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ||
- sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED
- )
- {
- return true;
- }
-
- // otherwise
- AddError("Invalid sort order (SO): " + sortOrder);
- return false;
-}
-
-// validate SAM header group order tag
-bool SamHeaderValidator::ValidateGroupOrder(void) {
-
- const string& groupOrder = m_header.GroupOrder;
-
- // if no group order, no problem, just return OK
- if ( groupOrder.empty() )
- return true;
-
- // if group order is valid keyword
- if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE ||
- groupOrder == Constants::SAM_HD_GROUPORDER_QUERY ||
- groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE
- )
- {
- return true;
- }
-
- // otherwise
- AddError("Invalid group order (GO): " + groupOrder);
- return false;
-}
-
-// validate SAM header sequence dictionary
-bool SamHeaderValidator::ValidateSequenceDictionary(void) {
-
- bool isValid = true;
-
- // check for unique sequence names
- isValid &= ContainsUniqueSequenceNames();
-
- // iterate over sequences
- const SamSequenceDictionary& sequences = m_header.Sequences;
- SamSequenceConstIterator seqIter = sequences.ConstBegin();
- SamSequenceConstIterator seqEnd = sequences.ConstEnd();
- for ( ; seqIter != seqEnd; ++seqIter ) {
- const SamSequence& seq = (*seqIter);
- isValid &= ValidateSequence(seq);
- }
-
- // return validation state
- return isValid;
-}
-
-// make sure all SQ names are unique
-bool SamHeaderValidator::ContainsUniqueSequenceNames(void) {
-
- bool isValid = true;
- set<string> sequenceNames;
- set<string>::iterator nameIter;
-
- // iterate over sequences
- const SamSequenceDictionary& sequences = m_header.Sequences;
- SamSequenceConstIterator seqIter = sequences.ConstBegin();
- SamSequenceConstIterator seqEnd = sequences.ConstEnd();
- for ( ; seqIter != seqEnd; ++seqIter ) {
- const SamSequence& seq = (*seqIter);
-
- // lookup sequence name
- const string& name = seq.Name;
- nameIter = sequenceNames.find(name);
-
- // error if found (duplicate entry)
- if ( nameIter != sequenceNames.end() ) {
- AddError("Sequence name (SN): " + name + " is not unique");
- isValid = false;
- }
-
- // otherwise ok, store name
- sequenceNames.insert(name);
- }
-
- // return validation state
- return isValid;
-}
-
-// validate SAM header sequence entry
-bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) {
- bool isValid = true;
- isValid &= CheckNameFormat(seq.Name);
- isValid &= CheckLengthInRange(seq.Length);
- return isValid;
-}
-
-// check sequence name is valid format
-bool SamHeaderValidator::CheckNameFormat(const string& name) {
-
- // invalid if name is empty
- if ( name.empty() ) {
- AddError("Sequence entry (@SQ) is missing SN tag");
- return false;
- }
-
- // invalid if first character is a reserved char
- const char firstChar = name.at(0);
- if ( firstChar == Constants::SAM_EQUAL || firstChar == Constants::SAM_STAR ) {
- AddError("Invalid sequence name (SN): " + name);
- return false;
- }
- // otherwise OK
- return true;
-}
-
-// check that sequence length is within accepted range
-bool SamHeaderValidator::CheckLengthInRange(const string& length) {
-
- // invalid if empty
- if ( length.empty() ) {
- AddError("Sequence entry (@SQ) is missing LN tag");
- return false;
- }
-
- // convert string length to numeric
- stringstream lengthStream(length);
- unsigned int sequenceLength;
- lengthStream >> sequenceLength;
-
- // invalid if length outside accepted range
- if ( sequenceLength < Constants::SAM_SQ_LENGTH_MIN || sequenceLength > Constants::SAM_SQ_LENGTH_MAX ) {
- AddError("Sequence length (LN): " + length + " out of range");
- return false;
- }
-
- // otherwise OK
- return true;
-}
-
-// validate SAM header read group dictionary
-bool SamHeaderValidator::ValidateReadGroupDictionary(void) {
-
- bool isValid = true;
-
- // check for unique read group IDs & platform units
- isValid &= ContainsUniqueIDsAndPlatformUnits();
-
- // iterate over read groups
- const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
- SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
- SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
- for ( ; rgIter != rgEnd; ++rgIter ) {
- const SamReadGroup& rg = (*rgIter);
- isValid &= ValidateReadGroup(rg);
- }
-
- // return validation state
- return isValid;
-}
-
-// make sure RG IDs and platform units are unique
-bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) {
-
- bool isValid = true;
- set<string> readGroupIds;
- set<string> platformUnits;
- set<string>::iterator idIter;
- set<string>::iterator puIter;
-
- // iterate over sequences
- const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
- SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
- SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
- for ( ; rgIter != rgEnd; ++rgIter ) {
- const SamReadGroup& rg = (*rgIter);
-
- // --------------------------------
- // check for unique ID
-
- // lookup read group ID
- const string& id = rg.ID;
- idIter = readGroupIds.find(id);
-
- // error if found (duplicate entry)
- if ( idIter != readGroupIds.end() ) {
- AddError("Read group ID (ID): " + id + " is not unique");
- isValid = false;
- }
-
- // otherwise ok, store id
- readGroupIds.insert(id);
-
- // --------------------------------
- // check for unique platform unit
-
- // lookup platform unit
- const string& pu = rg.PlatformUnit;
- puIter = platformUnits.find(pu);
-
- // error if found (duplicate entry)
- if ( puIter != platformUnits.end() ) {
- AddError("Platform unit (PU): " + pu + " is not unique");
- isValid = false;
- }
-
- // otherwise ok, store platform unit
- platformUnits.insert(pu);
- }
-
- // return validation state
- return isValid;
-}
-
-// validate SAM header read group entry
-bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) {
- bool isValid = true;
- isValid &= CheckReadGroupID(rg.ID);
- isValid &= CheckSequencingTechnology(rg.SequencingTechnology);
- return isValid;
-}
-
-// make sure RG ID exists
-bool SamHeaderValidator::CheckReadGroupID(const string& id) {
-
- // invalid if empty
- if ( id.empty() ) {
- AddError("Read group entry (@RG) is missing ID tag");
- return false;
- }
-
- // otherwise OK
- return true;
-}
-
-// make sure RG sequencing tech is one of the accepted keywords
-bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) {
-
- // if no technology provided, no problem, just return OK
- if ( technology.empty() )
- return true;
-
- // if technology is valid keyword
- if ( caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_CAPILLARY) ||
- caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_HELICOS) ||
- caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA) ||
- caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_IONTORRENT) ||
- caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_LS454) ||
- caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_PACBIO) ||
- caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_SOLID)
- )
- {
- return true;
- }
-
- // otherwise
- AddError("Invalid read group sequencing platform (PL): " + technology);
- return false;
-}
-
-// validate the SAM header "program chain"
-bool SamHeaderValidator::ValidateProgramChain(void) {
- bool isValid = true;
- isValid &= ContainsUniqueProgramIds();
- isValid &= ValidatePreviousProgramIds();
- return isValid;
-}
-
-// make sure all PG IDs are unique
-bool SamHeaderValidator::ContainsUniqueProgramIds(void) {
-
- bool isValid = true;
- set<string> programIds;
- set<string>::iterator pgIdIter;
-
- // iterate over program records
- const SamProgramChain& programs = m_header.Programs;
- SamProgramConstIterator pgIter = programs.ConstBegin();
- SamProgramConstIterator pgEnd = programs.ConstEnd();
- for ( ; pgIter != pgEnd; ++pgIter ) {
- const SamProgram& pg = (*pgIter);
-
- // lookup program ID
- const string& pgId = pg.ID;
- pgIdIter = programIds.find(pgId);
-
- // error if found (duplicate entry)
- if ( pgIdIter != programIds.end() ) {
- AddError("Program ID (ID): " + pgId + " is not unique");
- isValid = false;
- }
-
- // otherwise ok, store ID
- programIds.insert(pgId);
- }
-
- // return validation state
- return isValid;
-}
-
-// make sure that any PP tags present point to existing @PG IDs
-bool SamHeaderValidator::ValidatePreviousProgramIds(void) {
-
- bool isValid = true;
-
- // iterate over program records
- const SamProgramChain& programs = m_header.Programs;
- SamProgramConstIterator pgIter = programs.ConstBegin();
- SamProgramConstIterator pgEnd = programs.ConstEnd();
- for ( ; pgIter != pgEnd; ++pgIter ) {
- const SamProgram& pg = (*pgIter);
-
- // ignore record for validation if PreviousProgramID is empty
- const string& ppId = pg.PreviousProgramID;
- if ( ppId.empty() )
- continue;
-
- // see if program "chain" contains an entry for ppId
- if ( !programs.Contains(ppId) ) {
- AddError("PreviousProgramID (PP): " + ppId + " is not a known ID");
- isValid = false;
- }
- }
-
- // return validation state
- return isValid;
-}
+++ /dev/null
-// ***************************************************************************
-// SamHeaderValidator.h (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 6 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides functionality for validating SamHeader data
-// ***************************************************************************
-
-#ifndef SAM_HEADER_VALIDATOR_P_H
-#define SAM_HEADER_VALIDATOR_P_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-namespace BamTools {
-
-class SamHeader;
-class SamReadGroup;
-class SamSequence;
-
-namespace Internal {
-
-class SamHeaderValidator {
-
- // ctor & dtor
- public:
- SamHeaderValidator(const SamHeader& header);
- ~SamHeaderValidator(void);
-
- // SamHeaderValidator interface
- public:
-
- // prints error & warning messages
- void PrintMessages(std::ostream& stream);
-
- // validates SamHeader data, returns true/false accordingly
- bool Validate(void);
-
- // internal methods
- private:
-
- // validate header metadata
- bool ValidateMetadata(void);
- bool ValidateVersion(void);
- bool ContainsOnlyDigits(const std::string& s);
- bool ValidateSortOrder(void);
- bool ValidateGroupOrder(void);
-
- // validate sequence dictionary
- bool ValidateSequenceDictionary(void);
- bool ContainsUniqueSequenceNames(void);
- bool CheckNameFormat(const std::string& name);
- bool ValidateSequence(const SamSequence& seq);
- bool CheckLengthInRange(const std::string& length);
-
- // validate read group dictionary
- bool ValidateReadGroupDictionary(void);
- bool ContainsUniqueIDsAndPlatformUnits(void);
- bool ValidateReadGroup(const SamReadGroup& rg);
- bool CheckReadGroupID(const std::string& id);
- bool CheckSequencingTechnology(const std::string& technology);
-
- // validate program data
- bool ValidateProgramChain(void);
- bool ContainsUniqueProgramIds(void);
- bool ValidatePreviousProgramIds(void);
-
- // error reporting
- void AddError(const std::string& message);
- void AddWarning(const std::string& message);
- void PrintErrorMessages(std::ostream& stream);
- void PrintWarningMessages(std::ostream& stream);
-
- // data members
- private:
-
- // SamHeader being validated
- const SamHeader& m_header;
-
- // error reporting helpers
- static const std::string ERROR_PREFIX;
- static const std::string WARN_PREFIX;
- static const std::string NEWLINE;
-
- // error reporting messages
- std::vector<std::string> m_errorMessages;
- std::vector<std::string> m_warningMessages;
-};
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // SAM_HEADER_VALIDATOR_P_H
+++ /dev/null
-// ***************************************************************************
-// SamHeaderVersion.h (c) 2010 Derek Barnett
-// Marth Lab, Department of Biology, Boston College
-// ---------------------------------------------------------------------------
-// Last modified: 10 October 2011 (DB)
-// ---------------------------------------------------------------------------
-// Provides functionality for comparing SAM header versions
-// *************************************************************************
-
-#ifndef SAM_HEADERVERSION_P_H
-#define SAM_HEADERVERSION_P_H
-
-// -------------
-// W A R N I N G
-// -------------
-//
-// This file is not part of the BamTools API. It exists purely as an
-// implementation detail. This header file may change from version to version
-// without notice, or even be removed.
-//
-// We mean it.
-
-#include "api/SamConstants.h"
-#include <sstream>
-#include <string>
-
-namespace BamTools {
-namespace Internal {
-
-class SamHeaderVersion {
-
- // ctors & dtor
- public:
- SamHeaderVersion(void)
- : m_majorVersion(0)
- , m_minorVersion(0)
- { }
-
- explicit SamHeaderVersion(const std::string& version)
- : m_majorVersion(0)
- , m_minorVersion(0)
- {
- SetVersion(version);
- }
-
- SamHeaderVersion(const unsigned int& major, const unsigned int& minor)
- : m_majorVersion(major)
- , m_minorVersion(minor)
- { }
-
- ~SamHeaderVersion(void) {
- m_majorVersion = 0;
- m_minorVersion = 0;
- }
-
- // acess data
- public:
- unsigned int MajorVersion(void) const { return m_majorVersion; }
- unsigned int MinorVersion(void) const { return m_minorVersion; }
-
- void SetVersion(const std::string& version);
- std::string ToString(void) const;
-
- // data members
- private:
- unsigned int m_majorVersion;
- unsigned int m_minorVersion;
-};
-
-inline
-void SamHeaderVersion::SetVersion(const std::string& version) {
-
- // do nothing if version is empty
- if ( !version.empty() ) {
-
- std::stringstream versionStream("");
-
- // do nothing if period not found
- const size_t periodFound = version.find(Constants::SAM_PERIOD);
- if ( periodFound != std::string::npos ) {
-
- // store major version if non-empty and contains only digits
- const std::string& majorVersion = version.substr(0, periodFound);
- versionStream.str(majorVersion);
- if ( !majorVersion.empty() ) {
- const size_t nonDigitFound = majorVersion.find_first_not_of(Constants::SAM_DIGITS);
- if ( nonDigitFound == std::string::npos )
- versionStream >> m_majorVersion;
- }
-
- // store minor version if non-empty and contains only digits
- const std::string& minorVersion = version.substr(periodFound + 1);
- versionStream.str(minorVersion);
- if ( !minorVersion.empty() ) {
- const size_t nonDigitFound = minorVersion.find_first_not_of(Constants::SAM_DIGITS);
- if ( nonDigitFound == std::string::npos )
- versionStream >> m_minorVersion;
- }
- }
- }
-}
-
-// -----------------------------------------------------
-// printing
-
-inline std::string SamHeaderVersion::ToString(void) const {
- std::stringstream version;
- version << m_majorVersion << Constants::SAM_PERIOD << m_minorVersion;
- return version.str();
-}
-
-// -----------------------------------------------------
-// comparison operators
-
-inline bool operator==(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) {
- return (lhs.MajorVersion() == rhs.MajorVersion()) &&
- (lhs.MinorVersion() == rhs.MinorVersion());
-}
-
-inline bool operator<(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) {
- if ( lhs.MajorVersion() == rhs.MajorVersion() )
- return lhs.MinorVersion() < rhs.MinorVersion();
- else
- return lhs.MajorVersion() < rhs.MajorVersion();
-}
-
-inline bool operator> (const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return rhs < lhs; }
-inline bool operator<=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return !(lhs>rhs); }
-inline bool operator>=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return !(lhs<rhs); }
-
-} // namespace Internal
-} // namespace BamTools
-
-#endif // SAM_HEADERVERSION_P_H
--- /dev/null
+// ***************************************************************************
+// BamHeader_p.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for handling BAM headers.
+// ***************************************************************************
+
+#include "api/BamAux.h"
+#include "api/BamConstants.h"
+#include "api/internal/bam/BamHeader_p.h"
+#include "api/internal/io/BgzfStream_p.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdlib>
+#include <cstring>
+using namespace std;
+
+// ------------------------
+// static utility methods
+// ------------------------
+
+static inline
+bool isValidMagicNumber(const char* buffer) {
+ return ( strncmp(buffer, Constants::BAM_HEADER_MAGIC,
+ Constants::BAM_HEADER_MAGIC_LENGTH) == 0 );
+}
+
+// --------------------------
+// BamHeader implementation
+// --------------------------
+
+// ctor
+BamHeader::BamHeader(void) { }
+
+// dtor
+BamHeader::~BamHeader(void) { }
+
+// reads magic number from BGZF stream, returns true if valid
+void BamHeader::CheckMagicNumber(BgzfStream* stream) {
+
+ // try to read magic number
+ char buffer[Constants::BAM_HEADER_MAGIC_LENGTH];
+ const size_t numBytesRead = stream->Read(buffer, Constants::BAM_HEADER_MAGIC_LENGTH);
+ if ( numBytesRead != (int)Constants::BAM_HEADER_MAGIC_LENGTH )
+ throw BamException("BamHeader::CheckMagicNumber", "could not read magic number");
+
+ // validate magic number
+ if ( !isValidMagicNumber(buffer) )
+ throw BamException("BamHeader::CheckMagicNumber", "invalid magic number");
+}
+
+// clear SamHeader data
+void BamHeader::Clear(void) {
+ m_header.Clear();
+}
+
+// return true if SamHeader data is valid
+bool BamHeader::IsValid(void) const {
+ return m_header.IsValid();
+}
+
+// load BAM header ('magic number' and SAM header text) from BGZF stream
+void BamHeader::Load(BgzfStream* stream) {
+
+ // read & check magic number
+ CheckMagicNumber(stream);
+
+ // read header (length, then actual text)
+ uint32_t length(0);
+ ReadHeaderLength(stream, length);
+ ReadHeaderText(stream, length);
+}
+
+// reads SAM header text length from BGZF stream, stores it in @length
+void BamHeader::ReadHeaderLength(BgzfStream* stream, uint32_t& length) {
+
+ // read BAM header text length
+ char buffer[sizeof(uint32_t)];
+ const size_t numBytesRead = stream->Read(buffer, sizeof(uint32_t));
+ if ( numBytesRead != sizeof(uint32_t) )
+ throw BamException("BamHeader::ReadHeaderLength", "could not read header length");
+
+ // convert char buffer to length
+ length = BamTools::UnpackUnsignedInt(buffer);
+ if ( BamTools::SystemIsBigEndian() )
+ BamTools::SwapEndian_32(length);
+}
+
+// reads SAM header text from BGZF stream, stores in SamHeader object
+void BamHeader::ReadHeaderText(BgzfStream* stream, const uint32_t& length) {
+
+ // read header text
+ char* headerText = (char*)calloc(length + 1, 1);
+ const size_t bytesRead = stream->Read(headerText, length);
+
+ // if error reading, clean up buffer & throw
+ if ( bytesRead != length ) {
+ free(headerText);
+ throw BamException("BamHeader::ReadHeaderText", "could not read header text");
+ }
+
+ // otherwise, text was read OK
+ // store & cleanup
+ m_header.SetHeaderText( (string)((const char*)headerText) );
+ free(headerText);
+}
+
+// returns *copy* of SamHeader data object
+SamHeader BamHeader::ToSamHeader(void) const {
+ return m_header;
+}
+
+// returns SAM-formatted string of header data
+string BamHeader::ToString(void) const {
+ return m_header.ToString();
+}
--- /dev/null
+// ***************************************************************************
+// BamHeader_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for handling BAM headers.
+// ***************************************************************************
+
+#ifndef BAMHEADER_P_H
+#define BAMHEADER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/SamHeader.h"
+#include <string>
+
+namespace BamTools {
+namespace Internal {
+
+class BgzfStream;
+
+class BamHeader {
+
+ // ctor & dtor
+ public:
+ BamHeader(void);
+ ~BamHeader(void);
+
+ // BamHeader interface
+ public:
+ // clear SamHeader data
+ void Clear(void);
+ // return true if SamHeader data is valid
+ bool IsValid(void) const;
+ // load BAM header ('magic number' and SAM header text) from BGZF stream
+ // returns true if all OK
+ void Load(BgzfStream* stream);
+ // returns (editable) copy of SamHeader data object
+ SamHeader ToSamHeader(void) const;
+ // returns SAM-formatted string of header data
+ std::string ToString(void) const;
+
+ // internal methods
+ private:
+ // reads magic number from BGZF stream
+ void CheckMagicNumber(BgzfStream* stream);
+ // reads SAM header length from BGZF stream, stores it in @length
+ void ReadHeaderLength(BgzfStream* stream, uint32_t& length);
+ // reads SAM header text from BGZF stream, stores in SamHeader object
+ void ReadHeaderText(BgzfStream* stream, const uint32_t& length);
+
+ // data members
+ private:
+ SamHeader m_header;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMHEADER_P_H
--- /dev/null
+// ***************************************************************************
+// BamMultiMerger_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides merging functionality for BamMultiReader. At this point, supports
+// sorting results by (refId, position) or by read name.
+// ***************************************************************************
+
+#ifndef BAMMULTIMERGER_P_H
+#define BAMMULTIMERGER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/BamAlignment.h"
+#include "api/BamReader.h"
+#include "api/algorithms/Sort.h"
+#include <deque>
+#include <functional>
+#include <set>
+#include <string>
+
+namespace BamTools {
+namespace Internal {
+
+struct MergeItem {
+
+ // data members
+ BamReader* Reader;
+ BamAlignment* Alignment;
+
+ // ctors & dtor
+ MergeItem(BamReader* reader = 0,
+ BamAlignment* alignment = 0)
+ : Reader(reader)
+ , Alignment(alignment)
+ { }
+
+ MergeItem(const MergeItem& other)
+ : Reader(other.Reader)
+ , Alignment(other.Alignment)
+ { }
+
+ ~MergeItem(void) { }
+};
+
+template<typename Compare>
+struct MergeItemSorter : public std::binary_function<MergeItem, MergeItem, bool> {
+
+ public:
+ MergeItemSorter(const Compare& comp = Compare())
+ : m_comp(comp)
+ { }
+
+ bool operator()(const MergeItem& lhs, const MergeItem& rhs) {
+ const BamAlignment& l = *lhs.Alignment;
+ const BamAlignment& r = *rhs.Alignment;
+ return m_comp(l,r);
+ }
+
+ private:
+ Compare m_comp;
+};
+
+// pure ABC so we can just work polymorphically with any specific merger implementation
+class IMultiMerger {
+
+ public:
+ IMultiMerger(void) { }
+ virtual ~IMultiMerger(void) { }
+ public:
+ virtual void Add(MergeItem item) =0;
+ virtual void Clear(void) =0;
+ virtual const MergeItem& First(void) const =0;
+ virtual bool IsEmpty(void) const =0;
+ virtual void Remove(BamReader* reader) =0;
+ virtual int Size(void) const =0;
+ virtual MergeItem TakeFirst(void) =0;
+};
+
+// general merger
+template<typename Compare>
+class MultiMerger : public IMultiMerger {
+
+ public:
+ typedef Compare CompareType;
+ typedef MergeItemSorter<CompareType> MergeType;
+
+ public:
+ explicit MultiMerger(const Compare& comp = Compare())
+ : IMultiMerger()
+ , m_data( MergeType(comp) )
+ { }
+ ~MultiMerger(void) { }
+
+ public:
+ void Add(MergeItem item);
+ void Clear(void);
+ const MergeItem& First(void) const;
+ bool IsEmpty(void) const;
+ void Remove(BamReader* reader);
+ int Size(void) const;
+ MergeItem TakeFirst(void);
+
+ private:
+ typedef MergeItem ValueType;
+ typedef std::multiset<ValueType, MergeType> ContainerType;
+ typedef typename ContainerType::iterator DataIterator;
+ typedef typename ContainerType::const_iterator DataConstIterator;
+ ContainerType m_data;
+};
+
+template <typename Compare>
+inline void MultiMerger<Compare>::Add(MergeItem item) {
+
+ // N.B. - any future custom Compare types must define this method
+ // see algorithms/Sort.h
+
+ if ( CompareType::UsesCharData() )
+ item.Alignment->BuildCharData();
+ m_data.insert(item);
+}
+
+template <typename Compare>
+inline void MultiMerger<Compare>::Clear(void) {
+ m_data.clear();
+}
+
+template <typename Compare>
+inline const MergeItem& MultiMerger<Compare>::First(void) const {
+ const ValueType& entry = (*m_data.begin());
+ return entry;
+}
+
+template <typename Compare>
+inline bool MultiMerger<Compare>::IsEmpty(void) const {
+ return m_data.empty();
+}
+template <typename Compare>
+inline void MultiMerger<Compare>::Remove(BamReader* reader) {
+
+ if ( reader == 0 ) return;
+ const std::string& filenameToRemove = reader->GetFilename();
+
+ // iterate over readers in cache
+ DataIterator dataIter = m_data.begin();
+ DataIterator dataEnd = m_data.end();
+ for ( ; dataIter != dataEnd; ++dataIter ) {
+ const MergeItem& item = (*dataIter);
+ const BamReader* itemReader = item.Reader;
+ if ( itemReader == 0 ) continue;
+
+ // remove iterator on match
+ if ( itemReader->GetFilename() == filenameToRemove ) {
+ m_data.erase(dataIter);
+ return;
+ }
+ }
+}
+template <typename Compare>
+inline int MultiMerger<Compare>::Size(void) const {
+ return m_data.size();
+}
+
+template <typename Compare>
+inline MergeItem MultiMerger<Compare>::TakeFirst(void) {
+ DataIterator firstIter = m_data.begin();
+ MergeItem firstItem = (*firstIter);
+ m_data.erase(firstIter);
+ return firstItem;
+}
+
+// unsorted "merger"
+template<>
+class MultiMerger<Algorithms::Sort::Unsorted> : public IMultiMerger {
+
+ public:
+ explicit MultiMerger(const Algorithms::Sort::Unsorted& comp = Algorithms::Sort::Unsorted())
+ : IMultiMerger()
+ { }
+ ~MultiMerger(void) { }
+
+ public:
+ void Add(MergeItem item);
+ void Clear(void);
+ const MergeItem& First(void) const;
+ bool IsEmpty(void) const;
+ void Remove(BamReader* reader);
+ int Size(void) const;
+ MergeItem TakeFirst(void);
+
+ private:
+ typedef MergeItem ValueType;
+ typedef std::deque<ValueType> ContainerType;
+ typedef ContainerType::iterator DataIterator;
+ typedef ContainerType::const_iterator DataConstIterator;
+ ContainerType m_data;
+};
+
+inline
+void MultiMerger<Algorithms::Sort::Unsorted>::Add(MergeItem item) {
+ m_data.push_back(item);
+}
+
+inline
+void MultiMerger<Algorithms::Sort::Unsorted>::Clear(void) {
+ m_data.clear();
+}
+
+inline
+const MergeItem& MultiMerger<Algorithms::Sort::Unsorted>::First(void) const {
+ return m_data.front();
+}
+
+inline
+bool MultiMerger<Algorithms::Sort::Unsorted>::IsEmpty(void) const {
+ return m_data.empty();
+}
+
+inline
+void MultiMerger<Algorithms::Sort::Unsorted>::Remove(BamReader* reader) {
+
+ if ( reader == 0 ) return;
+ const std::string filenameToRemove = reader->GetFilename();
+
+ // iterate over readers in cache
+ DataIterator dataIter = m_data.begin();
+ DataIterator dataEnd = m_data.end();
+ for ( ; dataIter != dataEnd; ++dataIter ) {
+ const MergeItem& item = (*dataIter);
+ const BamReader* itemReader = item.Reader;
+ if ( itemReader == 0 ) continue;
+
+ // remove iterator on match
+ if ( itemReader->GetFilename() == filenameToRemove ) {
+ m_data.erase(dataIter);
+ return;
+ }
+ }
+}
+
+inline
+int MultiMerger<Algorithms::Sort::Unsorted>::Size(void) const {
+ return m_data.size();
+}
+
+inline
+MergeItem MultiMerger<Algorithms::Sort::Unsorted>::TakeFirst(void) {
+ MergeItem firstItem = m_data.front();
+ m_data.pop_front();
+ return firstItem;
+}
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMMULTIMERGER_P_H
--- /dev/null
+// ***************************************************************************
+// BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Functionality for simultaneously reading multiple BAM files
+// *************************************************************************
+
+#include "api/BamAlignment.h"
+#include "api/BamMultiReader.h"
+#include "api/SamConstants.h"
+#include "api/algorithms/Sort.h"
+#include "api/internal/bam/BamMultiReader_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <sstream>
+using namespace std;
+
+// ctor
+BamMultiReaderPrivate::BamMultiReaderPrivate(void)
+ : m_alignmentCache(0)
+{ }
+
+// dtor
+BamMultiReaderPrivate::~BamMultiReaderPrivate(void) {
+ Close();
+}
+
+// close all BAM files
+bool BamMultiReaderPrivate::Close(void) {
+
+ m_errorString.clear();
+
+ if ( CloseFiles(Filenames()) )
+ return true;
+ else {
+ const string currentError = m_errorString;
+ const string message = string("error encountered while closing all files: \n\t") + currentError;
+ SetErrorString("BamMultiReader::Close", message);
+ return false;
+ }
+}
+
+// close requested BAM file
+bool BamMultiReaderPrivate::CloseFile(const string& filename) {
+
+ m_errorString.clear();
+
+ vector<string> filenames(1, filename);
+ if ( CloseFiles(filenames) )
+ return true;
+ else {
+ const string currentError = m_errorString;
+ const string message = string("error while closing file: ") + filename + "\n" + currentError;
+ SetErrorString("BamMultiReader::CloseFile", message);
+ return false;
+ }
+}
+
+// close requested BAM files
+bool BamMultiReaderPrivate::CloseFiles(const vector<string>& filenames) {
+
+ bool errorsEncountered = false;
+ m_errorString.clear();
+
+ // iterate over filenames
+ vector<string>::const_iterator filesIter = filenames.begin();
+ vector<string>::const_iterator filesEnd = filenames.end();
+ for ( ; filesIter != filesEnd; ++filesIter ) {
+ const string& filename = (*filesIter);
+ if ( filename.empty() ) continue;
+
+ // iterate over readers
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
+ for ( ; readerIter != readerEnd; ++readerIter ) {
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+ if ( reader == 0 ) continue;
+
+ // if reader matches requested filename
+ if ( reader->GetFilename() == filename ) {
+
+ // remove reader's entry from alignment cache
+ m_alignmentCache->Remove(reader);
+
+ // clean up reader & its alignment
+ if ( !reader->Close() ) {
+ m_errorString.append(1, '\t');
+ m_errorString.append(reader->GetErrorString());
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+ }
+ delete reader;
+ reader = 0;
+
+ // delete reader's alignment entry
+ BamAlignment* alignment = item.Alignment;
+ delete alignment;
+ alignment = 0;
+
+ // remove reader from reader list
+ m_readers.erase(readerIter);
+
+ // on match, just go on to next filename
+ // (no need to keep looking and item iterator is invalid now anyway)
+ break;
+ }
+ }
+ }
+
+ // make sure alignment cache is cleaned up if all readers closed
+ if ( m_readers.empty() && m_alignmentCache ) {
+ m_alignmentCache->Clear();
+ delete m_alignmentCache;
+ m_alignmentCache = 0;
+ }
+
+ // return whether all readers closed OK
+ return !errorsEncountered;
+}
+
+// creates index files for BAM files that don't have them
+bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) {
+
+ bool errorsEncountered = false;
+ m_errorString.clear();
+
+ // iterate over readers
+ vector<MergeItem>::iterator itemIter = m_readers.begin();
+ vector<MergeItem>::iterator itemEnd = m_readers.end();
+ for ( ; itemIter != itemEnd; ++itemIter ) {
+ MergeItem& item = (*itemIter);
+ BamReader* reader = item.Reader;
+ if ( reader == 0 ) continue;
+
+ // if reader doesn't have an index, create one
+ if ( !reader->HasIndex() ) {
+ if ( !reader->CreateIndex(type) ) {
+ m_errorString.append(1, '\t');
+ m_errorString.append(reader->GetErrorString());
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+ }
+ }
+ }
+
+ // check for errors encountered before returning success/fail
+ if ( errorsEncountered ) {
+ const string currentError = m_errorString;
+ const string message = string("error while creating index files: ") + "\n" + currentError;
+ SetErrorString("BamMultiReader::CreateIndexes", message);
+ return false;
+ } else
+ return true;
+}
+
+IMultiMerger* BamMultiReaderPrivate::CreateAlignmentCache(void) const {
+
+ // fetch SamHeader
+ SamHeader header = GetHeader();
+
+ // if BAM files are sorted by position
+ if ( header.SortOrder == Constants::SAM_HD_SORTORDER_COORDINATE )
+ return new MultiMerger<Algorithms::Sort::ByPosition>();
+
+ // if BAM files are sorted by read name
+ if ( header.SortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME )
+ return new MultiMerger<Algorithms::Sort::ByName>();
+
+ // otherwise "unknown" or "unsorted", use unsorted merger and just read in
+ return new MultiMerger<Algorithms::Sort::Unsorted>();
+}
+
+const vector<string> BamMultiReaderPrivate::Filenames(void) const {
+
+ // init filename container
+ vector<string> filenames;
+ filenames.reserve( m_readers.size() );
+
+ // iterate over readers
+ vector<MergeItem>::const_iterator itemIter = m_readers.begin();
+ vector<MergeItem>::const_iterator itemEnd = m_readers.end();
+ for ( ; itemIter != itemEnd; ++itemIter ) {
+ const MergeItem& item = (*itemIter);
+ const BamReader* reader = item.Reader;
+ if ( reader == 0 ) continue;
+
+ // store filename if not empty
+ const string& filename = reader->GetFilename();
+ if ( !filename.empty() )
+ filenames.push_back(filename);
+ }
+
+ // return result
+ return filenames;
+}
+
+string BamMultiReaderPrivate::GetErrorString(void) const {
+ return m_errorString;
+}
+
+SamHeader BamMultiReaderPrivate::GetHeader(void) const {
+ const string& text = GetHeaderText();
+ return SamHeader(text);
+}
+
+// makes a virtual, unified header for all the bam files in the multireader
+string BamMultiReaderPrivate::GetHeaderText(void) const {
+
+ // N.B. - right now, simply copies all header data from first BAM,
+ // and then appends RG's from other BAM files
+ // TODO: make this more intelligent wrt other header lines/fields
+
+ // if no readers open
+ const size_t numReaders = m_readers.size();
+ if ( numReaders == 0 ) return string();
+
+ // retrieve first reader's header
+ const MergeItem& firstItem = m_readers.front();
+ const BamReader* reader = firstItem.Reader;
+ if ( reader == 0 ) return string();
+ SamHeader mergedHeader = reader->GetHeader();
+
+ // iterate over any remaining readers (skipping the first)
+ for ( size_t i = 1; i < numReaders; ++i ) {
+ const MergeItem& item = m_readers.at(i);
+ const BamReader* reader = item.Reader;
+ if ( reader == 0 ) continue;
+
+ // retrieve current reader's header
+ const SamHeader currentHeader = reader->GetHeader();
+
+ // append current reader's RG entries to merged header
+ // N.B. - SamReadGroupDictionary handles duplicate-checking
+ mergedHeader.ReadGroups.Add(currentHeader.ReadGroups);
+
+ // TODO: merge anything else??
+ }
+
+ // return stringified header
+ return mergedHeader.ToString();
+}
+
+// get next alignment among all files
+bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) {
+ return PopNextCachedAlignment(al, true);
+}
+
+// get next alignment among all files without parsing character data from alignments
+bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) {
+ return PopNextCachedAlignment(al, false);
+}
+
+// ---------------------------------------------------------------------------------------
+//
+// NB: The following GetReferenceX() functions assume that we have identical
+// references for all BAM files. We enforce this by invoking the
+// ValidateReaders() method to verify that our reference data is the same
+// across all files on Open - so we will not encounter a situation in which
+// there is a mismatch and we are still live.
+//
+// ---------------------------------------------------------------------------------------
+
+// returns the number of reference sequences
+int BamMultiReaderPrivate::GetReferenceCount(void) const {
+
+ // handle empty multireader
+ if ( m_readers.empty() ) return 0;
+
+ // return reference count from first reader
+ const MergeItem& item = m_readers.front();
+ const BamReader* reader = item.Reader;
+ if ( reader == 0 ) return 0;
+ else
+ return reader->GetReferenceCount();
+}
+
+// returns vector of reference objects
+const RefVector BamMultiReaderPrivate::GetReferenceData(void) const {
+
+ // handle empty multireader
+ if ( m_readers.empty() ) return RefVector();
+
+ // return reference data from first BamReader
+ const MergeItem& item = m_readers.front();
+ const BamReader* reader = item.Reader;
+ if ( reader == 0 ) return RefVector();
+ else
+ return reader->GetReferenceData();
+}
+
+// returns refID from reference name
+int BamMultiReaderPrivate::GetReferenceID(const string& refName) const {
+
+ // handle empty multireader
+ if ( m_readers.empty() ) return -1;
+
+ // return reference ID from first BamReader
+ const MergeItem& item = m_readers.front();
+ const BamReader* reader = item.Reader;
+ if ( reader == 0 ) return -1;
+ else
+ return reader->GetReferenceID(refName);
+}
+// ---------------------------------------------------------------------------------------
+
+// returns true if all readers have index data available
+// this is useful to indicate whether Jump() or SetRegion() are possible
+bool BamMultiReaderPrivate::HasIndexes(void) const {
+
+ // handle empty multireader
+ if ( m_readers.empty() )
+ return false;
+
+ bool result = true;
+
+ // iterate over readers
+ vector<MergeItem>::const_iterator readerIter = m_readers.begin();
+ vector<MergeItem>::const_iterator readerEnd = m_readers.end();
+ for ( ; readerIter != readerEnd; ++readerIter ) {
+ const MergeItem& item = (*readerIter);
+ const BamReader* reader = item.Reader;
+ if ( reader == 0 ) continue;
+
+ // see if current reader has index data
+ result &= reader->HasIndex();
+ }
+
+ return result;
+}
+
+// returns true if multireader has open readers
+bool BamMultiReaderPrivate::HasOpenReaders(void) {
+
+ // iterate over readers
+ vector<MergeItem>::const_iterator readerIter = m_readers.begin();
+ vector<MergeItem>::const_iterator readerEnd = m_readers.end();
+ for ( ; readerIter != readerEnd; ++readerIter ) {
+ const MergeItem& item = (*readerIter);
+ const BamReader* reader = item.Reader;
+ if ( reader == 0 ) continue;
+
+ // return true whenever an open reader is found
+ if ( reader->IsOpen() ) return true;
+ }
+
+ // no readers open
+ return false;
+}
+
+// performs random-access jump using (refID, position) as a left-bound
+bool BamMultiReaderPrivate::Jump(int refID, int position) {
+
+ // NB: While it may make sense to track readers in which we can
+ // successfully Jump, in practice a failure of Jump means "no
+ // alignments here." It makes sense to simply accept the failure,
+ // UpdateAlignments(), and continue.
+
+ // iterate over readers
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
+ for ( ; readerIter != readerEnd; ++readerIter ) {
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+ if ( reader == 0 ) continue;
+
+ // jump in each BamReader to position of interest
+ reader->Jump(refID, position);
+ }
+
+ // returns status of cache update
+ return UpdateAlignmentCache();
+}
+
+// locate (& load) index files for BAM readers that don't already have one loaded
+bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredType) {
+
+ bool errorsEncountered = false;
+ m_errorString.clear();
+
+ // iterate over readers
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
+ for ( ; readerIter != readerEnd; ++readerIter ) {
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+ if ( reader == 0 ) continue;
+
+ // if reader has no index, try to locate one
+ if ( !reader->HasIndex() ) {
+ if ( !reader->LocateIndex(preferredType) ) {
+ m_errorString.append(1, '\t');
+ m_errorString.append(reader->GetErrorString());
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+ }
+ }
+ }
+
+ // check for errors encountered before returning success/fail
+ if ( errorsEncountered ) {
+ const string currentError = m_errorString;
+ const string message = string("error while locating index files: ") + "\n" + currentError;
+ SetErrorString("BamMultiReader::LocatingIndexes", message);
+ return false;
+ } else
+ return true;
+}
+
+// opens BAM files
+bool BamMultiReaderPrivate::Open(const vector<string>& filenames) {
+
+ m_errorString.clear();
+
+ // put all current readers back at beginning (refreshes alignment cache)
+ if ( !Rewind() ) {
+ const string currentError = m_errorString;
+ const string message = string("unable to rewind existing readers: \n\t") + currentError;
+ SetErrorString("BamMultiReader::Open", message);
+ return false;
+ }
+
+ // iterate over filenames
+ bool errorsEncountered = false;
+ vector<string>::const_iterator filenameIter = filenames.begin();
+ vector<string>::const_iterator filenameEnd = filenames.end();
+ for ( ; filenameIter != filenameEnd; ++filenameIter ) {
+ const string& filename = (*filenameIter);
+ if ( filename.empty() ) continue;
+
+ // attempt to open BamReader
+ BamReader* reader = new BamReader;
+ const bool readerOpened = reader->Open(filename);
+
+ // if opened OK, store it
+ if ( readerOpened )
+ m_readers.push_back( MergeItem(reader, new BamAlignment) );
+
+ // otherwise store error & clean up invalid reader
+ else {
+ m_errorString.append(1, '\t');
+ m_errorString += string("unable to open file: ") + filename;
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+
+ delete reader;
+ reader = 0;
+ }
+ }
+
+ // check for errors while opening
+ if ( errorsEncountered ) {
+ const string currentError = m_errorString;
+ const string message = string("unable to open all files: \t\n") + currentError;
+ SetErrorString("BamMultiReader::Open", message);
+ return false;
+ }
+
+ // check for BAM file consistency
+ if ( !ValidateReaders() ) {
+ const string currentError = m_errorString;
+ const string message = string("unable to open inconsistent files: \t\n") + currentError;
+ SetErrorString("BamMultiReader::Open", message);
+ return false;
+ }
+
+ // update alignment cache
+ return UpdateAlignmentCache();
+}
+
+bool BamMultiReaderPrivate::OpenFile(const std::string& filename) {
+ vector<string> filenames(1, filename);
+ if ( Open(filenames) )
+ return true;
+ else {
+ const string currentError = m_errorString;
+ const string message = string("could not open file: ") + filename + "\n\t" + currentError;
+ SetErrorString("BamMultiReader::OpenFile", message);
+ return false;
+ }
+}
+
+bool BamMultiReaderPrivate::OpenIndexes(const vector<string>& indexFilenames) {
+
+ // TODO: This needs to be cleaner - should not assume same order.
+ // And either way, shouldn't start at first reader. Should start at
+ // first reader without an index?
+
+ // make sure same number of index filenames as readers
+ if ( m_readers.size() != indexFilenames.size() ) {
+ const string message("size of index file list does not match current BAM file count");
+ SetErrorString("BamMultiReader::OpenIndexes", message);
+ return false;
+ }
+
+ bool errorsEncountered = false;
+ m_errorString.clear();
+
+ // iterate over BamReaders
+ vector<string>::const_iterator indexFilenameIter = indexFilenames.begin();
+ vector<string>::const_iterator indexFilenameEnd = indexFilenames.end();
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
+ for ( ; readerIter != readerEnd; ++readerIter ) {
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+
+ // open index filename on reader
+ if ( reader ) {
+ const string& indexFilename = (*indexFilenameIter);
+ if ( !reader->OpenIndex(indexFilename) ) {
+ m_errorString.append(1, '\t');
+ m_errorString += reader->GetErrorString();
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+ }
+ }
+
+ // increment filename iterator, skip if no more index files to open
+ if ( ++indexFilenameIter == indexFilenameEnd )
+ break;
+ }
+
+ // return success/fail
+ if ( errorsEncountered ) {
+ const string currentError = m_errorString;
+ const string message = string("could not open all index files: \n\t") + currentError;
+ SetErrorString("BamMultiReader::OpenIndexes", message);
+ return false;
+ } else
+ return true;
+}
+
+bool BamMultiReaderPrivate::PopNextCachedAlignment(BamAlignment& al, const bool needCharData) {
+
+ // skip if no alignments available
+ if ( m_alignmentCache == 0 || m_alignmentCache->IsEmpty() )
+ return false;
+
+ // pop next merge item entry from cache
+ MergeItem item = m_alignmentCache->TakeFirst();
+ BamReader* reader = item.Reader;
+ BamAlignment* alignment = item.Alignment;
+ if ( reader == 0 || alignment == 0 )
+ return false;
+
+ // set char data if requested
+ if ( needCharData ) {
+ alignment->BuildCharData();
+ alignment->Filename = reader->GetFilename();
+ }
+
+ // store cached alignment into destination parameter (by copy)
+ al = *alignment;
+
+ // load next alignment from reader & store in cache
+ SaveNextAlignment(reader, alignment);
+ return true;
+}
+
+// returns BAM file pointers to beginning of alignment data & resets alignment cache
+bool BamMultiReaderPrivate::Rewind(void) {
+
+ // skip if no readers open
+ if ( m_readers.empty() )
+ return true;
+
+ // attempt to rewind files
+ if ( !RewindReaders() ) {
+ const string currentError = m_errorString;
+ const string message = string("could not rewind readers: \n\t") + currentError;
+ SetErrorString("BamMultiReader::Rewind", message);
+ return false;
+ }
+
+ // return status of cache update
+ return UpdateAlignmentCache();
+}
+
+// returns BAM file pointers to beginning of alignment data
+bool BamMultiReaderPrivate::RewindReaders(void) {
+
+ m_errorString.clear();
+ bool errorsEncountered = false;
+
+ // iterate over readers
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
+ for ( ; readerIter != readerEnd; ++readerIter ) {
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+ if ( reader == 0 ) continue;
+
+ // attempt rewind on BamReader
+ if ( !reader->Rewind() ) {
+ m_errorString.append(1, '\t');
+ m_errorString.append( reader->GetErrorString() );
+ m_errorString.append(1, '\n');
+ errorsEncountered = true;
+ }
+ }
+
+ return !errorsEncountered;
+}
+
+void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) {
+
+ // if can read alignment from reader, store in cache
+ //
+ // N.B. - lazy building of alignment's char data - populated only:
+ // automatically by alignment cache to maintain its sorting OR
+ // on demand from client call to future call to GetNextAlignment()
+
+ if ( reader->GetNextAlignmentCore(*alignment) )
+ m_alignmentCache->Add( MergeItem(reader, alignment) );
+}
+
+void BamMultiReaderPrivate::SetErrorString(const string& where, const string& what) const {
+ static const string SEPARATOR = ": ";
+ m_errorString = where + SEPARATOR + what;
+}
+
+bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) {
+
+ // NB: While it may make sense to track readers in which we can
+ // successfully SetRegion, In practice a failure of SetRegion means "no
+ // alignments here." It makes sense to simply accept the failure,
+ // UpdateAlignments(), and continue.
+
+ // iterate over alignments
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
+ for ( ; readerIter != readerEnd; ++readerIter ) {
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+ if ( reader == 0 ) continue;
+
+ // set region of interest
+ reader->SetRegion(region);
+ }
+
+ // return status of cache update
+ return UpdateAlignmentCache();
+}
+
+// updates our alignment cache
+bool BamMultiReaderPrivate::UpdateAlignmentCache(void) {
+
+ // create alignment cache if not created yet
+ if ( m_alignmentCache == 0 ) {
+ m_alignmentCache = CreateAlignmentCache();
+ if ( m_alignmentCache == 0 ) {
+ SetErrorString("BamMultiReader::UpdateAlignmentCache", "unable to create new alignment cache");
+ return false;
+ }
+ }
+
+ // clear any prior cache data
+ m_alignmentCache->Clear();
+
+ // iterate over readers
+ vector<MergeItem>::iterator readerIter = m_readers.begin();
+ vector<MergeItem>::iterator readerEnd = m_readers.end();
+ for ( ; readerIter != readerEnd; ++readerIter ) {
+ MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+ BamAlignment* alignment = item.Alignment;
+ if ( reader == 0 || alignment == 0 ) continue;
+
+ // save next alignment from each reader in cache
+ SaveNextAlignment(reader, alignment);
+ }
+
+ // if we get here, ok
+ return true;
+}
+
+// ValidateReaders checks that all the readers point to BAM files representing
+// alignments against the same set of reference sequences, and that the
+// sequences are identically ordered. If these checks fail the operation of
+// the multireader is undefined, so we force program exit.
+bool BamMultiReaderPrivate::ValidateReaders(void) const {
+
+ m_errorString.clear();
+
+ // skip if 0 or 1 readers opened
+ if ( m_readers.empty() || (m_readers.size() == 1) )
+ return true;
+
+ // retrieve first reader
+ const MergeItem& firstItem = m_readers.front();
+ const BamReader* firstReader = firstItem.Reader;
+ if ( firstReader == 0 ) return false;
+
+ // retrieve first reader's header data
+ const SamHeader& firstReaderHeader = firstReader->GetHeader();
+ const string& firstReaderSortOrder = firstReaderHeader.SortOrder;
+
+ // retrieve first reader's reference data
+ const RefVector& firstReaderRefData = firstReader->GetReferenceData();
+ const int firstReaderRefCount = firstReader->GetReferenceCount();
+ const int firstReaderRefSize = firstReaderRefData.size();
+
+ // iterate over all readers
+ vector<MergeItem>::const_iterator readerIter = m_readers.begin();
+ vector<MergeItem>::const_iterator readerEnd = m_readers.end();
+ for ( ; readerIter != readerEnd; ++readerIter ) {
+ const MergeItem& item = (*readerIter);
+ BamReader* reader = item.Reader;
+ if ( reader == 0 ) continue;
+
+ // get current reader's header data
+ const SamHeader& currentReaderHeader = reader->GetHeader();
+ const string& currentReaderSortOrder = currentReaderHeader.SortOrder;
+
+ // check compatible sort order
+ if ( currentReaderSortOrder != firstReaderSortOrder ) {
+ const string message = string("mismatched sort order in ") + reader->GetFilename() +
+ ", expected " + firstReaderSortOrder +
+ ", but found " + currentReaderSortOrder;
+ SetErrorString("BamMultiReader::ValidateReaders", message);
+ return false;
+ }
+
+ // get current reader's reference data
+ const RefVector currentReaderRefData = reader->GetReferenceData();
+ const int currentReaderRefCount = reader->GetReferenceCount();
+ const int currentReaderRefSize = currentReaderRefData.size();
+
+ // init reference data iterators
+ RefVector::const_iterator firstRefIter = firstReaderRefData.begin();
+ RefVector::const_iterator firstRefEnd = firstReaderRefData.end();
+ RefVector::const_iterator currentRefIter = currentReaderRefData.begin();
+
+ // compare reference counts from BamReader ( & container size, in case of BR error)
+ if ( (currentReaderRefCount != firstReaderRefCount) ||
+ (firstReaderRefSize != currentReaderRefSize) )
+ {
+ stringstream s("");
+ s << "mismatched reference count in " << reader->GetFilename()
+ << ", expected " << firstReaderRefCount
+ << ", but found " << currentReaderRefCount;
+ SetErrorString("BamMultiReader::ValidateReaders", s.str());
+ return false;
+ }
+
+ // this will be ok; we just checked above that we have identically-sized sets of references
+ // here we simply check if they are all, in fact, equal in content
+ while ( firstRefIter != firstRefEnd ) {
+ const RefData& firstRef = (*firstRefIter);
+ const RefData& currentRef = (*currentRefIter);
+
+ // compare reference name & length
+ if ( (firstRef.RefName != currentRef.RefName) ||
+ (firstRef.RefLength != currentRef.RefLength) )
+ {
+ stringstream s("");
+ s << "mismatched references found in" << reader->GetFilename()
+ << "expected: " << endl;
+
+ // print first reader's reference data
+ RefVector::const_iterator refIter = firstReaderRefData.begin();
+ RefVector::const_iterator refEnd = firstReaderRefData.end();
+ for ( ; refIter != refEnd; ++refIter ) {
+ const RefData& entry = (*refIter);
+ stringstream s("");
+ s << entry.RefName << " " << endl;
+ }
+
+ s << "but found: " << endl;
+
+ // print current reader's reference data
+ refIter = currentReaderRefData.begin();
+ refEnd = currentReaderRefData.end();
+ for ( ; refIter != refEnd; ++refIter ) {
+ const RefData& entry = (*refIter);
+ s << entry.RefName << " " << entry.RefLength << endl;
+ }
+
+ SetErrorString("BamMultiReader::ValidateReaders", s.str());
+ return false;
+ }
+
+ // update iterators
+ ++firstRefIter;
+ ++currentRefIter;
+ }
+ }
+
+ // if we get here, everything checks out
+ return true;
+}
--- /dev/null
+// ***************************************************************************
+// BamMultiReader_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Functionality for simultaneously reading multiple BAM files
+// *************************************************************************
+
+#ifndef BAMMULTIREADER_P_H
+#define BAMMULTIREADER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/SamHeader.h"
+#include "api/BamMultiReader.h"
+#include "api/internal/bam/BamMultiMerger_p.h"
+#include <string>
+#include <vector>
+
+namespace BamTools {
+namespace Internal {
+
+class BamMultiReaderPrivate {
+
+ // typedefs
+ public:
+ typedef std::pair<BamReader*, BamAlignment*> ReaderAlignment;
+
+ // constructor / destructor
+ public:
+ BamMultiReaderPrivate(void);
+ ~BamMultiReaderPrivate(void);
+
+ // public interface
+ public:
+
+ // file operations
+ bool Close(void);
+ bool CloseFile(const std::string& filename);
+ const std::vector<std::string> Filenames(void) const;
+ bool Jump(int refID, int position = 0);
+ bool Open(const std::vector<std::string>& filenames);
+ bool OpenFile(const std::string& filename);
+ bool Rewind(void);
+ bool SetRegion(const BamRegion& region);
+
+ // access alignment data
+ bool GetNextAlignment(BamAlignment& al);
+ bool GetNextAlignmentCore(BamAlignment& al);
+ bool HasOpenReaders(void);
+
+ // access auxiliary data
+ SamHeader GetHeader(void) const;
+ std::string GetHeaderText(void) const;
+ int GetReferenceCount(void) const;
+ const BamTools::RefVector GetReferenceData(void) const;
+ int GetReferenceID(const std::string& refName) const;
+
+ // BAM index operations
+ bool CreateIndexes(const BamIndex::IndexType& type = BamIndex::STANDARD);
+ bool HasIndexes(void) const;
+ bool LocateIndexes(const BamIndex::IndexType& preferredType = BamIndex::STANDARD);
+ bool OpenIndexes(const std::vector<std::string>& indexFilenames);
+
+ // error handling
+ std::string GetErrorString(void) const;
+
+ // 'internal' methods
+ public:
+
+ bool CloseFiles(const std::vector<std::string>& filenames);
+ IMultiMerger* CreateAlignmentCache(void) const;
+ bool PopNextCachedAlignment(BamAlignment& al, const bool needCharData);
+ bool RewindReaders(void);
+ void SaveNextAlignment(BamReader* reader, BamAlignment* alignment);
+ void SetErrorString(const std::string& where, const std::string& what) const; //
+ bool UpdateAlignmentCache(void);
+ bool ValidateReaders(void) const;
+
+ // data members
+ public:
+ std::vector<MergeItem> m_readers;
+ IMultiMerger* m_alignmentCache;
+ mutable std::string m_errorString;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMMULTIREADER_P_H
--- /dev/null
+// ***************************************************************************
+// BamRandomAccessController_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011(DB)
+// ---------------------------------------------------------------------------
+// Manages random access operations in a BAM file
+// **************************************************************************
+
+#include "api/BamIndex.h"
+#include "api/internal/bam/BamRandomAccessController_p.h"
+#include "api/internal/bam/BamReader_p.h"
+#include "api/internal/index/BamIndexFactory_p.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cassert>
+#include <sstream>
+using namespace std;
+
+BamRandomAccessController::BamRandomAccessController(void)
+ : m_index(0)
+ , m_hasAlignmentsInRegion(true)
+{ }
+
+BamRandomAccessController::~BamRandomAccessController(void) {
+ Close();
+}
+
+void BamRandomAccessController::AdjustRegion(const int& referenceCount) {
+
+ // skip if no index available
+ if ( m_index == 0 )
+ return;
+
+ // see if any references in region have alignments
+ m_hasAlignmentsInRegion = false;
+ int currentId = m_region.LeftRefID;
+ const int rightBoundRefId = ( m_region.isRightBoundSpecified() ? m_region.RightRefID : referenceCount - 1 );
+ while ( currentId <= rightBoundRefId ) {
+ m_hasAlignmentsInRegion = m_index->HasAlignments(currentId);
+ if ( m_hasAlignmentsInRegion ) break;
+ ++currentId;
+ }
+
+ // if no data found on any reference in region
+ if ( !m_hasAlignmentsInRegion )
+ return;
+
+ // if left bound of desired region had no data, use first reference that had data
+ // otherwise, leave requested region as-is
+ if ( currentId != m_region.LeftRefID ) {
+ m_region.LeftRefID = currentId;
+ m_region.LeftPosition = 0;
+ }
+}
+
+// returns alignments' "RegionState": { Before|Overlaps|After } current region
+BamRandomAccessController::RegionState
+BamRandomAccessController::AlignmentState(const BamAlignment& alignment) const {
+
+ // if region has no left bound at all
+ if ( !m_region.isLeftBoundSpecified() )
+ return OverlapsRegion;
+
+ // handle unmapped reads - return AFTER region to halt processing
+ if ( alignment.RefID == -1 )
+ return AfterRegion;
+
+ // if alignment is on any reference before left bound reference
+ if ( alignment.RefID < m_region.LeftRefID )
+ return BeforeRegion;
+
+ // if alignment is on left bound reference
+ else if ( alignment.RefID == m_region.LeftRefID ) {
+
+ // if alignment starts at or after left bound position
+ if ( alignment.Position >= m_region.LeftPosition) {
+
+ if ( m_region.isRightBoundSpecified() && // right bound is specified AND
+ m_region.LeftRefID == m_region.RightRefID && // left & right bounds on same reference AND
+ alignment.Position >= m_region.RightPosition ) // alignment starts on or after right bound position
+ return AfterRegion;
+
+ // otherwise, alignment overlaps region
+ else return OverlapsRegion;
+ }
+
+ // alignment starts before left bound position
+ else {
+
+ // if alignment overlaps left bound position
+ if ( alignment.GetEndPosition() > m_region.LeftPosition )
+ return OverlapsRegion;
+ else
+ return BeforeRegion;
+ }
+ }
+
+ // otherwise alignment is on a reference after left bound reference
+ else {
+
+ // if region has a right bound
+ if ( m_region.isRightBoundSpecified() ) {
+
+ // alignment is on any reference between boundaries
+ if ( alignment.RefID < m_region.RightRefID )
+ return OverlapsRegion;
+
+ // alignment is on any reference after right boundary
+ else if ( alignment.RefID > m_region.RightRefID )
+ return AfterRegion;
+
+ // alignment is on right bound reference
+ else {
+
+ // if alignment starts before right bound position
+ if ( alignment.Position < m_region.RightPosition )
+ return OverlapsRegion;
+ else
+ return AfterRegion;
+ }
+ }
+
+ // otherwise, alignment starts after left bound and there is no right bound given
+ else return OverlapsRegion;
+ }
+}
+
+void BamRandomAccessController::Close(void) {
+ ClearIndex();
+ ClearRegion();
+}
+
+void BamRandomAccessController::ClearIndex(void) {
+ if ( m_index ) {
+ delete m_index;
+ m_index = 0;
+ }
+}
+
+void BamRandomAccessController::ClearRegion(void) {
+ m_region.clear();
+ m_hasAlignmentsInRegion = true;
+}
+
+bool BamRandomAccessController::CreateIndex(BamReaderPrivate* reader,
+ const BamIndex::IndexType& type)
+{
+ // skip if reader is invalid
+ assert(reader);
+ if ( !reader->IsOpen() ) {
+ SetErrorString("BamRandomAccessController::CreateIndex",
+ "cannot create index for unopened reader");
+ return false;
+ }
+
+ // create new index of requested type
+ BamIndex* newIndex = BamIndexFactory::CreateIndexOfType(type, reader);
+ if ( newIndex == 0 ) {
+ stringstream s("");
+ s << "could not create index of type: " << type;
+ SetErrorString("BamRandomAccessController::CreateIndex", s.str());
+ return false;
+ }
+
+ // attempt to build index from current BamReader file
+ if ( !newIndex->Create() ) {
+ const string indexError = newIndex->GetErrorString();
+ const string message = "could not create index: \n\t" + indexError;
+ SetErrorString("BamRandomAccessController::CreateIndex", message);
+ return false;
+ }
+
+ // save new index & return success
+ SetIndex(newIndex);
+ return true;
+}
+
+string BamRandomAccessController::GetErrorString(void) const {
+ return m_errorString;
+}
+
+bool BamRandomAccessController::HasIndex(void) const {
+ return ( m_index != 0 );
+}
+
+bool BamRandomAccessController::HasRegion(void) const {
+ return ( !m_region.isNull() );
+}
+
+bool BamRandomAccessController::IndexHasAlignmentsForReference(const int& refId) {
+ return m_index->HasAlignments(refId);
+}
+
+bool BamRandomAccessController::LocateIndex(BamReaderPrivate* reader,
+ const BamIndex::IndexType& preferredType)
+{
+ // look up index filename, deferring to preferredType if possible
+ assert(reader);
+ const string& indexFilename = BamIndexFactory::FindIndexFilename(reader->Filename(), preferredType);
+
+ // if no index file found (of any type)
+ if ( indexFilename.empty() ) {
+ const string message = string("could not find index file for:") + reader->Filename();
+ SetErrorString("BamRandomAccessController::LocateIndex", message);
+ return false;
+ }
+
+ // otherwise open & use index file that was found
+ return OpenIndex(indexFilename, reader);
+}
+
+bool BamRandomAccessController::OpenIndex(const string& indexFilename, BamReaderPrivate* reader) {
+
+ // attempt create new index of type based on filename
+ BamIndex* index = BamIndexFactory::CreateIndexFromFilename(indexFilename, reader);
+ if ( index == 0 ) {
+ const string message = string("could not open index file: ") + indexFilename;
+ SetErrorString("BamRandomAccessController::OpenIndex", message);
+ return false;
+ }
+
+ // attempt to load data from index file
+ if ( !index->Load(indexFilename) ) {
+ const string indexError = index->GetErrorString();
+ const string message = string("could not load index data from file: ") + indexFilename +
+ "\n\t" + indexError;
+ SetErrorString("BamRandomAccessController::OpenIndex", message);
+ return false;
+ }
+
+ // save new index & return success
+ SetIndex(index);
+ return true;
+}
+
+bool BamRandomAccessController::RegionHasAlignments(void) const {
+ return m_hasAlignmentsInRegion;
+}
+
+void BamRandomAccessController::SetErrorString(const string& where, const string& what) {
+ m_errorString = where + ": " + what;
+}
+
+void BamRandomAccessController::SetIndex(BamIndex* index) {
+ if ( m_index )
+ ClearIndex();
+ m_index = index;
+}
+
+bool BamRandomAccessController::SetRegion(const BamRegion& region, const int& referenceCount) {
+
+ // store region
+ m_region = region;
+
+ // cannot jump when no index is available
+ if ( !HasIndex() ) {
+ SetErrorString("BamRandomAccessController", "cannot jump if no index data available");
+ return false;
+ }
+
+ // adjust region as necessary to reflect where data actually begins
+ AdjustRegion(referenceCount);
+
+ // if no data present, return true
+ // * Not an error, but future attempts to access alignments in this region will not return data
+ // Returning true is useful in a BamMultiReader setting where some BAM files may
+ // lack alignments in regions where other files still have data available.
+ if ( !m_hasAlignmentsInRegion )
+ return true;
+
+ // return success/failure of jump to specified region,
+ //
+ // * Index::Jump() is allowed to modify the m_hasAlignmentsInRegion flag
+ // This covers 'corner case' where a region is requested that lies beyond the last
+ // alignment on a reference. If this occurs, any subsequent calls to GetNextAlignment[Core]
+ // will not return data. BamMultiReader will still be able to successfully pull alignments
+ // from a region from other files even if this one has no data.
+ if ( !m_index->Jump(m_region, &m_hasAlignmentsInRegion) ) {
+ const string indexError = m_index->GetErrorString();
+ const string message = string("could not set region\n\t") + indexError;
+ SetErrorString("BamRandomAccessController::OpenIndex", message);
+ return false;
+ }
+ else
+ return true;
+}
--- /dev/null
+// ***************************************************************************
+// BamRandomAccessController_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011(DB)
+// ---------------------------------------------------------------------------
+// Manages random access operations in a BAM file
+// ***************************************************************************
+
+#ifndef BAMRACONTROLLER_P_H
+#define BAMRACONTROLLER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/BamAux.h"
+#include "api/BamIndex.h"
+
+namespace BamTools {
+
+class BamAlignment;
+
+namespace Internal {
+
+class BamReaderPrivate;
+
+class BamRandomAccessController {
+
+ // enums
+ public: enum RegionState { BeforeRegion = 0
+ , OverlapsRegion
+ , AfterRegion
+ };
+
+ // ctor & dtor
+ public:
+ BamRandomAccessController(void);
+ ~BamRandomAccessController(void);
+
+ // BamRandomAccessController interface
+ public:
+
+ // index methods
+ void ClearIndex(void);
+ bool CreateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& type);
+ bool HasIndex(void) const;
+ bool IndexHasAlignmentsForReference(const int& refId);
+ bool LocateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& preferredType);
+ bool OpenIndex(const std::string& indexFilename, BamReaderPrivate* reader);
+ void SetIndex(BamIndex* index);
+
+ // region methods
+ void ClearRegion(void);
+ bool HasRegion(void) const;
+ RegionState AlignmentState(const BamAlignment& alignment) const;
+ bool RegionHasAlignments(void) const;
+ bool SetRegion(const BamRegion& region, const int& referenceCount);
+
+ // general methods
+ void Close(void);
+ std::string GetErrorString(void) const;
+
+ // internal methods
+ private:
+ // adjusts requested region if necessary (depending on where data actually begins)
+ void AdjustRegion(const int& referenceCount);
+ // error-string handling
+ void SetErrorString(const std::string& where, const std::string& what);
+
+ // data members
+ private:
+
+ // index data
+ BamIndex* m_index; // owns the index, not a copy - responsible for deleting
+
+ // region data
+ BamRegion m_region;
+ bool m_hasAlignmentsInRegion;
+
+ // general data
+ std::string m_errorString;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMRACONTROLLER_P_H
--- /dev/null
+// ***************************************************************************
+// BamReader_p.cpp (c) 2009 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+#include "api/BamConstants.h"
+#include "api/BamReader.h"
+#include "api/IBamIODevice.h"
+#include "api/internal/bam/BamHeader_p.h"
+#include "api/internal/bam/BamRandomAccessController_p.h"
+#include "api/internal/bam/BamReader_p.h"
+#include "api/internal/index/BamStandardIndex_p.h"
+#include "api/internal/index/BamToolsIndex_p.h"
+#include "api/internal/io/BamDeviceFactory_p.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <iterator>
+#include <vector>
+using namespace std;
+
+// constructor
+BamReaderPrivate::BamReaderPrivate(BamReader* parent)
+ : m_alignmentsBeginOffset(0)
+ , m_parent(parent)
+{
+ m_isBigEndian = BamTools::SystemIsBigEndian();
+}
+
+// destructor
+BamReaderPrivate::~BamReaderPrivate(void) {
+ Close();
+}
+
+// closes the BAM file
+bool BamReaderPrivate::Close(void) {
+
+ // clear BAM metadata
+ m_references.clear();
+ m_header.Clear();
+
+ // clear filename
+ m_filename.clear();
+
+ // close random access controller
+ m_randomAccessController.Close();
+
+ // if stream is open, attempt close
+ if ( IsOpen() ) {
+ try {
+ m_stream.Close();
+ } catch ( BamException& e ) {
+ const string streamError = e.what();
+ const string message = string("encountered error closing BAM file: \n\t") + streamError;
+ SetErrorString("BamReader::Close", message);
+ return false;
+ }
+ }
+
+ // return success
+ return true;
+}
+
+// creates an index file of requested type on current BAM file
+bool BamReaderPrivate::CreateIndex(const BamIndex::IndexType& type) {
+
+ // skip if BAM file not open
+ if ( !IsOpen() ) {
+ SetErrorString("BamReader::CreateIndex", "cannot create index on unopened BAM file");
+ return false;
+ }
+
+ // attempt to create index
+ if ( m_randomAccessController.CreateIndex(this, type) )
+ return true;
+ else {
+ const string bracError = m_randomAccessController.GetErrorString();
+ const string message = string("could not create index: \n\t") + bracError;
+ SetErrorString("BamReader::CreateIndex", message);
+ return false;
+ }
+}
+
+// return path & filename of current BAM file
+const string BamReaderPrivate::Filename(void) const {
+ return m_filename;
+}
+
+string BamReaderPrivate::GetErrorString(void) const {
+ return m_errorString;
+}
+
+// return header data as std::string
+string BamReaderPrivate::GetHeaderText(void) const {
+ return m_header.ToString();
+}
+
+// return header data as SamHeader object
+SamHeader BamReaderPrivate::GetSamHeader(void) const {
+ return m_header.ToSamHeader();
+}
+
+// get next alignment (with character data fully parsed)
+bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) {
+
+ // if valid alignment found
+ if ( GetNextAlignmentCore(alignment) ) {
+
+ // store alignment's "source" filename
+ alignment.Filename = m_filename;
+
+ // return success/failure of parsing char data
+ if ( alignment.BuildCharData() )
+ return true;
+ else {
+ const string alError = alignment.GetErrorString();
+ const string message = string("could not populate alignment data: \n\t") + alError;
+ SetErrorString("BamReader::GetNextAlignment", message);
+ return false;
+ }
+ }
+
+ // no valid alignment found
+ return false;
+}
+
+// retrieves next available alignment core data (returns success/fail)
+// ** DOES NOT populate any character data fields (read name, bases, qualities, tag data, filename)
+// these can be accessed, if necessary, from the supportData
+// useful for operations requiring ONLY positional or other alignment-related information
+bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& alignment) {
+
+ // skip if stream not opened
+ if ( !m_stream.IsOpen() )
+ return false;
+
+ try {
+
+ // skip if region is set but has no alignments
+ if ( m_randomAccessController.HasRegion() &&
+ !m_randomAccessController.RegionHasAlignments() )
+ {
+ return false;
+ }
+
+ // if can't read next alignment
+ if ( !LoadNextAlignment(alignment) )
+ return false;
+
+ // check alignment's region-overlap state
+ BamRandomAccessController::RegionState state = m_randomAccessController.AlignmentState(alignment);
+
+ // if alignment starts after region, no need to keep reading
+ if ( state == BamRandomAccessController::AfterRegion )
+ return false;
+
+ // read until overlap is found
+ while ( state != BamRandomAccessController::OverlapsRegion ) {
+
+ // if can't read next alignment
+ if ( !LoadNextAlignment(alignment) )
+ return false;
+
+ // check alignment's region-overlap state
+ state = m_randomAccessController.AlignmentState(alignment);
+
+ // if alignment starts after region, no need to keep reading
+ if ( state == BamRandomAccessController::AfterRegion )
+ return false;
+ }
+
+ // if we get here, we found the next 'valid' alignment
+ // (e.g. overlaps current region if one was set, simply the next alignment if not)
+ alignment.SupportData.HasCoreOnly = true;
+ return true;
+
+ } catch ( BamException& e ) {
+ const string streamError = e.what();
+ const string message = string("encountered error reading BAM alignment: \n\t") + streamError;
+ SetErrorString("BamReader::GetNextAlignmentCore", message);
+ return false;
+ }
+}
+
+int BamReaderPrivate::GetReferenceCount(void) const {
+ return m_references.size();
+}
+
+const RefVector& BamReaderPrivate::GetReferenceData(void) const {
+ return m_references;
+}
+
+// returns RefID for given RefName (returns References.size() if not found)
+int BamReaderPrivate::GetReferenceID(const string& refName) const {
+
+ // retrieve names from reference data
+ vector<string> refNames;
+ RefVector::const_iterator refIter = m_references.begin();
+ RefVector::const_iterator refEnd = m_references.end();
+ for ( ; refIter != refEnd; ++refIter)
+ refNames.push_back( (*refIter).RefName );
+
+ // return 'index-of' refName (or -1 if not found)
+ int index = distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));
+ if ( index == (int)m_references.size() ) return -1;
+ else return index;
+}
+
+bool BamReaderPrivate::HasIndex(void) const {
+ return m_randomAccessController.HasIndex();
+}
+
+bool BamReaderPrivate::IsOpen(void) const {
+ return m_stream.IsOpen();
+}
+
+// load BAM header data
+void BamReaderPrivate::LoadHeaderData(void) {
+ m_header.Load(&m_stream);
+}
+
+// populates BamAlignment with alignment data under file pointer, returns success/fail
+bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) {
+
+ // read in the 'block length' value, make sure it's not zero
+ char buffer[sizeof(uint32_t)];
+ m_stream.Read(buffer, sizeof(uint32_t));
+ alignment.SupportData.BlockLength = BamTools::UnpackUnsignedInt(buffer);
+ if ( m_isBigEndian ) BamTools::SwapEndian_32(alignment.SupportData.BlockLength);
+ if ( alignment.SupportData.BlockLength == 0 )
+ return false;
+
+ // read in core alignment data, make sure the right size of data was read
+ char x[Constants::BAM_CORE_SIZE];
+ if ( m_stream.Read(x, Constants::BAM_CORE_SIZE) != Constants::BAM_CORE_SIZE )
+ return false;
+
+ // swap core endian-ness if necessary
+ if ( m_isBigEndian ) {
+ for ( unsigned int i = 0; i < Constants::BAM_CORE_SIZE; i+=sizeof(uint32_t) )
+ BamTools::SwapEndian_32p(&x[i]);
+ }
+
+ // set BamAlignment 'core' and 'support' data
+ alignment.RefID = BamTools::UnpackSignedInt(&x[0]);
+ alignment.Position = BamTools::UnpackSignedInt(&x[4]);
+
+ unsigned int tempValue = BamTools::UnpackUnsignedInt(&x[8]);
+ alignment.Bin = tempValue >> 16;
+ alignment.MapQuality = tempValue >> 8 & 0xff;
+ alignment.SupportData.QueryNameLength = tempValue & 0xff;
+
+ tempValue = BamTools::UnpackUnsignedInt(&x[12]);
+ alignment.AlignmentFlag = tempValue >> 16;
+ alignment.SupportData.NumCigarOperations = tempValue & 0xffff;
+
+ alignment.SupportData.QuerySequenceLength = BamTools::UnpackUnsignedInt(&x[16]);
+ alignment.MateRefID = BamTools::UnpackSignedInt(&x[20]);
+ alignment.MatePosition = BamTools::UnpackSignedInt(&x[24]);
+ alignment.InsertSize = BamTools::UnpackSignedInt(&x[28]);
+
+ // set BamAlignment length
+ alignment.Length = alignment.SupportData.QuerySequenceLength;
+
+ // read in character data - make sure proper data size was read
+ bool readCharDataOK = false;
+ const unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE;
+ RaiiBuffer allCharData(dataLength);
+
+ if ( m_stream.Read(allCharData.Buffer, dataLength) == dataLength ) {
+
+ // store 'allCharData' in supportData structure
+ alignment.SupportData.AllCharData.assign((const char*)allCharData.Buffer, dataLength);
+
+ // set success flag
+ readCharDataOK = true;
+
+ // save CIGAR ops
+ // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly,
+ // even when GetNextAlignmentCore() is called
+ const unsigned int cigarDataOffset = alignment.SupportData.QueryNameLength;
+ uint32_t* cigarData = (uint32_t*)(allCharData.Buffer + cigarDataOffset);
+ CigarOp op;
+ alignment.CigarData.clear();
+ alignment.CigarData.reserve(alignment.SupportData.NumCigarOperations);
+ for ( unsigned int i = 0; i < alignment.SupportData.NumCigarOperations; ++i ) {
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) BamTools::SwapEndian_32(cigarData[i]);
+
+ // build CigarOp structure
+ op.Length = (cigarData[i] >> Constants::BAM_CIGAR_SHIFT);
+ op.Type = Constants::BAM_CIGAR_LOOKUP[ (cigarData[i] & Constants::BAM_CIGAR_MASK) ];
+
+ // save CigarOp
+ alignment.CigarData.push_back(op);
+ }
+ }
+
+ // return success/failure
+ return readCharDataOK;
+}
+
+// loads reference data from BAM file
+bool BamReaderPrivate::LoadReferenceData(void) {
+
+ // get number of reference sequences
+ char buffer[sizeof(uint32_t)];
+ m_stream.Read(buffer, sizeof(uint32_t));
+ uint32_t numberRefSeqs = BamTools::UnpackUnsignedInt(buffer);
+ if ( m_isBigEndian ) BamTools::SwapEndian_32(numberRefSeqs);
+ m_references.reserve((int)numberRefSeqs);
+
+ // iterate over all references in header
+ for ( unsigned int i = 0; i != numberRefSeqs; ++i ) {
+
+ // get length of reference name
+ m_stream.Read(buffer, sizeof(uint32_t));
+ uint32_t refNameLength = BamTools::UnpackUnsignedInt(buffer);
+ if ( m_isBigEndian ) BamTools::SwapEndian_32(refNameLength);
+ RaiiBuffer refName(refNameLength);
+
+ // get reference name and reference sequence length
+ m_stream.Read(refName.Buffer, refNameLength);
+ m_stream.Read(buffer, sizeof(int32_t));
+ int32_t refLength = BamTools::UnpackSignedInt(buffer);
+ if ( m_isBigEndian ) BamTools::SwapEndian_32(refLength);
+
+ // store data for reference
+ RefData aReference;
+ aReference.RefName = (string)((const char*)refName.Buffer);
+ aReference.RefLength = refLength;
+ m_references.push_back(aReference);
+ }
+
+ // return success
+ return true;
+}
+
+bool BamReaderPrivate::LocateIndex(const BamIndex::IndexType& preferredType) {
+
+ if ( m_randomAccessController.LocateIndex(this, preferredType) )
+ return true;
+ else {
+ const string bracError = m_randomAccessController.GetErrorString();
+ const string message = string("could not locate index: \n\t") + bracError;
+ SetErrorString("BamReader::LocateIndex", message);
+ return false;
+ }
+}
+
+// opens BAM file (and index)
+bool BamReaderPrivate::Open(const string& filename) {
+
+ try {
+
+ // make sure we're starting with fresh state
+ Close();
+
+ // open BgzfStream
+ m_stream.Open(filename, IBamIODevice::ReadOnly);
+ assert(m_stream);
+
+ // load BAM metadata
+ LoadHeaderData();
+ LoadReferenceData();
+
+ // store filename & offset of first alignment
+ m_filename = filename;
+ m_alignmentsBeginOffset = m_stream.Tell();
+
+ // return success
+ return true;
+
+ } catch ( BamException& e ) {
+ const string error = e.what();
+ const string message = string("could not open file: ") + filename +
+ "\n\t" + error;
+ SetErrorString("BamReader::Open", message);
+ return false;
+ }
+}
+
+bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) {
+
+ if ( m_randomAccessController.OpenIndex(indexFilename, this) )
+ return true;
+ else {
+ const string bracError = m_randomAccessController.GetErrorString();
+ const string message = string("could not open index: \n\t") + bracError;
+ SetErrorString("BamReader::OpenIndex", message);
+ return false;
+ }
+}
+
+// returns BAM file pointer to beginning of alignment data
+bool BamReaderPrivate::Rewind(void) {
+
+ // reset region
+ m_randomAccessController.ClearRegion();
+
+ // return status of seeking back to first alignment
+ if ( Seek(m_alignmentsBeginOffset) )
+ return true;
+ else {
+ const string currentError = m_errorString;
+ const string message = string("could not rewind: \n\t") + currentError;
+ SetErrorString("BamReader::Rewind", message);
+ return false;
+ }
+}
+
+bool BamReaderPrivate::Seek(const int64_t& position) {
+
+ // skip if BAM file not open
+ if ( !IsOpen() ) {
+ SetErrorString("BamReader::Seek", "cannot seek on unopened BAM file");
+ return false;
+ }
+
+ try {
+ m_stream.Seek(position);
+ return true;
+ }
+ catch ( BamException& e ) {
+ const string streamError = e.what();
+ const string message = string("could not seek in BAM file: \n\t") + streamError;
+ SetErrorString("BamReader::Seek", message);
+ return false;
+ }
+}
+
+void BamReaderPrivate::SetErrorString(const string& where, const string& what) {
+ static const string SEPARATOR = ": ";
+ m_errorString = where + SEPARATOR + what;
+}
+
+void BamReaderPrivate::SetIndex(BamIndex* index) {
+ m_randomAccessController.SetIndex(index);
+}
+
+// sets current region & attempts to jump to it
+// returns success/failure
+bool BamReaderPrivate::SetRegion(const BamRegion& region) {
+
+ if ( m_randomAccessController.SetRegion(region, m_references.size()) )
+ return true;
+ else {
+ const string bracError = m_randomAccessController.GetErrorString();
+ const string message = string("could not set region: \n\t") + bracError;
+ SetErrorString("BamReader::SetRegion", message);
+ return false;
+ }
+}
+
+int64_t BamReaderPrivate::Tell(void) const {
+ return m_stream.Tell();
+}
--- /dev/null
+// ***************************************************************************
+// BamReader_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+#ifndef BAMREADER_P_H
+#define BAMREADER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/BamAlignment.h"
+#include "api/BamIndex.h"
+#include "api/BamReader.h"
+#include "api/SamHeader.h"
+#include "api/internal/bam/BamHeader_p.h"
+#include "api/internal/bam/BamRandomAccessController_p.h"
+#include "api/internal/io/BgzfStream_p.h"
+#include <string>
+
+namespace BamTools {
+namespace Internal {
+
+class BamReaderPrivate {
+
+ // ctor & dtor
+ public:
+ BamReaderPrivate(BamReader* parent);
+ ~BamReaderPrivate(void);
+
+ // BamReader interface
+ public:
+
+ // file operations
+ bool Close(void);
+ const std::string Filename(void) const;
+ bool IsOpen(void) const;
+ bool Open(const std::string& filename);
+ bool Rewind(void);
+ bool SetRegion(const BamRegion& region);
+
+ // access alignment data
+ bool GetNextAlignment(BamAlignment& alignment);
+ bool GetNextAlignmentCore(BamAlignment& alignment);
+
+ // access auxiliary data
+ std::string GetHeaderText(void) const;
+ SamHeader GetSamHeader(void) const;
+ int GetReferenceCount(void) const;
+ const RefVector& GetReferenceData(void) const;
+ int GetReferenceID(const std::string& refName) const;
+
+ // index operations
+ bool CreateIndex(const BamIndex::IndexType& type);
+ bool HasIndex(void) const;
+ bool LocateIndex(const BamIndex::IndexType& preferredType);
+ bool OpenIndex(const std::string& indexFilename);
+ void SetIndex(BamIndex* index);
+
+ // error handling
+ std::string GetErrorString(void) const;
+ void SetErrorString(const std::string& where, const std::string& what);
+
+ // internal methods, but available as a BamReaderPrivate 'interface'
+ //
+ // these methods should only be used by BamTools::Internal classes
+ // (currently only used by the BamIndex subclasses)
+ public:
+ // retrieves header text from BAM file
+ void LoadHeaderData(void);
+ // retrieves BAM alignment under file pointer
+ // (does no overlap checking or character data parsing)
+ bool LoadNextAlignment(BamAlignment& alignment);
+ // builds reference data structure from BAM file
+ bool LoadReferenceData(void);
+ // seek reader to file position
+ bool Seek(const int64_t& position);
+ // return reader's file position
+ int64_t Tell(void) const;
+
+ // data members
+ public:
+
+ // general BAM file data
+ int64_t m_alignmentsBeginOffset;
+ std::string m_filename;
+ RefVector m_references;
+
+ // system data
+ bool m_isBigEndian;
+
+ // parent BamReader
+ BamReader* m_parent;
+
+ // BamReaderPrivate components
+ BamHeader m_header;
+ BamRandomAccessController m_randomAccessController;
+ BgzfStream m_stream;
+
+ // error handling
+ std::string m_errorString;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMREADER_P_H
--- /dev/null
+// ***************************************************************************
+// BamWriter_p.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#include "api/BamAlignment.h"
+#include "api/BamConstants.h"
+#include "api/IBamIODevice.h"
+#include "api/internal/bam/BamWriter_p.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdlib>
+#include <cstring>
+using namespace std;
+
+// ctor
+BamWriterPrivate::BamWriterPrivate(void)
+ : m_isBigEndian( BamTools::SystemIsBigEndian() )
+{ }
+
+// dtor
+BamWriterPrivate::~BamWriterPrivate(void) {
+ Close();
+}
+
+// calculates minimum bin for a BAM alignment interval [begin, end)
+uint32_t BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
+ --end;
+ if ( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
+ if ( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);
+ if ( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);
+ if ( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);
+ if ( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);
+ return 0;
+}
+
+// closes the alignment archive
+void BamWriterPrivate::Close(void) {
+
+ // skip if file not open
+ if ( !IsOpen() ) return;
+
+ // close output stream
+ try {
+ m_stream.Close();
+ } catch ( BamException& e ) {
+ m_errorString = e.what();
+ }
+}
+
+// creates a cigar string from the supplied alignment
+void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
+
+ // initialize
+ const size_t numCigarOperations = cigarOperations.size();
+ packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT);
+
+ // pack the cigar data into the string
+ unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
+
+ // iterate over cigar operations
+ vector<CigarOp>::const_iterator coIter = cigarOperations.begin();
+ vector<CigarOp>::const_iterator coEnd = cigarOperations.end();
+ for ( ; coIter != coEnd; ++coIter ) {
+
+ // store op in packedCigar
+ uint8_t cigarOp;
+ switch ( coIter->Type ) {
+ case (Constants::BAM_CIGAR_MATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MATCH; break;
+ case (Constants::BAM_CIGAR_INS_CHAR) : cigarOp = Constants::BAM_CIGAR_INS; break;
+ case (Constants::BAM_CIGAR_DEL_CHAR) : cigarOp = Constants::BAM_CIGAR_DEL; break;
+ case (Constants::BAM_CIGAR_REFSKIP_CHAR) : cigarOp = Constants::BAM_CIGAR_REFSKIP; break;
+ case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_SOFTCLIP; break;
+ case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_HARDCLIP; break;
+ case (Constants::BAM_CIGAR_PAD_CHAR) : cigarOp = Constants::BAM_CIGAR_PAD; break;
+ case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_SEQMATCH; break;
+ case (Constants::BAM_CIGAR_MISMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MISMATCH; break;
+ default:
+ const string message = string("invalid CIGAR operation type") + coIter->Type;
+ throw BamException("BamWriter::CreatePackedCigar", message);
+ }
+
+ *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp;
+ pPackedCigar++;
+ }
+}
+
+// encodes the supplied query sequence into 4-bit notation
+void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
+
+ // prepare the encoded query string
+ const size_t queryLength = query.size();
+ const size_t encodedQueryLength = static_cast<size_t>((queryLength+1)/2);
+ encodedQuery.resize(encodedQueryLength);
+ char* pEncodedQuery = (char*)encodedQuery.data();
+ const char* pQuery = (const char*)query.data();
+
+ // walk through original query sequence, encoding its bases
+ unsigned char nucleotideCode;
+ bool useHighWord = true;
+ while ( *pQuery ) {
+ switch ( *pQuery ) {
+ case (Constants::BAM_DNA_EQUAL) : nucleotideCode = Constants::BAM_BASECODE_EQUAL; break;
+ case (Constants::BAM_DNA_A) : nucleotideCode = Constants::BAM_BASECODE_A; break;
+ case (Constants::BAM_DNA_C) : nucleotideCode = Constants::BAM_BASECODE_C; break;
+ case (Constants::BAM_DNA_M) : nucleotideCode = Constants::BAM_BASECODE_M; break;
+ case (Constants::BAM_DNA_G) : nucleotideCode = Constants::BAM_BASECODE_G; break;
+ case (Constants::BAM_DNA_R) : nucleotideCode = Constants::BAM_BASECODE_R; break;
+ case (Constants::BAM_DNA_S) : nucleotideCode = Constants::BAM_BASECODE_S; break;
+ case (Constants::BAM_DNA_V) : nucleotideCode = Constants::BAM_BASECODE_V; break;
+ case (Constants::BAM_DNA_T) : nucleotideCode = Constants::BAM_BASECODE_T; break;
+ case (Constants::BAM_DNA_W) : nucleotideCode = Constants::BAM_BASECODE_W; break;
+ case (Constants::BAM_DNA_Y) : nucleotideCode = Constants::BAM_BASECODE_Y; break;
+ case (Constants::BAM_DNA_H) : nucleotideCode = Constants::BAM_BASECODE_H; break;
+ case (Constants::BAM_DNA_K) : nucleotideCode = Constants::BAM_BASECODE_K; break;
+ case (Constants::BAM_DNA_D) : nucleotideCode = Constants::BAM_BASECODE_D; break;
+ case (Constants::BAM_DNA_B) : nucleotideCode = Constants::BAM_BASECODE_B; break;
+ case (Constants::BAM_DNA_N) : nucleotideCode = Constants::BAM_BASECODE_N; break;
+ default:
+ const string message = string("invalid base: ") + *pQuery;
+ throw BamException("BamWriter::EncodeQuerySequence", message);
+ }
+
+ // pack the nucleotide code
+ if ( useHighWord ) {
+ *pEncodedQuery = nucleotideCode << 4;
+ useHighWord = false;
+ } else {
+ *pEncodedQuery |= nucleotideCode;
+ ++pEncodedQuery;
+ useHighWord = true;
+ }
+
+ // increment the query position
+ ++pQuery;
+ }
+}
+
+// returns a description of the last error that occurred
+std::string BamWriterPrivate::GetErrorString(void) const {
+ return m_errorString;
+}
+
+// returns whether BAM file is open for writing or not
+bool BamWriterPrivate::IsOpen(void) const {
+ return m_stream.IsOpen();
+}
+
+// opens the alignment archive
+bool BamWriterPrivate::Open(const string& filename,
+ const string& samHeaderText,
+ const RefVector& referenceSequences)
+{
+ try {
+
+ // open the BGZF file for writing
+ m_stream.Open(filename, IBamIODevice::WriteOnly);
+
+ // write BAM file 'metadata' components
+ WriteMagicNumber();
+ WriteSamHeaderText(samHeaderText);
+ WriteReferences(referenceSequences);
+
+ // return success
+ return true;
+
+ } catch ( BamException& e ) {
+ m_errorString = e.what();
+ return false;
+ }
+}
+
+// saves the alignment to the alignment archive
+bool BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
+
+ try {
+
+ // if BamAlignment contains only the core data and a raw char data buffer
+ // (as a result of BamReader::GetNextAlignmentCore())
+ if ( al.SupportData.HasCoreOnly )
+ WriteCoreAlignment(al);
+
+ // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
+ // (resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code)
+ else WriteAlignment(al);
+
+ // if we get here, everything OK
+ return true;
+
+ } catch ( BamException& e ) {
+ m_errorString = e.what();
+ return false;
+ }
+}
+
+void BamWriterPrivate::SetWriteCompressed(bool ok) {
+ // modifying compression is not allowed if BAM file is open
+ if ( !IsOpen() )
+ m_stream.SetWriteCompressed(ok);
+}
+
+void BamWriterPrivate::WriteAlignment(const BamAlignment& al) {
+
+ // calculate char lengths
+ const unsigned int nameLength = al.Name.size() + 1;
+ const unsigned int numCigarOperations = al.CigarData.size();
+ const unsigned int queryLength = al.QueryBases.size();
+ const unsigned int tagDataLength = al.TagData.size();
+
+ // no way to tell if alignment's bin is already defined (there is no default, invalid value)
+ // so we'll go ahead calculate its bin ID before storing
+ const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition());
+
+ // create our packed cigar string
+ string packedCigar;
+ CreatePackedCigar(al.CigarData, packedCigar);
+ const unsigned int packedCigarLength = packedCigar.size();
+
+ // encode the query
+ string encodedQuery;
+ EncodeQuerySequence(al.QueryBases, encodedQuery);
+ const unsigned int encodedQueryLength = encodedQuery.size();
+
+ // write the block size
+ const unsigned int dataBlockSize = nameLength +
+ packedCigarLength +
+ encodedQueryLength +
+ queryLength +
+ tagDataLength;
+ unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize;
+ if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
+ m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
+
+ // assign the BAM core data
+ uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
+ buffer[0] = al.RefID;
+ buffer[1] = al.Position;
+ buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
+ buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
+ buffer[4] = queryLength;
+ buffer[5] = al.MateRefID;
+ buffer[6] = al.MatePosition;
+ buffer[7] = al.InsertSize;
+
+ // swap BAM core endian-ness, if necessary
+ if ( m_isBigEndian ) {
+ for ( int i = 0; i < 8; ++i )
+ BamTools::SwapEndian_32(buffer[i]);
+ }
+
+ // write the BAM core
+ m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
+
+ // write the query name
+ m_stream.Write(al.Name.c_str(), nameLength);
+
+ // write the packed cigar
+ if ( m_isBigEndian ) {
+ char* cigarData = new char[packedCigarLength]();
+ memcpy(cigarData, packedCigar.data(), packedCigarLength);
+ if ( m_isBigEndian ) {
+ for ( size_t i = 0; i < packedCigarLength; ++i )
+ BamTools::SwapEndian_32p(&cigarData[i]);
+ }
+ m_stream.Write(cigarData, packedCigarLength);
+ delete[] cigarData; // TODO: cleanup on Write exception thrown?
+ }
+ else
+ m_stream.Write(packedCigar.data(), packedCigarLength);
+
+ // write the encoded query sequence
+ m_stream.Write(encodedQuery.data(), encodedQueryLength);
+
+ // write the base qualities
+ char* pBaseQualities = (char*)al.Qualities.data();
+ for ( size_t i = 0; i < queryLength; ++i )
+ pBaseQualities[i] -= 33; // FASTQ conversion
+ m_stream.Write(pBaseQualities, queryLength);
+
+ // write the read group tag
+ if ( m_isBigEndian ) {
+
+ char* tagData = new char[tagDataLength]();
+ memcpy(tagData, al.TagData.data(), tagDataLength);
+
+ size_t i = 0;
+ while ( i < tagDataLength ) {
+
+ i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.)
+ const char type = tagData[i]; // get tag type at position i
+ ++i;
+
+ switch ( type ) {
+
+ case(Constants::BAM_TAG_TYPE_ASCII) :
+ case(Constants::BAM_TAG_TYPE_INT8) :
+ case(Constants::BAM_TAG_TYPE_UINT8) :
+ ++i;
+ break;
+
+ case(Constants::BAM_TAG_TYPE_INT16) :
+ case(Constants::BAM_TAG_TYPE_UINT16) :
+ BamTools::SwapEndian_16p(&tagData[i]);
+ i += sizeof(uint16_t);
+ break;
+
+ case(Constants::BAM_TAG_TYPE_FLOAT) :
+ case(Constants::BAM_TAG_TYPE_INT32) :
+ case(Constants::BAM_TAG_TYPE_UINT32) :
+ BamTools::SwapEndian_32p(&tagData[i]);
+ i += sizeof(uint32_t);
+ break;
+
+ case(Constants::BAM_TAG_TYPE_HEX) :
+ case(Constants::BAM_TAG_TYPE_STRING) :
+ // no endian swapping necessary for hex-string/string data
+ while ( tagData[i] )
+ ++i;
+ // increment one more for null terminator
+ ++i;
+ break;
+
+ case(Constants::BAM_TAG_TYPE_ARRAY) :
+
+ {
+ // read array type
+ const char arrayType = tagData[i];
+ ++i;
+
+ // swap endian-ness of number of elements in place, then retrieve for loop
+ BamTools::SwapEndian_32p(&tagData[i]);
+ int32_t numElements;
+ memcpy(&numElements, &tagData[i], sizeof(uint32_t));
+ i += sizeof(uint32_t);
+
+ // swap endian-ness of array elements
+ for ( int j = 0; j < numElements; ++j ) {
+ switch (arrayType) {
+ case (Constants::BAM_TAG_TYPE_INT8) :
+ case (Constants::BAM_TAG_TYPE_UINT8) :
+ // no endian-swapping necessary
+ ++i;
+ break;
+ case (Constants::BAM_TAG_TYPE_INT16) :
+ case (Constants::BAM_TAG_TYPE_UINT16) :
+ BamTools::SwapEndian_16p(&tagData[i]);
+ i += sizeof(uint16_t);
+ break;
+ case (Constants::BAM_TAG_TYPE_FLOAT) :
+ case (Constants::BAM_TAG_TYPE_INT32) :
+ case (Constants::BAM_TAG_TYPE_UINT32) :
+ BamTools::SwapEndian_32p(&tagData[i]);
+ i += sizeof(uint32_t);
+ break;
+ default:
+ delete[] tagData;
+ const string message = string("invalid binary array type: ") + arrayType;
+ throw BamException("BamWriter::SaveAlignment", message);
+ }
+ }
+
+ break;
+ }
+
+ default :
+ delete[] tagData;
+ const string message = string("invalid tag type: ") + type;
+ throw BamException("BamWriter::SaveAlignment", message);
+ }
+ }
+
+ m_stream.Write(tagData, tagDataLength);
+ delete[] tagData; // TODO: cleanup on Write exception thrown?
+ }
+ else
+ m_stream.Write(al.TagData.data(), tagDataLength);
+}
+
+void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al) {
+
+ // write the block size
+ unsigned int blockSize = al.SupportData.BlockLength;
+ if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
+ m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
+
+ // re-calculate bin (in case BamAlignment's position has been previously modified)
+ const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition());
+
+ // assign the BAM core data
+ uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
+ buffer[0] = al.RefID;
+ buffer[1] = al.Position;
+ buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
+ buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
+ buffer[4] = al.SupportData.QuerySequenceLength;
+ buffer[5] = al.MateRefID;
+ buffer[6] = al.MatePosition;
+ buffer[7] = al.InsertSize;
+
+ // swap BAM core endian-ness, if necessary
+ if ( m_isBigEndian ) {
+ for ( int i = 0; i < 8; ++i )
+ BamTools::SwapEndian_32(buffer[i]);
+ }
+
+ // write the BAM core
+ m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
+
+ // write the raw char data
+ m_stream.Write((char*)al.SupportData.AllCharData.data(),
+ al.SupportData.BlockLength-Constants::BAM_CORE_SIZE);
+}
+
+void BamWriterPrivate::WriteMagicNumber(void) {
+ // write BAM file 'magic number'
+ m_stream.Write(Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH);
+}
+
+void BamWriterPrivate::WriteReferences(const BamTools::RefVector& referenceSequences) {
+
+ // write the number of reference sequences
+ uint32_t numReferenceSequences = referenceSequences.size();
+ if ( m_isBigEndian ) BamTools::SwapEndian_32(numReferenceSequences);
+ m_stream.Write((char*)&numReferenceSequences, Constants::BAM_SIZEOF_INT);
+
+ // foreach reference sequence
+ RefVector::const_iterator rsIter = referenceSequences.begin();
+ RefVector::const_iterator rsEnd = referenceSequences.end();
+ for ( ; rsIter != rsEnd; ++rsIter ) {
+
+ // write the reference sequence name length
+ uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
+ if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceSequenceNameLen);
+ m_stream.Write((char*)&referenceSequenceNameLen, Constants::BAM_SIZEOF_INT);
+
+ // write the reference sequence name
+ m_stream.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
+
+ // write the reference sequence length
+ int32_t referenceLength = rsIter->RefLength;
+ if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceLength);
+ m_stream.Write((char*)&referenceLength, Constants::BAM_SIZEOF_INT);
+ }
+}
+
+void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText) {
+
+ // write the SAM header text length
+ uint32_t samHeaderLen = samHeaderText.size();
+ if ( m_isBigEndian ) BamTools::SwapEndian_32(samHeaderLen);
+ m_stream.Write((char*)&samHeaderLen, Constants::BAM_SIZEOF_INT);
+
+ // write the SAM header text
+ if ( samHeaderLen > 0 )
+ m_stream.Write(samHeaderText.data(), samHeaderLen);
+}
--- /dev/null
+// ***************************************************************************
+// BamWriter_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for producing BAM files
+// ***************************************************************************
+
+#ifndef BAMWRITER_P_H
+#define BAMWRITER_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/BamAux.h"
+#include "api/internal/io/BgzfStream_p.h"
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+class BamAlignment;
+
+namespace Internal {
+
+class BamWriterPrivate {
+
+ // ctor & dtor
+ public:
+ BamWriterPrivate(void);
+ ~BamWriterPrivate(void);
+
+ // interface methods
+ public:
+ void Close(void);
+ std::string GetErrorString(void) const;
+ bool IsOpen(void) const;
+ bool Open(const std::string& filename,
+ const std::string& samHeaderText,
+ const BamTools::RefVector& referenceSequences);
+ bool SaveAlignment(const BamAlignment& al);
+ void SetWriteCompressed(bool ok);
+
+ // 'internal' methods
+ public:
+ uint32_t CalculateMinimumBin(const int begin, int end) const;
+ void CreatePackedCigar(const std::vector<BamTools::CigarOp>& cigarOperations, std::string& packedCigar);
+ void EncodeQuerySequence(const std::string& query, std::string& encodedQuery);
+ void WriteAlignment(const BamAlignment& al);
+ void WriteCoreAlignment(const BamAlignment& al);
+ void WriteMagicNumber(void);
+ void WriteReferences(const BamTools::RefVector& referenceSequences);
+ void WriteSamHeaderText(const std::string& samHeaderText);
+
+ // data members
+ private:
+ BgzfStream m_stream;
+ bool m_isBigEndian;
+ std::string m_errorString;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMWRITER_P_H
--- /dev/null
+// ***************************************************************************
+// BamIndexFactory_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides interface for generating BamIndex implementations
+// ***************************************************************************
+
+#include "api/BamAux.h"
+#include "api/internal/index/BamIndexFactory_p.h"
+#include "api/internal/index/BamStandardIndex_p.h"
+#include "api/internal/index/BamToolsIndex_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+using namespace std;
+
+// generates index filename from BAM filename (depending on requested type)
+// if type is unknown, returns empty string
+const string BamIndexFactory::CreateIndexFilename(const string& bamFilename,
+ const BamIndex::IndexType& type)
+{
+ switch ( type ) {
+ case ( BamIndex::STANDARD ) : return ( bamFilename + BamStandardIndex::Extension() );
+ case ( BamIndex::BAMTOOLS ) : return ( bamFilename + BamToolsIndex::Extension() );
+ default :
+ return string();
+ }
+}
+
+// creates a new BamIndex object, depending on extension of @indexFilename
+BamIndex* BamIndexFactory::CreateIndexFromFilename(const string& indexFilename, BamReaderPrivate* reader) {
+
+ // if file doesn't exist, return null index
+ if ( !BamTools::FileExists(indexFilename) )
+ return 0;
+
+ // get file extension from index filename, including dot (".EXT")
+ // if can't get file extension, return null index
+ const string extension = FileExtension(indexFilename);
+ if ( extension.empty() )
+ return 0;
+
+ // create index based on extension
+ if ( extension == BamStandardIndex::Extension() ) return new BamStandardIndex(reader);
+ else if ( extension == BamToolsIndex::Extension() ) return new BamToolsIndex(reader);
+ else
+ return 0;
+}
+
+// creates a new BamIndex, object of requested @type
+BamIndex* BamIndexFactory::CreateIndexOfType(const BamIndex::IndexType& type,
+ BamReaderPrivate* reader)
+{
+ switch ( type ) {
+ case ( BamIndex::STANDARD ) : return new BamStandardIndex(reader);
+ case ( BamIndex::BAMTOOLS ) : return new BamToolsIndex(reader);
+ default :
+ return 0;
+ }
+}
+
+// retrieves file extension (including '.')
+const string BamIndexFactory::FileExtension(const string& filename) {
+
+ // if filename cannot contain valid path + extension, return empty string
+ if ( filename.empty() || filename.length() <= 4 )
+ return string();
+
+ // look for last dot in filename
+ const size_t lastDotPosition = filename.find_last_of('.');
+
+ // if none found, return empty string
+ if ( lastDotPosition == string::npos )
+ return string();
+
+ // return substring from last dot position
+ return filename.substr(lastDotPosition);
+}
+
+// returns name of existing index file that corresponds to @bamFilename
+// will defer to @preferredType if possible, if not will attempt to load any supported type
+// returns empty string if not found
+const string BamIndexFactory::FindIndexFilename(const string& bamFilename,
+ const BamIndex::IndexType& preferredType)
+{
+ // skip if BAM filename provided is empty
+ if ( bamFilename.empty() )
+ return string();
+
+ // try to find index of preferred type first
+ // return index filename if found
+ string indexFilename = CreateIndexFilename(bamFilename, preferredType);
+ if ( !indexFilename.empty() && BamTools::FileExists(indexFilename) )
+ return indexFilename;
+
+ // couldn't find preferred type, try the other supported types
+ // return index filename if found
+ if ( preferredType != BamIndex::STANDARD ) {
+ indexFilename = CreateIndexFilename(bamFilename, BamIndex::STANDARD);
+ if ( !indexFilename.empty() && BamTools::FileExists(indexFilename) )
+ return indexFilename;
+ }
+ if ( preferredType != BamIndex::BAMTOOLS ) {
+ indexFilename = CreateIndexFilename(bamFilename, BamIndex::BAMTOOLS);
+ if ( !indexFilename.empty() && BamTools::FileExists(indexFilename) )
+ return indexFilename;
+ }
+
+ // otherwise couldn't find any index matching this filename
+ return string();
+}
--- /dev/null
+// ***************************************************************************
+// BamIndexFactory_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides interface for generating BamIndex implementations
+// ***************************************************************************
+
+#ifndef BAMINDEX_FACTORY_P_H
+#define BAMINDEX_FACTORY_P_H
+
+#include "api/BamIndex.h"
+#include <string>
+
+namespace BamTools {
+namespace Internal {
+
+class BamIndexFactory {
+
+ // static interface methods
+ public:
+ // creates a new BamIndex object, depending on extension of @indexFilename
+ static BamIndex* CreateIndexFromFilename(const std::string& indexFilename,
+ BamReaderPrivate* reader);
+ // creates a new BamIndex object, of requested @type
+ static BamIndex* CreateIndexOfType(const BamIndex::IndexType& type,
+ BamReaderPrivate* reader);
+ // returns name of existing index file that corresponds to @bamFilename
+ // will defer to @preferredType if possible
+ // if @preferredType not found, will attempt to load any supported index type
+ // returns empty string if no index file (of any type) is found
+ static const std::string FindIndexFilename(const std::string& bamFilename,
+ const BamIndex::IndexType& preferredType);
+
+ // internal methods
+ public:
+ // generates index filename from BAM filename (depending on requested type)
+ // if type is unknown, returns empty string
+ static const std::string CreateIndexFilename(const std::string& bamFilename,
+ const BamIndex::IndexType& type);
+ // retrieves file extension (including '.')
+ static const std::string FileExtension(const std::string& filename);
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMINDEX_FACTORY_P_H
--- /dev/null
+// ***************************************************************************
+// BamStandardIndex.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the standardized BAM index format (".bai")
+// ***************************************************************************
+
+#include "api/BamAlignment.h"
+#include "api/internal/bam/BamReader_p.h"
+#include "api/internal/index/BamStandardIndex_p.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <algorithm>
+#include <sstream>
+using namespace std;
+
+// -----------------------------------
+// static BamStandardIndex constants
+// -----------------------------------
+
+const int BamStandardIndex::MAX_BIN = 37450; // =(8^6-1)/7+1
+const int BamStandardIndex::BAM_LIDX_SHIFT = 14;
+const string BamStandardIndex::BAI_EXTENSION = ".bai";
+const char* const BamStandardIndex::BAI_MAGIC = "BAI\1";
+const int BamStandardIndex::SIZEOF_ALIGNMENTCHUNK = sizeof(uint64_t)*2;
+const int BamStandardIndex::SIZEOF_BINCORE = sizeof(uint32_t) + sizeof(int32_t);
+const int BamStandardIndex::SIZEOF_LINEAROFFSET = sizeof(uint64_t);
+
+// ----------------------------
+// RaiiWrapper implementation
+// ----------------------------
+
+BamStandardIndex::RaiiWrapper::RaiiWrapper(void)
+ : IndexStream(0)
+ , Buffer(0)
+{ }
+
+BamStandardIndex::RaiiWrapper::~RaiiWrapper(void) {
+
+ if ( IndexStream ) {
+ fclose(IndexStream);
+ IndexStream = 0;
+ }
+
+ if ( Buffer ) {
+ delete[] Buffer;
+ Buffer = 0;
+ }
+}
+
+// ---------------------------------
+// BamStandardIndex implementation
+// ---------------------------------
+
+// ctor
+BamStandardIndex::BamStandardIndex(Internal::BamReaderPrivate* reader)
+ : BamIndex(reader)
+ , m_bufferLength(0)
+{
+ m_isBigEndian = BamTools::SystemIsBigEndian();
+}
+
+// dtor
+BamStandardIndex::~BamStandardIndex(void) {
+ CloseFile();
+}
+
+void BamStandardIndex::AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end) {
+
+ // retrieve references from reader
+ const RefVector& references = m_reader->GetReferenceData();
+
+ // LeftPosition cannot be greater than or equal to reference length
+ if ( region.LeftPosition >= references.at(region.LeftRefID).RefLength )
+ throw BamException("BamStandardIndex::AdjustRegion", "invalid region requested");
+
+ // set region 'begin'
+ begin = (unsigned int)region.LeftPosition;
+
+ // if right bound specified AND left&right bounds are on same reference
+ // OK to use right bound position as region 'end'
+ if ( region.isRightBoundSpecified() && ( region.LeftRefID == region.RightRefID ) )
+ end = (unsigned int)region.RightPosition;
+
+ // otherwise, set region 'end' to last reference base
+ else end = (unsigned int)references.at(region.LeftRefID).RefLength;
+}
+
+// [begin, end)
+void BamStandardIndex::CalculateCandidateBins(const uint32_t& begin,
+ const uint32_t& end,
+ set<uint16_t>& candidateBins)
+{
+ // initialize list, bin '0' is always a valid bin
+ candidateBins.insert(0);
+
+ // get rest of bins that contain this region
+ unsigned int k;
+ for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { candidateBins.insert(k); }
+ for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { candidateBins.insert(k); }
+ for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { candidateBins.insert(k); }
+ for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { candidateBins.insert(k); }
+ for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { candidateBins.insert(k); }
+}
+
+void BamStandardIndex::CalculateCandidateOffsets(const BaiReferenceSummary& refSummary,
+ const uint64_t& minOffset,
+ set<uint16_t>& candidateBins,
+ vector<int64_t>& offsets)
+{
+ // seek to first bin
+ Seek(refSummary.FirstBinFilePosition, SEEK_SET);
+
+ // iterate over reference bins
+ uint32_t binId;
+ int32_t numAlignmentChunks;
+ set<uint16_t>::iterator candidateBinIter;
+ for ( int i = 0; i < refSummary.NumBins; ++i ) {
+
+ // read bin contents (if successful, alignment chunks are now in m_buffer)
+ ReadBinIntoBuffer(binId, numAlignmentChunks);
+
+ // see if bin is a 'candidate bin'
+ candidateBinIter = candidateBins.find(binId);
+
+ // if not, move on to next bin
+ if ( candidateBinIter == candidateBins.end() )
+ continue;
+
+ // otherwise, check bin's contents against for overlap
+ else {
+
+ size_t offset = 0;
+ uint64_t chunkStart;
+ uint64_t chunkStop;
+
+ // iterate over alignment chunks
+ for ( int j = 0; j < numAlignmentChunks; ++j ) {
+
+ // read chunk start & stop from buffer
+ memcpy((char*)&chunkStart, Resources.Buffer+offset, sizeof(uint64_t));
+ offset += sizeof(uint64_t);
+ memcpy((char*)&chunkStop, Resources.Buffer+offset, sizeof(uint64_t));
+ offset += sizeof(uint64_t);
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_64(chunkStart);
+ SwapEndian_64(chunkStop);
+ }
+
+ // store alignment chunk's start offset
+ // if its stop offset is larger than our 'minOffset'
+ if ( chunkStop >= minOffset )
+ offsets.push_back(chunkStart);
+ }
+
+ // 'pop' bin ID from candidate bins set
+ candidateBins.erase(candidateBinIter);
+
+ // quit if no more candidates
+ if ( candidateBins.empty() )
+ break;
+ }
+ }
+}
+
+uint64_t BamStandardIndex::CalculateMinOffset(const BaiReferenceSummary& refSummary,
+ const uint32_t& begin)
+{
+ // if no linear offsets exist, return 0
+ if ( refSummary.NumLinearOffsets == 0 )
+ return 0;
+
+ // if 'begin' starts beyond last linear offset, use the last linear offset as minimum
+ // else use the offset corresponding to the requested start position
+ const int shiftedBegin = begin>>BamStandardIndex::BAM_LIDX_SHIFT;
+ if ( shiftedBegin >= refSummary.NumLinearOffsets )
+ return LookupLinearOffset( refSummary, refSummary.NumLinearOffsets-1 );
+ else
+ return LookupLinearOffset( refSummary, shiftedBegin );
+}
+
+void BamStandardIndex::CheckBufferSize(char*& buffer,
+ unsigned int& bufferLength,
+ const unsigned int& requestedBytes)
+{
+ try {
+ if ( requestedBytes > bufferLength ) {
+ bufferLength = requestedBytes + 10;
+ delete[] buffer;
+ buffer = new char[bufferLength];
+ }
+ } catch ( std::bad_alloc& ) {
+ stringstream s("");
+ s << "out of memory when allocating " << requestedBytes << " bytes";
+ throw BamException("BamStandardIndex::CheckBufferSize", s.str());
+ }
+}
+
+void BamStandardIndex::CheckBufferSize(unsigned char*& buffer,
+ unsigned int& bufferLength,
+ const unsigned int& requestedBytes)
+{
+ try {
+ if ( requestedBytes > bufferLength ) {
+ bufferLength = requestedBytes + 10;
+ delete[] buffer;
+ buffer = new unsigned char[bufferLength];
+ }
+ } catch ( std::bad_alloc& ) {
+ stringstream s("");
+ s << "out of memory when allocating " << requestedBytes << " bytes";
+ throw BamException("BamStandardIndex::CheckBufferSize", s.str());
+ }
+}
+
+void BamStandardIndex::CheckMagicNumber(void) {
+
+ // check 'magic number' to see if file is BAI index
+ char magic[4];
+ const size_t elementsRead = fread(magic, sizeof(char), 4, Resources.IndexStream);
+ if ( elementsRead != 4 )
+ throw BamException("BamStandardIndex::CheckMagicNumber", "could not read BAI magic number");
+
+ // compare to expected value
+ if ( strncmp(magic, BamStandardIndex::BAI_MAGIC, 4) != 0 )
+ throw BamException("BamStandardIndex::CheckMagicNumber", "invalid BAI magic number");
+}
+
+void BamStandardIndex::ClearReferenceEntry(BaiReferenceEntry& refEntry) {
+ refEntry.ID = -1;
+ refEntry.Bins.clear();
+ refEntry.LinearOffsets.clear();
+}
+
+void BamStandardIndex::CloseFile(void) {
+
+ // close file stream
+ if ( IsFileOpen() ) {
+ fclose(Resources.IndexStream);
+ Resources.IndexStream = 0;
+ }
+
+ // clear index file summary data
+ m_indexFileSummary.clear();
+
+ // clean up I/O buffer
+ delete[] Resources.Buffer;
+ Resources.Buffer = 0;
+ m_bufferLength = 0;
+}
+
+// builds index from associated BAM file & writes out to index file
+bool BamStandardIndex::Create(void) {
+
+ // skip if BamReader is invalid or not open
+ if ( m_reader == 0 || !m_reader->IsOpen() ) {
+ SetErrorString("BamStandardIndex::Create", "could not create index: reader is not open");
+ return false;
+ }
+
+ // rewind BamReader
+ if ( !m_reader->Rewind() ) {
+ const string readerError = m_reader->GetErrorString();
+ const string message = "could not create index: \n\t" + readerError;
+ SetErrorString("BamStandardIndex::Create", message);
+ return false;
+ }
+
+ try {
+
+ // open new index file (read & write)
+ string indexFilename = m_reader->Filename() + Extension();
+ OpenFile(indexFilename, "w+b");
+
+ // initialize BaiFileSummary with number of references
+ const int& numReferences = m_reader->GetReferenceCount();
+ ReserveForSummary(numReferences);
+
+ // initialize output file
+ WriteHeader();
+
+ // set up bin, ID, offset, & coordinate markers
+ const uint32_t defaultValue = 0xffffffffu;
+ uint32_t currentBin = defaultValue;
+ uint32_t lastBin = defaultValue;
+ int32_t currentRefID = defaultValue;
+ int32_t lastRefID = defaultValue;
+ uint64_t currentOffset = (uint64_t)m_reader->Tell();
+ uint64_t lastOffset = currentOffset;
+ int32_t lastPosition = defaultValue;
+
+ // iterate through alignments in BAM file
+ BamAlignment al;
+ BaiReferenceEntry refEntry;
+ while ( m_reader->LoadNextAlignment(al) ) {
+
+ // changed to new reference
+ if ( lastRefID != al.RefID ) {
+
+ // if not first reference, save previous reference data
+ if ( lastRefID != (int32_t)defaultValue ) {
+
+ SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset);
+ WriteReferenceEntry(refEntry);
+ ClearReferenceEntry(refEntry);
+
+ // write any empty references between (but *NOT* including) lastRefID & al.RefID
+ for ( int i = lastRefID+1; i < al.RefID; ++i ) {
+ BaiReferenceEntry emptyEntry(i);
+ WriteReferenceEntry(emptyEntry);
+ }
+
+ // update bin markers
+ currentOffset = lastOffset;
+ currentBin = al.Bin;
+ lastBin = al.Bin;
+ currentRefID = al.RefID;
+ }
+
+ // otherwise, this is first pass
+ // be sure to write any empty references up to (but *NOT* including) current RefID
+ else {
+ for ( int i = 0; i < al.RefID; ++i ) {
+ BaiReferenceEntry emptyEntry(i);
+ WriteReferenceEntry(emptyEntry);
+ }
+ }
+
+ // update reference markers
+ refEntry.ID = al.RefID;
+ lastRefID = al.RefID;
+ lastBin = defaultValue;
+ }
+
+ // if lastPosition greater than current alignment position - file not sorted properly
+ else if ( lastPosition > al.Position ) {
+ stringstream s("");
+ s << "BAM file is not properly sorted by coordinate" << endl
+ << "Current alignment position: " << al.Position
+ << " < previous alignment position: " << lastPosition
+ << " on reference ID: " << al.RefID << endl;
+ SetErrorString("BamStandardIndex::Create", s.str());
+ return false;
+ }
+
+ // if alignment's ref ID is valid & its bin is not a 'leaf'
+ if ( (al.RefID >= 0) && (al.Bin < 4681) )
+ SaveLinearOffsetEntry(refEntry.LinearOffsets, al.Position, al.GetEndPosition(), lastOffset);
+
+ // changed to new BAI bin
+ if ( al.Bin != lastBin ) {
+
+ // if not first bin on reference, save previous bin data
+ if ( currentBin != defaultValue )
+ SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset);
+
+ // update markers
+ currentOffset = lastOffset;
+ currentBin = al.Bin;
+ lastBin = al.Bin;
+ currentRefID = al.RefID;
+
+ // if invalid RefID, break out
+ if ( currentRefID < 0 )
+ break;
+ }
+
+ // make sure that current file pointer is beyond lastOffset
+ if ( m_reader->Tell() <= (int64_t)lastOffset ) {
+ SetErrorString("BamStandardIndex::Create", "calculating offsets failed");
+ return false;
+ }
+
+ // update lastOffset & lastPosition
+ lastOffset = m_reader->Tell();
+ lastPosition = al.Position;
+ }
+
+ // after finishing alignments, if any data was read, check:
+ if ( currentRefID >= 0 ) {
+
+ // store last alignment chunk to its bin, then write last reference entry with data
+ SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset);
+ WriteReferenceEntry(refEntry);
+
+ // then write any empty references remaining at end of file
+ for ( int i = currentRefID+1; i < numReferences; ++i ) {
+ BaiReferenceEntry emptyEntry(i);
+ WriteReferenceEntry(emptyEntry);
+ }
+ }
+
+ } catch ( BamException& e) {
+ m_errorString = e.what();
+ return false;
+ }
+
+ // rewind BamReader
+ if ( !m_reader->Rewind() ) {
+ const string readerError = m_reader->GetErrorString();
+ const string message = "could not create index: \n\t" + readerError;
+ SetErrorString("BamStandardIndex::Create", message);
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+// returns format's file extension
+const string BamStandardIndex::Extension(void) {
+ return BamStandardIndex::BAI_EXTENSION;
+}
+
+void BamStandardIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
+
+ // cannot calculate offsets if unknown/invalid reference ID requested
+ if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() )
+ throw BamException("BamStandardIndex::GetOffset", "invalid reference ID requested");
+
+ // retrieve index summary for left bound reference
+ const BaiReferenceSummary& refSummary = m_indexFileSummary.at(region.LeftRefID);
+
+ // set up region boundaries based on actual BamReader data
+ uint32_t begin;
+ uint32_t end;
+ AdjustRegion(region, begin, end);
+
+ // retrieve all candidate bin IDs for region
+ set<uint16_t> candidateBins;
+ CalculateCandidateBins(begin, end, candidateBins);
+
+ // use reference's linear offsets to calculate the minimum offset
+ // that must be considered to find overlap
+ const uint64_t& minOffset = CalculateMinOffset(refSummary, begin);
+
+ // attempt to use reference summary, minOffset, & candidateBins to calculate offsets
+ // no data should not be error, just bail
+ vector<int64_t> offsets;
+ CalculateCandidateOffsets(refSummary, minOffset, candidateBins, offsets);
+ if ( offsets.empty() )
+ return;
+
+ // ensure that offsets are sorted before processing
+ sort( offsets.begin(), offsets.end() );
+
+ // binary search for an overlapping block (may not be first one though)
+ BamAlignment al;
+ typedef vector<int64_t>::const_iterator OffsetConstIterator;
+ OffsetConstIterator offsetFirst = offsets.begin();
+ OffsetConstIterator offsetIter = offsetFirst;
+ OffsetConstIterator offsetLast = offsets.end();
+ iterator_traits<OffsetConstIterator>::difference_type count = distance(offsetFirst, offsetLast);
+ iterator_traits<OffsetConstIterator>::difference_type step;
+ while ( count > 0 ) {
+ offsetIter = offsetFirst;
+ step = count/2;
+ advance(offsetIter, step);
+
+ // attempt seek to candidate offset
+ const int64_t& candidateOffset = (*offsetIter);
+ if ( !m_reader->Seek(candidateOffset) ) {
+ const string readerError = m_reader->GetErrorString();
+ const string message = "could not seek in BAM file: \n\t" + readerError;
+ throw BamException("BamToolsIndex::GetOffset", message);
+ }
+
+ // load first available alignment, setting flag to true if data exists
+ *hasAlignmentsInRegion = m_reader->LoadNextAlignment(al);
+
+ // check alignment against region
+ if ( al.GetEndPosition() <= region.LeftPosition ) {
+ offsetFirst = ++offsetIter;
+ count -= step+1;
+ } else count = step;
+ }
+
+ // step back to the offset before the 'current offset' (to make sure we cover overlaps)
+ if ( offsetIter != offsets.begin() )
+ --offsetIter;
+ offset = (*offsetIter);
+}
+
+// returns whether reference has alignments or no
+bool BamStandardIndex::HasAlignments(const int& referenceID) const {
+ if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() )
+ return false;
+ const BaiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID);
+ return ( refSummary.NumBins > 0 );
+}
+
+bool BamStandardIndex::IsFileOpen(void) const {
+ return ( Resources.IndexStream != 0 );
+}
+
+// attempts to use index data to jump to @region, returns success/fail
+// a "successful" jump indicates no error, but not whether this region has data
+// * thus, the method sets a flag to indicate whether there are alignments
+// available after the jump position
+bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
+
+ // clear out flag
+ *hasAlignmentsInRegion = false;
+
+ // skip if invalid reader or not open
+ if ( m_reader == 0 || !m_reader->IsOpen() ) {
+ SetErrorString("BamStandardIndex::Jump", "could not jump: reader is not open");
+ return false;
+ }
+
+ // calculate nearest offset to jump to
+ int64_t offset;
+ try {
+ GetOffset(region, offset, hasAlignmentsInRegion);
+ } catch ( BamException& e ) {
+ m_errorString = e.what();
+ return false;
+ }
+
+ // if region has alignments, return success/fail of seeking there
+ if ( *hasAlignmentsInRegion )
+ return m_reader->Seek(offset);
+
+ // otherwise, simply return true (but hasAlignmentsInRegion flag has been set to false)
+ // (this is OK, BamReader will check this flag before trying to load data)
+ return true;
+}
+
+// loads existing data from file into memory
+bool BamStandardIndex::Load(const std::string& filename) {
+
+ try {
+
+ // attempt to open file (read-only)
+ OpenFile(filename, "rb");
+
+ // validate format
+ CheckMagicNumber();
+
+ // load in-memory summary of index data
+ SummarizeIndexFile();
+
+ // return success
+ return true;
+
+ } catch ( BamException& e ) {
+ m_errorString = e.what();
+ return false;
+ }
+}
+
+uint64_t BamStandardIndex::LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index) {
+
+ // attempt seek to proper index file position
+ const int64_t linearOffsetFilePosition = (int64_t)refSummary.FirstLinearOffsetFilePosition +
+ index*BamStandardIndex::SIZEOF_LINEAROFFSET;
+ Seek(linearOffsetFilePosition, SEEK_SET);
+
+ // read linear offset from BAI file
+ uint64_t linearOffset;
+ ReadLinearOffset(linearOffset);
+ return linearOffset;
+}
+
+void BamStandardIndex::MergeAlignmentChunks(BaiAlignmentChunkVector& chunks) {
+
+ // skip if chunks are empty, nothing to merge
+ if ( chunks.empty() )
+ return;
+
+ // set up merged alignment chunk container
+ BaiAlignmentChunkVector mergedChunks;
+ mergedChunks.push_back( chunks[0] );
+
+ // iterate over chunks
+ int i = 0;
+ BaiAlignmentChunkVector::iterator chunkIter = chunks.begin();
+ BaiAlignmentChunkVector::iterator chunkEnd = chunks.end();
+ for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
+
+ // get 'currentMergeChunk' based on numeric index
+ BaiAlignmentChunk& currentMergeChunk = mergedChunks[i];
+
+ // get sourceChunk based on source vector iterator
+ BaiAlignmentChunk& sourceChunk = (*chunkIter);
+
+ // if currentMergeChunk ends where sourceChunk starts, then merge the two
+ if ( currentMergeChunk.Stop>>16 == sourceChunk.Start>>16 )
+ currentMergeChunk.Stop = sourceChunk.Stop;
+
+ // otherwise
+ else {
+ // append sourceChunk after currentMergeChunk
+ mergedChunks.push_back(sourceChunk);
+
+ // update i, so the next iteration will consider the
+ // recently-appended sourceChunk as new mergeChunk candidate
+ ++i;
+ }
+ }
+
+ // saved newly-merged chunks into (parameter) chunks
+ chunks = mergedChunks;
+}
+
+void BamStandardIndex::OpenFile(const std::string& filename, const char* mode) {
+
+ // make sure any previous index file is closed
+ CloseFile();
+
+ // attempt to open file
+ Resources.IndexStream = fopen(filename.c_str(), mode);
+ if ( !IsFileOpen() ) {
+ const string message = string("could not open file: ") + filename;
+ throw BamException("BamStandardIndex::OpenFile", message);
+ }
+}
+
+void BamStandardIndex::ReadBinID(uint32_t& binId) {
+ const size_t elementsRead = fread(&binId, sizeof(binId), 1, Resources.IndexStream);
+ if ( m_isBigEndian ) SwapEndian_32(binId);
+ if ( elementsRead != 1 )
+ throw BamException("BamStandardIndex::ReadBinID", "could not read BAI bin ID");
+}
+
+void BamStandardIndex::ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks) {
+
+ // read bin header
+ ReadBinID(binId);
+ ReadNumAlignmentChunks(numAlignmentChunks);
+
+ // read bin contents
+ const unsigned int bytesRequested = numAlignmentChunks*BamStandardIndex::SIZEOF_ALIGNMENTCHUNK;
+ ReadIntoBuffer(bytesRequested);
+}
+
+void BamStandardIndex::ReadIntoBuffer(const unsigned int& bytesRequested) {
+
+ // ensure that our buffer is big enough for request
+ BamStandardIndex::CheckBufferSize(Resources.Buffer, m_bufferLength, bytesRequested);
+
+ // read from BAI file stream
+ const size_t bytesRead = fread( Resources.Buffer, sizeof(char), bytesRequested, Resources.IndexStream );
+ if ( bytesRead != (size_t)bytesRequested ) {
+ stringstream s("");
+ s << "expected to read: " << bytesRequested << " bytes, "
+ << "but instead read: " << bytesRead;
+ throw BamException("BamStandardIndex::ReadIntoBuffer", s.str());
+ }
+}
+
+void BamStandardIndex::ReadLinearOffset(uint64_t& linearOffset) {
+ const size_t elementsRead = fread(&linearOffset, sizeof(linearOffset), 1, Resources.IndexStream);
+ if ( m_isBigEndian ) SwapEndian_64(linearOffset);
+ if ( elementsRead != 1 )
+ throw BamException("BamStandardIndex::ReadLinearOffset", "could not read BAI linear offset");
+}
+
+void BamStandardIndex::ReadNumAlignmentChunks(int& numAlignmentChunks) {
+ const size_t elementsRead = fread(&numAlignmentChunks, sizeof(numAlignmentChunks), 1, Resources.IndexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numAlignmentChunks);
+ if ( elementsRead != 1 )
+ throw BamException("BamStandardIndex::ReadNumAlignmentChunks", "could not read BAI chunk count");
+}
+
+void BamStandardIndex::ReadNumBins(int& numBins) {
+ const size_t elementsRead = fread(&numBins, sizeof(numBins), 1, Resources.IndexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numBins);
+ if ( elementsRead != 1 )
+ throw BamException("BamStandardIndex::ReadNumBins", "could not read BAI bin count");
+}
+
+void BamStandardIndex::ReadNumLinearOffsets(int& numLinearOffsets) {
+ const size_t elementsRead = fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, Resources.IndexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets);
+ if ( elementsRead != 1 )
+ throw BamException("BamStandardIndex::ReadNumAlignmentChunks", "could not read BAI linear offset count");
+}
+
+void BamStandardIndex::ReadNumReferences(int& numReferences) {
+ const size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, Resources.IndexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numReferences);
+ if ( elementsRead != 1 )
+ throw BamException("BamStandardIndex::ReadNumReferences", "could not read reference count");
+}
+
+void BamStandardIndex::ReserveForSummary(const int& numReferences) {
+ m_indexFileSummary.clear();
+ m_indexFileSummary.assign( numReferences, BaiReferenceSummary() );
+}
+
+void BamStandardIndex::SaveAlignmentChunkToBin(BaiBinMap& binMap,
+ const uint32_t& currentBin,
+ const uint64_t& currentOffset,
+ const uint64_t& lastOffset)
+{
+ // create new alignment chunk
+ BaiAlignmentChunk newChunk(currentOffset, lastOffset);
+
+ // if no entry exists yet for this bin, create one and store alignment chunk
+ BaiBinMap::iterator binIter = binMap.find(currentBin);
+ if ( binIter == binMap.end() ) {
+ BaiAlignmentChunkVector newChunks;
+ newChunks.push_back(newChunk);
+ binMap.insert( pair<uint32_t, BaiAlignmentChunkVector>(currentBin, newChunks));
+ }
+
+ // otherwise, just append alignment chunk
+ else {
+ BaiAlignmentChunkVector& binChunks = (*binIter).second;
+ binChunks.push_back( newChunk );
+ }
+}
+
+void BamStandardIndex::SaveBinsSummary(const int& refId, const int& numBins) {
+ BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId);
+ refSummary.NumBins = numBins;
+ refSummary.FirstBinFilePosition = Tell();
+}
+
+void BamStandardIndex::SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets,
+ const int& alignmentStartPosition,
+ const int& alignmentStopPosition,
+ const uint64_t& lastOffset)
+{
+ // get converted offsets
+ const int beginOffset = alignmentStartPosition >> BamStandardIndex::BAM_LIDX_SHIFT;
+ const int endOffset = (alignmentStopPosition - 1) >> BamStandardIndex::BAM_LIDX_SHIFT;
+
+ // resize vector if necessary
+ int oldSize = offsets.size();
+ int newSize = endOffset + 1;
+ if ( oldSize < newSize )
+ offsets.resize(newSize, 0);
+
+ // store offset
+ for( int i = beginOffset + 1; i <= endOffset; ++i ) {
+ if ( offsets[i] == 0 )
+ offsets[i] = lastOffset;
+ }
+}
+
+void BamStandardIndex::SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets) {
+ BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId);
+ refSummary.NumLinearOffsets = numLinearOffsets;
+ refSummary.FirstLinearOffsetFilePosition = Tell();
+}
+
+// seek to position in index file stream
+void BamStandardIndex::Seek(const int64_t& position, const int& origin) {
+ if ( fseek64(Resources.IndexStream, position, origin) != 0 )
+ throw BamException("BamStandardIndex::Seek", "could not seek in BAI file");
+}
+
+void BamStandardIndex::SkipBins(const int& numBins) {
+ uint32_t binId;
+ int32_t numAlignmentChunks;
+ for (int i = 0; i < numBins; ++i)
+ ReadBinIntoBuffer(binId, numAlignmentChunks); // results & buffer ignored
+}
+
+void BamStandardIndex::SkipLinearOffsets(const int& numLinearOffsets) {
+ const unsigned int bytesRequested = numLinearOffsets*BamStandardIndex::SIZEOF_LINEAROFFSET;
+ ReadIntoBuffer(bytesRequested);
+}
+
+void BamStandardIndex::SortLinearOffsets(BaiLinearOffsetVector& linearOffsets) {
+ sort( linearOffsets.begin(), linearOffsets.end() );
+}
+
+void BamStandardIndex::SummarizeBins(BaiReferenceSummary& refSummary) {
+
+ // load number of bins
+ int numBins;
+ ReadNumBins(numBins);
+
+ // store bins summary for this reference
+ refSummary.NumBins = numBins;
+ refSummary.FirstBinFilePosition = Tell();
+
+ // skip this reference's bins
+ SkipBins(numBins);
+}
+
+void BamStandardIndex::SummarizeIndexFile(void) {
+
+ // load number of reference sequences
+ int numReferences;
+ ReadNumReferences(numReferences);
+
+ // initialize file summary data
+ ReserveForSummary(numReferences);
+
+ // iterate over reference entries
+ BaiFileSummary::iterator summaryIter = m_indexFileSummary.begin();
+ BaiFileSummary::iterator summaryEnd = m_indexFileSummary.end();
+ for ( int i = 0; summaryIter != summaryEnd; ++summaryIter, ++i )
+ SummarizeReference(*summaryIter);
+}
+
+void BamStandardIndex::SummarizeLinearOffsets(BaiReferenceSummary& refSummary) {
+
+ // load number of linear offsets
+ int numLinearOffsets;
+ ReadNumLinearOffsets(numLinearOffsets);
+
+ // store bin summary data for this reference
+ refSummary.NumLinearOffsets = numLinearOffsets;
+ refSummary.FirstLinearOffsetFilePosition = Tell();
+
+ // skip linear offsets in index file
+ SkipLinearOffsets(numLinearOffsets);
+}
+
+void BamStandardIndex::SummarizeReference(BaiReferenceSummary& refSummary) {
+ SummarizeBins(refSummary);
+ SummarizeLinearOffsets(refSummary);
+}
+
+// return position of file pointer in index file stream
+int64_t BamStandardIndex::Tell(void) const {
+ return ftell64(Resources.IndexStream);
+}
+
+void BamStandardIndex::WriteAlignmentChunk(const BaiAlignmentChunk& chunk) {
+
+ // localize alignment chunk offsets
+ uint64_t start = chunk.Start;
+ uint64_t stop = chunk.Stop;
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_64(start);
+ SwapEndian_64(stop);
+ }
+
+ // write to index file
+ size_t elementsWritten = 0;
+ elementsWritten += fwrite(&start, sizeof(start), 1, Resources.IndexStream);
+ elementsWritten += fwrite(&stop, sizeof(stop), 1, Resources.IndexStream);
+ if ( elementsWritten != 2 )
+ throw BamException("BamStandardIndex::WriteAlignmentChunk", "could not write BAI alignment chunk");
+}
+
+void BamStandardIndex::WriteAlignmentChunks(BaiAlignmentChunkVector& chunks) {
+
+ // make sure chunks are merged (simplified) before writing & saving summary
+ MergeAlignmentChunks(chunks);
+
+ // write chunks
+ int32_t chunkCount = chunks.size();
+ if ( m_isBigEndian ) SwapEndian_32(chunkCount);
+ const size_t elementsWritten = fwrite(&chunkCount, sizeof(chunkCount), 1, Resources.IndexStream);
+ if ( elementsWritten != 1 )
+ throw BamException("BamStandardIndex::WriteAlignmentChunks", "could not write BAI chunk count");
+
+ // iterate over chunks
+ BaiAlignmentChunkVector::const_iterator chunkIter = chunks.begin();
+ BaiAlignmentChunkVector::const_iterator chunkEnd = chunks.end();
+ for ( ; chunkIter != chunkEnd; ++chunkIter )
+ WriteAlignmentChunk( (*chunkIter) );
+}
+
+void BamStandardIndex::WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks) {
+
+ // write BAM bin ID
+ uint32_t binKey = binId;
+ if ( m_isBigEndian ) SwapEndian_32(binKey);
+ const size_t elementsWritten = fwrite(&binKey, sizeof(binKey), 1, Resources.IndexStream);
+ if ( elementsWritten != 1 )
+ throw BamException("BamStandardIndex::WriteBin", "could not write bin ID");
+
+ // write bin's alignment chunks
+ WriteAlignmentChunks(chunks);
+}
+
+void BamStandardIndex::WriteBins(const int& refId, BaiBinMap& bins) {
+
+ // write number of bins
+ int32_t binCount = bins.size();
+ if ( m_isBigEndian ) SwapEndian_32(binCount);
+ const size_t elementsWritten = fwrite(&binCount, sizeof(binCount), 1, Resources.IndexStream);
+ if ( elementsWritten != 1 )
+ throw BamException("BamStandardIndex::WriteBins", "could not write bin count");
+
+ // save summary for reference's bins
+ SaveBinsSummary(refId, bins.size());
+
+ // iterate over bins
+ BaiBinMap::iterator binIter = bins.begin();
+ BaiBinMap::iterator binEnd = bins.end();
+ for ( ; binIter != binEnd; ++binIter )
+ WriteBin( (*binIter).first, (*binIter).second );
+}
+
+void BamStandardIndex::WriteHeader(void) {
+
+ size_t elementsWritten = 0;
+
+ // write magic number
+ elementsWritten += fwrite(BamStandardIndex::BAI_MAGIC, sizeof(char), 4, Resources.IndexStream);
+
+ // write number of reference sequences
+ int32_t numReferences = m_indexFileSummary.size();
+ if ( m_isBigEndian ) SwapEndian_32(numReferences);
+ elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, Resources.IndexStream);
+
+ if ( elementsWritten != 5 )
+ throw BamException("BamStandardIndex::WriteHeader", "could not write BAI header");
+}
+
+void BamStandardIndex::WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets) {
+
+ // make sure linear offsets are sorted before writing & saving summary
+ SortLinearOffsets(linearOffsets);
+
+ size_t elementsWritten = 0;
+
+ // write number of linear offsets
+ int32_t offsetCount = linearOffsets.size();
+ if ( m_isBigEndian ) SwapEndian_32(offsetCount);
+ elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, Resources.IndexStream);
+
+ // save summary for reference's linear offsets
+ SaveLinearOffsetsSummary(refId, linearOffsets.size());
+
+ // iterate over linear offsets
+ BaiLinearOffsetVector::const_iterator offsetIter = linearOffsets.begin();
+ BaiLinearOffsetVector::const_iterator offsetEnd = linearOffsets.end();
+ for ( ; offsetIter != offsetEnd; ++offsetIter ) {
+
+ // write linear offset
+ uint64_t linearOffset = (*offsetIter);
+ if ( m_isBigEndian ) SwapEndian_64(linearOffset);
+ elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, Resources.IndexStream);
+ }
+
+ if ( elementsWritten != (linearOffsets.size() + 1) )
+ throw BamException("BamStandardIndex::WriteLinearOffsets", "could not write BAI linear offsets");
+}
+
+void BamStandardIndex::WriteReferenceEntry(BaiReferenceEntry& refEntry) {
+ WriteBins(refEntry.ID, refEntry.Bins);
+ WriteLinearOffsets(refEntry.ID, refEntry.LinearOffsets);
+}
--- /dev/null
+// ***************************************************************************
+// BamStandardIndex.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the standardized BAM index format (".bai")
+// ***************************************************************************
+
+#ifndef BAM_STANDARD_INDEX_FORMAT_H
+#define BAM_STANDARD_INDEX_FORMAT_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/BamAux.h"
+#include "api/BamIndex.h"
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+namespace Internal {
+
+// -----------------------------------------------------------------------------
+// BamStandardIndex data structures
+
+// defines start and end of a contiguous run of alignments
+struct BaiAlignmentChunk {
+
+ // data members
+ uint64_t Start;
+ uint64_t Stop;
+
+ // constructor
+ BaiAlignmentChunk(const uint64_t& start = 0,
+ const uint64_t& stop = 0)
+ : Start(start)
+ , Stop(stop)
+ { }
+};
+
+// comparison operator (for sorting)
+inline
+bool operator<(const BaiAlignmentChunk& lhs, const BaiAlignmentChunk& rhs) {
+ return lhs.Start < rhs.Start;
+}
+
+// convenience typedef for a list of all alignment 'chunks' in a BAI bin
+typedef std::vector<BaiAlignmentChunk> BaiAlignmentChunkVector;
+
+// convenience typedef for a map of all BAI bins in a reference (ID => chunks)
+typedef std::map<uint32_t, BaiAlignmentChunkVector> BaiBinMap;
+
+// convenience typedef for a list of all 'linear offsets' in a reference
+typedef std::vector<uint64_t> BaiLinearOffsetVector;
+
+// contains all fields necessary for building, loading, & writing
+// full BAI index data for a single reference
+struct BaiReferenceEntry {
+
+ // data members
+ int32_t ID;
+ BaiBinMap Bins;
+ BaiLinearOffsetVector LinearOffsets;
+
+ // ctor
+ BaiReferenceEntry(const int32_t& id = -1)
+ : ID(id)
+ { }
+};
+
+// provides (persistent) summary of BaiReferenceEntry's index data
+struct BaiReferenceSummary {
+
+ // data members
+ int NumBins;
+ int NumLinearOffsets;
+ uint64_t FirstBinFilePosition;
+ uint64_t FirstLinearOffsetFilePosition;
+
+ // ctor
+ BaiReferenceSummary(void)
+ : NumBins(0)
+ , NumLinearOffsets(0)
+ , FirstBinFilePosition(0)
+ , FirstLinearOffsetFilePosition(0)
+ { }
+};
+
+// convenience typedef for describing a full BAI index file summary
+typedef std::vector<BaiReferenceSummary> BaiFileSummary;
+
+// end BamStandardIndex data structures
+// -----------------------------------------------------------------------------
+
+class BamStandardIndex : public BamIndex {
+
+ // ctor & dtor
+ public:
+ BamStandardIndex(Internal::BamReaderPrivate* reader);
+ ~BamStandardIndex(void);
+
+ // BamIndex implementation
+ public:
+ // builds index from associated BAM file & writes out to index file
+ bool Create(void);
+ // returns whether reference has alignments or no
+ bool HasAlignments(const int& referenceID) const;
+ // attempts to use index data to jump to @region, returns success/fail
+ // a "successful" jump indicates no error, but not whether this region has data
+ // * thus, the method sets a flag to indicate whether there are alignments
+ // available after the jump position
+ bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
+ // loads existing data from file into memory
+ bool Load(const std::string& filename);
+ public:
+ // returns format's file extension
+ static const std::string Extension(void);
+
+ // internal methods
+ private:
+
+ // index file ops
+ void CheckMagicNumber(void);
+ void CloseFile(void);
+ bool IsFileOpen(void) const;
+ void OpenFile(const std::string& filename, const char* mode);
+ void Seek(const int64_t& position, const int& origin);
+ int64_t Tell(void) const;
+
+ // BAI index building methods
+ void ClearReferenceEntry(BaiReferenceEntry& refEntry);
+ void SaveAlignmentChunkToBin(BaiBinMap& binMap,
+ const uint32_t& currentBin,
+ const uint64_t& currentOffset,
+ const uint64_t& lastOffset);
+ void SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets,
+ const int& alignmentStartPosition,
+ const int& alignmentStopPosition,
+ const uint64_t& lastOffset);
+
+ // random-access methods
+ void AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end);
+ void CalculateCandidateBins(const uint32_t& begin,
+ const uint32_t& end,
+ std::set<uint16_t>& candidateBins);
+ void CalculateCandidateOffsets(const BaiReferenceSummary& refSummary,
+ const uint64_t& minOffset,
+ std::set<uint16_t>& candidateBins,
+ std::vector<int64_t>& offsets);
+ uint64_t CalculateMinOffset(const BaiReferenceSummary& refSummary, const uint32_t& begin);
+ void GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion);
+ uint64_t LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index);
+
+ // BAI summary (create/load) methods
+ void ReserveForSummary(const int& numReferences);
+ void SaveBinsSummary(const int& refId, const int& numBins);
+ void SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets);
+ void SkipBins(const int& numBins);
+ void SkipLinearOffsets(const int& numLinearOffsets);
+ void SummarizeBins(BaiReferenceSummary& refSummary);
+ void SummarizeIndexFile(void);
+ void SummarizeLinearOffsets(BaiReferenceSummary& refSummary);
+ void SummarizeReference(BaiReferenceSummary& refSummary);
+
+ // BAI full index input methods
+ void ReadBinID(uint32_t& binId);
+ void ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks);
+ void ReadIntoBuffer(const unsigned int& bytesRequested);
+ void ReadLinearOffset(uint64_t& linearOffset);
+ void ReadNumAlignmentChunks(int& numAlignmentChunks);
+ void ReadNumBins(int& numBins);
+ void ReadNumLinearOffsets(int& numLinearOffsets);
+ void ReadNumReferences(int& numReferences);
+
+ // BAI full index output methods
+ void MergeAlignmentChunks(BaiAlignmentChunkVector& chunks);
+ void SortLinearOffsets(BaiLinearOffsetVector& linearOffsets);
+ void WriteAlignmentChunk(const BaiAlignmentChunk& chunk);
+ void WriteAlignmentChunks(BaiAlignmentChunkVector& chunks);
+ void WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks);
+ void WriteBins(const int& refId, BaiBinMap& bins);
+ void WriteHeader(void);
+ void WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets);
+ void WriteReferenceEntry(BaiReferenceEntry& refEntry);
+
+ // data members
+ private:
+ bool m_isBigEndian;
+ BaiFileSummary m_indexFileSummary;
+
+ // our input buffer
+ unsigned int m_bufferLength;
+
+ struct RaiiWrapper {
+ FILE* IndexStream;
+ char* Buffer;
+ RaiiWrapper(void);
+ ~RaiiWrapper(void);
+ };
+ RaiiWrapper Resources;
+
+ // static methods
+ private:
+ // checks if the buffer is large enough to accomodate the requested size
+ static void CheckBufferSize(char*& buffer,
+ unsigned int& bufferLength,
+ const unsigned int& requestedBytes);
+ // checks if the buffer is large enough to accomodate the requested size
+ static void CheckBufferSize(unsigned char*& buffer,
+ unsigned int& bufferLength,
+ const unsigned int& requestedBytes);
+ // static constants
+ private:
+ static const int MAX_BIN;
+ static const int BAM_LIDX_SHIFT;
+ static const std::string BAI_EXTENSION;
+ static const char* const BAI_MAGIC;
+ static const int SIZEOF_ALIGNMENTCHUNK;
+ static const int SIZEOF_BINCORE;
+ static const int SIZEOF_LINEAROFFSET;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAM_STANDARD_INDEX_FORMAT_H
--- /dev/null
+// ***************************************************************************
+// BamToolsIndex.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the BamTools index format (".bti")
+// ***************************************************************************
+
+#include "api/BamAlignment.h"
+#include "api/internal/bam/BamReader_p.h"
+#include "api/internal/index/BamToolsIndex_p.h"
+#include "api/internal/io/BgzfStream_p.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <map>
+using namespace std;
+
+// --------------------------------
+// static BamToolsIndex constants
+// --------------------------------
+
+const uint32_t BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000;
+const string BamToolsIndex::BTI_EXTENSION = ".bti";
+const char* const BamToolsIndex::BTI_MAGIC = "BTI\1";
+const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t)*2 + sizeof(int64_t);
+
+// ----------------------------
+// RaiiWrapper implementation
+// ----------------------------
+
+BamToolsIndex::RaiiWrapper::RaiiWrapper(void)
+ : IndexStream(0)
+{ }
+
+BamToolsIndex::RaiiWrapper::~RaiiWrapper(void) {
+ if ( IndexStream )
+ fclose(IndexStream);
+}
+
+// ------------------------------
+// BamToolsIndex implementation
+// ------------------------------
+
+// ctor
+BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader)
+ : BamIndex(reader)
+ , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH)
+ , m_inputVersion(0)
+ , m_outputVersion(BTI_2_0) // latest version - used for writing new index files
+{
+ m_isBigEndian = BamTools::SystemIsBigEndian();
+}
+
+// dtor
+BamToolsIndex::~BamToolsIndex(void) {
+ CloseFile();
+}
+
+void BamToolsIndex::CheckMagicNumber(void) {
+
+ // read magic number
+ char magic[4];
+ size_t elementsRead = fread(magic, sizeof(char), 4, Resources.IndexStream);
+ if ( elementsRead != 4 )
+ throw BamException("BamToolsIndex::CheckMagicNumber", "could not read BTI magic number");
+
+ // validate expected magic number
+ if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 )
+ throw BamException("BamToolsIndex::CheckMagicNumber", "invalid BTI magic number");
+}
+
+// check index file version, return true if OK
+void BamToolsIndex::CheckVersion(void) {
+
+ // read version from file
+ size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, Resources.IndexStream);
+ if ( elementsRead != 1 )
+ throw BamException("BamToolsIndex::CheckVersion", "could not read format version");
+ if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
+
+ // if version is negative, or zero
+ if ( m_inputVersion <= 0 )
+ throw BamException("BamToolsIndex::CheckVersion", "invalid format version");
+
+ // if version is newer than can be supported by this version of bamtools
+ else if ( m_inputVersion > m_outputVersion ) {
+ const string message = "unsupported format: this index was created by a newer version of BamTools. "
+ "Update your local version of BamTools to use the index file.";
+ throw BamException("BamToolsIndex::CheckVersion", message);
+ }
+
+ // ------------------------------------------------------------------
+ // check for deprecated, unsupported versions
+ // (the format had to be modified to accomodate a particular bug fix)
+
+ // Version 2.0: introduced support for half-open intervals, instead of the old closed intervals
+ // respondBy: throwing exception - we're not going to try to handle the old BTI files.
+ else if ( (Version)m_inputVersion < BamToolsIndex::BTI_2_0 ) {
+ const string message = "unsupported format: this version of the index may not properly handle "
+ "coordinate intervals. Please run 'bamtools index -bti -in yourData.bam' "
+ "to generate an up-to-date, fixed BTI file.";
+ throw BamException("BamToolsIndex::CheckVersion", message);
+ }
+}
+
+void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) {
+ refEntry.ID = -1;
+ refEntry.Blocks.clear();
+}
+
+void BamToolsIndex::CloseFile(void) {
+ if ( IsFileOpen() ) {
+ fclose(Resources.IndexStream);
+ Resources.IndexStream = 0;
+ }
+ m_indexFileSummary.clear();
+}
+
+// builds index from associated BAM file & writes out to index file
+bool BamToolsIndex::Create(void) {
+
+ // skip if BamReader is invalid or not open
+ if ( m_reader == 0 || !m_reader->IsOpen() ) {
+ SetErrorString("BamToolsIndex::Create", "could not create index: reader is not open");
+ return false;
+ }
+
+ // rewind BamReader
+ if ( !m_reader->Rewind() ) {
+ const string readerError = m_reader->GetErrorString();
+ const string message = "could not create index: \n\t" + readerError;
+ SetErrorString("BamToolsIndex::Create", message);
+ return false;
+ }
+
+ try {
+ // open new index file (read & write)
+ const string indexFilename = m_reader->Filename() + Extension();
+ OpenFile(indexFilename, "w+b");
+
+ // initialize BtiFileSummary with number of references
+ const int& numReferences = m_reader->GetReferenceCount();
+ InitializeFileSummary(numReferences);
+
+ // intialize output file header
+ WriteHeader();
+
+ // index building markers
+ uint32_t currentBlockCount = 0;
+ int64_t currentAlignmentOffset = m_reader->Tell();
+ int32_t blockRefId = -1;
+ int32_t blockMaxEndPosition = -1;
+ int64_t blockStartOffset = currentAlignmentOffset;
+ int32_t blockStartPosition = -1;
+
+ // plow through alignments, storing index entries
+ BamAlignment al;
+ BtiReferenceEntry refEntry;
+ while ( m_reader->LoadNextAlignment(al) ) {
+
+ // if moved to new reference
+ if ( al.RefID != blockRefId ) {
+
+ // if first pass, check:
+ if ( currentBlockCount == 0 ) {
+
+ // write any empty references up to (but not including) al.RefID
+ for ( int i = 0; i < al.RefID; ++i )
+ WriteReferenceEntry( BtiReferenceEntry(i) );
+ }
+
+ // not first pass:
+ else {
+
+ // store previous BTI block data in reference entry
+ const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ refEntry.Blocks.push_back(block);
+
+ // write reference entry, then clear
+ WriteReferenceEntry(refEntry);
+ ClearReferenceEntry(refEntry);
+
+ // write any empty references between (but not including)
+ // the last blockRefID and current al.RefID
+ for ( int i = blockRefId+1; i < al.RefID; ++i )
+ WriteReferenceEntry( BtiReferenceEntry(i) );
+
+ // reset block count
+ currentBlockCount = 0;
+ }
+
+ // set ID for new reference entry
+ refEntry.ID = al.RefID;
+ }
+
+ // if beginning of block, update counters
+ if ( currentBlockCount == 0 ) {
+ blockRefId = al.RefID;
+ blockStartOffset = currentAlignmentOffset;
+ blockStartPosition = al.Position;
+ blockMaxEndPosition = al.GetEndPosition();
+ }
+
+ // increment block counter
+ ++currentBlockCount;
+
+ // check end position
+ const int32_t alignmentEndPosition = al.GetEndPosition();
+ if ( alignmentEndPosition > blockMaxEndPosition )
+ blockMaxEndPosition = alignmentEndPosition;
+
+ // if block is full, get offset for next block, reset currentBlockCount
+ if ( currentBlockCount == m_blockSize ) {
+
+ // store previous block data in reference entry
+ const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ refEntry.Blocks.push_back(block);
+
+ // update markers
+ blockStartOffset = m_reader->Tell();
+ currentBlockCount = 0;
+ }
+
+ // not the best name, but for the next iteration, this value will be the offset of the
+ // *current* alignment. this is necessary because we won't know if this next alignment
+ // is on a new reference until we actually read it
+ currentAlignmentOffset = m_reader->Tell();
+ }
+
+ // after finishing alignments, if any data was read, check:
+ if ( blockRefId >= 0 ) {
+
+ // store last BTI block data in reference entry
+ const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ refEntry.Blocks.push_back(block);
+
+ // write last reference entry, then clear
+ WriteReferenceEntry(refEntry);
+ ClearReferenceEntry(refEntry);
+
+ // then write any empty references remaining at end of file
+ for ( int i = blockRefId+1; i < numReferences; ++i )
+ WriteReferenceEntry( BtiReferenceEntry(i) );
+ }
+
+ } catch ( BamException& e ) {
+ m_errorString = e.what();
+ return false;
+ }
+
+ // rewind BamReader
+ if ( !m_reader->Rewind() ) {
+ const string readerError = m_reader->GetErrorString();
+ const string message = "could not create index: \n\t" + readerError;
+ SetErrorString("BamToolsIndex::Create", message);
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+// returns format's file extension
+const std::string BamToolsIndex::Extension(void) {
+ return BamToolsIndex::BTI_EXTENSION;
+}
+
+void BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
+
+ // return false ref ID is not a valid index in file summary data
+ if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() )
+ throw BamException("BamToolsIndex::GetOffset", "invalid region requested");
+
+ // retrieve reference index data for left bound reference
+ BtiReferenceEntry refEntry(region.LeftRefID);
+ ReadReferenceEntry(refEntry);
+
+ // binary search for an overlapping block (may not be first one though)
+ bool found = false;
+ typedef BtiBlockVector::const_iterator BtiBlockConstIterator;
+ BtiBlockConstIterator blockFirst = refEntry.Blocks.begin();
+ BtiBlockConstIterator blockIter = blockFirst;
+ BtiBlockConstIterator blockLast = refEntry.Blocks.end();
+ iterator_traits<BtiBlockConstIterator>::difference_type count = distance(blockFirst, blockLast);
+ iterator_traits<BtiBlockConstIterator>::difference_type step;
+ while ( count > 0 ) {
+ blockIter = blockFirst;
+ step = count/2;
+ advance(blockIter, step);
+
+ const BtiBlock& block = (*blockIter);
+ if ( block.StartPosition <= region.RightPosition ) {
+ if ( block.MaxEndPosition > region.LeftPosition ) {
+ offset = block.StartOffset;
+ break;
+ }
+ blockFirst = ++blockIter;
+ count -= step+1;
+ }
+ else count = step;
+ }
+
+ // if we didn't search "off the end" of the blocks
+ if ( blockIter != blockLast ) {
+
+ // "walk back" until we've gone too far
+ while ( blockIter != blockFirst ) {
+ const BtiBlock& currentBlock = (*blockIter);
+
+ --blockIter;
+ const BtiBlock& previousBlock = (*blockIter);
+ if ( previousBlock.MaxEndPosition <= region.LeftPosition ) {
+ offset = currentBlock.StartOffset;
+ found = true;
+ break;
+ }
+ }
+
+ // if we walked all the way to first block, just return that and let the reader's
+ // region overlap parsing do the rest
+ if ( blockIter == blockFirst ) {
+ const BtiBlock& block = (*blockIter);
+ offset = block.StartOffset;
+ found = true;
+ }
+ }
+
+
+ // sets to false if blocks container is empty, or if no matching block could be found
+ *hasAlignmentsInRegion = found;
+}
+
+// returns whether reference has alignments or no
+bool BamToolsIndex::HasAlignments(const int& referenceID) const {
+ if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() )
+ return false;
+ const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID);
+ return ( refSummary.NumBlocks > 0 );
+}
+
+// pre-allocates space for each reference's summary data
+void BamToolsIndex::InitializeFileSummary(const int& numReferences) {
+ m_indexFileSummary.clear();
+ for ( int i = 0; i < numReferences; ++i )
+ m_indexFileSummary.push_back( BtiReferenceSummary() );
+}
+
+// returns true if the index stream is open
+bool BamToolsIndex::IsFileOpen(void) const {
+ return ( Resources.IndexStream != 0 );
+}
+
+// attempts to use index data to jump to @region, returns success/fail
+// a "successful" jump indicates no error, but not whether this region has data
+// * thus, the method sets a flag to indicate whether there are alignments
+// available after the jump position
+bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) {
+
+ // clear flag
+ *hasAlignmentsInRegion = false;
+
+ // skip if invalid reader or not open
+ if ( m_reader == 0 || !m_reader->IsOpen() ) {
+ SetErrorString("BamToolsIndex::Jump", "could not jump: reader is not open");
+ return false;
+ }
+
+ // make sure left-bound position is valid
+ const RefVector& references = m_reader->GetReferenceData();
+ if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) {
+ SetErrorString("BamToolsIndex::Jump", "could not create index: invalid region requested");
+ return false;
+ }
+
+ // calculate nearest offset to jump to
+ int64_t offset;
+ try {
+ GetOffset(region, offset, hasAlignmentsInRegion);
+ } catch ( BamException& e ) {
+ m_errorString = e.what();
+ return false;
+ }
+
+ // return success/failure of seek
+ return m_reader->Seek(offset);
+}
+
+// loads existing data from file into memory
+bool BamToolsIndex::Load(const std::string& filename) {
+
+ try {
+
+ // attempt to open file (read-only)
+ OpenFile(filename, "rb");
+
+ // load metadata & generate in-memory summary
+ LoadHeader();
+ LoadFileSummary();
+
+ // return success
+ return true;
+
+ } catch ( BamException& e ) {
+ m_errorString = e.what();
+ return false;
+ }
+}
+
+void BamToolsIndex::LoadFileSummary(void) {
+
+ // load number of reference sequences
+ int numReferences;
+ LoadNumReferences(numReferences);
+
+ // initialize file summary data
+ InitializeFileSummary(numReferences);
+
+ // load summary for each reference
+ BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin();
+ BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end();
+ for ( ; summaryIter != summaryEnd; ++summaryIter )
+ LoadReferenceSummary(*summaryIter);
+}
+
+void BamToolsIndex::LoadHeader(void) {
+
+ // check BTI file metadata
+ CheckMagicNumber();
+ CheckVersion();
+
+ // use file's BTI block size to set member variable
+ const size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, Resources.IndexStream);
+ if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
+ if ( elementsRead != 1 )
+ throw BamException("BamToolsIndex::LoadHeader", "could not read BTI block size");
+}
+
+void BamToolsIndex::LoadNumBlocks(int& numBlocks) {
+ const size_t elementsRead = fread(&numBlocks, sizeof(numBlocks), 1, Resources.IndexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numBlocks);
+ if ( elementsRead != 1 )
+ throw BamException("BamToolsIndex::LoadNumBlocks", "could not read number of BTI blocks");
+}
+
+void BamToolsIndex::LoadNumReferences(int& numReferences) {
+ const size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, Resources.IndexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numReferences);
+ if ( elementsRead != 1 )
+ throw BamException("BamToolsIndex::LoadNumReferences", "could not read number of references");
+}
+
+void BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) {
+
+ // load number of blocks
+ int numBlocks;
+ LoadNumBlocks(numBlocks);
+
+ // store block summary data for this reference
+ refSummary.NumBlocks = numBlocks;
+ refSummary.FirstBlockFilePosition = Tell();
+
+ // skip reference's blocks
+ SkipBlocks(numBlocks);
+}
+
+void BamToolsIndex::OpenFile(const std::string& filename, const char* mode) {
+
+ // make sure any previous index file is closed
+ CloseFile();
+
+ // attempt to open file
+ Resources.IndexStream = fopen(filename.c_str(), mode);
+ if ( !IsFileOpen() ) {
+ const string message = string("could not open file: ") + filename;
+ throw BamException("BamToolsIndex::OpenFile", message);
+ }
+}
+
+void BamToolsIndex::ReadBlock(BtiBlock& block) {
+
+ // read in block data members
+ size_t elementsRead = 0;
+ elementsRead += fread(&block.MaxEndPosition, sizeof(block.MaxEndPosition), 1, Resources.IndexStream);
+ elementsRead += fread(&block.StartOffset, sizeof(block.StartOffset), 1, Resources.IndexStream);
+ elementsRead += fread(&block.StartPosition, sizeof(block.StartPosition), 1, Resources.IndexStream);
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_32(block.MaxEndPosition);
+ SwapEndian_64(block.StartOffset);
+ SwapEndian_32(block.StartPosition);
+ }
+
+ if ( elementsRead != 3 )
+ throw BamException("BamToolsIndex::ReadBlock", "could not read block");
+}
+
+void BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) {
+
+ // prep blocks container
+ blocks.clear();
+ blocks.reserve(refSummary.NumBlocks);
+
+ // skip to first block entry
+ Seek( refSummary.FirstBlockFilePosition, SEEK_SET );
+
+ // read & store block entries
+ BtiBlock block;
+ for ( int i = 0; i < refSummary.NumBlocks; ++i ) {
+ ReadBlock(block);
+ blocks.push_back(block);
+ }
+}
+
+void BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) {
+
+ // return false if refId not valid index in file summary structure
+ if ( refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size() )
+ throw BamException("BamToolsIndex::ReadReferenceEntry", "invalid reference requested");
+
+ // use index summary to assist reading the reference's BTI blocks
+ const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID);
+ ReadBlocks(refSummary, refEntry.Blocks);
+}
+
+void BamToolsIndex::Seek(const int64_t& position, const int& origin) {
+ if ( fseek64(Resources.IndexStream, position, origin) != 0 )
+ throw BamException("BamToolsIndex::Seek", "could not seek in BAI file");
+}
+
+void BamToolsIndex::SkipBlocks(const int& numBlocks) {
+ Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR );
+}
+
+int64_t BamToolsIndex::Tell(void) const {
+ return ftell64(Resources.IndexStream);
+}
+
+void BamToolsIndex::WriteBlock(const BtiBlock& block) {
+
+ // copy entry data
+ int32_t maxEndPosition = block.MaxEndPosition;
+ int64_t startOffset = block.StartOffset;
+ int32_t startPosition = block.StartPosition;
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_32(maxEndPosition);
+ SwapEndian_64(startOffset);
+ SwapEndian_32(startPosition);
+ }
+
+ // write the reference index entry
+ size_t elementsWritten = 0;
+ elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, Resources.IndexStream);
+ elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, Resources.IndexStream);
+ elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, Resources.IndexStream);
+ if ( elementsWritten != 3 )
+ throw BamException("BamToolsIndex::WriteBlock", "could not write BTI block");
+}
+
+void BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) {
+ BtiBlockVector::const_iterator blockIter = blocks.begin();
+ BtiBlockVector::const_iterator blockEnd = blocks.end();
+ for ( ; blockIter != blockEnd; ++blockIter )
+ WriteBlock(*blockIter);
+}
+
+void BamToolsIndex::WriteHeader(void) {
+
+ size_t elementsWritten = 0;
+
+ // write BTI index format 'magic number'
+ elementsWritten += fwrite(BamToolsIndex::BTI_MAGIC, 1, 4, Resources.IndexStream);
+
+ // write BTI index format version
+ int32_t currentVersion = (int32_t)m_outputVersion;
+ if ( m_isBigEndian ) SwapEndian_32(currentVersion);
+ elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, Resources.IndexStream);
+
+ // write block size
+ uint32_t blockSize = m_blockSize;
+ if ( m_isBigEndian ) SwapEndian_32(blockSize);
+ elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, Resources.IndexStream);
+
+ // write number of references
+ int32_t numReferences = m_indexFileSummary.size();
+ if ( m_isBigEndian ) SwapEndian_32(numReferences);
+ elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, Resources.IndexStream);
+
+ if ( elementsWritten != 7 )
+ throw BamException("BamToolsIndex::WriteHeader", "could not write BTI header");
+}
+
+void BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) {
+
+ // write number of blocks this reference
+ uint32_t numBlocks = refEntry.Blocks.size();
+ if ( m_isBigEndian ) SwapEndian_32(numBlocks);
+ const size_t elementsWritten = fwrite(&numBlocks, sizeof(numBlocks), 1, Resources.IndexStream);
+ if ( elementsWritten != 1 )
+ throw BamException("BamToolsIndex::WriteReferenceEntry", "could not write number of blocks");
+
+ // write actual block entries
+ WriteBlocks(refEntry.Blocks);
+}
--- /dev/null
+// ***************************************************************************
+// BamToolsIndex.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the BamTools index format (".bti")
+// ***************************************************************************
+
+#ifndef BAMTOOLS_INDEX_FORMAT_H
+#define BAMTOOLS_INDEX_FORMAT_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/BamAux.h"
+#include "api/BamIndex.h"
+#include <map>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+namespace Internal {
+
+// contains data for each 'block' in a BTI index
+struct BtiBlock {
+
+ // data members
+ int32_t MaxEndPosition;
+ int64_t StartOffset;
+ int32_t StartPosition;
+
+ // ctor
+ BtiBlock(const int32_t& maxEndPosition = 0,
+ const int64_t& startOffset = 0,
+ const int32_t& startPosition = 0)
+ : MaxEndPosition(maxEndPosition)
+ , StartOffset(startOffset)
+ , StartPosition(startPosition)
+ { }
+};
+
+// convenience typedef for describing a a list of BTI blocks on a reference
+typedef std::vector<BtiBlock> BtiBlockVector;
+
+// contains all fields necessary for building, loading, & writing
+// full BTI index data for a single reference
+struct BtiReferenceEntry {
+
+ // data members
+ int32_t ID;
+ BtiBlockVector Blocks;
+
+ // ctor
+ BtiReferenceEntry(const int& id = -1)
+ : ID(id)
+ { }
+};
+
+// provides (persistent) summary of BtiReferenceEntry's index data
+struct BtiReferenceSummary {
+
+ // data members
+ int NumBlocks;
+ uint64_t FirstBlockFilePosition;
+
+ // ctor
+ BtiReferenceSummary(void)
+ : NumBlocks(0)
+ , FirstBlockFilePosition(0)
+ { }
+};
+
+// convenience typedef for describing a full BTI index file summary
+typedef std::vector<BtiReferenceSummary> BtiFileSummary;
+
+class BamToolsIndex : public BamIndex {
+
+ // keep a list of any supported versions here
+ // (might be useful later to handle any 'legacy' versions if the format changes)
+ // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
+ //
+ // so a change introduced in BTI_1_2 may be handled from then on by:
+ //
+ // if ( indexVersion >= BTI_1_2 )
+ // do something new
+ // else
+ // do the old thing
+ enum Version { BTI_1_0 = 1
+ , BTI_1_1
+ , BTI_1_2
+ , BTI_2_0
+ };
+
+ // ctor & dtor
+ public:
+ BamToolsIndex(Internal::BamReaderPrivate* reader);
+ ~BamToolsIndex(void);
+
+ // BamIndex implementation
+ public:
+ // builds index from associated BAM file & writes out to index file
+ bool Create(void);
+ // returns whether reference has alignments or no
+ bool HasAlignments(const int& referenceID) const;
+ // attempts to use index data to jump to @region, returns success/fail
+ // a "successful" jump indicates no error, but not whether this region has data
+ // * thus, the method sets a flag to indicate whether there are alignments
+ // available after the jump position
+ bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
+ // loads existing data from file into memory
+ bool Load(const std::string& filename);
+ public:
+ // returns format's file extension
+ static const std::string Extension(void);
+
+ // internal methods
+ private:
+
+ // index file ops
+ void CheckMagicNumber(void);
+ void CheckVersion(void);
+ void CloseFile(void);
+ bool IsFileOpen(void) const;
+ void OpenFile(const std::string& filename, const char* mode);
+ void Seek(const int64_t& position, const int& origin);
+ int64_t Tell(void) const;
+
+ // index-creation methods
+ void ClearReferenceEntry(BtiReferenceEntry& refEntry);
+ void WriteBlock(const BtiBlock& block);
+ void WriteBlocks(const BtiBlockVector& blocks);
+ void WriteHeader(void);
+ void WriteReferenceEntry(const BtiReferenceEntry& refEntry);
+
+ // random-access methods
+ void GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion);
+ void ReadBlock(BtiBlock& block);
+ void ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks);
+ void ReadReferenceEntry(BtiReferenceEntry& refEntry);
+
+ // BTI summary data methods
+ void InitializeFileSummary(const int& numReferences);
+ void LoadFileSummary(void);
+ void LoadHeader(void);
+ void LoadNumBlocks(int& numBlocks);
+ void LoadNumReferences(int& numReferences);
+ void LoadReferenceSummary(BtiReferenceSummary& refSummary);
+ void SkipBlocks(const int& numBlocks);
+
+ // data members
+ private:
+ bool m_isBigEndian;
+ BtiFileSummary m_indexFileSummary;
+ uint32_t m_blockSize;
+ int32_t m_inputVersion; // Version is serialized as int
+ Version m_outputVersion;
+
+ struct RaiiWrapper {
+ FILE* IndexStream;
+ RaiiWrapper(void);
+ ~RaiiWrapper(void);
+ };
+ RaiiWrapper Resources;
+
+ // static constants
+ private:
+ static const uint32_t DEFAULT_BLOCK_LENGTH;
+ static const std::string BTI_EXTENSION;
+ static const char* const BTI_MAGIC;
+ static const int SIZEOF_BLOCK;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMTOOLS_INDEX_FORMAT_H
--- /dev/null
+// ***************************************************************************
+// BamDeviceFactory_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 September 2011 (DB)
+// ---------------------------------------------------------------------------
+// Creates built-in concrete implementations of IBamIODevices
+// ***************************************************************************
+
+#include "api/internal/io/BamDeviceFactory_p.h"
+#include "api/internal/io/BamFile_p.h"
+#include "api/internal/io/BamFtp_p.h"
+#include "api/internal/io/BamHttp_p.h"
+#include "api/internal/io/BamPipe_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <iostream>
+using namespace std;
+
+IBamIODevice* BamDeviceFactory::CreateDevice(const string& source) {
+
+ // check for requested pipe
+ if ( source == "-" || source == "stdin" || source == "stdout" )
+ return new BamPipe;
+
+ // check for HTTP prefix
+ if ( source.find("http://") == 0 )
+ return new BamHttp(source);
+
+ // check for FTP prefix
+ if ( source.find("ftp://") == 0 )
+ return new BamFtp(source);
+
+ // otherwise assume a "normal" file
+ return new BamFile(source);
+}
--- /dev/null
+// ***************************************************************************
+// BamDeviceFactory_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Creates built-in concrete implementations of IBamIODevices
+// ***************************************************************************
+
+#ifndef BAMDEVICEFACTORY_P_H
+#define BAMDEVICEFACTORY_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/IBamIODevice.h"
+#include <string>
+
+namespace BamTools {
+namespace Internal {
+
+class BamDeviceFactory {
+ public:
+ static IBamIODevice* CreateDevice(const std::string& source);
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMDEVICEFACTORY_P_H
--- /dev/null
+// ***************************************************************************
+// BamFile_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides BAM file-specific IO behavior
+// ***************************************************************************
+
+#include "api/internal/io/BamFile_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+#include <iostream>
+using namespace std;
+
+BamFile::BamFile(const string& filename)
+ : ILocalIODevice()
+ , m_filename(filename)
+{ }
+
+BamFile::~BamFile(void) { }
+
+void BamFile::Close(void) {
+ if ( IsOpen() ) {
+ m_filename.clear();
+ ILocalIODevice::Close();
+ }
+}
+
+bool BamFile::IsRandomAccess(void) const {
+ return true;
+}
+
+bool BamFile::Open(const IBamIODevice::OpenMode mode) {
+
+ // make sure we're starting with a fresh file stream
+ Close();
+
+ // attempt to open FILE* depending on requested openmode
+ if ( mode == IBamIODevice::ReadOnly )
+ m_stream = fopen(m_filename.c_str(), "rb");
+ else if ( mode == IBamIODevice::WriteOnly )
+ m_stream = fopen(m_filename.c_str(), "wb");
+ else {
+ SetErrorString("BamFile::Open", "unknown open mode requested");
+ return false;
+ }
+
+ // check that we obtained a valid FILE*
+ if ( m_stream == 0 ) {
+ const string message_base = string("could not open file handle for ");
+ const string message = message_base + ( (m_filename.empty()) ? "empty filename" : m_filename );
+ SetErrorString("BamFile::Open", message);
+ return false;
+ }
+
+ // store current IO mode & return success
+ m_mode = mode;
+ return true;
+}
+
+bool BamFile::Seek(const int64_t& position) {
+ BT_ASSERT_X( m_stream, "BamFile::Seek() - null stream" );
+ return ( fseek64(m_stream, position, SEEK_SET) == 0 );
+}
--- /dev/null
+// ***************************************************************************
+// BamFile_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides BAM file-specific IO behavior
+// ***************************************************************************
+
+#ifndef BAMFILE_P_H
+#define BAMFILE_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/internal/io/ILocalIODevice_p.h"
+#include <string>
+
+namespace BamTools {
+namespace Internal {
+
+class BamFile : public ILocalIODevice {
+
+ // ctor & dtor
+ public:
+ BamFile(const std::string& filename);
+ ~BamFile(void);
+
+ // ILocalIODevice implementation
+ public:
+ void Close(void);
+ bool IsRandomAccess(void) const;
+ bool Open(const IBamIODevice::OpenMode mode);
+ bool Seek(const int64_t& position);
+
+ // data members
+ private:
+ std::string m_filename;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMFILE_P_H
--- /dev/null
+// ***************************************************************************
+// BamFtp_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides reading/writing of BAM files on FTP server
+// ***************************************************************************
+
+#include "api/internal/io/BamFtp_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+using namespace std;
+
+BamFtp::BamFtp(const string& url)
+ : IBamIODevice()
+{
+ BT_ASSERT_X(false, "BamFtp not yet implemented");
+}
+
+BamFtp::~BamFtp(void) { }
+
+void BamFtp::Close(void) {
+ return ;
+}
+
+bool BamFtp::IsRandomAccess(void) const {
+ return true;
+}
+
+bool BamFtp::Open(const IBamIODevice::OpenMode mode) {
+ (void) mode;
+ return true;
+}
+
+size_t BamFtp::Read(char* data, const unsigned int numBytes) {
+ (void)data;
+ (void)numBytes;
+ return 0;
+}
+
+bool BamFtp::Seek(const int64_t& position) {
+ (void)position;
+ return true;
+}
+
+int64_t BamFtp::Tell(void) const {
+ return -1;
+}
+
+size_t BamFtp::Write(const char* data, const unsigned int numBytes) {
+ (void)data;
+ (void)numBytes;
+ return 0;
+}
--- /dev/null
+// ***************************************************************************
+// BamFtp_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides reading/writing of BAM files on FTP server
+// ***************************************************************************
+
+#ifndef BAMFTP_P_H
+#define BAMFTP_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/IBamIODevice.h"
+#include <string>
+
+namespace BamTools {
+namespace Internal {
+
+class BamFtp : public IBamIODevice {
+
+ // ctor & dtor
+ public:
+ BamFtp(const std::string& url);
+ ~BamFtp(void);
+
+ // IBamIODevice implementation
+ public:
+ void Close(void);
+ bool IsRandomAccess(void) const;
+ bool Open(const IBamIODevice::OpenMode mode);
+ size_t Read(char* data, const unsigned int numBytes);
+ bool Seek(const int64_t& position);
+ int64_t Tell(void) const;
+ size_t Write(const char* data, const unsigned int numBytes);
+
+ // internal methods
+ private:
+
+ // data members
+ private:
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMFTP_P_H
--- /dev/null
+// ***************************************************************************
+// BamHttp_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides reading/writing of BAM files on HTTP server
+// ***************************************************************************
+
+#include "api/internal/io/BamHttp_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+using namespace std;
+
+BamHttp::BamHttp(const string& url)
+ : IBamIODevice()
+{
+ BT_ASSERT_X(false, "BamHttp not yet implemented");
+}
+
+BamHttp::~BamHttp(void) { }
+
+void BamHttp::Close(void) {
+ return ;
+}
+
+bool BamHttp::IsRandomAccess(void) const {
+ return true;
+}
+
+bool BamHttp::Open(const IBamIODevice::OpenMode mode) {
+ (void) mode;
+ return true;
+}
+
+size_t BamHttp::Read(char* data, const unsigned int numBytes) {
+ (void)data;
+ (void)numBytes;
+ return 0;
+}
+
+bool BamHttp::Seek(const int64_t& position) {
+ (void)position;
+ return true;
+}
+
+int64_t BamHttp::Tell(void) const {
+ return -1;
+}
+
+size_t BamHttp::Write(const char* data, const unsigned int numBytes) {
+ (void)data;
+ (void)numBytes;
+ return 0;
+}
--- /dev/null
+// ***************************************************************************
+// BamHttp_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides reading/writing of BAM files on HTTP server
+// ***************************************************************************
+
+#ifndef BAMHTTP_P_H
+#define BAMHTTP_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/IBamIODevice.h"
+#include <string>
+
+namespace BamTools {
+namespace Internal {
+
+class BamHttp : public IBamIODevice {
+
+ // ctor & dtor
+ public:
+ BamHttp(const std::string& url);
+ ~BamHttp(void);
+
+ // IBamIODevice implementation
+ public:
+ void Close(void);
+ bool IsRandomAccess(void) const;
+ bool Open(const IBamIODevice::OpenMode mode);
+ size_t Read(char* data, const unsigned int numBytes);
+ bool Seek(const int64_t& position);
+ int64_t Tell(void) const;
+ size_t Write(const char* data, const unsigned int numBytes);
+
+ // internal methods
+ private:
+
+ // data members
+ private:
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMHTTP_P_H
--- /dev/null
+// ***************************************************************************
+// BamPipe_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides BAM pipe-specific IO behavior
+// ***************************************************************************
+
+#include "api/internal/io/BamPipe_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+#include <iostream>
+using namespace std;
+
+BamPipe::BamPipe(void) : ILocalIODevice() { }
+
+BamPipe::~BamPipe(void) { }
+
+bool BamPipe::IsRandomAccess(void) const {
+ return false;
+}
+
+bool BamPipe::Open(const IBamIODevice::OpenMode mode) {
+
+ // make sure we're starting with a fresh pipe
+ Close();
+
+ // open stdin/stdout depending on requested openmode
+ if ( mode == IBamIODevice::ReadOnly )
+ m_stream = freopen(0, "rb", stdin);
+ else if ( mode == IBamIODevice::WriteOnly )
+ m_stream = freopen(0, "wb", stdout);
+ else {
+ SetErrorString("BamPipe::Open", "unknown open mode requested");
+ return false;
+ }
+
+ // check that we obtained a valid FILE*
+ if ( m_stream == 0 ) {
+ const string message_base = string("could not open handle on ");
+ const string message = message_base + ( (mode == IBamIODevice::ReadOnly) ? "stdin" : "stdout" );
+ SetErrorString("BamPipe::Open", message);
+ return false;
+ }
+
+ // store current IO mode & return success
+ m_mode = mode;
+ return true;
+}
+
+bool BamPipe::Seek(const int64_t& ) {
+ SetErrorString("BamPipe::Seek", "random access not allowed in FIFO pipe");
+ return false;
+}
--- /dev/null
+// ***************************************************************************
+// BamPipe_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides BAM pipe-specific IO behavior
+// ***************************************************************************
+
+#ifndef BAMPIPE_P_H
+#define BAMPIPE_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/internal/io/ILocalIODevice_p.h"
+#include <string>
+
+namespace BamTools {
+namespace Internal {
+
+class BamPipe : public ILocalIODevice {
+
+ // ctor & dtor
+ public:
+ BamPipe(void);
+ ~BamPipe(void);
+
+ // IBamIODevice implementation
+ public:
+ bool IsRandomAccess(void) const;
+ bool Open(const IBamIODevice::OpenMode mode);
+ bool Seek(const int64_t& position);
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMPIPE_P_H
--- /dev/null
+// ***************************************************************************
+// BgzfStream_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011(DB)
+// ---------------------------------------------------------------------------
+// Based on BGZF routines developed at the Broad Institute.
+// Provides the basic functionality for reading & writing BGZF files
+// Replaces the old BGZF.* files to avoid clashing with other toolkits
+// ***************************************************************************
+
+#include "api/BamAux.h"
+#include "api/BamConstants.h"
+#include "api/internal/io/BamDeviceFactory_p.h"
+#include "api/internal/io/BgzfStream_p.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include "zlib.h"
+
+#include <cstring>
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+using namespace std;
+
+// ----------------------------
+// RaiiWrapper implementation
+// ----------------------------
+
+BgzfStream::RaiiWrapper::RaiiWrapper(void) {
+ CompressedBlock = new char[Constants::BGZF_MAX_BLOCK_SIZE];
+ UncompressedBlock = new char[Constants::BGZF_DEFAULT_BLOCK_SIZE];
+}
+
+BgzfStream::RaiiWrapper::~RaiiWrapper(void) {
+
+ // clean up buffers
+ delete[] CompressedBlock;
+ delete[] UncompressedBlock;
+ CompressedBlock = 0;
+ UncompressedBlock = 0;
+}
+
+// ---------------------------
+// BgzfStream implementation
+// ---------------------------
+
+// constructor
+BgzfStream::BgzfStream(void)
+ : m_blockLength(0)
+ , m_blockOffset(0)
+ , m_blockAddress(0)
+ , m_isWriteCompressed(true)
+ , m_device(0)
+{ }
+
+// destructor
+BgzfStream::~BgzfStream(void) {
+ Close();
+}
+
+// checks BGZF block header
+bool BgzfStream::CheckBlockHeader(char* header) {
+ return (header[0] == Constants::GZIP_ID1 &&
+ header[1] == Constants::GZIP_ID2 &&
+ header[2] == Z_DEFLATED &&
+ (header[3] & Constants::FLG_FEXTRA) != 0 &&
+ BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN &&
+ header[12] == Constants::BGZF_ID1 &&
+ header[13] == Constants::BGZF_ID2 &&
+ BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN );
+}
+
+// closes BGZF file
+void BgzfStream::Close(void) {
+
+ // skip if no device open
+ if ( m_device == 0 ) return;
+
+ // if writing to file, flush the current BGZF block,
+ // then write an empty block (as EOF marker)
+ if ( m_device->IsOpen() && (m_device->Mode() == IBamIODevice::WriteOnly) ) {
+ FlushBlock();
+ const size_t blockLength = DeflateBlock();
+ m_device->Write(Resources.CompressedBlock, blockLength);
+ }
+
+ // close device
+ m_device->Close();
+ delete m_device;
+ m_device = 0;
+
+ // reset state
+ m_blockLength = 0;
+ m_blockOffset = 0;
+ m_blockAddress = 0;
+ m_isWriteCompressed = true;
+}
+
+// compresses the current block
+size_t BgzfStream::DeflateBlock(void) {
+
+ // initialize the gzip header
+ char* buffer = Resources.CompressedBlock;
+ memset(buffer, 0, 18);
+ buffer[0] = Constants::GZIP_ID1;
+ buffer[1] = Constants::GZIP_ID2;
+ buffer[2] = Constants::CM_DEFLATE;
+ buffer[3] = Constants::FLG_FEXTRA;
+ buffer[9] = Constants::OS_UNKNOWN;
+ buffer[10] = Constants::BGZF_XLEN;
+ buffer[12] = Constants::BGZF_ID1;
+ buffer[13] = Constants::BGZF_ID2;
+ buffer[14] = Constants::BGZF_LEN;
+
+ // set compression level
+ const int compressionLevel = ( m_isWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 );
+
+ // loop to retry for blocks that do not compress enough
+ int inputLength = m_blockOffset;
+ size_t compressedLength = 0;
+ const unsigned int bufferSize = Constants::BGZF_MAX_BLOCK_SIZE;
+
+ while ( true ) {
+
+ // initialize zstream values
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = (Bytef*)Resources.UncompressedBlock;
+ zs.avail_in = inputLength;
+ zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH];
+ zs.avail_out = bufferSize -
+ Constants::BGZF_BLOCK_HEADER_LENGTH -
+ Constants::BGZF_BLOCK_FOOTER_LENGTH;
+
+ // initialize the zlib compression algorithm
+ int status = deflateInit2(&zs,
+ compressionLevel,
+ Z_DEFLATED,
+ Constants::GZIP_WINDOW_BITS,
+ Constants::Z_DEFAULT_MEM_LEVEL,
+ Z_DEFAULT_STRATEGY);
+ if ( status != Z_OK )
+ throw BamException("BgzfStream::DeflateBlock", "zlib deflateInit2 failed");
+
+ // compress the data
+ status = deflate(&zs, Z_FINISH);
+
+ // if not at stream end
+ if ( status != Z_STREAM_END ) {
+
+ deflateEnd(&zs);
+
+ // there was not enough space available in buffer
+ // try to reduce the input length & re-start loop
+ if ( status == Z_OK ) {
+ inputLength -= 1024;
+ if ( inputLength < 0 )
+ throw BamException("BgzfStream::DeflateBlock", "input reduction failed");
+ continue;
+ }
+
+ throw BamException("BgzfStream::DeflateBlock", "zlib deflate failed");
+ }
+
+ // finalize the compression routine
+ status = deflateEnd(&zs);
+ if ( status != Z_OK )
+ throw BamException("BgzfStream::DeflateBlock", "zlib deflateEnd failed");
+
+ // update compressedLength
+ compressedLength = zs.total_out +
+ Constants::BGZF_BLOCK_HEADER_LENGTH +
+ Constants::BGZF_BLOCK_FOOTER_LENGTH;
+ if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE )
+ throw BamException("BgzfStream::DeflateBlock", "deflate overflow");
+
+ // quit while loop
+ break;
+ }
+
+ // store the compressed length
+ BamTools::PackUnsignedShort(&buffer[16], static_cast<uint16_t>(compressedLength - 1));
+
+ // store the CRC32 checksum
+ uint32_t crc = crc32(0, NULL, 0);
+ crc = crc32(crc, (Bytef*)Resources.UncompressedBlock, inputLength);
+ BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc);
+ BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength);
+
+ // ensure that we have less than a block of data left
+ int remaining = m_blockOffset - inputLength;
+ if ( remaining > 0 ) {
+ if ( remaining > inputLength )
+ throw BamException("BgzfStream::DeflateBlock", "after deflate, remainder too large");
+ memcpy(Resources.UncompressedBlock, Resources.UncompressedBlock + inputLength, remaining);
+ }
+
+ // update block data
+ m_blockOffset = remaining;
+
+ // return result
+ return compressedLength;
+}
+
+// flushes the data in the BGZF block
+void BgzfStream::FlushBlock(void) {
+
+ BT_ASSERT_X( m_device, "BgzfStream::FlushBlock() - attempting to flush to null device" );
+
+ // flush all of the remaining blocks
+ while ( m_blockOffset > 0 ) {
+
+ // compress the data block
+ const size_t blockLength = DeflateBlock();
+
+ // flush the data to our output device
+ const size_t numBytesWritten = m_device->Write(Resources.CompressedBlock, blockLength);
+ if ( numBytesWritten != blockLength ) {
+ stringstream s("");
+ s << "expected to write " << blockLength
+ << " bytes during flushing, but wrote " << numBytesWritten;
+ throw BamException("BgzfStream::FlushBlock", s.str());
+ }
+
+ // update block data
+ m_blockAddress += blockLength;
+ }
+}
+
+// decompresses the current block
+size_t BgzfStream::InflateBlock(const size_t& blockLength) {
+
+ // setup zlib stream object
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = (Bytef*)Resources.CompressedBlock + 18;
+ zs.avail_in = blockLength - 16;
+ zs.next_out = (Bytef*)Resources.UncompressedBlock;
+ zs.avail_out = Constants::BGZF_DEFAULT_BLOCK_SIZE;
+
+ // initialize
+ int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS);
+ if ( status != Z_OK )
+ throw BamException("BgzfStream::InflateBlock", "zlib inflateInit failed");
+
+ // decompress
+ status = inflate(&zs, Z_FINISH);
+ if ( status != Z_STREAM_END ) {
+ inflateEnd(&zs);
+ throw BamException("BgzfStream::InflateBlock", "zlib inflate failed");
+ }
+
+ // finalize
+ status = inflateEnd(&zs);
+ if ( status != Z_OK ) {
+ inflateEnd(&zs);
+ throw BamException("BgzfStream::InflateBlock", "zlib inflateEnd failed");
+ }
+
+ // return result
+ return zs.total_out;
+}
+
+bool BgzfStream::IsOpen(void) const {
+ if ( m_device == 0 )
+ return false;
+ return m_device->IsOpen();
+}
+
+void BgzfStream::Open(const string& filename, const IBamIODevice::OpenMode mode) {
+
+ // close current device if necessary
+ Close();
+ BT_ASSERT_X( (m_device == 0), "BgzfStream::Open() - unable to properly close previous IO device" );
+
+ // retrieve new IO device depending on filename
+ m_device = BamDeviceFactory::CreateDevice(filename);
+ BT_ASSERT_X( m_device, "BgzfStream::Open() - unable to create IO device from filename" );
+
+ // if device fails to open
+ if ( !m_device->Open(mode) ) {
+ const string deviceError = m_device->GetErrorString();
+ const string message = string("could not open BGZF stream: \n\t") + deviceError;
+ throw BamException("BgzfStream::Open", message);
+ }
+}
+
+// reads BGZF data into a byte buffer
+size_t BgzfStream::Read(char* data, const size_t dataLength) {
+
+ if ( dataLength == 0 )
+ return 0;
+
+ // if stream not open for reading
+ BT_ASSERT_X( m_device, "BgzfStream::Read() - trying to read from null device");
+ if ( !m_device->IsOpen() || (m_device->Mode() != IBamIODevice::ReadOnly) )
+ return 0;
+
+ // read blocks as needed until desired data length is retrieved
+ char* output = data;
+ size_t numBytesRead = 0;
+ while ( numBytesRead < dataLength ) {
+
+ // determine bytes available in current block
+ int bytesAvailable = m_blockLength - m_blockOffset;
+
+ // read (and decompress) next block if needed
+ if ( bytesAvailable <= 0 ) {
+ ReadBlock();
+ bytesAvailable = m_blockLength - m_blockOffset;
+ if ( bytesAvailable <= 0 )
+ break;
+ }
+
+ // copy data from uncompressed source buffer into data destination buffer
+ const size_t copyLength = min( (dataLength-numBytesRead), (size_t)bytesAvailable );
+ memcpy(output, Resources.UncompressedBlock + m_blockOffset, copyLength);
+
+ // update counters
+ m_blockOffset += copyLength;
+ output += copyLength;
+ numBytesRead += copyLength;
+ }
+
+ // update block data
+ if ( m_blockOffset == m_blockLength ) {
+ m_blockAddress = m_device->Tell();
+ m_blockOffset = 0;
+ m_blockLength = 0;
+
+ }
+
+ // return actual number of bytes read
+ return numBytesRead;
+}
+
+// reads a BGZF block
+void BgzfStream::ReadBlock(void) {
+
+ BT_ASSERT_X( m_device, "BgzfStream::ReadBlock() - trying to read from null IO device");
+
+ // store block's starting address
+ int64_t blockAddress = m_device->Tell();
+
+ // read block header from file
+ char header[Constants::BGZF_BLOCK_HEADER_LENGTH];
+ size_t numBytesRead = m_device->Read(header, Constants::BGZF_BLOCK_HEADER_LENGTH);
+
+ // if block header empty
+ if ( numBytesRead == 0 ) {
+ m_blockLength = 0;
+ return;
+ }
+
+ // if block header invalid size
+ if ( numBytesRead != Constants::BGZF_BLOCK_HEADER_LENGTH )
+ throw BamException("BgzfStream::ReadBlock", "invalid block header size");
+
+ // validate block header contents
+ if ( !BgzfStream::CheckBlockHeader(header) )
+ throw BamException("BgzfStream::ReadBlock", "invalid block header contents");
+
+ // copy header contents to compressed buffer
+ const size_t blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1;
+ memcpy(Resources.CompressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH);
+
+ // read remainder of block
+ const size_t remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH;
+ numBytesRead = m_device->Read(&Resources.CompressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], remaining);
+ if ( numBytesRead != remaining )
+ throw BamException("BgzfStream::ReadBlock", "could not read data from block");
+
+ // decompress block data
+ numBytesRead = InflateBlock(blockLength);
+
+ // update block data
+ if ( m_blockLength != 0 )
+ m_blockOffset = 0;
+ m_blockAddress = blockAddress;
+ m_blockLength = numBytesRead;
+}
+
+// seek to position in BGZF file
+void BgzfStream::Seek(const int64_t& position) {
+
+ BT_ASSERT_X( m_device, "BgzfStream::Seek() - trying to seek on null IO device");
+
+ // skip if device is not open
+ if ( !IsOpen() ) return;
+
+ // determine adjusted offset & address
+ int blockOffset = (position & 0xFFFF);
+ int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL;
+
+ // attempt seek in file
+ if ( m_device->IsRandomAccess() && m_device->Seek(blockAddress) ) {
+
+ // update block data & return success
+ m_blockLength = 0;
+ m_blockAddress = blockAddress;
+ m_blockOffset = blockOffset;
+ }
+ else {
+ stringstream s("");
+ s << "unable to seek to position: " << position;
+ throw BamException("BgzfStream::Seek", s.str());
+ }
+}
+
+void BgzfStream::SetWriteCompressed(bool ok) {
+ m_isWriteCompressed = ok;
+}
+
+// get file position in BGZF file
+int64_t BgzfStream::Tell(void) const {
+ if ( !IsOpen() )
+ return 0;
+ return ( (m_blockAddress << 16) | (m_blockOffset & 0xFFFF) );
+}
+
+// writes the supplied data into the BGZF buffer
+size_t BgzfStream::Write(const char* data, const size_t dataLength) {
+
+ BT_ASSERT_X( m_device, "BgzfStream::Write() - trying to write to null IO device");
+ BT_ASSERT_X( (m_device->Mode() == IBamIODevice::WriteOnly),
+ "BgzfStream::Write() - trying to write to non-writable IO device");
+
+ // skip if file not open for writing
+ if ( !IsOpen() )
+ return 0;
+
+ // write blocks as needed til all data is written
+ size_t numBytesWritten = 0;
+ const char* input = data;
+ const size_t blockLength = Constants::BGZF_DEFAULT_BLOCK_SIZE;
+ while ( numBytesWritten < dataLength ) {
+
+ // copy data contents to uncompressed output buffer
+ unsigned int copyLength = min(blockLength - m_blockOffset, dataLength - numBytesWritten);
+ char* buffer = Resources.UncompressedBlock;
+ memcpy(buffer + m_blockOffset, input, copyLength);
+
+ // update counter
+ m_blockOffset += copyLength;
+ input += copyLength;
+ numBytesWritten += copyLength;
+
+ // flush (& compress) output buffer when full
+ if ( m_blockOffset == blockLength )
+ FlushBlock();
+ }
+
+ // return actual number of bytes written
+ return numBytesWritten;
+}
--- /dev/null
+// ***************************************************************************
+// BgzfStream_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011(DB)
+// ---------------------------------------------------------------------------
+// Based on BGZF routines developed at the Broad Institute.
+// Provides the basic functionality for reading & writing BGZF files
+// Replaces the old BGZF.* files to avoid clashing with other toolkits
+// ***************************************************************************
+
+#ifndef BGZFSTREAM_P_H
+#define BGZFSTREAM_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/api_global.h"
+#include "api/IBamIODevice.h"
+#include <string>
+
+namespace BamTools {
+namespace Internal {
+
+class BgzfStream {
+
+ // constructor & destructor
+ public:
+ BgzfStream(void);
+ ~BgzfStream(void);
+
+ // main interface methods
+ public:
+ // closes BGZF file
+ void Close(void);
+ // returns true if BgzfStream open for IO
+ bool IsOpen(void) const;
+ // opens the BGZF file
+ void Open(const std::string& filename, const IBamIODevice::OpenMode mode);
+ // reads BGZF data into a byte buffer
+ size_t Read(char* data, const size_t dataLength);
+ // seek to position in BGZF file
+ void Seek(const int64_t& position);
+ // sets IO device (closes previous, if any, but does not attempt to open)
+ void SetIODevice(IBamIODevice* device);
+ // enable/disable compressed output
+ void SetWriteCompressed(bool ok);
+ // get file position in BGZF file
+ int64_t Tell(void) const;
+ // writes the supplied data into the BGZF buffer
+ size_t Write(const char* data, const size_t dataLength);
+
+ // internal methods
+ private:
+ // compresses the current block
+ size_t DeflateBlock(void);
+ // flushes the data in the BGZF block
+ void FlushBlock(void);
+ // de-compresses the current block
+ size_t InflateBlock(const size_t& blockLength);
+ // reads a BGZF block
+ void ReadBlock(void);
+
+ // static 'utility' methods
+ public:
+ // checks BGZF block header
+ static bool CheckBlockHeader(char* header);
+
+ // data members
+ public:
+ unsigned int m_blockLength;
+ unsigned int m_blockOffset;
+ uint64_t m_blockAddress;
+
+ bool m_isWriteCompressed;
+ IBamIODevice* m_device;
+
+ struct RaiiWrapper {
+ RaiiWrapper(void);
+ ~RaiiWrapper(void);
+ char* UncompressedBlock;
+ char* CompressedBlock;
+ };
+ RaiiWrapper Resources;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BGZFSTREAM_P_H
--- /dev/null
+// ***************************************************************************
+// ILocalIODevice_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides shared behavior for files & pipes
+// ***************************************************************************
+
+#include "api/internal/io/ILocalIODevice_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+using namespace std;
+
+ILocalIODevice::ILocalIODevice(void)
+ : IBamIODevice()
+ , m_stream(0)
+{ }
+
+ILocalIODevice::~ILocalIODevice(void) {
+ Close();
+}
+
+void ILocalIODevice::Close(void) {
+
+ // skip if not open
+ if ( !IsOpen() )
+ return;
+
+ // flush & close FILE*
+ fflush(m_stream);
+ fclose(m_stream);
+ m_stream = 0;
+
+ // reset other device state
+ m_mode = IBamIODevice::NotOpen;
+}
+
+size_t ILocalIODevice::Read(char* data, const unsigned int numBytes) {
+ BT_ASSERT_X( m_stream, "ILocalIODevice::Read: trying to read from null stream" );
+ BT_ASSERT_X( (m_mode == IBamIODevice::ReadOnly), "ILocalIODevice::Read: device not in read-only mode");
+ return fread(data, sizeof(char), numBytes, m_stream);
+}
+
+int64_t ILocalIODevice::Tell(void) const {
+ BT_ASSERT_X( m_stream, "ILocalIODevice::Tell: trying to get file position fromnull stream" );
+ return ftell64(m_stream);
+}
+
+size_t ILocalIODevice::Write(const char* data, const unsigned int numBytes) {
+ BT_ASSERT_X( m_stream, "ILocalIODevice::Write: tryint to write to null stream" );
+ BT_ASSERT_X( (m_mode == IBamIODevice::WriteOnly), "ILocalIODevice::Write: device not in write-only mode" );
+ return fwrite(data, sizeof(char), numBytes, m_stream);
+}
--- /dev/null
+// ***************************************************************************
+// ILocalIODevice_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides shared behavior for files & pipes
+// ***************************************************************************
+
+#ifndef ILOCALIODEVICE_P_H
+#define ILOCALIODEVICE_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/IBamIODevice.h"
+
+namespace BamTools {
+namespace Internal {
+
+class ILocalIODevice : public IBamIODevice {
+
+ // ctor & dtor
+ public:
+ ILocalIODevice(void);
+ virtual ~ILocalIODevice(void);
+
+ // IBamIODevice implementation
+ public:
+ virtual void Close(void);
+ virtual size_t Read(char* data, const unsigned int numBytes);
+ virtual int64_t Tell(void) const;
+ virtual size_t Write(const char* data, const unsigned int numBytes);
+
+ // data members
+ protected:
+ FILE* m_stream;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // ILOCALIODEVICE_P_H
--- /dev/null
+// ***************************************************************************
+// SamFormatParser.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for parsing SAM header text into SamHeader object
+// ***************************************************************************
+
+#include "api/SamConstants.h"
+#include "api/SamHeader.h"
+#include "api/internal/sam/SamFormatParser_p.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+using namespace std;
+
+SamFormatParser::SamFormatParser(SamHeader& header)
+ : m_header(header)
+{ }
+
+SamFormatParser::~SamFormatParser(void) { }
+
+void SamFormatParser::Parse(const string& headerText) {
+
+ // clear header's prior contents
+ m_header.Clear();
+
+ // empty header is OK, but skip processing
+ if ( headerText.empty() )
+ return;
+
+ // other wise parse SAM lines
+ istringstream headerStream(headerText);
+ string headerLine("");
+ while ( getline(headerStream, headerLine) )
+ ParseSamLine(headerLine);
+}
+
+void SamFormatParser::ParseSamLine(const string& line) {
+
+ // skip if line is not long enough to contain true values
+ if ( line.length() < 5 ) return;
+
+ // determine token at beginning of line
+ const string firstToken = line.substr(0,3);
+ string restOfLine = line.substr(4);
+ if ( firstToken == Constants::SAM_HD_BEGIN_TOKEN) ParseHDLine(restOfLine);
+ else if ( firstToken == Constants::SAM_SQ_BEGIN_TOKEN) ParseSQLine(restOfLine);
+ else if ( firstToken == Constants::SAM_RG_BEGIN_TOKEN) ParseRGLine(restOfLine);
+ else if ( firstToken == Constants::SAM_PG_BEGIN_TOKEN) ParsePGLine(restOfLine);
+ else if ( firstToken == Constants::SAM_CO_BEGIN_TOKEN) ParseCOLine(restOfLine);
+ else {
+ const string message = string("unknown token: ") + firstToken;
+ throw BamException("SamFormatParser::ParseSamLine", message);
+ }
+}
+
+void SamFormatParser::ParseHDLine(const string& line) {
+
+ // split HD lines into tokens
+ vector<string> tokens = Split(line, Constants::SAM_TAB);
+
+ // iterate over tokens
+ vector<string>::const_iterator tokenIter = tokens.begin();
+ vector<string>::const_iterator tokenEnd = tokens.end();
+ for ( ; tokenIter != tokenEnd; ++tokenIter ) {
+
+ // get tag/value
+ const string tokenTag = (*tokenIter).substr(0,2);
+ const string tokenValue = (*tokenIter).substr(3);
+
+ // set header contents
+ if ( tokenTag == Constants::SAM_HD_VERSION_TAG ) m_header.Version = tokenValue;
+ else if ( tokenTag == Constants::SAM_HD_SORTORDER_TAG ) m_header.SortOrder = tokenValue;
+ else if ( tokenTag == Constants::SAM_HD_GROUPORDER_TAG ) m_header.GroupOrder = tokenValue;
+ else {
+ const string message = string("unknown HD tag: ") + tokenTag;
+ throw BamException("SamFormatParser::ParseHDLine", message);
+ }
+ }
+
+ // check for required tags
+ if ( !m_header.HasVersion() )
+ throw BamException("SamFormatParser::ParseHDLine", "@HD line is missing VN tag");
+}
+
+void SamFormatParser::ParseSQLine(const string& line) {
+
+ SamSequence seq;
+
+ // split SQ line into tokens
+ vector<string> tokens = Split(line, Constants::SAM_TAB);
+
+ // iterate over tokens
+ vector<string>::const_iterator tokenIter = tokens.begin();
+ vector<string>::const_iterator tokenEnd = tokens.end();
+ for ( ; tokenIter != tokenEnd; ++tokenIter ) {
+
+ // get tag/value
+ const string tokenTag = (*tokenIter).substr(0,2);
+ const string tokenValue = (*tokenIter).substr(3);
+
+ // set sequence contents
+ if ( tokenTag == Constants::SAM_SQ_NAME_TAG ) seq.Name = tokenValue;
+ else if ( tokenTag == Constants::SAM_SQ_LENGTH_TAG ) seq.Length = tokenValue;
+ else if ( tokenTag == Constants::SAM_SQ_ASSEMBLYID_TAG ) seq.AssemblyID = tokenValue;
+ else if ( tokenTag == Constants::SAM_SQ_CHECKSUM_TAG ) seq.Checksum = tokenValue;
+ else if ( tokenTag == Constants::SAM_SQ_SPECIES_TAG ) seq.Species = tokenValue;
+ else if ( tokenTag == Constants::SAM_SQ_URI_TAG ) seq.URI = tokenValue;
+ else {
+ const string message = string("unknown SQ tag: ") + tokenTag;
+ throw BamException("SamFormatParser::ParseSQLine", message);
+ }
+ }
+
+ // check for required tags
+ if ( !seq.HasName() )
+ throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing SN tag");
+ if ( !seq.HasLength() )
+ throw BamException("SamFormatParser::ParseSQLine", "@SQ line is missing LN tag");
+
+ // store SAM sequence entry
+ m_header.Sequences.Add(seq);
+}
+
+void SamFormatParser::ParseRGLine(const string& line) {
+
+ SamReadGroup rg;
+
+ // split string into tokens
+ vector<string> tokens = Split(line, Constants::SAM_TAB);
+
+ // iterate over tokens
+ vector<string>::const_iterator tokenIter = tokens.begin();
+ vector<string>::const_iterator tokenEnd = tokens.end();
+ for ( ; tokenIter != tokenEnd; ++tokenIter ) {
+
+ // get token tag/value
+ const string tokenTag = (*tokenIter).substr(0,2);
+ const string tokenValue = (*tokenIter).substr(3);
+
+ // set read group contents
+ if ( tokenTag == Constants::SAM_RG_ID_TAG ) rg.ID = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_DESCRIPTION_TAG ) rg.Description = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_FLOWORDER_TAG ) rg.FlowOrder = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_KEYSEQUENCE_TAG ) rg.KeySequence = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_LIBRARY_TAG ) rg.Library = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_PLATFORMUNIT_TAG ) rg.PlatformUnit = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG ) rg.PredictedInsertSize = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_PRODUCTIONDATE_TAG ) rg.ProductionDate = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_PROGRAM_TAG ) rg.Program = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_SAMPLE_TAG ) rg.Sample = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_SEQCENTER_TAG ) rg.SequencingCenter = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG ) rg.SequencingTechnology = tokenValue;
+ else {
+ const string message = string("unknown RG tag: ") + tokenTag;
+ throw BamException("SamFormatParser::ParseRGLine", message);
+ }
+ }
+
+ // check for required tags
+ if ( !rg.HasID() )
+ throw BamException("SamFormatParser::ParseRGLine", "@RG line is missing ID tag");
+
+ // store SAM read group entry
+ m_header.ReadGroups.Add(rg);
+}
+
+void SamFormatParser::ParsePGLine(const string& line) {
+
+ SamProgram pg;
+
+ // split string into tokens
+ vector<string> tokens = Split(line, Constants::SAM_TAB);
+
+ // iterate over tokens
+ vector<string>::const_iterator tokenIter = tokens.begin();
+ vector<string>::const_iterator tokenEnd = tokens.end();
+ for ( ; tokenIter != tokenEnd; ++tokenIter ) {
+
+ // get token tag/value
+ const string tokenTag = (*tokenIter).substr(0,2);
+ const string tokenValue = (*tokenIter).substr(3);
+
+ // set program record contents
+ if ( tokenTag == Constants::SAM_PG_ID_TAG ) pg.ID = tokenValue;
+ else if ( tokenTag == Constants::SAM_PG_NAME_TAG ) pg.Name = tokenValue;
+ else if ( tokenTag == Constants::SAM_PG_COMMANDLINE_TAG ) pg.CommandLine = tokenValue;
+ else if ( tokenTag == Constants::SAM_PG_PREVIOUSPROGRAM_TAG ) pg.PreviousProgramID = tokenValue;
+ else if ( tokenTag == Constants::SAM_PG_VERSION_TAG ) pg.Version = tokenValue;
+ else {
+ const string message = string("unknown PG tag: ") + tokenTag;
+ throw BamException("SamFormatParser::ParsePGLine", message);
+ }
+ }
+
+ // check for required tags
+ if ( !pg.HasID() )
+ throw BamException("SamFormatParser::ParsePGLine", "@PG line is missing ID tag");
+
+ // store SAM program entry
+ m_header.Programs.Add(pg);
+}
+
+void SamFormatParser::ParseCOLine(const string& line) {
+ // simply add line to comments list
+ m_header.Comments.push_back(line);
+}
+
+const vector<string> SamFormatParser::Split(const string& line, const char delim) {
+ vector<string> tokens;
+ stringstream lineStream(line);
+ string token;
+ while ( getline(lineStream, token, delim) )
+ tokens.push_back(token);
+ return tokens;
+}
--- /dev/null
+// ***************************************************************************
+// SamFormatParser.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for parsing SAM header text into SamHeader object
+// ***************************************************************************
+
+#ifndef SAM_FORMAT_PARSER_H
+#define SAM_FORMAT_PARSER_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+class SamHeader;
+
+namespace Internal {
+
+class SamFormatParser {
+
+ // ctor & dtor
+ public:
+ SamFormatParser(BamTools::SamHeader& header);
+ ~SamFormatParser(void);
+
+ // parse text & populate header data
+ public:
+ void Parse(const std::string& headerText);
+
+ // internal methods
+ private:
+ void ParseSamLine(const std::string& line);
+ void ParseHDLine(const std::string& line);
+ void ParseSQLine(const std::string& line);
+ void ParseRGLine(const std::string& line);
+ void ParsePGLine(const std::string& line);
+ void ParseCOLine(const std::string& line);
+ const std::vector<std::string> Split(const std::string& line, const char delim);
+
+ // data members
+ private:
+ SamHeader& m_header;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // SAM_FORMAT_PARSER_H
--- /dev/null
+// ***************************************************************************
+// SamFormatPrinter.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for printing formatted SAM header to string
+// ***************************************************************************
+
+#include "api/SamConstants.h"
+#include "api/SamHeader.h"
+#include "api/internal/sam/SamFormatPrinter_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+using namespace std;
+
+// ------------------------
+// static utility methods
+// ------------------------
+
+static inline
+const string FormatTag(const string& tag, const string& value) {
+ return string(Constants::SAM_TAB + tag + Constants::SAM_COLON + value);
+}
+
+// ---------------------------------
+// SamFormatPrinter implementation
+// ---------------------------------
+
+SamFormatPrinter::SamFormatPrinter(const SamHeader& header)
+ : m_header(header)
+{ }
+
+SamFormatPrinter::~SamFormatPrinter(void) { }
+
+const string SamFormatPrinter::ToString(void) const {
+
+ // clear out stream
+ stringstream out("");
+
+ // generate formatted header text
+ PrintHD(out);
+ PrintSQ(out);
+ PrintRG(out);
+ PrintPG(out);
+ PrintCO(out);
+
+ // return result
+ return out.str();
+}
+
+void SamFormatPrinter::PrintHD(std::stringstream& out) const {
+
+ // if header has @HD data
+ if ( m_header.HasVersion() ) {
+
+ // @HD VN:<Version>
+ out << Constants::SAM_HD_BEGIN_TOKEN
+ << FormatTag(Constants::SAM_HD_VERSION_TAG, m_header.Version);
+
+ // SO:<SortOrder>
+ if ( m_header.HasSortOrder() )
+ out << FormatTag(Constants::SAM_HD_SORTORDER_TAG, m_header.SortOrder);
+
+ // GO:<GroupOrder>
+ if ( m_header.HasGroupOrder() )
+ out << FormatTag(Constants::SAM_HD_GROUPORDER_TAG, m_header.GroupOrder);
+
+ // newline
+ out << endl;
+ }
+}
+
+void SamFormatPrinter::PrintSQ(std::stringstream& out) const {
+
+ // iterate over sequence entries
+ SamSequenceConstIterator seqIter = m_header.Sequences.ConstBegin();
+ SamSequenceConstIterator seqEnd = m_header.Sequences.ConstEnd();
+ for ( ; seqIter != seqEnd; ++seqIter ) {
+ const SamSequence& seq = (*seqIter);
+
+ // @SQ SN:<Name> LN:<Length>
+ out << Constants::SAM_SQ_BEGIN_TOKEN
+ << FormatTag(Constants::SAM_SQ_NAME_TAG, seq.Name)
+ << FormatTag(Constants::SAM_SQ_LENGTH_TAG, seq.Length);
+
+ // AS:<AssemblyID>
+ if ( seq.HasAssemblyID() )
+ out << FormatTag(Constants::SAM_SQ_ASSEMBLYID_TAG, seq.AssemblyID);
+
+ // M5:<Checksum>
+ if ( seq.HasChecksum() )
+ out << FormatTag(Constants::SAM_SQ_CHECKSUM_TAG, seq.Checksum);
+
+ // SP:<Species>
+ if ( seq.HasSpecies() )
+ out << FormatTag(Constants::SAM_SQ_SPECIES_TAG, seq.Species);
+
+ // UR:<URI>
+ if ( seq.HasURI() )
+ out << FormatTag(Constants::SAM_SQ_URI_TAG, seq.URI);
+
+ // newline
+ out << endl;
+ }
+}
+
+void SamFormatPrinter::PrintRG(std::stringstream& out) const {
+
+ // iterate over read group entries
+ SamReadGroupConstIterator rgIter = m_header.ReadGroups.ConstBegin();
+ SamReadGroupConstIterator rgEnd = m_header.ReadGroups.ConstEnd();
+ for ( ; rgIter != rgEnd; ++rgIter ) {
+ const SamReadGroup& rg = (*rgIter);
+
+ // @RG ID:<ID>
+ out << Constants::SAM_RG_BEGIN_TOKEN
+ << FormatTag(Constants::SAM_RG_ID_TAG, rg.ID);
+
+ // CN:<SequencingCenter>
+ if ( rg.HasSequencingCenter() )
+ out << FormatTag(Constants::SAM_RG_SEQCENTER_TAG, rg.SequencingCenter);
+
+ // DS:<Description>
+ if ( rg.HasDescription() )
+ out << FormatTag(Constants::SAM_RG_DESCRIPTION_TAG, rg.Description);
+
+ // DT:<ProductionDate>
+ if ( rg.HasProductionDate() )
+ out << FormatTag(Constants::SAM_RG_PRODUCTIONDATE_TAG, rg.ProductionDate);
+
+ // FO:<FlowOrder>
+ if ( rg.HasFlowOrder() )
+ out << FormatTag(Constants::SAM_RG_FLOWORDER_TAG, rg.FlowOrder);
+
+ // KS:<KeySequence>
+ if ( rg.HasKeySequence() )
+ out << FormatTag(Constants::SAM_RG_KEYSEQUENCE_TAG, rg.KeySequence);
+
+ // LB:<Library>
+ if ( rg.HasLibrary() )
+ out << FormatTag(Constants::SAM_RG_LIBRARY_TAG, rg.Library);
+
+ // PG:<Program>
+ if ( rg.HasProgram() )
+ out << FormatTag(Constants::SAM_RG_PROGRAM_TAG, rg.Program);
+
+ // PI:<PredictedInsertSize>
+ if ( rg.HasPredictedInsertSize() )
+ out << FormatTag(Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG, rg.PredictedInsertSize);
+
+ // PL:<SequencingTechnology>
+ if ( rg.HasSequencingTechnology() )
+ out << FormatTag(Constants::SAM_RG_SEQTECHNOLOGY_TAG, rg.SequencingTechnology);
+
+ // PU:<PlatformUnit>
+ if ( rg.HasPlatformUnit() )
+ out << FormatTag(Constants::SAM_RG_PLATFORMUNIT_TAG, rg.PlatformUnit);
+
+ // SM:<Sample>
+ if ( rg.HasSample() )
+ out << FormatTag(Constants::SAM_RG_SAMPLE_TAG, rg.Sample);
+
+ // newline
+ out << endl;
+ }
+}
+
+void SamFormatPrinter::PrintPG(std::stringstream& out) const {
+
+ // iterate over program record entries
+ SamProgramConstIterator pgIter = m_header.Programs.ConstBegin();
+ SamProgramConstIterator pgEnd = m_header.Programs.ConstEnd();
+ for ( ; pgIter != pgEnd; ++pgIter ) {
+ const SamProgram& pg = (*pgIter);
+
+ // @PG ID:<ID>
+ out << Constants::SAM_PG_BEGIN_TOKEN
+ << FormatTag(Constants::SAM_PG_ID_TAG, pg.ID);
+
+ // PN:<Name>
+ if ( pg.HasName() )
+ out << FormatTag(Constants::SAM_PG_NAME_TAG, pg.Name);
+
+ // CL:<CommandLine>
+ if ( pg.HasCommandLine() )
+ out << FormatTag(Constants::SAM_PG_COMMANDLINE_TAG, pg.CommandLine);
+
+ // PP:<PreviousProgramID>
+ if ( pg.HasPreviousProgramID() )
+ out << FormatTag(Constants::SAM_PG_PREVIOUSPROGRAM_TAG, pg.PreviousProgramID);
+
+ // VN:<Version>
+ if ( pg.HasVersion() )
+ out << FormatTag(Constants::SAM_PG_VERSION_TAG, pg.Version);
+
+ // newline
+ out << endl;
+ }
+}
+
+void SamFormatPrinter::PrintCO(std::stringstream& out) const {
+
+ // iterate over comments
+ vector<string>::const_iterator commentIter = m_header.Comments.begin();
+ vector<string>::const_iterator commentEnd = m_header.Comments.end();
+ for ( ; commentIter != commentEnd; ++commentIter ) {
+
+ // @CO <Comment>
+ out << Constants::SAM_CO_BEGIN_TOKEN
+ << Constants::SAM_TAB
+ << (*commentIter)
+ << endl;
+ }
+}
--- /dev/null
+// ***************************************************************************
+// SamFormatPrinter.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 6 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for printing formatted SAM header to string
+// ***************************************************************************
+
+#ifndef SAM_FORMAT_PRINTER_H
+#define SAM_FORMAT_PRINTER_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <sstream>
+#include <string>
+
+namespace BamTools {
+
+class SamHeader;
+
+namespace Internal {
+
+class SamFormatPrinter {
+
+ // ctor & dtor
+ public:
+ SamFormatPrinter(const BamTools::SamHeader& header);
+ ~SamFormatPrinter(void);
+
+ // generates SAM-formatted string from header data
+ public:
+ const std::string ToString(void) const;
+
+ // internal methods
+ private:
+ void PrintHD(std::stringstream& out) const;
+ void PrintSQ(std::stringstream& out) const;
+ void PrintRG(std::stringstream& out) const;
+ void PrintPG(std::stringstream& out) const;
+ void PrintCO(std::stringstream& out) const;
+
+ // data members
+ private:
+ const SamHeader& m_header;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // SAM_FORMAT_PRINTER_H
--- /dev/null
+// ***************************************************************************
+// SamHeaderValidator.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for validating SamHeader data
+// ***************************************************************************
+
+#include "api/SamConstants.h"
+#include "api/SamHeader.h"
+#include "api/internal/sam/SamHeaderValidator_p.h"
+#include "api/internal/sam/SamHeaderVersion_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cctype>
+#include <set>
+#include <sstream>
+using namespace std;
+
+// ------------------------
+// static utility methods
+// -------------------------
+
+static
+bool caseInsensitiveCompare(const string& lhs, const string& rhs) {
+
+ // can omit checking chars if lengths not equal
+ const int lhsLength = lhs.length();
+ const int rhsLength = rhs.length();
+ if ( lhsLength != rhsLength )
+ return false;
+
+ // do *basic* toupper checks on each string char's
+ for ( int i = 0; i < lhsLength; ++i ) {
+ if ( toupper( (int)lhs.at(i)) != toupper( (int)rhs.at(i)) )
+ return false;
+ }
+
+ // otherwise OK
+ return true;
+}
+
+// ------------------------------------------------------------------------
+// Allow validation rules to vary, as needed, between SAM header versions
+//
+// use SAM_VERSION_X_Y to tag important changes
+//
+// Together, they will allow for comparisons like:
+// if ( m_version < SAM_VERSION_2_0 ) {
+// // use some older rule
+// else
+// // use rule introduced with version 2.0
+
+static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0);
+static const SamHeaderVersion SAM_VERSION_1_1 = SamHeaderVersion(1,1);
+static const SamHeaderVersion SAM_VERSION_1_2 = SamHeaderVersion(1,2);
+static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3);
+static const SamHeaderVersion SAM_VERSION_1_4 = SamHeaderVersion(1,4);
+
+// TODO: This functionality is currently unused.
+// Make validation "version-aware."
+//
+// ------------------------------------------------------------------------
+
+const string SamHeaderValidator::ERROR_PREFIX = "ERROR: ";
+const string SamHeaderValidator::WARN_PREFIX = "WARNING: ";
+const string SamHeaderValidator::NEWLINE = "\n";
+
+SamHeaderValidator::SamHeaderValidator(const SamHeader& header)
+ : m_header(header)
+{ }
+
+SamHeaderValidator::~SamHeaderValidator(void) { }
+
+void SamHeaderValidator::AddError(const string& message) {
+ m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE);
+}
+
+void SamHeaderValidator::AddWarning(const string& message) {
+ m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE);
+}
+
+void SamHeaderValidator::PrintErrorMessages(ostream& stream) {
+
+ // skip if no error messages
+ if ( m_errorMessages.empty() )
+ return;
+
+ // print error header line
+ stream << "* SAM header has " << m_errorMessages.size() << " errors:" << endl;
+
+ // print each error message
+ vector<string>::const_iterator errorIter = m_errorMessages.begin();
+ vector<string>::const_iterator errorEnd = m_errorMessages.end();
+ for ( ; errorIter != errorEnd; ++errorIter )
+ stream << (*errorIter);
+}
+
+void SamHeaderValidator::PrintMessages(ostream& stream) {
+ PrintErrorMessages(stream);
+ PrintWarningMessages(stream);
+}
+
+void SamHeaderValidator::PrintWarningMessages(ostream& stream) {
+
+ // skip if no warning messages
+ if ( m_warningMessages.empty() )
+ return;
+
+ // print warning header line
+ stream << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl;
+
+ // print each warning message
+ vector<string>::const_iterator warnIter = m_warningMessages.begin();
+ vector<string>::const_iterator warnEnd = m_warningMessages.end();
+ for ( ; warnIter != warnEnd; ++warnIter )
+ stream << (*warnIter);
+}
+
+// entry point for validation
+bool SamHeaderValidator::Validate(void) {
+ bool isValid = true;
+ isValid &= ValidateMetadata();
+ isValid &= ValidateSequenceDictionary();
+ isValid &= ValidateReadGroupDictionary();
+ isValid &= ValidateProgramChain();
+ return isValid;
+}
+
+// check all SAM header 'metadata'
+bool SamHeaderValidator::ValidateMetadata(void) {
+ bool isValid = true;
+ isValid &= ValidateVersion();
+ isValid &= ValidateSortOrder();
+ isValid &= ValidateGroupOrder();
+ return isValid;
+}
+
+// check SAM header version tag
+bool SamHeaderValidator::ValidateVersion(void) {
+
+ const string& version = m_header.Version;
+
+ // warn if version not present
+ if ( version.empty() ) {
+ AddWarning("Version (VN) missing. Not required, but strongly recommended");
+ return true;
+ }
+
+ // invalid if version does not contain a period
+ const size_t periodFound = version.find(Constants::SAM_PERIOD);
+ if ( periodFound == string::npos ) {
+ AddError("Invalid version (VN) format: " + version);
+ return false;
+ }
+
+ // invalid if major version is empty or contains non-digits
+ const string majorVersion = version.substr(0, periodFound);
+ if ( majorVersion.empty() || !ContainsOnlyDigits(majorVersion) ) {
+ AddError("Invalid version (VN) format: " + version);
+ return false;
+ }
+
+ // invalid if major version is empty or contains non-digits
+ const string minorVersion = version.substr(periodFound + 1);
+ if ( minorVersion.empty() || !ContainsOnlyDigits(minorVersion) ) {
+ AddError("Invalid version (VN) format: " + version);
+ return false;
+ }
+
+ // TODO: check if version is not just syntactically OK,
+ // but is also a valid SAM version ( 1.0 .. CURRENT )
+
+ // all checked out this far, then version is OK
+ return true;
+}
+
+// assumes non-empty input string
+bool SamHeaderValidator::ContainsOnlyDigits(const string& s) {
+ const size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS);
+ return ( nonDigitPosition == string::npos ) ;
+}
+
+// validate SAM header sort order tag
+bool SamHeaderValidator::ValidateSortOrder(void) {
+
+ const string& sortOrder = m_header.SortOrder;
+
+ // warn if sort order not present
+ if ( sortOrder.empty() ) {
+ AddWarning("Sort order (SO) missing. Not required, but strongly recommended");
+ return true;
+ }
+
+ // if sort order is valid keyword
+ if ( sortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ||
+ sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ||
+ sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED
+ )
+ {
+ return true;
+ }
+
+ // otherwise
+ AddError("Invalid sort order (SO): " + sortOrder);
+ return false;
+}
+
+// validate SAM header group order tag
+bool SamHeaderValidator::ValidateGroupOrder(void) {
+
+ const string& groupOrder = m_header.GroupOrder;
+
+ // if no group order, no problem, just return OK
+ if ( groupOrder.empty() )
+ return true;
+
+ // if group order is valid keyword
+ if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE ||
+ groupOrder == Constants::SAM_HD_GROUPORDER_QUERY ||
+ groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE
+ )
+ {
+ return true;
+ }
+
+ // otherwise
+ AddError("Invalid group order (GO): " + groupOrder);
+ return false;
+}
+
+// validate SAM header sequence dictionary
+bool SamHeaderValidator::ValidateSequenceDictionary(void) {
+
+ bool isValid = true;
+
+ // check for unique sequence names
+ isValid &= ContainsUniqueSequenceNames();
+
+ // iterate over sequences
+ const SamSequenceDictionary& sequences = m_header.Sequences;
+ SamSequenceConstIterator seqIter = sequences.ConstBegin();
+ SamSequenceConstIterator seqEnd = sequences.ConstEnd();
+ for ( ; seqIter != seqEnd; ++seqIter ) {
+ const SamSequence& seq = (*seqIter);
+ isValid &= ValidateSequence(seq);
+ }
+
+ // return validation state
+ return isValid;
+}
+
+// make sure all SQ names are unique
+bool SamHeaderValidator::ContainsUniqueSequenceNames(void) {
+
+ bool isValid = true;
+ set<string> sequenceNames;
+ set<string>::iterator nameIter;
+
+ // iterate over sequences
+ const SamSequenceDictionary& sequences = m_header.Sequences;
+ SamSequenceConstIterator seqIter = sequences.ConstBegin();
+ SamSequenceConstIterator seqEnd = sequences.ConstEnd();
+ for ( ; seqIter != seqEnd; ++seqIter ) {
+ const SamSequence& seq = (*seqIter);
+
+ // lookup sequence name
+ const string& name = seq.Name;
+ nameIter = sequenceNames.find(name);
+
+ // error if found (duplicate entry)
+ if ( nameIter != sequenceNames.end() ) {
+ AddError("Sequence name (SN): " + name + " is not unique");
+ isValid = false;
+ }
+
+ // otherwise ok, store name
+ sequenceNames.insert(name);
+ }
+
+ // return validation state
+ return isValid;
+}
+
+// validate SAM header sequence entry
+bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) {
+ bool isValid = true;
+ isValid &= CheckNameFormat(seq.Name);
+ isValid &= CheckLengthInRange(seq.Length);
+ return isValid;
+}
+
+// check sequence name is valid format
+bool SamHeaderValidator::CheckNameFormat(const string& name) {
+
+ // invalid if name is empty
+ if ( name.empty() ) {
+ AddError("Sequence entry (@SQ) is missing SN tag");
+ return false;
+ }
+
+ // invalid if first character is a reserved char
+ const char firstChar = name.at(0);
+ if ( firstChar == Constants::SAM_EQUAL || firstChar == Constants::SAM_STAR ) {
+ AddError("Invalid sequence name (SN): " + name);
+ return false;
+ }
+ // otherwise OK
+ return true;
+}
+
+// check that sequence length is within accepted range
+bool SamHeaderValidator::CheckLengthInRange(const string& length) {
+
+ // invalid if empty
+ if ( length.empty() ) {
+ AddError("Sequence entry (@SQ) is missing LN tag");
+ return false;
+ }
+
+ // convert string length to numeric
+ stringstream lengthStream(length);
+ unsigned int sequenceLength;
+ lengthStream >> sequenceLength;
+
+ // invalid if length outside accepted range
+ if ( sequenceLength < Constants::SAM_SQ_LENGTH_MIN || sequenceLength > Constants::SAM_SQ_LENGTH_MAX ) {
+ AddError("Sequence length (LN): " + length + " out of range");
+ return false;
+ }
+
+ // otherwise OK
+ return true;
+}
+
+// validate SAM header read group dictionary
+bool SamHeaderValidator::ValidateReadGroupDictionary(void) {
+
+ bool isValid = true;
+
+ // check for unique read group IDs & platform units
+ isValid &= ContainsUniqueIDsAndPlatformUnits();
+
+ // iterate over read groups
+ const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
+ SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
+ SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
+ for ( ; rgIter != rgEnd; ++rgIter ) {
+ const SamReadGroup& rg = (*rgIter);
+ isValid &= ValidateReadGroup(rg);
+ }
+
+ // return validation state
+ return isValid;
+}
+
+// make sure RG IDs and platform units are unique
+bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) {
+
+ bool isValid = true;
+ set<string> readGroupIds;
+ set<string> platformUnits;
+ set<string>::iterator idIter;
+ set<string>::iterator puIter;
+
+ // iterate over sequences
+ const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
+ SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
+ SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
+ for ( ; rgIter != rgEnd; ++rgIter ) {
+ const SamReadGroup& rg = (*rgIter);
+
+ // --------------------------------
+ // check for unique ID
+
+ // lookup read group ID
+ const string& id = rg.ID;
+ idIter = readGroupIds.find(id);
+
+ // error if found (duplicate entry)
+ if ( idIter != readGroupIds.end() ) {
+ AddError("Read group ID (ID): " + id + " is not unique");
+ isValid = false;
+ }
+
+ // otherwise ok, store id
+ readGroupIds.insert(id);
+
+ // --------------------------------
+ // check for unique platform unit
+
+ // lookup platform unit
+ const string& pu = rg.PlatformUnit;
+ puIter = platformUnits.find(pu);
+
+ // error if found (duplicate entry)
+ if ( puIter != platformUnits.end() ) {
+ AddError("Platform unit (PU): " + pu + " is not unique");
+ isValid = false;
+ }
+
+ // otherwise ok, store platform unit
+ platformUnits.insert(pu);
+ }
+
+ // return validation state
+ return isValid;
+}
+
+// validate SAM header read group entry
+bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) {
+ bool isValid = true;
+ isValid &= CheckReadGroupID(rg.ID);
+ isValid &= CheckSequencingTechnology(rg.SequencingTechnology);
+ return isValid;
+}
+
+// make sure RG ID exists
+bool SamHeaderValidator::CheckReadGroupID(const string& id) {
+
+ // invalid if empty
+ if ( id.empty() ) {
+ AddError("Read group entry (@RG) is missing ID tag");
+ return false;
+ }
+
+ // otherwise OK
+ return true;
+}
+
+// make sure RG sequencing tech is one of the accepted keywords
+bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) {
+
+ // if no technology provided, no problem, just return OK
+ if ( technology.empty() )
+ return true;
+
+ // if technology is valid keyword
+ if ( caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_CAPILLARY) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_HELICOS) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_IONTORRENT) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_LS454) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_PACBIO) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_SOLID)
+ )
+ {
+ return true;
+ }
+
+ // otherwise
+ AddError("Invalid read group sequencing platform (PL): " + technology);
+ return false;
+}
+
+// validate the SAM header "program chain"
+bool SamHeaderValidator::ValidateProgramChain(void) {
+ bool isValid = true;
+ isValid &= ContainsUniqueProgramIds();
+ isValid &= ValidatePreviousProgramIds();
+ return isValid;
+}
+
+// make sure all PG IDs are unique
+bool SamHeaderValidator::ContainsUniqueProgramIds(void) {
+
+ bool isValid = true;
+ set<string> programIds;
+ set<string>::iterator pgIdIter;
+
+ // iterate over program records
+ const SamProgramChain& programs = m_header.Programs;
+ SamProgramConstIterator pgIter = programs.ConstBegin();
+ SamProgramConstIterator pgEnd = programs.ConstEnd();
+ for ( ; pgIter != pgEnd; ++pgIter ) {
+ const SamProgram& pg = (*pgIter);
+
+ // lookup program ID
+ const string& pgId = pg.ID;
+ pgIdIter = programIds.find(pgId);
+
+ // error if found (duplicate entry)
+ if ( pgIdIter != programIds.end() ) {
+ AddError("Program ID (ID): " + pgId + " is not unique");
+ isValid = false;
+ }
+
+ // otherwise ok, store ID
+ programIds.insert(pgId);
+ }
+
+ // return validation state
+ return isValid;
+}
+
+// make sure that any PP tags present point to existing @PG IDs
+bool SamHeaderValidator::ValidatePreviousProgramIds(void) {
+
+ bool isValid = true;
+
+ // iterate over program records
+ const SamProgramChain& programs = m_header.Programs;
+ SamProgramConstIterator pgIter = programs.ConstBegin();
+ SamProgramConstIterator pgEnd = programs.ConstEnd();
+ for ( ; pgIter != pgEnd; ++pgIter ) {
+ const SamProgram& pg = (*pgIter);
+
+ // ignore record for validation if PreviousProgramID is empty
+ const string& ppId = pg.PreviousProgramID;
+ if ( ppId.empty() )
+ continue;
+
+ // see if program "chain" contains an entry for ppId
+ if ( !programs.Contains(ppId) ) {
+ AddError("PreviousProgramID (PP): " + ppId + " is not a known ID");
+ isValid = false;
+ }
+ }
+
+ // return validation state
+ return isValid;
+}
--- /dev/null
+// ***************************************************************************
+// SamHeaderValidator.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 6 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for validating SamHeader data
+// ***************************************************************************
+
+#ifndef SAM_HEADER_VALIDATOR_P_H
+#define SAM_HEADER_VALIDATOR_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+class SamHeader;
+class SamReadGroup;
+class SamSequence;
+
+namespace Internal {
+
+class SamHeaderValidator {
+
+ // ctor & dtor
+ public:
+ SamHeaderValidator(const SamHeader& header);
+ ~SamHeaderValidator(void);
+
+ // SamHeaderValidator interface
+ public:
+
+ // prints error & warning messages
+ void PrintMessages(std::ostream& stream);
+
+ // validates SamHeader data, returns true/false accordingly
+ bool Validate(void);
+
+ // internal methods
+ private:
+
+ // validate header metadata
+ bool ValidateMetadata(void);
+ bool ValidateVersion(void);
+ bool ContainsOnlyDigits(const std::string& s);
+ bool ValidateSortOrder(void);
+ bool ValidateGroupOrder(void);
+
+ // validate sequence dictionary
+ bool ValidateSequenceDictionary(void);
+ bool ContainsUniqueSequenceNames(void);
+ bool CheckNameFormat(const std::string& name);
+ bool ValidateSequence(const SamSequence& seq);
+ bool CheckLengthInRange(const std::string& length);
+
+ // validate read group dictionary
+ bool ValidateReadGroupDictionary(void);
+ bool ContainsUniqueIDsAndPlatformUnits(void);
+ bool ValidateReadGroup(const SamReadGroup& rg);
+ bool CheckReadGroupID(const std::string& id);
+ bool CheckSequencingTechnology(const std::string& technology);
+
+ // validate program data
+ bool ValidateProgramChain(void);
+ bool ContainsUniqueProgramIds(void);
+ bool ValidatePreviousProgramIds(void);
+
+ // error reporting
+ void AddError(const std::string& message);
+ void AddWarning(const std::string& message);
+ void PrintErrorMessages(std::ostream& stream);
+ void PrintWarningMessages(std::ostream& stream);
+
+ // data members
+ private:
+
+ // SamHeader being validated
+ const SamHeader& m_header;
+
+ // error reporting helpers
+ static const std::string ERROR_PREFIX;
+ static const std::string WARN_PREFIX;
+ static const std::string NEWLINE;
+
+ // error reporting messages
+ std::vector<std::string> m_errorMessages;
+ std::vector<std::string> m_warningMessages;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // SAM_HEADER_VALIDATOR_P_H
--- /dev/null
+// ***************************************************************************
+// SamHeaderVersion.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 10 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for comparing SAM header versions
+// *************************************************************************
+
+#ifndef SAM_HEADERVERSION_P_H
+#define SAM_HEADERVERSION_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include "api/SamConstants.h"
+#include <sstream>
+#include <string>
+
+namespace BamTools {
+namespace Internal {
+
+class SamHeaderVersion {
+
+ // ctors & dtor
+ public:
+ SamHeaderVersion(void)
+ : m_majorVersion(0)
+ , m_minorVersion(0)
+ { }
+
+ explicit SamHeaderVersion(const std::string& version)
+ : m_majorVersion(0)
+ , m_minorVersion(0)
+ {
+ SetVersion(version);
+ }
+
+ SamHeaderVersion(const unsigned int& major, const unsigned int& minor)
+ : m_majorVersion(major)
+ , m_minorVersion(minor)
+ { }
+
+ ~SamHeaderVersion(void) {
+ m_majorVersion = 0;
+ m_minorVersion = 0;
+ }
+
+ // acess data
+ public:
+ unsigned int MajorVersion(void) const { return m_majorVersion; }
+ unsigned int MinorVersion(void) const { return m_minorVersion; }
+
+ void SetVersion(const std::string& version);
+ std::string ToString(void) const;
+
+ // data members
+ private:
+ unsigned int m_majorVersion;
+ unsigned int m_minorVersion;
+};
+
+inline
+void SamHeaderVersion::SetVersion(const std::string& version) {
+
+ // do nothing if version is empty
+ if ( !version.empty() ) {
+
+ std::stringstream versionStream("");
+
+ // do nothing if period not found
+ const size_t periodFound = version.find(Constants::SAM_PERIOD);
+ if ( periodFound != std::string::npos ) {
+
+ // store major version if non-empty and contains only digits
+ const std::string& majorVersion = version.substr(0, periodFound);
+ versionStream.str(majorVersion);
+ if ( !majorVersion.empty() ) {
+ const size_t nonDigitFound = majorVersion.find_first_not_of(Constants::SAM_DIGITS);
+ if ( nonDigitFound == std::string::npos )
+ versionStream >> m_majorVersion;
+ }
+
+ // store minor version if non-empty and contains only digits
+ const std::string& minorVersion = version.substr(periodFound + 1);
+ versionStream.str(minorVersion);
+ if ( !minorVersion.empty() ) {
+ const size_t nonDigitFound = minorVersion.find_first_not_of(Constants::SAM_DIGITS);
+ if ( nonDigitFound == std::string::npos )
+ versionStream >> m_minorVersion;
+ }
+ }
+ }
+}
+
+// -----------------------------------------------------
+// printing
+
+inline std::string SamHeaderVersion::ToString(void) const {
+ std::stringstream version;
+ version << m_majorVersion << Constants::SAM_PERIOD << m_minorVersion;
+ return version.str();
+}
+
+// -----------------------------------------------------
+// comparison operators
+
+inline bool operator==(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) {
+ return (lhs.MajorVersion() == rhs.MajorVersion()) &&
+ (lhs.MinorVersion() == rhs.MinorVersion());
+}
+
+inline bool operator<(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) {
+ if ( lhs.MajorVersion() == rhs.MajorVersion() )
+ return lhs.MinorVersion() < rhs.MinorVersion();
+ else
+ return lhs.MajorVersion() < rhs.MajorVersion();
+}
+
+inline bool operator> (const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return rhs < lhs; }
+inline bool operator<=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return !(lhs>rhs); }
+inline bool operator>=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return !(lhs<rhs); }
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // SAM_HEADERVERSION_P_H
--- /dev/null
+// ***************************************************************************
+// BamException_p.cpp (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides a basic exception class for BamTools internals
+// ***************************************************************************
+
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+using namespace std;
+
+const string BamException::SEPARATOR = ": ";
--- /dev/null
+// ***************************************************************************
+// BamException_p.h (c) 2011 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 6 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides a basic exception class for BamTools internals
+// ***************************************************************************
+
+#ifndef BAMEXCEPTION_P_H
+#define BAMEXCEPTION_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <exception>
+#include <string>
+
+namespace BamTools {
+namespace Internal {
+
+class BamException : public std::exception {
+
+ public:
+ inline BamException(const std::string& where, const std::string& message)
+ : std::exception()
+ , m_errorString(where + SEPARATOR + message)
+ { }
+
+ inline ~BamException(void) throw() { }
+
+ inline const char* what(void) const throw() {
+ return m_errorString.c_str();
+ }
+
+ private:
+ std::string m_errorString;
+ static const std::string SEPARATOR;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMEXCEPTION_P_H