From a15dba1bdfe5a1a61e175cb18b1e2694cfcd1746 Mon Sep 17 00:00:00 2001 From: derek Date: Tue, 5 Apr 2011 12:43:31 -0400 Subject: [PATCH] Major performance boost to startup & random-access - especially for the use cases involving multiple (hundreds) of BAMs with BAI index files. * This did require some changes to the BamIndex interface. I doubt man y people are writing custom index format classes, but if you are one of them and have any problems, feel free to contact me with questions. --- src/api/BamConstants.h | 18 +- src/api/BamIndex.cpp | 190 --- src/api/BamIndex.h | 86 +- src/api/CMakeLists.txt | 1 - src/api/internal/BamIndexFactory_p.cpp | 27 +- src/api/internal/BamIndexFactory_p.h | 8 +- src/api/internal/BamMultiReader_p.cpp | 8 +- .../internal/BamRandomAccessController_p.cpp | 33 +- .../internal/BamRandomAccessController_p.h | 4 +- src/api/internal/BamReader_p.cpp | 15 +- src/api/internal/BamReader_p.h | 15 +- src/api/internal/BamStandardIndex_p.cpp | 1259 +++++++++-------- src/api/internal/BamStandardIndex_p.h | 296 ++-- src/api/internal/BamToolsIndex_p.cpp | 736 +++++----- src/api/internal/BamToolsIndex_p.h | 197 ++- src/api/internal/BgzfStream_p.cpp | 17 +- src/api/internal/BgzfStream_p.h | 6 +- 17 files changed, 1339 insertions(+), 1577 deletions(-) delete mode 100644 src/api/BamIndex.cpp diff --git a/src/api/BamConstants.h b/src/api/BamConstants.h index 6a97980..6a1695a 100644 --- a/src/api/BamConstants.h +++ b/src/api/BamConstants.h @@ -1,10 +1,20 @@ +// *************************************************************************** +// BamConstants.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 5 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides basic constants for handling BAM files. +// *************************************************************************** + #ifndef BAM_CONSTANTS_H #define BAM_CONSTANTS_H #include /*! \namespace BamTools::Constants - \brief Contains most of the constants used throughout BamTools. + \brief Provides basic constants for handling BAM files. */ namespace BamTools { @@ -43,6 +53,8 @@ const int BAM_CIGAR_SOFTCLIP = 4; const int BAM_CIGAR_HARDCLIP = 5; const int BAM_CIGAR_PAD = 6; +// TODO: Add support for 'X' and '=' ops + const char BAM_CIGAR_MATCH_CHAR = 'M'; const char BAM_CIGAR_INS_CHAR = 'I'; const char BAM_CIGAR_DEL_CHAR = 'D'; @@ -103,8 +115,8 @@ const int Z_DEFAULT_MEM_LEVEL = 8; // BZGF constants const int BGZF_BLOCK_HEADER_LENGTH = 18; const int BGZF_BLOCK_FOOTER_LENGTH = 8; -const int BGZF_MAX_BLOCK_SIZE = 65536; -const int BGZF_DEFAULT_BLOCK_SIZE = 65536; +const int BGZF_MAX_BLOCK_SIZE = 262144; +const int BGZF_DEFAULT_BLOCK_SIZE = 262144; } // namespace Constants } // namespace BamTools diff --git a/src/api/BamIndex.cpp b/src/api/BamIndex.cpp deleted file mode 100644 index 3e5f86e..0000000 --- a/src/api/BamIndex.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// *************************************************************************** -// BamIndex.cpp (c) 2009 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 23 March 2011 (DB) -// --------------------------------------------------------------------------- -// Provides index functionality - both for the default (standardized) BAM -// index format (.bai) as well as a BamTools-specific (nonstandard) index -// format (.bti). -// *************************************************************************** - -#include -#include -#include -#include -using namespace BamTools; -using namespace BamTools::Internal; - -#include -#include -#include -#include -#include -using namespace std; - -/*! \class BamTools::BamIndex - \brief Provides methods for generating & loading BAM index files. - - This class straddles the line between public API and internal - implementation detail. Most client code should never have to use this - class directly. - - It is exposed to the public API to allow advanced users to implement - their own custom indexing schemes. - - N.B. - Please note that if you wish to derive your own subclass, you are - entering waters that are not well-documented at the moment and are - likely to be changing soon anyway. Implementing a custom index is - technically do-able at the moment, but the learning curve is (at the - moment) overly steep. Changes will be coming soon to alleviate some - of this headache. -*/ - -// ctor -BamIndex::BamIndex(void) - : m_indexStream(0) - , m_indexFilename("") - , m_cacheMode(BamIndex::LimitedIndexCaching) -{ } - -// dtor -BamIndex::~BamIndex(void) { - if ( IsOpen() ) fclose(m_indexStream); - m_indexFilename = ""; -} - -// return true if FILE* is open -bool BamIndex::IsOpen(void) const { - return ( m_indexStream != 0 ); -} - -// loads existing data from file into memory -bool BamIndex::Load(const string& filename) { - - // open index file, abort on error - if ( !OpenIndexFile(filename, "rb") ) { - fprintf(stderr, "BamIndex ERROR: unable to open the BAM index file %s for reading.\n", filename.c_str()); - return false; - } - - // check magic number - if ( !LoadHeader() ) { - fclose(m_indexStream); - return false; - } - - // load reference data (but only keep in memory if full caching requested) - bool saveInitialLoad = ( m_cacheMode == BamIndex::FullIndexCaching ); - if ( !LoadAllReferences(saveInitialLoad) ) { - fclose(m_indexStream); - return false; - } - - // update index cache based on selected mode - UpdateCache(); - - // return success - return true; -} - -// opens index file for reading/writing, return true if opened OK -bool BamIndex::OpenIndexFile(const string& filename, const string& mode) { - - // attempt to open file, return false if error - m_indexStream = fopen(filename.c_str(), mode.c_str()); - if ( m_indexStream == 0 ) return false; - - // otherwise save filename & return sucess - m_indexFilename = filename; - return true; -} - -// rewind index file to beginning of index data, return true if rewound OK -bool BamIndex::Rewind(void) { - return ( fseek64(m_indexStream, DataBeginOffset(), SEEK_SET) == 0 ); -} - -// change the index caching behavior -void BamIndex::SetCacheMode(const BamIndex::IndexCacheMode& mode) { - if ( mode != m_cacheMode ) { - m_cacheMode = mode; - UpdateCache(); - } -} - -// updates in-memory cache of index data, depending on current cache mode -void BamIndex::UpdateCache(void) { - - // skip if file not open - if ( !IsOpen() ) return; - - // reflect requested cache mode behavior - switch ( m_cacheMode ) { - - case (BamIndex::FullIndexCaching) : - Rewind(); - LoadAllReferences(true); - break; - - case (BamIndex::LimitedIndexCaching) : - if ( HasFullDataCache() ) - KeepOnlyFirstReferenceOffsets(); - else { - ClearAllData(); - SkipToFirstReference(); - LoadFirstReference(true); - } - break; - - case(BamIndex::NoIndexCaching) : - ClearAllData(); - break; - - default : - // unreachable - ; - } -} - -// writes in-memory index data out to file -bool BamIndex::Write(const string& bamFilename) { - - // open index file for writing - string indexFilename = bamFilename + Extension(); - if ( !OpenIndexFile(indexFilename, "wb") ) { - fprintf(stderr, "BamIndex ERROR: could not open file to save index data.\n"); - return false; - } - - // write index header data - if ( !WriteHeader() ) { - fprintf(stderr, "BamIndex ERROR: there was a problem writing index metadata to the new index file.\n"); - fflush(m_indexStream); - fclose(m_indexStream); - exit(1); - } - - // write main index data - if ( !WriteAllReferences() ) { - fprintf(stderr, "BamIndex ERROR: there was a problem writing index data to the new index file.\n"); - fflush(m_indexStream); - fclose(m_indexStream); - exit(1); - } - - // flush any remaining output - fflush(m_indexStream); - fclose(m_indexStream); - - // re-open index file for later reading - if ( !OpenIndexFile(indexFilename, "rb") ) { - fprintf(stderr, "BamIndex ERROR: could not open newly created index file for reading.\n"); - return false; - } - - // save index filename & return success - m_indexFilename = indexFilename; - return true; -} diff --git a/src/api/BamIndex.h b/src/api/BamIndex.h index 5ba1469..00a8f01 100644 --- a/src/api/BamIndex.h +++ b/src/api/BamIndex.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 24 February 2011 (DB) +// Last modified: 5 April 2011 (DB) // --------------------------------------------------------------------------- // Provides basic BAM index interface // *************************************************************************** @@ -13,20 +13,27 @@ #include #include -#include #include -#include namespace BamTools { -class BamReader; - namespace Internal { class BamReaderPrivate; } // namespace Internal -// -------------------------------------------------- -// BamIndex base class +/*! \class BamTools::BamIndex + \brief Provides methods for generating & loading BAM index files. + + This class straddles the line between public API and internal + implementation detail. Most client code should never have to use this + class directly. + + It is exposed to the public API to allow advanced users to implement + their own custom indexing schemes. + + More documentation on methods & enums coming soon. +*/ + class API_EXPORT BamIndex { // enums @@ -44,75 +51,28 @@ class API_EXPORT BamIndex { // ctor & dtor public: - BamIndex(void); - virtual ~BamIndex(void); + BamIndex(Internal::BamReaderPrivate* reader) : m_reader(reader) { } + virtual ~BamIndex(void) { } // index interface public: - // creates index data (in-memory) from @reader data - virtual bool Build(Internal::BamReaderPrivate* reader) =0; - // returns supported file extension - virtual const std::string Extension(void) =0; + // builds index from associated BAM file & writes out to index file + virtual bool Create(void) =0; // creates index file from BAM file // returns whether reference has alignments or no virtual bool HasAlignments(const int& referenceID) const =0; - // attempts to use index data to jump to @region in @reader; returns success/fail + // attempts to use index data to jump to @region, returns success/fail // a "successful" jump indicates no error, but not whether this region has data // * thus, the method sets a flag to indicate whether there are alignments // available after the jump position - virtual bool Jump(Internal::BamReaderPrivate* reader, - const BamTools::BamRegion& region, - bool* hasAlignmentsInRegion) =0; + virtual bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) =0; // loads existing data from file into memory - virtual bool Load(const std::string& filename); + virtual bool Load(const std::string& filename) =0; // change the index caching behavior - virtual void SetCacheMode(const BamIndex::IndexCacheMode& mode); - // writes in-memory index data out to file - // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) - virtual bool Write(const std::string& bamFilename); - - // derived-classes MUST provide implementation - protected: - // clear all current index offset data in memory - virtual void ClearAllData(void) =0; - // return file position after header metadata - virtual off_t DataBeginOffset(void) const =0; - // return true if all index data is cached - virtual bool HasFullDataCache(void) const =0; - // clears index data from all references except the first - virtual void KeepOnlyFirstReferenceOffsets(void) =0; - // load index data for all references, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - virtual bool LoadAllReferences(bool saveData = true) =0; - // load first reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - virtual bool LoadFirstReference(bool saveData = true) =0; - // load header data from index file, return true if loaded OK - virtual bool LoadHeader(void) =0; - // position file pointer to first reference begin, return true if skipped OK - virtual bool SkipToFirstReference(void) =0; - // write index reference data - virtual bool WriteAllReferences(void) =0; - // write index header data - virtual bool WriteHeader(void) =0; - - // internal methods (but available to derived classes) - protected: - // rewind index file to beginning of index data, return true if rewound OK - bool Rewind(void); - - private: - // return true if FILE* is open - bool IsOpen(void) const; - // opens index file according to requested mode, return true if opened OK - bool OpenIndexFile(const std::string& filename, const std::string& mode); - // updates in-memory cache of index data, depending on current cache mode - void UpdateCache(void); + virtual void SetCacheMode(const BamIndex::IndexCacheMode& mode) =0; // data members protected: - FILE* m_indexStream; - std::string m_indexFilename; - BamIndex::IndexCacheMode m_cacheMode; + Internal::BamReaderPrivate* m_reader; // copy, not ownedprivate: }; } // namespace BamTools diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt index 57efba2..a09d9a1 100644 --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@ -14,7 +14,6 @@ add_definitions( -DBAMTOOLS_API_LIBRARY ) # (for proper exporting of library sym # list of all BamTools API source (.cpp) files set( BamToolsAPISources BamAlignment.cpp - BamIndex.cpp BamMultiReader.cpp BamReader.cpp BamWriter.cpp diff --git a/src/api/internal/BamIndexFactory_p.cpp b/src/api/internal/BamIndexFactory_p.cpp index dcc11ce..69b372b 100644 --- a/src/api/internal/BamIndexFactory_p.cpp +++ b/src/api/internal/BamIndexFactory_p.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 March 2011 (DB) +// Last modified: 5 April 2011 (DB) // --------------------------------------------------------------------------- // Provides interface for generating BamIndex implementations // *************************************************************************** @@ -24,16 +24,16 @@ const string BamIndexFactory::CreateIndexFilename(const string& bamFilename, const BamIndex::IndexType& type) { switch ( type ) { - case ( BamIndex::STANDARD ) : return ( bamFilename + BAI_EXTENSION ); - case ( BamIndex::BAMTOOLS ) : return ( bamFilename + BTI_EXTENSION ); + case ( BamIndex::STANDARD ) : return ( bamFilename + BamStandardIndex::Extension() ); + case ( BamIndex::BAMTOOLS ) : return ( bamFilename + BamToolsIndex::Extension() ); default : - fprintf(stderr, "BamIndexFactory ERROR: unknown index type %u\n", type); + cerr << "BamIndexFactory ERROR: unknown index type" << type << endl; return string(); } } // creates a new BamIndex object, depending on extension of @indexFilename -BamIndex* BamIndexFactory::CreateIndexFromFilename(const string& indexFilename) { +BamIndex* BamIndexFactory::CreateIndexFromFilename(const string& indexFilename, BamReaderPrivate* reader) { // if file doesn't exist, return null index if ( !BamTools::FileExists(indexFilename) ) @@ -46,18 +46,21 @@ BamIndex* BamIndexFactory::CreateIndexFromFilename(const string& indexFilename) return 0; // create index based on extension - if ( extension == BAI_EXTENSION ) return new BamStandardIndex; - else if ( extension == BTI_EXTENSION ) return new BamToolsIndex; - else return 0; + if ( extension == BamStandardIndex::Extension() ) return new BamStandardIndex(reader); + else if ( extension == BamToolsIndex::Extension() ) return new BamToolsIndex(reader); + else + return 0; } // creates a new BamIndex, object of requested @type -BamIndex* BamIndexFactory::CreateIndexOfType(const BamIndex::IndexType& type) { +BamIndex* BamIndexFactory::CreateIndexOfType(const BamIndex::IndexType& type, + BamReaderPrivate* reader) +{ switch ( type ) { - case ( BamIndex::STANDARD ) : return new BamStandardIndex; - case ( BamIndex::BAMTOOLS ) : return new BamToolsIndex; + case ( BamIndex::STANDARD ) : return new BamStandardIndex(reader); + case ( BamIndex::BAMTOOLS ) : return new BamToolsIndex(reader); default : - fprintf(stderr, "BamIndexFactory ERROR: unknown index type %u\n", type); + cerr << "BamIndexFactory ERROR: unknown index type " << type << endl; return 0; } } diff --git a/src/api/internal/BamIndexFactory_p.h b/src/api/internal/BamIndexFactory_p.h index 4ef9585..f060d2c 100644 --- a/src/api/internal/BamIndexFactory_p.h +++ b/src/api/internal/BamIndexFactory_p.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 26 January 2011 (DB) +// Last modified: 5 April 2011 (DB) // --------------------------------------------------------------------------- // Provides interface for generating BamIndex implementations // *************************************************************************** @@ -22,9 +22,11 @@ class BamIndexFactory { // static interface methods public: // creates a new BamIndex object, depending on extension of @indexFilename - static BamIndex* CreateIndexFromFilename(const std::string& indexFilename); + static BamIndex* CreateIndexFromFilename(const std::string& indexFilename, + BamReaderPrivate* reader); // creates a new BamIndex object, of requested @type - static BamIndex* CreateIndexOfType(const BamIndex::IndexType& type); + static BamIndex* CreateIndexOfType(const BamIndex::IndexType& type, + BamReaderPrivate* reader); // returns name of existing index file that corresponds to @bamFilename // will defer to @preferredType if possible // if @preferredType not found, will attempt to load any supported index type diff --git a/src/api/internal/BamMultiReader_p.cpp b/src/api/internal/BamMultiReader_p.cpp index 583085c..e789d06 100644 --- a/src/api/internal/BamMultiReader_p.cpp +++ b/src/api/internal/BamMultiReader_p.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 March 2011 (DB) +// Last modified: 5 April 2011 (DB) // --------------------------------------------------------------------------- // Functionality for simultaneously reading multiple BAM files // ************************************************************************* @@ -180,6 +180,8 @@ SamHeader BamMultiReaderPrivate::GetHeader(void) const { // makes a virtual, unified header for all the bam files in the multireader string BamMultiReaderPrivate::GetHeaderText(void) const { + // TODO: merge SamHeader objects instead of parsing string data (again) + // if only one reader is open if ( m_readers.size() == 1 ) { @@ -545,7 +547,7 @@ BamReader* BamMultiReaderPrivate::OpenReader(const std::string& filename) { // reader could not open else { - cerr << "BamMultiReader WARNING: Could not open: " + cerr << "BamMultiReader WARNING: Could not open " << filename << ", ignoring file" << endl; } @@ -646,7 +648,7 @@ bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) { // attempt to set BamReader's region of interest if ( !reader->SetRegion(region) ) { - cerr << "BamMultiReader ERROR: could not jump " << reader->GetFilename() << " to " + cerr << "BamMultiReader WARNING: could not jump " << reader->GetFilename() << " to " << region.LeftRefID << ":" << region.LeftPosition << ".." << region.RightRefID << ":" << region.RightPosition << endl; } diff --git a/src/api/internal/BamRandomAccessController_p.cpp b/src/api/internal/BamRandomAccessController_p.cpp index a785610..89636bb 100644 --- a/src/api/internal/BamRandomAccessController_p.cpp +++ b/src/api/internal/BamRandomAccessController_p.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 March 2011(DB) +// Last modified: 5 April 2011(DB) // --------------------------------------------------------------------------- // Manages random access operations in a BAM file // ************************************************************************** @@ -151,27 +151,22 @@ bool BamRandomAccessController::CreateIndex(BamReaderPrivate* reader, return false; // create new index of requested type - BamIndex* newIndex = BamIndexFactory::CreateIndexOfType(type); + BamIndex* newIndex = BamIndexFactory::CreateIndexOfType(type, reader); if ( newIndex == 0 ) { cerr << "BamRandomAccessController ERROR: could not create index of type " << type << endl; return false; } // attempt to build index from current BamReader file - if ( !newIndex->Build(reader) ) { - cerr << "BamRandomAccessController ERROR: could not build index on BAM file: " << reader->Filename() << endl; + if ( !newIndex->Create() ) { + cerr << "BamRandomAccessController ERROR: could not create index for BAM file: " + << reader->Filename() << endl; return false; } // save new index SetIndex(newIndex); - // attempt to write new index file - if ( newIndex->Write(reader->Filename()) ) { - cerr << "BamRandomAccessController ERROR: could not save new index for BAM file: " << reader->Filename() << endl; - return false; - } - // set new index's cache mode & return success newIndex->SetCacheMode(m_indexCacheMode); return true; @@ -189,24 +184,28 @@ bool BamRandomAccessController::IndexHasAlignmentsForReference(const int& refId) return m_index->HasAlignments(refId); } -bool BamRandomAccessController::LocateIndex(const string& bamFilename, +bool BamRandomAccessController::LocateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& preferredType) { // look up index filename, deferring to preferredType if possible - const string& indexFilename = BamIndexFactory::FindIndexFilename(bamFilename, preferredType); + const string& indexFilename = BamIndexFactory::FindIndexFilename(reader->Filename(), preferredType); // if no index file found (of any type) - if ( indexFilename.empty() ) + if ( indexFilename.empty() ) { + cerr << "BamRandomAccessController WARNING: " + << "could not find index file for BAM: " + << reader->Filename() << endl; return false; + } // otherwise open & use index file that was found - return OpenIndex(indexFilename); + return OpenIndex(indexFilename, reader); } -bool BamRandomAccessController::OpenIndex(const string& indexFilename) { +bool BamRandomAccessController::OpenIndex(const string& indexFilename, BamReaderPrivate* reader) { // attempt create new index of type based on filename - BamIndex* index = BamIndexFactory::CreateIndexFromFilename(indexFilename); + BamIndex* index = BamIndexFactory::CreateIndexFromFilename(indexFilename, reader); if ( index == 0 ) { cerr << "BamRandomAccessController ERROR: could not create index for file: " << indexFilename << endl; return false; @@ -270,5 +269,5 @@ bool BamRandomAccessController::SetRegion(BamReaderPrivate* reader, // alignment on a reference. If this occurs, any subsequent calls to GetNextAlignment[Core] // will not return data. BamMultiReader will still be able to successfully pull alignments // from a region from multiple files even if one or more have no data. - return m_index->Jump(reader, m_region, &m_hasAlignmentsInRegion); + return m_index->Jump(m_region, &m_hasAlignmentsInRegion); } diff --git a/src/api/internal/BamRandomAccessController_p.h b/src/api/internal/BamRandomAccessController_p.h index e541cdf..372ea4a 100644 --- a/src/api/internal/BamRandomAccessController_p.h +++ b/src/api/internal/BamRandomAccessController_p.h @@ -56,8 +56,8 @@ class BamRandomAccessController { bool CreateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& type); bool HasIndex(void) const; bool IndexHasAlignmentsForReference(const int& refId); - bool LocateIndex(const std::string& bamFilename, const BamIndex::IndexType& preferredType); - bool OpenIndex(const std::string& indexFilename); + bool LocateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& preferredType); + bool OpenIndex(const std::string& indexFilename, BamReaderPrivate* reader); void SetIndex(BamIndex* index); void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode); diff --git a/src/api/internal/BamReader_p.cpp b/src/api/internal/BamReader_p.cpp index 441e8c0..c2afb4c 100644 --- a/src/api/internal/BamReader_p.cpp +++ b/src/api/internal/BamReader_p.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 March 2011 (DB) +// Last modified: 5 April 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for reading BAM files // *************************************************************************** @@ -295,7 +295,7 @@ bool BamReaderPrivate::LoadReferenceData(void) { } bool BamReaderPrivate::LocateIndex(const BamIndex::IndexType& preferredType) { - return m_randomAccessController.LocateIndex(m_filename, preferredType); + return m_randomAccessController.LocateIndex(this, preferredType); } // opens BAM file (and index) @@ -326,7 +326,7 @@ bool BamReaderPrivate::Open(const string& filename) { } bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) { - return m_randomAccessController.OpenIndex(indexFilename); + return m_randomAccessController.OpenIndex(indexFilename, this); } // returns BAM file pointer to beginning of alignment data @@ -349,6 +349,10 @@ bool BamReaderPrivate::Rewind(void) { return m_stream.Seek(m_alignmentsBeginOffset); } +bool BamReaderPrivate::Seek(const int64_t& position) { + return m_stream.Seek(position); +} + void BamReaderPrivate::SetIndex(BamIndex* index) { m_randomAccessController.SetIndex(index); } @@ -364,7 +368,6 @@ bool BamReaderPrivate::SetRegion(const BamRegion& region) { return m_randomAccessController.SetRegion(this, region, m_references.size()); } -// returns handle to internal BgzfStream -BgzfStream* BamReaderPrivate::Stream(void) { - return &m_stream; +int64_t BamReaderPrivate::Tell(void) const { + return m_stream.Tell(); } diff --git a/src/api/internal/BamReader_p.h b/src/api/internal/BamReader_p.h index 7dda67f..c0d07d8 100644 --- a/src/api/internal/BamReader_p.h +++ b/src/api/internal/BamReader_p.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 24 February 2011 (DB) +// Last modified: 5 April 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for reading BAM files // *************************************************************************** @@ -70,11 +70,10 @@ class BamReaderPrivate { void SetIndex(BamIndex* index); void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode); - // BamReaderPrivate interface - public: - BgzfStream* Stream(void); - - // 'internal' methods + // internal methods, but available as a BamReaderPrivate 'interface' + // + // these methods should only be used by BamTools::Internal classes + // (currently only used by the BamIndex subclasses) public: // retrieves header text from BAM file bool LoadHeaderData(void); @@ -83,6 +82,10 @@ class BamReaderPrivate { bool LoadNextAlignment(BamAlignment& alignment); // builds reference data structure from BAM file bool LoadReferenceData(void); + // seek reader to file position + bool Seek(const int64_t& position); + // return reader's file position + int64_t Tell(void) const; // data members public: diff --git a/src/api/internal/BamStandardIndex_p.cpp b/src/api/internal/BamStandardIndex_p.cpp index cf0e2c1..df4145d 100644 --- a/src/api/internal/BamStandardIndex_p.cpp +++ b/src/api/internal/BamStandardIndex_p.cpp @@ -3,16 +3,14 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 March 2011 (DB) +// Last modified: 5 April 2011 (DB) // --------------------------------------------------------------------------- // Provides index operations for the standardized BAM index format (".bai") // *************************************************************************** #include -#include #include #include -#include using namespace BamTools; using namespace BamTools::Internal; @@ -21,399 +19,464 @@ using namespace BamTools::Internal; #include #include #include -#include using namespace std; -BamStandardIndex::BamStandardIndex(void) - : BamIndex() - , m_dataBeginOffset(0) - , m_hasFullDataCache(false) +// static BamStandardIndex constants +const int BamStandardIndex::MAX_BIN = 37450; // =(8^6-1)/7+1 +const int BamStandardIndex::BAM_LIDX_SHIFT = 14; +const string BamStandardIndex::BAI_EXTENSION = ".bai"; +const char* const BamStandardIndex::BAI_MAGIC = "BAI\1"; +const int BamStandardIndex::SIZEOF_ALIGNMENTCHUNK = sizeof(uint64_t)*2; +const int BamStandardIndex::SIZEOF_BINCORE = sizeof(uint32_t) + sizeof(int32_t); +const int BamStandardIndex::SIZEOF_LINEAROFFSET = sizeof(uint64_t); + +// ctor +BamStandardIndex::BamStandardIndex(Internal::BamReaderPrivate* reader) + : BamIndex(reader) + , m_indexStream(0) + , m_cacheMode(BamIndex::LimitedIndexCaching) + , m_buffer(0) + , m_bufferLength(0) { - m_isBigEndian = BamTools::SystemIsBigEndian(); + m_isBigEndian = BamTools::SystemIsBigEndian(); } +// dtor BamStandardIndex::~BamStandardIndex(void) { - ClearAllData(); + CloseFile(); } -// calculate bins that overlap region -int BamStandardIndex::BinsFromRegion(const BamRegion& region, - const RefVector& references, - const bool isRightBoundSpecified, - uint16_t bins[MAX_BIN]) -{ - // get region boundaries - uint32_t begin = (unsigned int)region.LeftPosition; - uint32_t end; +bool BamStandardIndex::AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end) { + + // retrieve references from reader + const RefVector& references = m_reader->GetReferenceData(); + + // make sure left-bound position is valid + if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) + return false; + + // set region 'begin' + begin = (unsigned int)region.LeftPosition; // if right bound specified AND left&right bounds are on same reference - // OK to use right bound position - if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) ) + // OK to use right bound position as region 'end' + if ( region.isRightBoundSpecified() && ( region.LeftRefID == region.RightRefID ) ) end = (unsigned int)region.RightPosition; - // otherwise, use end of left bound reference as cutoff - else - end = (unsigned int)references.at(region.LeftRefID).RefLength - 1; + // otherwise, set region 'end' to last reference base + else end = (unsigned int)references.at(region.LeftRefID).RefLength - 1; - // initialize list, bin '0' always a valid bin - int i = 0; - bins[i++] = 0; + // return success + return true; +} + +void BamStandardIndex::CalculateCandidateBins(const uint32_t& begin, + const uint32_t& end, + set& candidateBins) +{ + // initialize list, bin '0' is always a valid bin + candidateBins.insert(0); // get rest of bins that contain this region unsigned int k; - for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; } - for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; } - for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; } - for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; } - for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; } - - // return number of bins stored - return i; + for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { candidateBins.insert(k); } + for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { candidateBins.insert(k); } + for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { candidateBins.insert(k); } + for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { candidateBins.insert(k); } + for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { candidateBins.insert(k); } } -// creates index data (in-memory) from @reader data -bool BamStandardIndex::Build(Internal::BamReaderPrivate* reader) { - - // skip if invalid reader - if ( reader == 0 ) - return false; - - // skip if reader BgzfStream is invalid or not open - BgzfStream* bgzfStream = reader->Stream(); - if ( bgzfStream == 0 || !bgzfStream->IsOpen ) +bool BamStandardIndex::CalculateCandidateOffsets(const BaiReferenceSummary& refSummary, + const uint64_t& minOffset, + set& candidateBins, + vector& offsets) +{ + // attempt seek to first bin + if ( !Seek(refSummary.FirstBinFilePosition, SEEK_SET) ) return false; - // move reader's file pointer to beginning of alignments - reader->Rewind(); - - // get reference count, reserve index space - const int numReferences = reader->GetReferenceCount(); - m_indexData.clear(); - m_hasFullDataCache = false; - SetReferenceCount(numReferences); - - // sets default constant for bin, ID, offset, coordinate variables - const uint32_t defaultValue = 0xffffffffu; - - // bin data - uint32_t saveBin(defaultValue); - uint32_t lastBin(defaultValue); + // iterate over reference bins + uint32_t binId; + int32_t numAlignmentChunks; + set::iterator candidateBinIter; + for ( int i = 0; i < refSummary.NumBins; ++i ) { - // reference ID data - int32_t saveRefID(defaultValue); - int32_t lastRefID(defaultValue); + // read bin contents (if successful, alignment chunks are now in m_buffer) + if ( !ReadBinIntoBuffer(binId, numAlignmentChunks) ) + return false; - // offset data - uint64_t saveOffset = bgzfStream->Tell(); - uint64_t lastOffset = saveOffset; + // see if bin is a 'candidate bin' + candidateBinIter = candidateBins.find(binId); - // coordinate data - int32_t lastCoordinate = defaultValue; + // if not, move on to next bin + if ( candidateBinIter == candidateBins.end() ) + continue; - BamAlignment bAlignment; - while ( reader->LoadNextAlignment(bAlignment) ) { + // otherwise, check bin's contents against for overlap + else { - // change of chromosome, save ID, reset bin - if ( lastRefID != bAlignment.RefID ) { - lastRefID = bAlignment.RefID; - lastBin = defaultValue; - } + unsigned int offset = 0; + uint64_t chunkStart; + uint64_t chunkStop; - // if lastCoordinate greater than BAM position - file not sorted properly - else if ( lastCoordinate > bAlignment.Position ) { - fprintf(stderr, "BamStandardIndex ERROR: file not properly sorted:\n"); - fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", - bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID); - exit(1); - } + // iterate over alignment chunks + for (int j = 0; j < numAlignmentChunks; ++j ) { - // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions) - if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) { - - // save linear offset entry (matched to BAM entry refID) - BamStandardIndexData::iterator indexIter = m_indexData.find(bAlignment.RefID); - if ( indexIter == m_indexData.end() ) return false; // error - ReferenceIndex& refIndex = (*indexIter).second; - LinearOffsetVector& offsets = refIndex.Offsets; - SaveLinearOffset(offsets, bAlignment, lastOffset); - } + // read chunk start & stop from buffer + memcpy((char*)&chunkStart, m_buffer+offset, sizeof(uint64_t)); + offset += sizeof(uint64_t); + memcpy((char*)&chunkStop, m_buffer, sizeof(uint64_t)); + offset += sizeof(uint64_t); - // if current BamAlignment bin != lastBin, "then possibly write the binning index" - if ( bAlignment.Bin != lastBin ) { - - // if not first time through - if ( saveBin != defaultValue ) { + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_64(chunkStart); + SwapEndian_64(chunkStop); + } - // save Bam bin entry - BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID); - if ( indexIter == m_indexData.end() ) return false; // error - ReferenceIndex& refIndex = (*indexIter).second; - BamBinMap& binMap = refIndex.Bins; - SaveBinEntry(binMap, saveBin, saveOffset, lastOffset); + // store alignment chunk's start offset + // if its stop offset is larger than our 'minOffset' + if ( chunkStop > minOffset ) + offsets.push_back(chunkStart); } - // update saveOffset - saveOffset = lastOffset; - - // update bin values - saveBin = bAlignment.Bin; - lastBin = bAlignment.Bin; + // 'pop' bin ID from candidate bins set + candidateBins.erase(candidateBinIter); - // update saveRefID - saveRefID = bAlignment.RefID; - - // if invalid RefID, break out - if ( saveRefID < 0 ) break; + // quit if no more candidates + if ( candidateBins.empty() ) + break; } - - // make sure that current file pointer is beyond lastOffset - if ( bgzfStream->Tell() <= (int64_t)lastOffset ) { - fprintf(stderr, "BamStandardIndex ERROR: could not build index - calculating offsets failed.\n"); - exit(1); - } - - // update lastOffset - lastOffset = bgzfStream->Tell(); - - // update lastCoordinate - lastCoordinate = bAlignment.Position; - } - - // save any leftover BAM data (as long as refID is valid) - if ( saveRefID >= 0 ) { - // save Bam bin entry - BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID); - if ( indexIter == m_indexData.end() ) return false; // error - ReferenceIndex& refIndex = (*indexIter).second; - BamBinMap& binMap = refIndex.Bins; - SaveBinEntry(binMap, saveBin, saveOffset, lastOffset); } - // simplify index by merging chunks - MergeChunks(); - - // iterate through references in index - // sort offsets in linear offset vector - BamStandardIndexData::iterator indexIter = m_indexData.begin(); - BamStandardIndexData::iterator indexEnd = m_indexData.end(); - for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) { + // return success/failure on calculating at least 1 offset + return ( !offsets.empty() ); +} - // get reference index data - ReferenceIndex& refIndex = (*indexIter).second; - LinearOffsetVector& offsets = refIndex.Offsets; +uint64_t BamStandardIndex::CalculateMinOffset(const BaiReferenceSummary& refSummary, + const uint32_t& begin) +{ + // if no linear offsets exist, return 0 + if ( refSummary.NumLinearOffsets == 0 ) + return 0; + + // if 'begin' starts beyond last linear offset, use the last linear offset as minimum + // else use the offset corresponding to the requested start position + const int shiftedBegin = begin>>BamStandardIndex::BAM_LIDX_SHIFT; + if ( shiftedBegin >= refSummary.NumLinearOffsets ) + return LookupLinearOffset( refSummary, refSummary.NumLinearOffsets-1 ); + else + return LookupLinearOffset( refSummary, shiftedBegin ); +} - // sort linear offsets - sort(offsets.begin(), offsets.end()); +void BamStandardIndex::CheckBufferSize(char*& buffer, + unsigned int& bufferLength, + const unsigned int& requestedBytes) +{ + try { + if ( requestedBytes > bufferLength ) { + bufferLength = requestedBytes + 10; + delete[] buffer; + buffer = new char[bufferLength]; + } + } catch ( std::bad_alloc ) { + cerr << "BamStandardIndex ERROR: out of memory when allocating " + << requestedBytes << " byes" << endl; + exit(1); } +} - // rewind reader's file pointer to beginning of alignments, return success/fail - return reader->Rewind(); +void BamStandardIndex::CheckBufferSize(unsigned char*& buffer, + unsigned int& bufferLength, + const unsigned int& requestedBytes) +{ + try { + if ( requestedBytes > bufferLength ) { + bufferLength = requestedBytes + 10; + delete[] buffer; + buffer = new unsigned char[bufferLength]; + } + } catch ( std::bad_alloc ) { + cerr << "BamStandardIndex ERROR: out of memory when allocating " + << requestedBytes << " byes" << endl; + exit(1); + } } -// check index file magic number, return true if OK bool BamStandardIndex::CheckMagicNumber(void) { - // read in magic number + // check 'magic number' to see if file is BAI index char magic[4]; size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream); + if ( elementsRead != 4 ) { + cerr << "BamStandardIndex ERROR: could not read format 'magic number'" << endl; + return false; + } // compare to expected value - if ( strncmp(magic, "BAI\1", 4) != 0 ) { - fprintf(stderr, "BamStandardIndex ERROR: could not load index file - invalid magic number.\n"); - fclose(m_indexStream); + if ( strncmp(magic, BamStandardIndex::BAI_MAGIC, 4) != 0 ) { + cerr << "BamStandardIndex ERROR: invalid format" << endl; return false; } - // return success/failure of load - return (elementsRead == 4); + // otherwise OK + return true; } -// clear all current index offset data in memory -void BamStandardIndex::ClearAllData(void) { - BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); - BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - const int& refId = (*indexIter).first; - ClearReferenceOffsets(refId); - } +void BamStandardIndex::ClearReferenceEntry(BaiReferenceEntry& refEntry) { + refEntry.ID = -1; + refEntry.Bins.clear(); + refEntry.LinearOffsets.clear(); } -// clear all index offset data for desired reference -void BamStandardIndex::ClearReferenceOffsets(const int& refId) { +void BamStandardIndex::CloseFile(void) { - // look up refId, skip if not found - BamStandardIndexData::iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end() ) return ; + // close file stream + if ( IsFileOpen() ) + fclose(m_indexStream); - // clear reference data - ReferenceIndex& refEntry = (*indexIter).second; - refEntry.Bins.clear(); - refEntry.Offsets.clear(); + // clear index file summary data + m_indexFileSummary.clear(); - // set flag - m_hasFullDataCache = false; + // clean up I/O buffer + delete[] m_buffer; + m_buffer = 0; + m_bufferLength = 0; } -// return file position after header metadata -off_t BamStandardIndex::DataBeginOffset(void) const { - return m_dataBeginOffset; -} +// builds index from associated BAM file & writes out to index file +bool BamStandardIndex::Create(void) { -// calculates offset(s) for a given region -bool BamStandardIndex::GetOffsets(const BamRegion& region, - const RefVector& references, - const bool isRightBoundSpecified, - vector& offsets, - bool* hasAlignmentsInRegion) -{ - // return false if leftBound refID is not found in index data - if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) + // return false if BamReader is invalid or not open + if ( m_reader == 0 || !m_reader->IsOpen() ) { + cerr << "BamStandardIndex ERROR: BamReader is not open" + << ", aborting index creation" << endl; return false; + } - // load index data for region if not already cached - if ( !IsDataLoaded(region.LeftRefID) ) { - bool loadedOk = true; - loadedOk &= SkipToReference(region.LeftRefID); - loadedOk &= LoadReference(region.LeftRefID); - if ( !loadedOk ) return false; + // rewind BamReader + if ( !m_reader->Rewind() ) { + cerr << "BamStandardIndex ERROR: could not rewind BamReader to create index" + << ", aborting index creation" << endl; + return false; + } + + // open new index file (read & write) + string indexFilename = m_reader->Filename() + Extension(); + if ( !OpenFile(indexFilename, "w+b") ) { + cerr << "BamStandardIndex ERROR: could not open ouput index file: " << indexFilename + << ", aborting index creation" << endl; + return false; } - // calculate which bins overlap this region - uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); - int numBins = BinsFromRegion(region, references, isRightBoundSpecified, bins); - - // get bins for this reference - BamStandardIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID); - if ( indexIter == m_indexData.end() ) return false; // error - const ReferenceIndex& refIndex = (*indexIter).second; - const BamBinMap& binMap = refIndex.Bins; - - // get minimum offset to consider - const LinearOffsetVector& linearOffsets = refIndex.Offsets; - const uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) - ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT); - - // store all alignment 'chunk' starts (file offsets) for bins in this region - for ( int i = 0; i < numBins; ++i ) { - - const uint16_t binKey = bins[i]; - map::const_iterator binIter = binMap.find(binKey); - if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) { - - // iterate over chunks - const ChunkVector& chunks = (*binIter).second; - std::vector::const_iterator chunksIter = chunks.begin(); - std::vector::const_iterator chunksEnd = chunks.end(); - for ( ; chunksIter != chunksEnd; ++chunksIter) { - - // if valid chunk found, store its file offset - const Chunk& chunk = (*chunksIter); - if ( chunk.Stop > minOffset ) - offsets.push_back( chunk.Start ); + // initialize BaiFileSummary with number of references + const int& numReferences = m_reader->GetReferenceCount(); + ReserveForSummary(numReferences); + + // initialize output file + bool createdOk = true; + createdOk &= WriteHeader(); + + // set up bin, ID, offset, & coordinate markers + const uint32_t defaultValue = 0xffffffffu; + uint32_t currentBin = defaultValue; + uint32_t lastBin = defaultValue; + int32_t currentRefID = defaultValue; + int32_t lastRefID = defaultValue; + uint64_t currentOffset = (uint64_t)m_reader->Tell(); + uint64_t lastOffset = currentOffset; + int32_t lastPosition = defaultValue; + + // iterate through alignments in BAM file + BamAlignment al; + BaiReferenceEntry refEntry; + while ( m_reader->LoadNextAlignment(al) ) { + + // changed to new reference + if ( lastRefID != al.RefID ) { + + // if not first reference, save previous reference data + if ( lastRefID != (int32_t)defaultValue ) { + + SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); + createdOk &= WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); + + // update bin markers + currentOffset = lastOffset; + currentBin = al.Bin; + lastBin = al.Bin; + currentRefID = al.RefID; } + + // update reference markers + refEntry.ID = al.RefID; + lastRefID = al.RefID; + lastBin = defaultValue; } - } - // clean up memory - free(bins); + // if lastPosition greater than current alignment position - file not sorted properly + else if ( lastPosition > al.Position ) { + cerr << "BamStandardIndex ERROR: BAM file is not properly sorted by coordinate" + << ", aborting index creation" + << endl + << "At alignment: " << al.Name + << " : previous position " << lastPosition + << " > this alignment position " << al.Position + << " on reference id: " << al.RefID << endl; + return false; + } - // sort the offsets before returning - sort(offsets.begin(), offsets.end()); + // if alignment's ref ID is valid & its bin is not a 'leaf' + if ( (al.RefID >= 0) && (al.Bin < 4681) ) + SaveLinearOffsetEntry(refEntry.LinearOffsets, al.Position, al.GetEndPosition(), lastOffset); - // set flag & return success - *hasAlignmentsInRegion = (offsets.size() != 0 ); + // changed to new BAI bin + if ( al.Bin != lastBin ) { - // if cache mode set to none, dump the data we just loaded - if ( m_cacheMode == BamIndex::NoIndexCaching ) - ClearReferenceOffsets(region.LeftRefID); + // if not first bin on reference, save previous bin data + if ( currentBin != defaultValue ) + SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); - // return succes - return true; -} + // update markers + currentOffset = lastOffset; + currentBin = al.Bin; + lastBin = al.Bin; + currentRefID = al.RefID; -// returns whether reference has alignments or no -bool BamStandardIndex::HasAlignments(const int& refId) const { - BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end() ) return false; // error - const ReferenceIndex& refEntry = (*indexIter).second; - return refEntry.HasAlignments; + // if invalid RefID, break out + if ( currentRefID < 0 ) + break; + } + + // make sure that current file pointer is beyond lastOffset + if ( m_reader->Tell() <= (int64_t)lastOffset ) { + cerr << "BamStandardIndex ERROR: calculating offsets failed" + << ", aborting index creation" << endl; + return false; + } + + // update lastOffset & lastPosition + lastOffset = m_reader->Tell(); + lastPosition = al.Position; + } + + // store last alignment chunk to its bin, then write last reference entry + if ( currentRefID >= 0 ) { + SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); + createdOk &= WriteReferenceEntry(refEntry); + } + + // rewind reader now that we're done building + createdOk &= m_reader->Rewind(); + + // return result + return createdOk; } -// return true if all index data is cached -bool BamStandardIndex::HasFullDataCache(void) const { - return m_hasFullDataCache; +// returns format's file extension +const string BamStandardIndex::Extension(void) { + return BamStandardIndex::BAI_EXTENSION; } -// returns true if index cache has data for desired reference -bool BamStandardIndex::IsDataLoaded(const int& refId) const { +bool BamStandardIndex::GetOffsets(const BamRegion& region, vector& offsets) { - // look up refId, return false if not found - BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end() ) return false; + // cannot calculate offsets if unknown/invalid reference ID requested + if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() ) + return false; - // see if reference has alignments - // if not, it's not a problem to have no offset data - const ReferenceIndex& refEntry = (*indexIter).second; - if ( !refEntry.HasAlignments ) return true; + // retrieve index summary for left bound reference + const BaiReferenceSummary& refSummary = m_indexFileSummary.at(region.LeftRefID); - // return whether bin map contains data - return ( !refEntry.Bins.empty() ); -} + // set up region boundaries based on actual BamReader data + uint32_t begin; + uint32_t end; + if ( !AdjustRegion(region, begin, end) ) + return false; -// attempts to use index to jump to region; returns success/fail -bool BamStandardIndex::Jump(Internal::BamReaderPrivate* reader, - const BamTools::BamRegion& region, - bool *hasAlignmentsInRegion) -{ - // skip if invalid reader - if ( reader == 0 ) + // retrieve all candidate bin IDs for region + set candidateBins; + CalculateCandidateBins(begin, end, candidateBins); + + // use reference's linear offsets to calculate the minimum offset + // that must be considered to find overlap + const uint64_t& minOffset = CalculateMinOffset(refSummary, begin); + + // attempt to use reference summary, minOffset, & candidateBins to calculate offsets + if ( !CalculateCandidateOffsets(refSummary, minOffset, candidateBins, offsets) ) return false; - // skip if reader BgzfStream is invalid or not open - BgzfStream* bgzfStream = reader->Stream(); - if ( bgzfStream == 0 || !bgzfStream->IsOpen ) + // ensure that offsets are sorted before returning + sort( offsets.begin(), offsets.end() ); + + // return succes + return true; +} + +// returns whether reference has alignments or no +bool BamStandardIndex::HasAlignments(const int& referenceID) const { + if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() ) return false; + const BaiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID); + return ( refSummary.NumBins > 0 ); +} - // retrieve references from reader - const RefVector references = reader->GetReferenceData(); +bool BamStandardIndex::IsFileOpen(void) const { + return ( m_indexStream != 0 ); +} - // make sure left-bound position is valid - if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) +// attempts to use index data to jump to @region, returns success/fail +// a "successful" jump indicates no error, but not whether this region has data +// * thus, the method sets a flag to indicate whether there are alignments +// available after the jump position +bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { + + // skip if reader is not valid or is not open + if ( m_reader == 0 || !m_reader->IsOpen() ) return false; // calculate offsets for this region // if failed, print message, set flag, and return failure vector offsets; - if ( !GetOffsets(region, references, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) { - fprintf(stderr, "BamStandardIndex ERROR: could not jump - unable to calculate offset candidates for specified region.\n"); + if ( !GetOffsets(region, offsets) ) { + cerr << "BamStandardIndex ERROR: could not jump" + << ", unable to retrieve offsets for region" << endl; *hasAlignmentsInRegion = false; return false; } - // iterate through offsets - BamAlignment alignment; + // if no offsets retrieved, set flag + if ( offsets.empty() ) + *hasAlignmentsInRegion = false; + + // iterate through candidate offsets + BamAlignment al; bool result = true; - for ( vector::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { + vector::const_iterator offsetIter = offsets.begin(); + vector::const_iterator offsetEnd = offsets.end(); + for ( ; offsetIter != offsetEnd; ++offsetIter) { // attempt seek & load first available alignment // set flag to true if data exists - result &= bgzfStream->Seek(*o); - *hasAlignmentsInRegion = reader->GetNextAlignmentCore(alignment); + result &= m_reader->Seek(*offsetIter); + *hasAlignmentsInRegion = m_reader->LoadNextAlignment(al); // if this alignment corresponds to desired position // return success of seeking back to the offset before the 'current offset' (to cover overlaps) - if ( ((alignment.RefID == region.LeftRefID) && - ((alignment.Position + alignment.Length) > region.LeftPosition)) || - (alignment.RefID > region.LeftRefID) ) + if ( ((al.RefID == region.LeftRefID) && + ((al.Position + al.Length) > region.LeftPosition)) || + (al.RefID > region.LeftRefID) ) { - if ( o != offsets.begin() ) --o; - return bgzfStream->Seek(*o); + if ( offsetIter != offsets.begin() ) + --offsetIter; + return m_reader->Seek(*offsetIter); } } // if error in jumping, print message & set flag if ( !result ) { - fprintf(stderr, "BamStandardIndex ERROR: could not jump - unable to determine correct offset for specified region.\n"); + cerr << "BamStandardIndex ERROR: could not jump" + << ", there was a problem seeking in BAM file" << endl; *hasAlignmentsInRegion = false; } @@ -421,313 +484,202 @@ bool BamStandardIndex::Jump(Internal::BamReaderPrivate* reader, return result; } -// clears index data from all references except the first -void BamStandardIndex::KeepOnlyFirstReferenceOffsets(void) { - BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); - KeepOnlyReferenceOffsets((*indexBegin).first); -} +// loads existing data from file into memory +bool BamStandardIndex::Load(const std::string& filename) { -// clears index data from all references except the one specified -void BamStandardIndex::KeepOnlyReferenceOffsets(const int& refId) { - BamStandardIndexData::iterator mapIter = m_indexData.begin(); - BamStandardIndexData::iterator mapEnd = m_indexData.end(); - for ( ; mapIter != mapEnd; ++mapIter ) { - const int entryRefId = (*mapIter).first; - if ( entryRefId != refId ) - ClearReferenceOffsets(entryRefId); + // attempt open index file (read-only) + if ( !OpenFile(filename, "rb") ) { + cerr << "BamStandardIndex ERROR: could not open input index file: " << filename + << ", aborting index load" << endl; + return false; } -} - -bool BamStandardIndex::LoadAllReferences(bool saveData) { - - // skip if data already loaded - if ( m_hasFullDataCache ) return true; - // get number of reference sequences - uint32_t numReferences; - if ( !LoadReferenceCount((int&)numReferences) ) + // if invalid format 'magic number', close & return failure + if ( !CheckMagicNumber() ) { + CloseFile(); return false; + } - // iterate over reference entries - bool loadedOk = true; - for ( int i = 0; i < (int)numReferences; ++i ) - loadedOk &= LoadReference(i, saveData); - - // set flag - if ( loadedOk && saveData ) - m_hasFullDataCache = true; - - // return success/failure of loading references - return loadedOk; + // attempt to load index file summary, return success/failure + return SummarizeIndexFile(); } -// load header data from index file, return true if loaded OK -bool BamStandardIndex::LoadHeader(void) { - - bool loadedOk = CheckMagicNumber(); +uint64_t BamStandardIndex::LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index) { - // store offset of beginning of data - m_dataBeginOffset = ftell64(m_indexStream); + // attempt seek to proper index file position + const int64_t linearOffsetFilePosition = (int64_t)refSummary.FirstLinearOffsetFilePosition + + index*BamStandardIndex::SIZEOF_LINEAROFFSET; + if ( !Seek(linearOffsetFilePosition, SEEK_SET) ) + return 0; - // return success/failure of load - return loadedOk; + // read linear offset from BAI file + uint64_t linearOffset(0); + if ( !ReadLinearOffset(linearOffset) ) + return 0; + return linearOffset; } -// load a single index bin entry from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamStandardIndex::LoadBin(ReferenceIndex& refEntry, bool saveData) { +void BamStandardIndex::MergeAlignmentChunks(BaiAlignmentChunkVector& chunks) { - size_t elementsRead = 0; + // skip if chunks are empty, nothing to merge + if ( chunks.empty() ) + return; - // get bin ID - uint32_t binId; - elementsRead += fread(&binId, sizeof(binId), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(binId); - - // load alignment chunks for this bin - ChunkVector chunks; - bool chunksOk = LoadChunks(chunks, saveData); + // set up merged alignment chunk container + BaiAlignmentChunkVector mergedChunks; + mergedChunks.push_back( chunks[0] ); - // store bin entry - if ( chunksOk && saveData ) - refEntry.Bins.insert(pair(binId, chunks)); - - // return success/failure of load - return ( (elementsRead == 1) && chunksOk ); -} + // iterate over chunks + int i = 0; + BaiAlignmentChunkVector::iterator chunkIter = chunks.begin(); + BaiAlignmentChunkVector::iterator chunkEnd = chunks.end(); + for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { -bool BamStandardIndex::LoadBins(ReferenceIndex& refEntry, bool saveData) { + // get 'currentMergeChunk' based on numeric index + BaiAlignmentChunk& currentMergeChunk = mergedChunks[i]; - size_t elementsRead = 0; + // get sourceChunk based on source vector iterator + BaiAlignmentChunk& sourceChunk = (*chunkIter); - // get number of bins - int32_t numBins; - elementsRead += fread(&numBins, sizeof(numBins), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(numBins); + // if currentMergeChunk ends where sourceChunk starts, then merge the two + if ( currentMergeChunk.Stop>>16 == sourceChunk.Start>>16 ) + currentMergeChunk.Stop = sourceChunk.Stop; - // set flag - refEntry.HasAlignments = ( numBins != 0 ); + // otherwise + else { + // append sourceChunk after currentMergeChunk + mergedChunks.push_back(sourceChunk); - // iterate over bins - bool binsOk = true; - for ( int i = 0; i < numBins; ++i ) - binsOk &= LoadBin(refEntry, saveData); + // update i, so the next iteration will consider the + // recently-appended sourceChunk as new mergeChunk candidate + ++i; + } + } - // return success/failure of load - return ( (elementsRead == 1) && binsOk ); + // saved newly-merged chunks into (parameter) chunks + chunks = mergedChunks; } -// load a single index bin entry from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamStandardIndex::LoadChunk(ChunkVector& chunks, bool saveData) { +bool BamStandardIndex::OpenFile(const std::string& filename, const char* mode) { - size_t elementsRead = 0; - - // read in chunk data - uint64_t start; - uint64_t stop; - elementsRead += fread(&start, sizeof(start), 1, m_indexStream); - elementsRead += fread(&stop, sizeof(stop), 1, m_indexStream); + // make sure any previous index file is closed + CloseFile(); - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_64(start); - SwapEndian_64(stop); - } - - // save data if requested - if ( saveData ) chunks.push_back( Chunk(start, stop) ); - - // return success/failure of load - return ( elementsRead == 2 ); + // attempt to open file + m_indexStream = fopen(filename.c_str(), mode); + return IsFileOpen(); } -bool BamStandardIndex::LoadChunks(ChunkVector& chunks, bool saveData) { - +bool BamStandardIndex::ReadBinID(uint32_t& binId) { size_t elementsRead = 0; - - // read in number of chunks - uint32_t numChunks; - elementsRead += fread(&numChunks, sizeof(numChunks), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(numChunks); - - // initialize space for chunks if we're storing this data - if ( saveData ) chunks.reserve(numChunks); - - // iterate over chunks - bool chunksOk = true; - for ( int i = 0; i < (int)numChunks; ++i ) - chunksOk &= LoadChunk(chunks, saveData); - - // sort chunk vector - sort( chunks.begin(), chunks.end(), ChunkLessThan ); - - // return success/failure of load - return ( (elementsRead == 1) && chunksOk ); + elementsRead += fread(&binId, sizeof(binId), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(binId); + return ( elementsRead == 1 ); } -// load a single index linear offset entry from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamStandardIndex::LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData) { +bool BamStandardIndex::ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks) { - size_t elementsRead = 0; + bool readOk = true; - // read in number of linear offsets - int32_t numLinearOffsets; - elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets); + // read bin header + readOk &= ReadBinID(binId); + readOk &= ReadNumAlignmentChunks(numAlignmentChunks); - // set up destination vector (if we're saving the data) - LinearOffsetVector linearOffsets; - if ( saveData ) linearOffsets.reserve(numLinearOffsets); + // read bin contents + const unsigned int bytesRequested = numAlignmentChunks*BamStandardIndex::SIZEOF_ALIGNMENTCHUNK; + readOk &= ReadIntoBuffer(bytesRequested); - // iterate over linear offsets - uint64_t linearOffset; - for ( int i = 0; i < numLinearOffsets; ++i ) { - elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_64(linearOffset); - if ( saveData ) linearOffsets.push_back(linearOffset); - } + // return success/failure + return readOk; +} - // sort linear offsets - sort ( linearOffsets.begin(), linearOffsets.end() ); +bool BamStandardIndex::ReadIntoBuffer(const unsigned int& bytesRequested) { - // save in reference index entry if desired - if ( saveData ) refEntry.Offsets = linearOffsets; + // ensure that our buffer is big enough for request + BamStandardIndex::CheckBufferSize(m_buffer, m_bufferLength, bytesRequested); - // return success/failure of load - return ( elementsRead == (size_t)(numLinearOffsets + 1) ); + // read from BAI file stream + size_t bytesRead = fread( m_buffer, sizeof(char), bytesRequested, m_indexStream ); + return ( bytesRead == (size_t)bytesRequested ); } -bool BamStandardIndex::LoadFirstReference(bool saveData) { - BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); - return LoadReference((*indexBegin).first, saveData); +bool BamStandardIndex::ReadLinearOffset(uint64_t& linearOffset) { + size_t elementsRead = 0; + elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_64(linearOffset); + return ( elementsRead == 1 ); } -// load a single reference from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamStandardIndex::LoadReference(const int& refId, bool saveData) { - - // look up refId - BamStandardIndexData::iterator indexIter = m_indexData.find(refId); - - // if reference not previously loaded, create new entry - if ( indexIter == m_indexData.end() ) { - ReferenceIndex newEntry; - newEntry.HasAlignments = false; - m_indexData.insert( pair(refId, newEntry) ); - } - - // load reference data - indexIter = m_indexData.find(refId); - ReferenceIndex& entry = (*indexIter).second; - bool loadedOk = true; - loadedOk &= LoadBins(entry, saveData); - loadedOk &= LoadLinearOffsets(entry, saveData); - return loadedOk; +bool BamStandardIndex::ReadNumAlignmentChunks(int& numAlignmentChunks) { + size_t elementsRead = 0; + elementsRead += fread(&numAlignmentChunks, sizeof(numAlignmentChunks), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numAlignmentChunks); + return ( elementsRead == 1 ); } -// loads number of references, return true if loaded OK -bool BamStandardIndex::LoadReferenceCount(int& numReferences) { +bool BamStandardIndex::ReadNumBins(int& numBins) { + size_t elementsRead = 0; + elementsRead += fread(&numBins, sizeof(numBins), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numBins); + return ( elementsRead == 1 ); +} +bool BamStandardIndex::ReadNumLinearOffsets(int& numLinearOffsets) { size_t elementsRead = 0; + elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets); + return ( elementsRead == 1 ); +} - // read reference count +bool BamStandardIndex::ReadNumReferences(int& numReferences) { + size_t elementsRead = 0; elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); if ( m_isBigEndian ) SwapEndian_32(numReferences); - - // return success/failure of load return ( elementsRead == 1 ); } -// merges 'alignment chunks' in BAM bin (used for index building) -void BamStandardIndex::MergeChunks(void) { - - // iterate over reference enties - BamStandardIndexData::iterator indexIter = m_indexData.begin(); - BamStandardIndexData::iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - - // get BAM bin map for this reference - ReferenceIndex& refIndex = (*indexIter).second; - BamBinMap& bamBinMap = refIndex.Bins; - - // iterate over BAM bins - BamBinMap::iterator binIter = bamBinMap.begin(); - BamBinMap::iterator binEnd = bamBinMap.end(); - for ( ; binIter != binEnd; ++binIter ) { - - // get chunk vector for this bin - ChunkVector& binChunks = (*binIter).second; - if ( binChunks.size() == 0 ) continue; - - ChunkVector mergedChunks; - mergedChunks.push_back( binChunks[0] ); - - // iterate over chunks - int i = 0; - ChunkVector::iterator chunkIter = binChunks.begin(); - ChunkVector::iterator chunkEnd = binChunks.end(); - for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { - - // get 'currentChunk' based on numeric index - Chunk& currentChunk = mergedChunks[i]; - - // get iteratorChunk based on vector iterator - Chunk& iteratorChunk = (*chunkIter); - - // if chunk ends where (iterator) chunk starts, then merge - if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) - currentChunk.Stop = iteratorChunk.Stop; - - // otherwise - else { - // set currentChunk + 1 to iteratorChunk - mergedChunks.push_back(iteratorChunk); - ++i; - } - } - - // saved merged chunk vector - (*binIter).second = mergedChunks; - } - } +void BamStandardIndex::ReserveForSummary(const int& numReferences) { + m_indexFileSummary.clear(); + m_indexFileSummary.assign( numReferences, BaiReferenceSummary() ); } -// saves BAM bin entry for index -void BamStandardIndex::SaveBinEntry(BamBinMap& binMap, - const uint32_t& saveBin, - const uint64_t& saveOffset, - const uint64_t& lastOffset) +void BamStandardIndex::SaveAlignmentChunkToBin(BaiBinMap& binMap, + const uint32_t& currentBin, + const uint64_t& currentOffset, + const uint64_t& lastOffset) { - // look up saveBin - BamBinMap::iterator binIter = binMap.find(saveBin); - - // create new chunk - Chunk newChunk(saveOffset, lastOffset); + // create new alignment chunk + BaiAlignmentChunk newChunk(currentOffset, lastOffset); - // if entry doesn't exist + // if no entry exists yet for this bin, create one and store alignment chunk + BaiBinMap::iterator binIter = binMap.find(currentBin); if ( binIter == binMap.end() ) { - ChunkVector newChunks; + BaiAlignmentChunkVector newChunks; newChunks.push_back(newChunk); - binMap.insert( pair(saveBin, newChunks)); + binMap.insert( pair(currentBin, newChunks)); } - // otherwise + // otherwise, just append alignment chunk else { - ChunkVector& binChunks = (*binIter).second; + BaiAlignmentChunkVector& binChunks = (*binIter).second; binChunks.push_back( newChunk ); } } -// saves linear offset entry for index -void BamStandardIndex::SaveLinearOffset(LinearOffsetVector& offsets, - const BamAlignment& bAlignment, - const uint64_t& lastOffset) +void BamStandardIndex::SaveBinsSummary(const int& refId, const int& numBins) { + BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId); + refSummary.NumBins = numBins; + refSummary.FirstBinFilePosition = Tell(); +} + +void BamStandardIndex::SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets, + const int& alignmentStartPosition, + const int& alignmentStopPosition, + const uint64_t& lastOffset) { // get converted offsets - int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; - int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; + const int beginOffset = alignmentStartPosition >> BamStandardIndex::BAM_LIDX_SHIFT; + const int endOffset = (alignmentStopPosition - 1) >> BamStandardIndex::BAM_LIDX_SHIFT; // resize vector if necessary int oldSize = offsets.size(); @@ -739,120 +691,108 @@ void BamStandardIndex::SaveLinearOffset(LinearOffsetVector& offsets, for( int i = beginOffset + 1; i <= endOffset; ++i ) { if ( offsets[i] == 0 ) offsets[i] = lastOffset; - } + } } -// initializes index data structure to hold @count references -void BamStandardIndex::SetReferenceCount(const int& count) { - for ( int i = 0; i < count; ++i ) - m_indexData[i].HasAlignments = false; +void BamStandardIndex::SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets) { + BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId); + refSummary.NumLinearOffsets = numLinearOffsets; + refSummary.FirstLinearOffsetFilePosition = Tell(); } -bool BamStandardIndex::SkipToFirstReference(void) { - BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); - return SkipToReference( (*indexBegin).first ); +// seek to position in index file stream +bool BamStandardIndex::Seek(const int64_t& position, const int& origin) { + return ( fseek64(m_indexStream, position, origin) == 0 ); } -// position file pointer to desired reference begin, return true if skipped OK -bool BamStandardIndex::SkipToReference(const int& refId) { - - // attempt rewind - if ( !Rewind() ) return false; - - // read in number of references - uint32_t numReferences; - size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(numReferences); +// change the index caching behavior +void BamStandardIndex::SetCacheMode(const BamIndex::IndexCacheMode& mode) { + m_cacheMode = mode; + // do nothing else here ? cache mode will be ignored from now on, most likely +} - // iterate over reference entries +bool BamStandardIndex::SkipBins(const int& numBins) { + uint32_t binId; + int32_t numAlignmentChunks; bool skippedOk = true; - int currentRefId = 0; - while (currentRefId != refId) { - skippedOk &= LoadReference(currentRefId, false); - ++currentRefId; - } - - // return success + for (int i = 0; i < numBins; ++i) + skippedOk &= ReadBinIntoBuffer(binId, numAlignmentChunks); // results & buffer ignored return skippedOk; } -// write header to new index file -bool BamStandardIndex::WriteHeader(void) { - - size_t elementsWritten = 0; - - // write magic number - elementsWritten += fwrite("BAI\1", sizeof(char), 4, m_indexStream); - - // store offset of beginning of data - m_dataBeginOffset = ftell64(m_indexStream); - - // return success/failure of write - return (elementsWritten == 4); +bool BamStandardIndex::SkipLinearOffsets(const int& numLinearOffsets) { + const unsigned int bytesRequested = numLinearOffsets*BamStandardIndex::SIZEOF_LINEAROFFSET; + return ReadIntoBuffer(bytesRequested); } -// write index data for all references to new index file -bool BamStandardIndex::WriteAllReferences(void) { +void BamStandardIndex::SortLinearOffsets(BaiLinearOffsetVector& linearOffsets) { + sort( linearOffsets.begin(), linearOffsets.end() ); +} - size_t elementsWritten = 0; +bool BamStandardIndex::SummarizeBins(BaiReferenceSummary& refSummary) { - // write number of reference sequences - int32_t numReferenceSeqs = m_indexData.size(); - if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs); - elementsWritten += fwrite(&numReferenceSeqs, sizeof(numReferenceSeqs), 1, m_indexStream); + // load number of bins + int numBins; + if ( !ReadNumBins(numBins) ) + return false; - // iterate over reference sequences - bool refsOk = true; - BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); - BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++ indexIter ) - refsOk &= WriteReference( (*indexIter).second ); + // store bins summary for this reference + refSummary.NumBins = numBins; + refSummary.FirstBinFilePosition = Tell(); - // return success/failure of write - return ( (elementsWritten == 1) && refsOk ); + // attempt skip reference bins, return success/failure + return SkipBins(numBins); } -// write index data for bin to new index file -bool BamStandardIndex::WriteBin(const uint32_t& binId, const ChunkVector& chunks) { +bool BamStandardIndex::SummarizeIndexFile(void) { - size_t elementsWritten = 0; + // load number of reference sequences + int numReferences; + if ( !ReadNumReferences(numReferences) ) + return false; - // write BAM bin ID - uint32_t binKey = binId; - if ( m_isBigEndian ) SwapEndian_32(binKey); - elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_indexStream); + // initialize file summary data + ReserveForSummary(numReferences); - // write chunks - bool chunksOk = WriteChunks(chunks); + // iterate over reference entries + bool loadedOk = true; + BaiFileSummary::iterator summaryIter = m_indexFileSummary.begin(); + BaiFileSummary::iterator summaryEnd = m_indexFileSummary.end(); + for ( int i = 0; summaryIter != summaryEnd; ++summaryIter, ++i ) + loadedOk &= SummarizeReference(*summaryIter); - // return success/failure of write - return ( (elementsWritten == 1) && chunksOk ); + // return result + return loadedOk; } -// write index data for bins to new index file -bool BamStandardIndex::WriteBins(const BamBinMap& bins) { +bool BamStandardIndex::SummarizeLinearOffsets(BaiReferenceSummary& refSummary) { - size_t elementsWritten = 0; + // load number of linear offsets + int numLinearOffsets; + if ( !ReadNumLinearOffsets(numLinearOffsets) ) + return false; - // write number of bins - int32_t binCount = bins.size(); - if ( m_isBigEndian ) SwapEndian_32(binCount); - elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_indexStream); + // store bin summary data for this reference + refSummary.NumLinearOffsets = numLinearOffsets; + refSummary.FirstLinearOffsetFilePosition = Tell(); - // iterate over bins - bool binsOk = true; - BamBinMap::const_iterator binIter = bins.begin(); - BamBinMap::const_iterator binEnd = bins.end(); - for ( ; binIter != binEnd; ++binIter ) - binsOk &= WriteBin( (*binIter).first, (*binIter).second ); + // skip linear offsets in index file + return SkipLinearOffsets(numLinearOffsets); +} - // return success/failure of write - return ( (elementsWritten == 1) && binsOk ); +bool BamStandardIndex::SummarizeReference(BaiReferenceSummary& refSummary) { + bool loadedOk = true; + loadedOk &= SummarizeBins(refSummary); + loadedOk &= SummarizeLinearOffsets(refSummary); + return loadedOk; } -// write index data for chunk entry to new index file -bool BamStandardIndex::WriteChunk(const Chunk& chunk) { +// return position of file pointer in index file stream +int64_t BamStandardIndex::Tell(void) const { + return ftell64(m_indexStream); +} + +bool BamStandardIndex::WriteAlignmentChunk(const BaiAlignmentChunk& chunk) { size_t elementsWritten = 0; @@ -874,8 +814,10 @@ bool BamStandardIndex::WriteChunk(const Chunk& chunk) { return ( elementsWritten == 2 ); } -// write index data for chunk entry to new index file -bool BamStandardIndex::WriteChunks(const ChunkVector& chunks) { +bool BamStandardIndex::WriteAlignmentChunks(BaiAlignmentChunkVector& chunks) { + + // make sure chunks are merged (simplified) before writing & saving summary + MergeAlignmentChunks(chunks); size_t elementsWritten = 0; @@ -886,28 +828,88 @@ bool BamStandardIndex::WriteChunks(const ChunkVector& chunks) { // iterate over chunks bool chunksOk = true; - ChunkVector::const_iterator chunkIter = chunks.begin(); - ChunkVector::const_iterator chunkEnd = chunks.end(); + BaiAlignmentChunkVector::const_iterator chunkIter = chunks.begin(); + BaiAlignmentChunkVector::const_iterator chunkEnd = chunks.end(); for ( ; chunkIter != chunkEnd; ++chunkIter ) - chunksOk &= WriteChunk( (*chunkIter) ); + chunksOk &= WriteAlignmentChunk( (*chunkIter) ); + + // return success/failure of write + return ( (elementsWritten == 1) && chunksOk ); +} + +bool BamStandardIndex::WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks) { + + size_t elementsWritten = 0; + + // write BAM bin ID + uint32_t binKey = binId; + if ( m_isBigEndian ) SwapEndian_32(binKey); + elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_indexStream); + + // write bin's alignment chunks + bool chunksOk = WriteAlignmentChunks(chunks); // return success/failure of write return ( (elementsWritten == 1) && chunksOk ); } -// write index data for linear offsets entry to new index file -bool BamStandardIndex::WriteLinearOffsets(const LinearOffsetVector& offsets) { +bool BamStandardIndex::WriteBins(const int& refId, BaiBinMap& bins) { + + size_t elementsWritten = 0; + + // write number of bins + int32_t binCount = bins.size(); + if ( m_isBigEndian ) SwapEndian_32(binCount); + elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_indexStream); + + // save summary for reference's bins + SaveBinsSummary(refId, bins.size()); + + // iterate over bins + bool binsOk = true; + BaiBinMap::iterator binIter = bins.begin(); + BaiBinMap::iterator binEnd = bins.end(); + for ( ; binIter != binEnd; ++binIter ) + binsOk &= WriteBin( (*binIter).first, (*binIter).second ); + + // return success/failure of write + return ( (elementsWritten == 1) && binsOk ); +} + +bool BamStandardIndex::WriteHeader(void) { + + size_t elementsWritten = 0; + + // write magic number + elementsWritten += fwrite(BamStandardIndex::BAI_MAGIC, sizeof(char), 4, m_indexStream); + + // write number of reference sequences + int32_t numReferences = m_indexFileSummary.size(); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream); + + // return success/failure of write + return (elementsWritten == 5); +} + +bool BamStandardIndex::WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets) { + + // make sure linear offsets are sorted before writing & saving summary + SortLinearOffsets(linearOffsets); size_t elementsWritten = 0; // write number of linear offsets - int32_t offsetCount = offsets.size(); + int32_t offsetCount = linearOffsets.size(); if ( m_isBigEndian ) SwapEndian_32(offsetCount); elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_indexStream); + // save summary for reference's linear offsets + SaveLinearOffsetsSummary(refId, linearOffsets.size()); + // iterate over linear offsets - LinearOffsetVector::const_iterator offsetIter = offsets.begin(); - LinearOffsetVector::const_iterator offsetEnd = offsets.end(); + BaiLinearOffsetVector::const_iterator offsetIter = linearOffsets.begin(); + BaiLinearOffsetVector::const_iterator offsetEnd = linearOffsets.end(); for ( ; offsetIter != offsetEnd; ++offsetIter ) { // write linear offset @@ -917,13 +919,12 @@ bool BamStandardIndex::WriteLinearOffsets(const LinearOffsetVector& offsets) { } // return success/failure of write - return ( elementsWritten == (size_t)(offsetCount + 1) ); + return ( elementsWritten == (size_t)(linearOffsets.size() + 1) ); } -// write index data for a single reference to new index file -bool BamStandardIndex::WriteReference(const ReferenceIndex& refEntry) { +bool BamStandardIndex::WriteReferenceEntry(BaiReferenceEntry& refEntry) { bool refOk = true; - refOk &= WriteBins(refEntry.Bins); - refOk &= WriteLinearOffsets(refEntry.Offsets); + refOk &= WriteBins(refEntry.ID, refEntry.Bins); + refOk &= WriteLinearOffsets(refEntry.ID, refEntry.LinearOffsets); return refOk; } diff --git a/src/api/internal/BamStandardIndex_p.h b/src/api/internal/BamStandardIndex_p.h index 767606e..7c61a62 100644 --- a/src/api/internal/BamStandardIndex_p.h +++ b/src/api/internal/BamStandardIndex_p.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 January 2011 (DB) +// Last modified: 5 April 2011 (DB) // --------------------------------------------------------------------------- // Provides index operations for the standardized BAM index format (".bai") // *************************************************************************** @@ -24,193 +24,211 @@ #include #include #include +#include #include #include namespace BamTools { - -class BamAlignment; - namespace Internal { -// BAM index constants -const int MAX_BIN = 37450; // =(8^6-1)/7+1 -const int BAM_LIDX_SHIFT = 14; -const std::string BAI_EXTENSION = ".bai"; +// ----------------------------------------------------------------------------- +// BamStandardIndex data structures -// -------------------------------------------------- -// BamStandardIndex data structures & typedefs -struct Chunk { +// defines start and end of a contiguous run of alignments +struct BaiAlignmentChunk { // data members uint64_t Start; uint64_t Stop; // constructor - Chunk(const uint64_t& start = 0, - const uint64_t& stop = 0) + BaiAlignmentChunk(const uint64_t& start = 0, + const uint64_t& stop = 0) : Start(start) , Stop(stop) { } }; +// comparison operator (for sorting) inline -bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) { +bool operator<(const BaiAlignmentChunk& lhs, const BaiAlignmentChunk& rhs) { return lhs.Start < rhs.Start; } -typedef std::vector ChunkVector; -typedef std::map BamBinMap; -typedef std::vector LinearOffsetVector; +// convenience typedef for a list of all alignment 'chunks' in a BAI bin +typedef std::vector BaiAlignmentChunkVector; + +// convenience typedef for a map of all BAI bins in a reference (ID => chunks) +typedef std::map BaiBinMap; + +// convenience typedef for a list of all 'linear offsets' in a reference +typedef std::vector BaiLinearOffsetVector; -struct ReferenceIndex { +// contains all fields necessary for building, loading, & writing +// full BAI index data for a single reference +struct BaiReferenceEntry { // data members - BamBinMap Bins; - LinearOffsetVector Offsets; - bool HasAlignments; + int32_t ID; + BaiBinMap Bins; + BaiLinearOffsetVector LinearOffsets; - // constructor - ReferenceIndex(const BamBinMap& binMap = BamBinMap(), - const LinearOffsetVector& offsets = LinearOffsetVector(), - const bool hasAlignments = false) - : Bins(binMap) - , Offsets(offsets) - , HasAlignments(hasAlignments) + // ctor + BaiReferenceEntry(const int32_t& id = -1) + : ID(id) { } }; -typedef std::map BamStandardIndexData; +// provides (persistent) summary of BaiReferenceEntry's index data +struct BaiReferenceSummary { + + // data members + int NumBins; + int NumLinearOffsets; + uint64_t FirstBinFilePosition; + uint64_t FirstLinearOffsetFilePosition; + + // ctor + BaiReferenceSummary(void) + : NumBins(0) + , NumLinearOffsets(0) + , FirstBinFilePosition(0) + , FirstLinearOffsetFilePosition(0) + { } +}; + +// convenience typedef for describing a full BAI index file summary +typedef std::vector BaiFileSummary; + +// end BamStandardIndex data structures +// ----------------------------------------------------------------------------- class BamStandardIndex : public BamIndex { // ctor & dtor public: - BamStandardIndex(void); + BamStandardIndex(Internal::BamReaderPrivate* reader); ~BamStandardIndex(void); - // interface (implements BamIndex virtual methods) + // BamIndex implementation public: - // creates index data (in-memory) from @reader data - bool Build(Internal::BamReaderPrivate* reader); - // returns supported file extension - const std::string Extension(void) { return BAI_EXTENSION; } + // builds index from associated BAM file & writes out to index file + bool Create(void); // returns whether reference has alignments or no bool HasAlignments(const int& referenceID) const; - // attempts to use index to jump to @region in @reader; returns success/fail + // attempts to use index data to jump to @region, returns success/fail // a "successful" jump indicates no error, but not whether this region has data // * thus, the method sets a flag to indicate whether there are alignments // available after the jump position - bool Jump(Internal::BamReaderPrivate* reader, - const BamTools::BamRegion& region, - bool* hasAlignmentsInRegion); - - public: - // clear all current index offset data in memory - void ClearAllData(void); - // return file position after header metadata - off_t DataBeginOffset(void) const; - // return true if all index data is cached - bool HasFullDataCache(void) const; - // clears index data from all references except the first - void KeepOnlyFirstReferenceOffsets(void); - // load index data for all references, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadAllReferences(bool saveData = true); - // load first reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadFirstReference(bool saveData = true); - // load header data from index file, return true if loaded OK - bool LoadHeader(void); - // position file pointer to first reference begin, return true if skipped OK - bool SkipToFirstReference(void); - // write index reference data - bool WriteAllReferences(void); - // write index header data - bool WriteHeader(void); - - // 'internal' methods + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + // loads existing data from file into memory + bool Load(const std::string& filename); + // change the index caching behavior + void SetCacheMode(const BamIndex::IndexCacheMode& mode); public: + // returns format's file extension + static const std::string Extension(void); - // ----------------------- - // index file operations - - // check index file magic number, return true if OK + // internal file ops + private: bool CheckMagicNumber(void); - // check index file version, return true if OK - bool CheckVersion(void); - // load a single index bin entry from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadBin(ReferenceIndex& refEntry, bool saveData = true); - bool LoadBins(ReferenceIndex& refEntry, bool saveData = true); - // load a single index bin entry from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadChunk(ChunkVector& chunks, bool saveData = true); - bool LoadChunks(ChunkVector& chunks, bool saveData = true); - // load a single index linear offset entry from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true); - // load a single reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadReference(const int& refId, bool saveData = true); - // loads number of references, return true if loaded OK - bool LoadReferenceCount(int& numReferences); - // position file pointer to desired reference begin, return true if skipped OK - bool SkipToReference(const int& refId); - // write index data for bin to new index file - bool WriteBin(const uint32_t& binId, const ChunkVector& chunks); - // write index data for bins to new index file - bool WriteBins(const BamBinMap& bins); - // write index data for chunk entry to new index file - bool WriteChunk(const Chunk& chunk); - // write index data for chunk entry to new index file - bool WriteChunks(const ChunkVector& chunks); - // write index data for linear offsets entry to new index file - bool WriteLinearOffsets(const LinearOffsetVector& offsets); - // write index data single reference to new index file - bool WriteReference(const ReferenceIndex& refEntry); - - // ----------------------- - // index data operations - - // calculate bins that overlap region - int BinsFromRegion(const BamRegion& region, - const RefVector& references, - const bool isRightBoundSpecified, - uint16_t bins[MAX_BIN]); - // clear all index offset data for desired reference - void ClearReferenceOffsets(const int& refId); - // calculates offset(s) for a given region - bool GetOffsets(const BamRegion& region, - const RefVector& references, - const bool isRightBoundSpecified, - std::vector& offsets, - bool* hasAlignmentsInRegion); - // returns true if index cache has data for desired reference - bool IsDataLoaded(const int& refId) const; - // clears index data from all references except the one specified - void KeepOnlyReferenceOffsets(const int& refId); - // simplifies index by merging 'chunks' - void MergeChunks(void); - // saves BAM bin entry for index - void SaveBinEntry(BamBinMap& binMap, - const uint32_t& saveBin, - const uint64_t& saveOffset, - const uint64_t& lastOffset); - // saves linear offset entry for index - void SaveLinearOffset(LinearOffsetVector& offsets, - const BamAlignment& bAlignment, - const uint64_t& lastOffset); - // initializes index data structure to hold @count references - void SetReferenceCount(const int& count); + void CloseFile(void); + bool IsFileOpen(void) const; + bool OpenFile(const std::string& filename, const char* mode); + bool Seek(const int64_t& position, const int& origin); + int64_t Tell(void) const; - // data members + // internal BAI index building methods + private: + void ClearReferenceEntry(BaiReferenceEntry& refEntry); + void SaveAlignmentChunkToBin(BaiBinMap& binMap, + const uint32_t& currentBin, + const uint64_t& currentOffset, + const uint64_t& lastOffset); + void SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets, + const int& alignmentStartPosition, + const int& alignmentStopPosition, + const uint64_t& lastOffset); + + // internal random-access methods + private: + bool AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end); + void CalculateCandidateBins(const uint32_t& begin, + const uint32_t& end, + std::set& candidateBins); + bool CalculateCandidateOffsets(const BaiReferenceSummary& refSummary, + const uint64_t& minOffset, + std::set& candidateBins, + std::vector& offsets); + uint64_t CalculateMinOffset(const BaiReferenceSummary& refSummary, const uint32_t& begin); + bool GetOffsets(const BamRegion& region, std::vector& offsets); + uint64_t LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index); + + // internal BAI summary (create/load) methods + private: + void ReserveForSummary(const int& numReferences); + void SaveBinsSummary(const int& refId, const int& numBins); + void SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets); + bool SkipBins(const int& numBins); + bool SkipLinearOffsets(const int& numLinearOffsets); + bool SummarizeBins(BaiReferenceSummary& refSummary); + bool SummarizeIndexFile(void); + bool SummarizeLinearOffsets(BaiReferenceSummary& refSummary); + bool SummarizeReference(BaiReferenceSummary& refSummary); + + // internal BAI full index input methods + private: + bool ReadBinID(uint32_t& binId); + bool ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks); + bool ReadIntoBuffer(const unsigned int& bytesRequested); + bool ReadLinearOffset(uint64_t& linearOffset); + bool ReadNumAlignmentChunks(int& numAlignmentChunks); + bool ReadNumBins(int& numBins); + bool ReadNumLinearOffsets(int& numLinearOffsets); + bool ReadNumReferences(int& numReferences); + + // internal BAI full index output methods private: + void MergeAlignmentChunks(BaiAlignmentChunkVector& chunks); + void SortLinearOffsets(BaiLinearOffsetVector& linearOffsets); + bool WriteAlignmentChunk(const BaiAlignmentChunk& chunk); + bool WriteAlignmentChunks(BaiAlignmentChunkVector& chunks); + bool WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks); + bool WriteBins(const int& refId, BaiBinMap& bins); + bool WriteHeader(void); + bool WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets); + bool WriteReferenceEntry(BaiReferenceEntry& refEntry); - BamStandardIndexData m_indexData; - off_t m_dataBeginOffset; - bool m_hasFullDataCache; + // data members + private: + FILE* m_indexStream; bool m_isBigEndian; + BamIndex::IndexCacheMode m_cacheMode; + BaiFileSummary m_indexFileSummary; + + // our input buffer + char* m_buffer; + unsigned int m_bufferLength; + + // static methods + private: + // checks if the buffer is large enough to accomodate the requested size + static void CheckBufferSize(char*& buffer, + unsigned int& bufferLength, + const unsigned int& requestedBytes); + // checks if the buffer is large enough to accomodate the requested size + static void CheckBufferSize(unsigned char*& buffer, + unsigned int& bufferLength, + const unsigned int& requestedBytes); + // static constants + private: + static const int MAX_BIN; + static const int BAM_LIDX_SHIFT; + static const std::string BAI_EXTENSION; + static const char* const BAI_MAGIC; + static const int SIZEOF_ALIGNMENTCHUNK; + static const int SIZEOF_BINCORE; + static const int SIZEOF_LINEAROFFSET; }; } // namespace Internal diff --git a/src/api/internal/BamToolsIndex_p.cpp b/src/api/internal/BamToolsIndex_p.cpp index 954400e..7e6e4ca 100644 --- a/src/api/internal/BamToolsIndex_p.cpp +++ b/src/api/internal/BamToolsIndex_p.cpp @@ -3,13 +3,12 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 January 2011 (DB) +// Last modified: 5 April 2011 (DB) // --------------------------------------------------------------------------- // Provides index operations for the BamTools index format (".bti") // *************************************************************************** #include -#include #include #include #include @@ -24,11 +23,18 @@ using namespace BamTools::Internal; #include using namespace std; -BamToolsIndex::BamToolsIndex(void) - : BamIndex() - , m_blockSize(1000) - , m_dataBeginOffset(0) - , m_hasFullDataCache(false) +// static BamToolsIndex constants +const int BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000; +const string BamToolsIndex::BTI_EXTENSION = ".bti"; +const char* const BamToolsIndex::BTI_MAGIC = "BTI\1"; +const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t)*2 + sizeof(int64_t); + +// ctor +BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader) + : BamIndex(reader) + , m_indexStream(0) + , m_cacheMode(BamIndex::LimitedIndexCaching) + , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH) , m_inputVersion(0) , m_outputVersion(BTI_1_2) // latest version - used for writing new index files { @@ -37,33 +43,123 @@ BamToolsIndex::BamToolsIndex(void) // dtor BamToolsIndex::~BamToolsIndex(void) { - ClearAllData(); + CloseFile(); } -// creates index data (in-memory) from @reader data -bool BamToolsIndex::Build(Internal::BamReaderPrivate* reader) { +bool BamToolsIndex::CheckMagicNumber(void) { + + // check 'magic number' to see if file is BTI index + char magic[4]; + size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream); + if ( elementsRead != 4 ) { + cerr << "BamToolsIndex ERROR: could not read format 'magic' number" << endl; + return false; + } + + if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 ) { + cerr << "BamToolsIndex ERROR: invalid format" << endl; + return false; + } + + // otherwise ok + return true; +} + +// check index file version, return true if OK +bool BamToolsIndex::CheckVersion(void) { + + // read version from file + size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(m_inputVersion); + + // if version is negative, or zero + if ( m_inputVersion <= 0 ) { + cerr << "BamToolsIndex ERROR: could not load index file: invalid version." + << endl; + return false; + } + + // if version is newer than can be supported by this version of bamtools + else if ( m_inputVersion > m_outputVersion ) { + cerr << "BamToolsIndex ERROR: could not load index file. This version of BamTools does not recognize new index file version" + << endl + << "Please update BamTools to a more recent version to support this index file." + << endl; + return false; + } + + // ------------------------------------------------------------------ + // check for deprecated, unsupported versions + // (typically whose format did not accomodate a particular bug fix) - // skip if invalid reader - if ( reader == 0 ) + else if ( (Version)m_inputVersion == BamToolsIndex::BTI_1_0 ) { + cerr << "BamToolsIndex ERROR: could not load index file. This version of the index contains a bug related to accessing data near reference ends." + << endl << endl + << "Please run 'bamtools index -bti -in yourData.bam' to generate an up-to-date, fixed BTI file." + << endl << endl; return false; + } - // skip if reader's BgzfStream is invalid or not open - BgzfStream* bgzfStream = reader->Stream(); - if ( bgzfStream == 0 || !bgzfStream->IsOpen ) + else if ( (Version)m_inputVersion == BamToolsIndex::BTI_1_1 ) { + cerr << "BamToolsIndex ERROR: could not load index file. This version of the index contains a bug related to handling empty references." + << endl << endl + << "Please run 'bamtools index -bti -in yourData.bam' to generate an up-to-date, fixed BTI file." + << endl << endl; return false; + } + + // otherwise ok + else return true; +} + +void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) { + refEntry.ID = -1; + refEntry.Blocks.clear(); +} + +void BamToolsIndex::CloseFile(void) { + if ( IsFileOpen() ) + fclose(m_indexStream); + m_indexFileSummary.clear(); +} + +// builds index from associated BAM file & writes out to index file +bool BamToolsIndex::Create(void) { + + // return false if BamReader is invalid or not open + if ( m_reader == 0 || !m_reader->IsOpen() ) { + cerr << "BamToolsIndex ERROR: BamReader is not open" + << ", aborting index creation" << endl; + return false; + } + + // rewind BamReader + if ( !m_reader->Rewind() ) { + cerr << "BamToolsIndex ERROR: could not rewind BamReader to create index" + << ", aborting index creation" << endl; + return false; + } + + // open new index file (read & write) + string indexFilename = m_reader->Filename() + Extension(); + if ( !OpenFile(indexFilename, "w+b") ) { + cerr << "BamStandardIndex ERROR: could not open ouput index file " << indexFilename + << ", aborting index creation" << endl; + return false; + } - // move reader's file pointer to beginning of alignments - if ( !reader->Rewind() ) return false; + // initialize BtiFileSummary with number of references + const int& numReferences = m_reader->GetReferenceCount(); + InitializeFileSummary(numReferences); - // initialize index data structure with space for all references - const int numReferences = reader->GetReferenceCount(); - m_indexData.clear(); - m_hasFullDataCache = false; - SetReferenceCount(numReferences); + // initialize output file + bool createdOk = true; + createdOk &= WriteHeader(); - // set up counters and markers + // index building markers int32_t currentBlockCount = 0; - int64_t currentAlignmentOffset = bgzfStream->Tell(); + int64_t currentAlignmentOffset = m_reader->Tell(); int32_t blockRefId = 0; int32_t blockMaxEndPosition = 0; int64_t blockStartOffset = currentAlignmentOffset; @@ -71,16 +167,21 @@ bool BamToolsIndex::Build(Internal::BamReaderPrivate* reader) { // plow through alignments, storing index entries BamAlignment al; - while ( reader->LoadNextAlignment(al) ) { + BtiReferenceEntry refEntry; + while ( m_reader->LoadNextAlignment(al) ) { // if block contains data (not the first time through) AND alignment is on a new reference if ( currentBlockCount > 0 && al.RefID != blockRefId ) { - // store previous data - BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); - SaveOffsetEntry(blockRefId, entry); + // store previous BTI block data in reference entry + BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); - // intialize new block for current alignment's reference + // write reference entry, then clear + createdOk &= WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); + + // update markers currentBlockCount = 0; blockMaxEndPosition = al.GetEndPosition(); blockStartOffset = currentAlignmentOffset; @@ -102,420 +203,329 @@ bool BamToolsIndex::Build(Internal::BamReaderPrivate* reader) { // if block is full, get offset for next block, reset currentBlockCount if ( currentBlockCount == m_blockSize ) { - BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); - SaveOffsetEntry(blockRefId, entry); - blockStartOffset = bgzfStream->Tell(); + + // store previous block data in reference entry + BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); + + // update markers + blockStartOffset = m_reader->Tell(); currentBlockCount = 0; } // not the best name, but for the next iteration, this value will be the offset of the *current* alignment // necessary because we won't know if this next alignment is on a new reference until we actually read it - currentAlignmentOffset = bgzfStream->Tell(); - } - - // store final block with data - BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); - SaveOffsetEntry(blockRefId, entry); - - // set flag - m_hasFullDataCache = true; - - // return success/failure of rewind - return reader->Rewind(); -} - -// check index file magic number, return true if OK -bool BamToolsIndex::CheckMagicNumber(void) { - - // see if index is valid BAM index - char magic[4]; - size_t elementsRead = fread(magic, 1, 4, m_indexStream); - if ( elementsRead != 4 ) return false; - if ( strncmp(magic, "BTI\1", 4) != 0 ) { - fprintf(stderr, "BamToolsIndex ERROR: could not load index file - invalid magic number.\n"); - return false; - } - - // otherwise ok - return true; -} - -// check index file version, return true if OK -bool BamToolsIndex::CheckVersion(void) { - - // read version from file - size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(m_inputVersion); - - // if version is negative, or zero - if ( m_inputVersion <= 0 ) { - fprintf(stderr, "BamToolsIndex ERROR: could not load index file - invalid version.\n"); - return false; + currentAlignmentOffset = m_reader->Tell(); } - // if version is newer than can be supported by this version of bamtools - else if ( m_inputVersion > m_outputVersion ) { - fprintf(stderr, "BamToolsIndex ERROR: could not load index file - this version of BamTools does not recognize new index file version.\n"); - fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n"); - return false; - } + // store last BTI block data in reference entry + BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); - // ------------------------------------------------------------------ - // check for deprecated, unsupported versions - // (typically whose format did not accomodate a particular bug fix) + // write last reference entry, then clear + createdOk &= WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); - else if ( (Version)m_inputVersion == BTI_1_0 ) { - fprintf(stderr, "BamToolsIndex ERROR: could not load index file - this version of the index contains a bug related to accessing data near reference ends.\n"); - fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date, fixed BTI file.\n\n"); - return false; - } + // rewind reader & return result + createdOk &= m_reader->Rewind(); - else if ( (Version)m_inputVersion == BTI_1_1 ) { - fprintf(stderr, "BamToolsIndex ERROR: could not load index file - this version of the index contains a bug related to handling empty references.\n"); - fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date, fixed BTI file.\n\n"); - return false; - } - - // otherwise ok - else return true; + // return result + return createdOk; } -// clear all current index offset data in memory -void BamToolsIndex::ClearAllData(void) { - BamToolsIndexData::const_iterator indexIter = m_indexData.begin(); - BamToolsIndexData::const_iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - const int& refId = (*indexIter).first; - ClearReferenceOffsets(refId); - } +// returns format's file extension +const std::string BamToolsIndex::Extension(void) { + return BamToolsIndex::BTI_EXTENSION; } -// clear all index offset data for desired reference -void BamToolsIndex::ClearReferenceOffsets(const int& refId) { - if ( m_indexData.find(refId) == m_indexData.end() ) return; - vector& offsets = m_indexData[refId].Offsets; - offsets.clear(); - m_hasFullDataCache = false; -} - -// return file position after header metadata -off_t BamToolsIndex::DataBeginOffset(void) const { - return m_dataBeginOffset; -} - -// calculate BAM file offset for desired region -// return true if no error (*NOT* equivalent to "has alignments or valid offset") -// check @hasAlignmentsInRegion to determine this status -// @region - target region -// @offset - resulting seek target -// @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status -// N.B. - ignores isRightBoundSpecified bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { - // return false if leftBound refID is not found in index data - BamToolsIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID); - if ( indexIter == m_indexData.end() ) - return false; - - // load index data for region if not already cached - if ( !IsDataLoaded(region.LeftRefID) ) { - - bool loadedOk = true; - loadedOk &= SkipToReference(region.LeftRefID); - loadedOk &= LoadReference(region.LeftRefID); - if ( !loadedOk ) return false; - } - - // localize index data for this reference (& sanity check that data actually exists) - indexIter = m_indexData.find(region.LeftRefID); - if ( indexIter == m_indexData.end() ) + // return false ref ID is not a valid index in file summary data + if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() ) return false; - const vector& referenceOffsets = (*indexIter).second.Offsets; - if ( referenceOffsets.empty() ) + // retrieve reference index data for left bound reference + BtiReferenceEntry refEntry(region.LeftRefID); + if ( !ReadReferenceEntry(refEntry) ) { + cerr << "BamToolsIndex ERROR: could not retrieve index data from BTI file" << endl; return false; + } - // ------------------------------------------------------- - // calculate nearest index to jump to - - // save first offset - offset = (*referenceOffsets.begin()).StartOffset; + // TODO: can this next loop be sped up? + // Binary search for overlap instead of linear search perhaps. - // iterate over offsets entries on this reference - vector::const_iterator offsetIter = referenceOffsets.begin(); - vector::const_iterator offsetEnd = referenceOffsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) { - const BamToolsIndexEntry& entry = (*offsetIter); + // iterate over blocks on this reference + BtiBlockVector::const_iterator blockIter = refEntry.Blocks.begin(); + BtiBlockVector::const_iterator blockEnd = refEntry.Blocks.end(); + for ( ; blockIter != blockEnd; ++blockIter ) { + const BtiBlock& block = (*blockIter); - // break if alignment 'entry' overlaps region - if ( entry.MaxEndPosition >= region.LeftPosition ) break; - offset = (*offsetIter).StartOffset; + // store offset & break if block end overlaps region start + if ( block.MaxEndPosition >= region.LeftPosition ) { + offset = block.StartOffset; + break; + } } - // set flag based on whether an index entry was found for this region - *hasAlignmentsInRegion = ( offsetIter != offsetEnd ); - - // if cache mode set to none, dump the data we just loaded - if ( m_cacheMode == BamIndex::NoIndexCaching ) - ClearReferenceOffsets(region.LeftRefID); + // sets to false if blocks container is empty, or if no matching block could be found + *hasAlignmentsInRegion = ( blockIter != blockEnd ); // return success return true; } // returns whether reference has alignments or no -bool BamToolsIndex::HasAlignments(const int& refId) const { - BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end()) return false; - const BamToolsReferenceEntry& refEntry = (*indexIter).second; - return refEntry.HasAlignments; +bool BamToolsIndex::HasAlignments(const int& referenceID) const { + if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() ) + return false; + const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID); + return ( refSummary.NumBlocks > 0 ); } -// return true if all index data is cached -bool BamToolsIndex::HasFullDataCache(void) const { - return m_hasFullDataCache; +void BamToolsIndex::InitializeFileSummary(const int& numReferences) { + m_indexFileSummary.clear(); + for ( int i = 0; i < numReferences; ++i ) + m_indexFileSummary.push_back( BtiReferenceSummary() ); } -// returns true if index cache has data for desired reference -bool BamToolsIndex::IsDataLoaded(const int& refId) const { - - BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end()) return false; - const BamToolsReferenceEntry& refEntry = (*indexIter).second; - - if ( !refEntry.HasAlignments ) return true; // no data period - - // return whether offsets list contains data - return !refEntry.Offsets.empty(); +bool BamToolsIndex::IsFileOpen(void) const { + return ( m_indexStream != 0 ); } -// attempts to use index to jump to @region in @reader; returns success/fail -bool BamToolsIndex::Jump(Internal::BamReaderPrivate* reader, - const BamTools::BamRegion& region, - bool* hasAlignmentsInRegion) -{ +// attempts to use index data to jump to @region, returns success/fail +// a "successful" jump indicates no error, but not whether this region has data +// * thus, the method sets a flag to indicate whether there are alignments +// available after the jump position +bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) { + // clear flag *hasAlignmentsInRegion = false; - // skip if invalid reader - if ( reader == 0 ) return false; - - // skip if reader's BgzfStream is invalid or not open - BgzfStream* bgzfStream = reader->Stream(); - if ( bgzfStream == 0 || !bgzfStream->IsOpen ) + // skip if invalid reader or not open + if ( m_reader == 0 || !m_reader->IsOpen() ) return false; // make sure left-bound position is valid - const RefVector& references = reader->GetReferenceData(); + const RefVector& references = m_reader->GetReferenceData(); if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false; // calculate nearest offset to jump to int64_t offset; if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) { - fprintf(stderr, "BamToolsIndex ERROR: could not jump - unable to calculate offset for specified region.\n"); + cerr << "BamToolsIndex ERROR: could not jump" + << ", unable to calculate offset for specified region" << endl; return false; } // return success/failure of seek - return bgzfStream->Seek(offset); + return m_reader->Seek(offset); } -// clears index data from all references except the first -void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) { - BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); - KeepOnlyReferenceOffsets( (*indexBegin).first ); -} +// loads existing data from file into memory +bool BamToolsIndex::Load(const std::string& filename) { -// clears index data from all references except the one specified -void BamToolsIndex::KeepOnlyReferenceOffsets(const int& refId) { - BamToolsIndexData::iterator mapIter = m_indexData.begin(); - BamToolsIndexData::iterator mapEnd = m_indexData.end(); - for ( ; mapIter != mapEnd; ++mapIter ) { - const int entryRefId = (*mapIter).first; - if ( entryRefId != refId ) - ClearReferenceOffsets(entryRefId); + // attempt open index file (read-only) + if ( !OpenFile(filename, "rb") ) { + cerr << "BamStandardIndex ERROR: could not open input index file " << filename + << ", aborting index load" << endl; + return false; } + + // attempt to load & validate BTI header data + if ( !LoadHeader() ) { + CloseFile(); + return false; + } + + // attempt to load index file summary, return success/failure + return LoadFileSummary(); } -// load index data for all references, return true if loaded OK -bool BamToolsIndex::LoadAllReferences(bool saveData) { +bool BamToolsIndex::LoadFileSummary(void) { - // skip if data already loaded - if ( m_hasFullDataCache ) return true; + // load number of reference sequences + int numReferences; + if ( !LoadNumReferences(numReferences) ) + return false; - // read in number of references - int32_t numReferences; - if ( !LoadReferenceCount(numReferences) ) return false; - //SetReferenceCount(numReferences); + // initialize file summary data + InitializeFileSummary(numReferences); // iterate over reference entries bool loadedOk = true; - for ( int i = 0; i < numReferences; ++i ) - loadedOk &= LoadReference(i, saveData); - - // set flag - if ( loadedOk && saveData ) - m_hasFullDataCache = true; + BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin(); + BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end(); + for ( ; summaryIter != summaryEnd; ++summaryIter ) + loadedOk &= LoadReferenceSummary(*summaryIter); - // return success/failure of load + // return result return loadedOk; } -// load header data from index file, return true if loaded OK bool BamToolsIndex::LoadHeader(void) { - // check magic number - if ( !CheckMagicNumber() ) return false; + // if invalid format 'magic number' + if ( !CheckMagicNumber() ) + return false; - // check BTI version - if ( !CheckVersion() ) return false; + // if invalid BTI version + if ( !CheckVersion() ) + return false; - // read in block size + // use file's BTI block size to set member variable size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - - // swap endian-ness if necessary if ( m_isBigEndian ) SwapEndian_32(m_blockSize); - - // store offset of beginning of data - m_dataBeginOffset = ftell64(m_indexStream); - - // return success - return true; + return ( elementsRead == 1 ); } -// load a single index entry from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamToolsIndex::LoadIndexEntry(const int& refId, bool saveData) { +bool BamToolsIndex::LoadNumBlocks(int& numBlocks) { + size_t elementsRead = 0; + elementsRead += fread(&numBlocks, sizeof(numBlocks), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numBlocks); + return ( elementsRead == 1 ); +} - // read in index entry data members +bool BamToolsIndex::LoadNumReferences(int& numReferences) { size_t elementsRead = 0; - BamToolsIndexEntry entry; - elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_indexStream); - elementsRead += fread(&entry.StartOffset, sizeof(entry.StartOffset), 1, m_indexStream); - elementsRead += fread(&entry.StartPosition, sizeof(entry.StartPosition), 1, m_indexStream); - if ( elementsRead != 3 ) return false; + elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + return ( elementsRead == 1 ); +} - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_32(entry.MaxEndPosition); - SwapEndian_64(entry.StartOffset); - SwapEndian_32(entry.StartPosition); - } +bool BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) { - // save data - if ( saveData ) - SaveOffsetEntry(refId, entry); + // load number of blocks + int numBlocks; + if ( !LoadNumBlocks(numBlocks) ) + return false; - // return success/failure of load - return true; + // store block summary data for this reference + refSummary.NumBlocks = numBlocks; + refSummary.FirstBlockFilePosition = Tell(); + + // skip blocks in index file (and return status) + return SkipBlocks(numBlocks); } -// load a single reference from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamToolsIndex::LoadFirstReference(bool saveData) { - BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); - return LoadReference( (*indexBegin).first, saveData ); +bool BamToolsIndex::OpenFile(const std::string& filename, const char* mode) { + + // make sure any previous index file is closed + CloseFile(); + + // attempt to open file + m_indexStream = fopen(filename.c_str(), mode); + return IsFileOpen(); } -// load a single reference from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamToolsIndex::LoadReference(const int& refId, bool saveData) { +bool BamToolsIndex::ReadBlock(BtiBlock& block) { - // read in number of offsets for this reference - uint32_t numOffsets; - size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_indexStream); - if ( elementsRead != 1 ) return false; + // read in block data members + size_t elementsRead = 0; + elementsRead += fread(&block.MaxEndPosition, sizeof(block.MaxEndPosition), 1, m_indexStream); + elementsRead += fread(&block.StartOffset, sizeof(block.StartOffset), 1, m_indexStream); + elementsRead += fread(&block.StartPosition, sizeof(block.StartPosition), 1, m_indexStream); // swap endian-ness if necessary - if ( m_isBigEndian ) SwapEndian_32(numOffsets); + if ( m_isBigEndian ) { + SwapEndian_32(block.MaxEndPosition); + SwapEndian_64(block.StartOffset); + SwapEndian_32(block.StartPosition); + } - // initialize offsets container for this reference - SetOffsetCount(refId, (int)numOffsets); + // return success/failure + return ( elementsRead == 3 ); +} - // iterate over offset entries - for ( unsigned int j = 0; j < numOffsets; ++j ) - LoadIndexEntry(refId, saveData); +bool BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) { - // return success/failure of load - return true; -} + // prep blocks container + blocks.clear(); + blocks.reserve(refSummary.NumBlocks); -// loads number of references, return true if loaded OK -bool BamToolsIndex::LoadReferenceCount(int& numReferences) { + // skip to first block entry + if ( !Seek( refSummary.FirstBlockFilePosition, SEEK_SET ) ) { + cerr << "BamToolsIndex ERROR: could not seek to position " + << refSummary.FirstBlockFilePosition << endl; + return false; + } - size_t elementsRead = 0; + // read & store block entries + bool readOk = true; + BtiBlock block; + for ( int i = 0; i < refSummary.NumBlocks; ++i ) { + readOk &= ReadBlock(block); + blocks.push_back(block); + } + return readOk; +} - // read reference count - elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); - if ( elementsRead != 1 ) return false; +bool BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) { - // swap endian-ness if necessary - if ( m_isBigEndian ) SwapEndian_32(numReferences); + // return false if refId not valid index in file summary structure + if ( refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size() ) + return false; - // return success - return true; + // use index summary to assist reading the reference's BTI blocks + const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID); + return ReadBlocks(refSummary, refEntry.Blocks); } -// saves an index offset entry in memory -void BamToolsIndex::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) { - BamToolsReferenceEntry& refEntry = m_indexData[refId]; - refEntry.HasAlignments = true; - refEntry.Offsets.push_back(entry); +bool BamToolsIndex::Seek(const int64_t& position, const int& origin) { + return ( fseek64(m_indexStream, position, origin) == 0 ); } -// pre-allocates size for offset vector -void BamToolsIndex::SetOffsetCount(const int& refId, const int& offsetCount) { - BamToolsReferenceEntry& refEntry = m_indexData[refId]; - refEntry.Offsets.reserve(offsetCount); - refEntry.HasAlignments = ( offsetCount > 0); +// change the index caching behavior +void BamToolsIndex::SetCacheMode(const BamIndex::IndexCacheMode& mode) { + m_cacheMode = mode; + // do nothing else here ? cache mode will be ignored from now on, most likely } -// initializes index data structure to hold @count references -void BamToolsIndex::SetReferenceCount(const int& count) { - for ( int i = 0; i < count; ++i ) - m_indexData[i].HasAlignments = false; +bool BamToolsIndex::SkipBlocks(const int& numBlocks) { + return Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR ); } -// position file pointer to first reference begin, return true if skipped OK -bool BamToolsIndex::SkipToFirstReference(void) { - BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); - return SkipToReference( (*indexBegin).first ); +int64_t BamToolsIndex::Tell(void) const { + return ftell64(m_indexStream); } -// position file pointer to desired reference begin, return true if skipped OK -bool BamToolsIndex::SkipToReference(const int& refId) { - - // attempt rewind - if ( !Rewind() ) return false; +bool BamToolsIndex::WriteBlock(const BtiBlock& block) { - // read in number of references - int32_t numReferences; - size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(numReferences); + // copy entry data + int32_t maxEndPosition = block.MaxEndPosition; + int64_t startOffset = block.StartOffset; + int32_t startPosition = block.StartPosition; - // iterate over reference entries - bool skippedOk = true; - int currentRefId = 0; - while (currentRefId != refId) { - skippedOk &= LoadReference(currentRefId, false); - ++currentRefId; + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_32(maxEndPosition); + SwapEndian_64(startOffset); + SwapEndian_32(startPosition); } - // return success/failure of skip - return skippedOk; + // write the reference index entry + size_t elementsWritten = 0; + elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream); + elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_indexStream); + elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_indexStream); + return ( elementsWritten == 3 ); +} + +bool BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) { + bool writtenOk = true; + BtiBlockVector::const_iterator blockIter = blocks.begin(); + BtiBlockVector::const_iterator blockEnd = blocks.end(); + for ( ; blockIter != blockEnd; ++blockIter ) + writtenOk &= WriteBlock(*blockIter); + return writtenOk; } -// write header to new index file bool BamToolsIndex::WriteHeader(void) { size_t elementsWritten = 0; // write BTI index format 'magic number' - elementsWritten += fwrite("BTI\1", 1, 4, m_indexStream); + elementsWritten += fwrite(BamToolsIndex::BTI_MAGIC, 1, 4, m_indexStream); // write BTI index format version int32_t currentVersion = (int32_t)m_outputVersion; @@ -527,76 +537,28 @@ bool BamToolsIndex::WriteHeader(void) { if ( m_isBigEndian ) SwapEndian_32(blockSize); elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_indexStream); - // store offset of beginning of data - m_dataBeginOffset = ftell64(m_indexStream); - - // return success/failure of write - return ( elementsWritten == 6 ); -} - -// write index data for all references to new index file -bool BamToolsIndex::WriteAllReferences(void) { - - size_t elementsWritten = 0; - // write number of references - int32_t numReferences = (int32_t)m_indexData.size(); + int32_t numReferences = m_indexFileSummary.size(); + cerr << "Writing numReferences = " << numReferences << endl; if ( m_isBigEndian ) SwapEndian_32(numReferences); elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream); - if ( elementsWritten != 1 ) return false; - - // iterate through references in index - bool refOk = true; - BamToolsIndexData::const_iterator refIter = m_indexData.begin(); - BamToolsIndexData::const_iterator refEnd = m_indexData.end(); - for ( ; refIter != refEnd; ++refIter ) - refOk &= WriteReferenceEntry( (*refIter).second ); - // return success/fail - return refOk; + // return success/failure of write + return ( elementsWritten == 7 ); } -// write current reference index data to new index file -bool BamToolsIndex::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) { +bool BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) { size_t elementsWritten = 0; - // write number of offsets listed for this reference - uint32_t numOffsets = refEntry.Offsets.size(); - if ( m_isBigEndian ) SwapEndian_32(numOffsets); - elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_indexStream); - if ( elementsWritten != 1 ) return false; + // write number of blocks this reference + uint32_t numBlocks = refEntry.Blocks.size(); + if ( m_isBigEndian ) SwapEndian_32(numBlocks); + elementsWritten += fwrite(&numBlocks, sizeof(numBlocks), 1, m_indexStream); - // iterate over offset entries - bool entriesOk = true; - vector::const_iterator offsetIter = refEntry.Offsets.begin(); - vector::const_iterator offsetEnd = refEntry.Offsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) - entriesOk &= WriteIndexEntry( (*offsetIter) ); + // write actual block entries + const bool blocksOk = WriteBlocks(refEntry.Blocks); // return success/fail - return entriesOk; -} - -// write current index offset entry to new index file -bool BamToolsIndex::WriteIndexEntry(const BamToolsIndexEntry& entry) { - - // copy entry data - int32_t maxEndPosition = entry.MaxEndPosition; - int64_t startOffset = entry.StartOffset; - int32_t startPosition = entry.StartPosition; - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_32(maxEndPosition); - SwapEndian_64(startOffset); - SwapEndian_32(startPosition); - } - - // write the reference index entry - size_t elementsWritten = 0; - elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream); - elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_indexStream); - elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_indexStream); - return ( elementsWritten == 3 ); + return ( elementsWritten == 1) && blocksOk; } diff --git a/src/api/internal/BamToolsIndex_p.h b/src/api/internal/BamToolsIndex_p.h index ee5abbc..16aef8c 100644 --- a/src/api/internal/BamToolsIndex_p.h +++ b/src/api/internal/BamToolsIndex_p.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 January 2011 (DB) +// Last modified: 5 April 2011 (DB) // --------------------------------------------------------------------------- // Provides index operations for the BamTools index format (".bti") // *************************************************************************** @@ -30,11 +30,8 @@ namespace BamTools { namespace Internal { -// BTI constants -const std::string BTI_EXTENSION = ".bti"; - -// individual index offset entry -struct BamToolsIndexEntry { +// contains data for each 'block' in a BTI index +struct BtiBlock { // data members int32_t MaxEndPosition; @@ -42,30 +39,48 @@ struct BamToolsIndexEntry { int32_t StartPosition; // ctor - BamToolsIndexEntry(const int32_t& maxEndPosition = 0, - const int64_t& startOffset = 0, - const int32_t& startPosition = 0) + BtiBlock(const int32_t& maxEndPosition = 0, + const int64_t& startOffset = 0, + const int32_t& startPosition = 0) : MaxEndPosition(maxEndPosition) , StartOffset(startOffset) , StartPosition(startPosition) { } }; -// reference index entry -struct BamToolsReferenceEntry { +// convenience typedef for describing a a list of BTI blocks on a reference +typedef std::vector BtiBlockVector; + +// contains all fields necessary for building, loading, & writing +// full BTI index data for a single reference +struct BtiReferenceEntry { // data members - bool HasAlignments; - std::vector Offsets; + int32_t ID; + BtiBlockVector Blocks; // ctor - BamToolsReferenceEntry(void) - : HasAlignments(false) + BtiReferenceEntry(const int& id = -1) + : ID(id) { } }; -// the actual index data structure -typedef std::map BamToolsIndexData; +// provides (persistent) summary of BtiReferenceEntry's index data +struct BtiReferenceSummary { + + // data members + int NumBlocks; + uint64_t FirstBlockFilePosition; + + // ctor + BtiReferenceSummary(void) + : NumBlocks(0) + , FirstBlockFilePosition(0) + { } +}; + +// convenience typedef for describing a full BTI index file summary +typedef std::vector BtiFileSummary; class BamToolsIndex : public BamIndex { @@ -84,109 +99,87 @@ class BamToolsIndex : public BamIndex { , BTI_1_2 }; - // ctor & dtor public: - BamToolsIndex(void); + BamToolsIndex(Internal::BamReaderPrivate* reader); ~BamToolsIndex(void); - // interface (implements BamIndex virtual methods) + // BamIndex implementation public: - // creates index data (in-memory) from @reader data - bool Build(Internal::BamReaderPrivate* reader); - // returns supported file extension - const std::string Extension(void) { return BTI_EXTENSION; } + // builds index from associated BAM file & writes out to index file + bool Create(void); // returns whether reference has alignments or no bool HasAlignments(const int& referenceID) const; - // attempts to use index to jump to @region in @reader; returns success/fail + // attempts to use index data to jump to @region, returns success/fail // a "successful" jump indicates no error, but not whether this region has data // * thus, the method sets a flag to indicate whether there are alignments // available after the jump position - bool Jump(Internal::BamReaderPrivate* reader, - const BamTools::BamRegion& region, - bool *hasAlignmentsInRegion); - - public: - // clear all current index offset data in memory - void ClearAllData(void); - // return file position after header metadata - off_t DataBeginOffset(void) const; - // return true if all index data is cached - bool HasFullDataCache(void) const; - // clears index data from all references except the first - void KeepOnlyFirstReferenceOffsets(void); - // load index data for all references, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadAllReferences(bool saveData = true); - // load first reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadFirstReference(bool saveData = true); - // load header data from index file, return true if loaded OK - bool LoadHeader(void); - // position file pointer to first reference begin, return true if skipped OK - bool SkipToFirstReference(void); - // write index reference data - bool WriteAllReferences(void); - // write index header data - bool WriteHeader(void); - - // internal methods + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + // loads existing data from file into memory + bool Load(const std::string& filename); + // change the index caching behavior + void SetCacheMode(const BamIndex::IndexCacheMode& mode); public: + // returns format's file extension + static const std::string Extension(void); - // ----------------------- - // index file operations - - // check index file magic number, return true if OK + // internal file ops + private: bool CheckMagicNumber(void); - // check index file version, return true if OK bool CheckVersion(void); - // load a single index entry from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadIndexEntry(const int& refId, bool saveData = true); - // load a single reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadReference(const int& refId, bool saveData = true); - // loads number of references, return true if loaded OK - bool LoadReferenceCount(int& numReferences); - // position file pointer to desired reference begin, return true if skipped OK - bool SkipToReference(const int& refId); - // write current reference index data to new index file - bool WriteReferenceEntry(const BamToolsReferenceEntry& refEntry); - // write current index offset entry to new index file - bool WriteIndexEntry(const BamToolsIndexEntry& entry); - - // ----------------------- - // index data operations - - // clear all index offset data for desired reference - void ClearReferenceOffsets(const int& refId); - // calculate BAM file offset for desired region - // return true if no error (*NOT* equivalent to "has alignments or valid offset") - // check @hasAlignmentsInRegion to determine this status - // @region - target region - // @offset - resulting seek target - // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status + void CloseFile(void); + bool IsFileOpen(void) const; + bool OpenFile(const std::string& filename, const char* mode); + bool Seek(const int64_t& position, const int& origin); + int64_t Tell(void) const; + + // internal BTI index building methods + private: + void ClearReferenceEntry(BtiReferenceEntry& refEntry); + + // internal random-access methods + private: bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); - // returns true if index cache has data for desired reference - bool IsDataLoaded(const int& refId) const; - // clears index data from all references except the one specified - void KeepOnlyReferenceOffsets(const int& refId); - // saves an index offset entry in memory - void SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry); - // pre-allocates size for offset vector - void SetOffsetCount(const int& refId, const int& offsetCount); - // initializes index data structure to hold @count references - void SetReferenceCount(const int& count); + + // internal BTI summary data methods + private: + void InitializeFileSummary(const int& numReferences); + bool LoadFileSummary(void); + bool LoadHeader(void); + bool LoadNumBlocks(int& numBlocks); + bool LoadNumReferences(int& numReferences); + bool LoadReferenceSummary(BtiReferenceSummary& refSummary); + bool SkipBlocks(const int& numBlocks); + + // internal BTI full index input methods + private: + bool ReadBlock(BtiBlock& block); + bool ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks); + bool ReadReferenceEntry(BtiReferenceEntry& refEntry); + + // internal BTI full index output methods + private: + bool WriteBlock(const BtiBlock& block); + bool WriteBlocks(const BtiBlockVector& blocks); + bool WriteHeader(void); + bool WriteReferenceEntry(const BtiReferenceEntry& refEntry); // data members private: - int32_t m_blockSize; - BamToolsIndexData m_indexData; - off_t m_dataBeginOffset; - bool m_hasFullDataCache; - bool m_isBigEndian; - int32_t m_inputVersion; // Version is serialized as int - Version m_outputVersion; + FILE* m_indexStream; + bool m_isBigEndian; + BamIndex::IndexCacheMode m_cacheMode; + BtiFileSummary m_indexFileSummary; + int m_blockSize; + int32_t m_inputVersion; // Version is serialized as int + Version m_outputVersion; + + // static constants + private: + static const int DEFAULT_BLOCK_LENGTH; + static const std::string BTI_EXTENSION; + static const char* const BTI_MAGIC; + static const int SIZEOF_BLOCK; }; } // namespace Internal diff --git a/src/api/internal/BgzfStream_p.cpp b/src/api/internal/BgzfStream_p.cpp index fb67799..aba2a07 100644 --- a/src/api/internal/BgzfStream_p.cpp +++ b/src/api/internal/BgzfStream_p.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 March 2011(DB) +// Last modified: 5 April 2011(DB) // --------------------------------------------------------------------------- // Based on BGZF routines developed at the Broad Institute. // Provides the basic functionality for reading & writing BGZF files @@ -375,7 +375,7 @@ bool BgzfStream::ReadBlock(void) { } // seek to position in BGZF file -bool BgzfStream::Seek(int64_t position) { +bool BgzfStream::Seek(const int64_t& position) { // skip if not open if ( !IsOpen ) return false; @@ -390,12 +390,10 @@ bool BgzfStream::Seek(int64_t position) { return false; } - // update block data + // update block data & return success BlockLength = 0; BlockAddress = blockAddress; BlockOffset = blockOffset; - - // return success return true; } @@ -404,12 +402,9 @@ void BgzfStream::SetWriteCompressed(bool ok) { } // get file position in BGZF file -int64_t BgzfStream::Tell(void) { - - // skip if file not open - if ( !IsOpen ) return false; - - // otherwise return file pointer position +int64_t BgzfStream::Tell(void) const { + if ( !IsOpen ) + return 0; return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) ); } diff --git a/src/api/internal/BgzfStream_p.h b/src/api/internal/BgzfStream_p.h index 69473e6..c5e5d48 100644 --- a/src/api/internal/BgzfStream_p.h +++ b/src/api/internal/BgzfStream_p.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 24 February 2011(DB) +// Last modified: 5 April 2011(DB) // --------------------------------------------------------------------------- // Based on BGZF routines developed at the Broad Institute. // Provides the basic functionality for reading & writing BGZF files @@ -48,11 +48,11 @@ class BgzfStream { // reads BGZF data into a byte buffer int Read(char* data, const unsigned int dataLength); // seek to position in BGZF file - bool Seek(int64_t position); + bool Seek(const int64_t& position); // enable/disable compressed output void SetWriteCompressed(bool ok); // get file position in BGZF file - int64_t Tell(void); + int64_t Tell(void) const; // writes the supplied data into the BGZF buffer unsigned int Write(const char* data, const unsigned int dataLen); -- 2.39.2