From 3a975af617ffe4f1f8077e7e45fa484296f5e704 Mon Sep 17 00:00:00 2001 From: Derek Date: Thu, 21 Oct 2010 00:19:40 -0400 Subject: [PATCH] Implemented index cache mode for both BAI & BTI formats * Client code can now decide between 3 index cache modes: Full : save entire index data in memory Limited (default) : save only index data for current reference None : save no index data - only load data necessary for a single- * Required a major overhaul to BamIndex interface and derived classes. Lots of refactoring to move common code up to BamIndex. Derived classes now share much of the same method names & organization. Only implementation details differ, as needed. * Miscellaneous: moved BAMTOOLS_LFS definitions into BamAux.h & cleaned up BGZF.h --- src/api/BGZF.h | 7 +- src/api/BamAux.h | 15 +- src/api/BamIndex.cpp | 1874 +++++++++++++++++++++++++++++------------ src/api/BamIndex.h | 177 +++- src/api/BamReader.cpp | 138 +-- src/api/BamReader.h | 44 +- 6 files changed, 1577 insertions(+), 678 deletions(-) diff --git a/src/api/BGZF.h b/src/api/BGZF.h index 37bcff7..c1ff2f8 100644 --- a/src/api/BGZF.h +++ b/src/api/BGZF.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 16 August 2010 (DB) +// Last modified: 20 October 2010 (DB) // --------------------------------------------------------------------------- // BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -14,15 +14,10 @@ #ifndef BGZF_H #define BGZF_H -// 'C' includes #include #include #include - -// C++ includes #include - -// zlib includes #include "zlib.h" // Platform-specific large-file support diff --git a/src/api/BamAux.h b/src/api/BamAux.h index 9d38e7d..7bdc6c2 100644 --- a/src/api/BamAux.h +++ b/src/api/BamAux.h @@ -17,10 +17,19 @@ #include #include -// ---------------------------------------------------------------- -// ---------------------------------------------------------------- -// Platform-specific type definitions +// Platform-specific large-file support +#ifndef BAMTOOLS_LFS +#define BAMTOOLS_LFS + #ifdef WIN32 + #define ftell64(a) _ftelli64(a) + #define fseek64(a,b,c) _fseeki64(a,b,c) + #else + #define ftell64(a) ftello(a) + #define fseek64(a,b,c) fseeko(a,b,c) + #endif +#endif // BAMTOOLS_LFS +// Platform-specific type definitions #ifndef BAMTOOLS_TYPES #define BAMTOOLS_TYPES #ifdef _MSC_VER diff --git a/src/api/BamIndex.cpp b/src/api/BamIndex.cpp index 51d7850..222d96d 100644 --- a/src/api/BamIndex.cpp +++ b/src/api/BamIndex.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 9 October 2010 (DB) +// Last modified: 21 October 2010 (DB) // --------------------------------------------------------------------------- // Provides index functionality - both for the default (standardized) BAM // index format (.bai) as well as a BamTools-specific (nonstandard) index @@ -24,15 +24,148 @@ using namespace BamTools; // ------------------------------- // BamIndex implementation -BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian) +// ctor +BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader) : m_BGZF(bgzf) , m_reader(reader) - , m_isBigEndian(isBigEndian) + , m_cacheMode(BamIndex::LimitedIndexCaching) + , m_indexStream(0) { if ( m_reader && m_reader->IsOpen() ) m_references = m_reader->GetReferenceData(); } +// dtor +BamIndex::~BamIndex(void) { + if ( IsOpen() ) + fclose(m_indexStream); +} + +// return true if FILE* is open +bool BamIndex::IsOpen(void) const { + return ( m_indexStream != 0 ); +} + +// loads existing data from file into memory +bool BamIndex::Load(const string& filename) { + + // open index file, abort on error + if ( !OpenIndexFile(filename, "rb") ) { + fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str()); + return false; + } + + // check magic number + if ( !LoadHeader() ) { + fclose(m_indexStream); + return false; + } + + // load reference data (but only keep in memory if full caching requested) + bool saveInitialLoad = ( m_cacheMode == BamIndex::FullIndexCaching ); + if ( !LoadAllReferences(saveInitialLoad) ) { + fclose(m_indexStream); + return false; + } + + // update index cache based on selected mode + UpdateCache(); + + // return success + return true; +} + +// opens index file for reading/writing, return true if opened OK +bool BamIndex::OpenIndexFile(const string& filename, const string& mode) { + m_indexStream = fopen(filename.c_str(), mode.c_str()); + return ( m_indexStream != 0 ); +} + +// rewind index file to beginning of index data, return true if rewound OK +bool BamIndex::Rewind(void) { + return ( fseek64(m_indexStream, DataBeginOffset(), SEEK_SET) == 0 ); +} + +// change the index caching behavior +void BamIndex::SetCacheMode(const BamIndexCacheMode mode) { + if ( mode != m_cacheMode ) { + m_cacheMode = mode; + UpdateCache(); + } +} + +// updates in-memory cache of index data, depending on current cache mode +void BamIndex::UpdateCache(void) { + + // skip if file not open + if ( !IsOpen() ) return; + + // reflect requested cache mode behavior + switch ( m_cacheMode ) { + + case (BamIndex::FullIndexCaching) : + Rewind(); + LoadAllReferences(true); + break; + + case (BamIndex::LimitedIndexCaching) : + if ( HasFullDataCache() ) + KeepOnlyFirstReferenceOffsets(); + else { + ClearAllData(); + SkipToFirstReference(); + LoadFirstReference(true); + } + break; + case(BamIndex::NoIndexCaching) : + ClearAllData(); + break; + default : + // unreachable + ; + } +} + +// writes in-memory index data out to file +bool BamIndex::Write(const string& bamFilename) { + + // open index file for writing + string indexFilename = bamFilename + Extension(); + if ( !OpenIndexFile(indexFilename, "wb") ) { + fprintf(stderr, "ERROR: Could not open file to save index.\n"); + return false; + } + + // write index header data + if ( !WriteHeader() ) { + fprintf(stderr, "ERROR: There was a problem writing index metadata to new index file.\n"); + fflush(m_indexStream); + fclose(m_indexStream); + exit(1); + } + + // write main index data + if ( !WriteAllReferences() ) { + fprintf(stderr, "ERROR: There was a problem writing index data to new index file.\n"); + fflush(m_indexStream); + fclose(m_indexStream); + exit(1); + } + + // flush any remaining output, rewind file, and return success + fflush(m_indexStream); + fclose(m_indexStream); + + // re-open index file for later reading + if ( !OpenIndexFile(indexFilename, "rb") ) { + fprintf(stderr, "ERROR: Could not open newly created index file for reading.\n"); + return false; + } + + // return success/failure of write + return true; +} + // ######################################################################################### // ######################################################################################### @@ -42,8 +175,8 @@ BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool i namespace BamTools { // BAM index constants -const int MAX_BIN = 37450; // =(8^6-1)/7+1 -const int BAM_LIDX_SHIFT = 14; +const int MAX_BIN = 37450; // =(8^6-1)/7+1 +const int BAM_LIDX_SHIFT = 14; // -------------------------------------------------- // BamStandardIndex data structures & typedefs @@ -74,16 +207,19 @@ struct ReferenceIndex { // data members BamBinMap Bins; LinearOffsetVector Offsets; + bool HasAlignments; // constructor - ReferenceIndex(const BamBinMap& binMap = BamBinMap(), - const LinearOffsetVector& offsets = LinearOffsetVector()) + ReferenceIndex(const BamBinMap& binMap = BamBinMap(), + const LinearOffsetVector& offsets = LinearOffsetVector(), + const bool hasAlignments = false) : Bins(binMap) , Offsets(offsets) + , HasAlignments(hasAlignments) { } }; -typedef vector BamStandardIndexData; +typedef map BamStandardIndexData; } // namespace BamTools @@ -92,51 +228,143 @@ typedef vector BamStandardIndexData; struct BamStandardIndex::BamStandardIndexPrivate { - // ------------------------- - // data members + // parent object + BamStandardIndex* m_parent; + // data members BamStandardIndexData m_indexData; - BamStandardIndex* m_parent; - - // ------------------------- - // ctor & dtor - - BamStandardIndexPrivate(BamStandardIndex* parent) : m_parent(parent) { } - ~BamStandardIndexPrivate(void) { m_indexData.clear(); } - - // ------------------------- - // 'public' methods + off_t m_dataBeginOffset; + bool m_hasFullDataCache; + bool m_isBigEndian; + + // ctor & dtor + BamStandardIndexPrivate(BamStandardIndex* parent); + ~BamStandardIndexPrivate(void); - // creates index data (in-memory) from current reader data - bool Build(void); - // returns whether reference has alignments or no - bool HasAlignments(const int& referenceID) const; - // attempts to use index to jump to region; returns success/fail - bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); - // loads existing data from file into memory - bool Load(const std::string& filename); - // writes in-memory index data out to file - // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) - bool Write(const std::string& bamFilename); + // parent interface methods + public: + + // creates index data (in-memory) from current reader data + bool Build(void); + // clear all current index offset data in memory + void ClearAllData(void); + // return file position after header metadata + const off_t DataBeginOffset(void) const; + // returns whether reference has alignments or no + bool HasAlignments(const int& referenceID) const; + // return true if all index data is cached + bool HasFullDataCache(void) const; + // attempts to use index to jump to region; returns success/fail + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + // clears index data from all references except the first reference + void KeepOnlyFirstReferenceOffsets(void); + // load index data for all references, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadAllReferences(bool saveData = true); + // load first reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadFirstReference(bool saveData = true); + // load header data from index file, return true if loaded OK + bool LoadHeader(void); + // position file pointer to first reference begin, return true if skipped OK + bool SkipToFirstReference(void); + // write header to new index file + bool WriteHeader(void); + // write index data for all references to new index file + bool WriteAllReferences(void); - // ------------------------- // internal methods - - // calculate bins that overlap region - int BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]); - // calculates offset(s) for a given region - bool GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets, bool* hasAlignmentsInRegion); - // saves BAM bin entry for index - void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset); - // saves linear offset entry for index - void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset); - // simplifies index by merging 'chunks' - void MergeChunks(void); + private: + + // ----------------------- + // index file operations + + // check index file magic number, return true if OK + bool CheckMagicNumber(void); + // check index file version, return true if OK + bool CheckVersion(void); + // load a single index bin entry from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadBin(ReferenceIndex& refEntry, bool saveData = true); + bool LoadBins(ReferenceIndex& refEntry, bool saveData = true); + // load a single index bin entry from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadChunk(ChunkVector& chunks, bool saveData = true); + bool LoadChunks(ChunkVector& chunks, bool saveData = true); + // load a single index linear offset entry from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true); + // load a single reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadReference(const int& refId, bool saveData = true); + // loads number of references, return true if loaded OK + bool LoadReferenceCount(int& numReferences); + // position file pointer to desired reference begin, return true if skipped OK + bool SkipToReference(const int& refId); + // write index data for bin to new index file + bool WriteBin(const uint32_t& binId, const ChunkVector& chunks); + // write index data for bins to new index file + bool WriteBins(const BamBinMap& bins); + // write index data for chunk entry to new index file + bool WriteChunk(const Chunk& chunk); + // write index data for chunk entry to new index file + bool WriteChunks(const ChunkVector& chunks); + // write index data for linear offsets entry to new index file + bool WriteLinearOffsets(const LinearOffsetVector& offsets); + // write index data single reference to new index file + bool WriteReference(const ReferenceIndex& refEntry); + + // ----------------------- + // index data operations + + // calculate bins that overlap region + int BinsFromRegion(const BamRegion& region, + const bool isRightBoundSpecified, + uint16_t bins[BamTools::MAX_BIN]); + // clear all index offset data for desired reference + void ClearReferenceOffsets(const int& refId); + // calculates offset(s) for a given region + bool GetOffsets(const BamRegion& region, + const bool isRightBoundSpecified, + vector& offsets, + bool* hasAlignmentsInRegion); + // returns true if index cache has data for desired reference + bool IsDataLoaded(const int& refId) const; + // clears index data from all references except the one specified + void KeepOnlyReferenceOffsets(const int& refId); + // simplifies index by merging 'chunks' + void MergeChunks(void); + // saves BAM bin entry for index + void SaveBinEntry(BamBinMap& binMap, + const uint32_t& saveBin, + const uint64_t& saveOffset, + const uint64_t& lastOffset); + // saves linear offset entry for index + void SaveLinearOffset(LinearOffsetVector& offsets, + const BamAlignment& bAlignment, + const uint64_t& lastOffset); + // initializes index data structure to hold @count references + void SetReferenceCount(const int& count); + }; +BamStandardIndex::BamStandardIndexPrivate::BamStandardIndexPrivate(BamStandardIndex* parent) + : m_parent(parent) + , m_dataBeginOffset(0) + , m_hasFullDataCache(false) +{ + m_isBigEndian = BamTools::SystemIsBigEndian(); +} + +BamStandardIndex::BamStandardIndexPrivate::~BamStandardIndexPrivate(void) { + ClearAllData(); +} + // calculate bins that overlap region -int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) { - +int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& region, + const bool isRightBoundSpecified, + uint16_t bins[MAX_BIN]) +{ // get region boundaries uint32_t begin = (unsigned int)region.LeftPosition; uint32_t end; @@ -171,8 +399,8 @@ bool BamStandardIndex::BamStandardIndexPrivate::Build(void) { // localize parent data if ( m_parent == 0 ) return false; - BamReader* reader = m_parent->m_reader; - BgzfData* mBGZF = m_parent->m_BGZF; + BamReader* reader = m_parent->m_reader; + BgzfData* mBGZF = m_parent->m_BGZF; // be sure reader & BGZF file are valid & open for reading if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen ) @@ -183,8 +411,9 @@ bool BamStandardIndex::BamStandardIndexPrivate::Build(void) { // get reference count, reserve index space const int numReferences = (int)m_parent->m_references.size(); - for ( int i = 0; i < numReferences; ++i ) - m_indexData.push_back(ReferenceIndex()); + m_indexData.clear(); + m_hasFullDataCache = false; + SetReferenceCount(numReferences); // sets default constant for bin, ID, offset, coordinate variables const uint32_t defaultValue = 0xffffffffu; @@ -216,7 +445,8 @@ bool BamStandardIndex::BamStandardIndexPrivate::Build(void) { // if lastCoordinate greater than BAM position - file not sorted properly else if ( lastCoordinate > bAlignment.Position ) { fprintf(stderr, "BAM file not properly sorted:\n"); - fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID); + fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), + lastCoordinate, bAlignment.Position, bAlignment.RefID); exit(1); } @@ -226,7 +456,7 @@ bool BamStandardIndex::BamStandardIndexPrivate::Build(void) { // save linear offset entry (matched to BAM entry refID) ReferenceIndex& refIndex = m_indexData.at(bAlignment.RefID); LinearOffsetVector& offsets = refIndex.Offsets; - InsertLinearOffset(offsets, bAlignment, lastOffset); + SaveLinearOffset(offsets, bAlignment, lastOffset); } // if current BamAlignment bin != lastBin, "then possibly write the binning index" @@ -238,7 +468,7 @@ bool BamStandardIndex::BamStandardIndexPrivate::Build(void) { // save Bam bin entry ReferenceIndex& refIndex = m_indexData.at(saveRefID); BamBinMap& binMap = refIndex.Bins; - InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); + SaveBinEntry(binMap, saveBin, saveOffset, lastOffset); } // update saveOffset @@ -273,12 +503,12 @@ bool BamStandardIndex::BamStandardIndexPrivate::Build(void) { // save Bam bin entry ReferenceIndex& refIndex = m_indexData.at(saveRefID); BamBinMap& binMap = refIndex.Bins; - InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); + SaveBinEntry(binMap, saveBin, saveOffset, lastOffset); } // simplify index by merging chunks MergeChunks(); - + // iterate through references in index // sort offsets in linear offset vector BamStandardIndexData::iterator indexIter = m_indexData.begin(); @@ -286,7 +516,7 @@ bool BamStandardIndex::BamStandardIndexPrivate::Build(void) { for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) { // get reference index data - ReferenceIndex& refIndex = (*indexIter); + ReferenceIndex& refIndex = (*indexIter).second; LinearOffsetVector& offsets = refIndex.Offsets; // sort linear offsets @@ -297,9 +527,72 @@ bool BamStandardIndex::BamStandardIndexPrivate::Build(void) { return reader->Rewind(); } +// check index file magic number, return true if OK +bool BamStandardIndex::BamStandardIndexPrivate::CheckMagicNumber(void) { + + // read in magic number + char magic[4]; + size_t elementsRead = fread(magic, sizeof(char), 4, m_parent->m_indexStream); + + // compare to expected value + if ( strncmp(magic, "BAI\1", 4) != 0 ) { + fprintf(stderr, "Problem with index file - invalid format.\n"); + fclose(m_parent->m_indexStream); + return false; + } + + // return success/failure of load + return (elementsRead == 4); +} + +// clear all current index offset data in memory +void BamStandardIndex::BamStandardIndexPrivate::ClearAllData(void) { + BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); + BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); + for ( ; indexIter != indexEnd; ++indexIter ) { + const int& refId = (*indexIter).first; + ClearReferenceOffsets(refId); + } +} + +// clear all index offset data for desired reference +void BamStandardIndex::BamStandardIndexPrivate::ClearReferenceOffsets(const int& refId) { + + // look up desired reference, skip if not found + if ( m_indexData.find(refId) == m_indexData.end() ) return; + + // clear reference data + ReferenceIndex& refEntry = m_indexData.at(refId); + refEntry.Bins.clear(); + refEntry.Offsets.clear(); + + // set flag + m_hasFullDataCache = false; +} + +// return file position after header metadata +const off_t BamStandardIndex::BamStandardIndexPrivate::DataBeginOffset(void) const { + return m_dataBeginOffset; +} + // calculates offset(s) for a given region -bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets, bool* hasAlignmentsInRegion) { - +bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& region, + const bool isRightBoundSpecified, + vector& offsets, + bool* hasAlignmentsInRegion) +{ + // return false if leftBound refID is not found in index data + if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) + return false; + + // load index data for region if not already cached + if ( !IsDataLoaded(region.LeftRefID) ) { + bool loadedOk = true; + loadedOk &= SkipToReference(region.LeftRefID); + loadedOk &= LoadReference(region.LeftRefID); + if ( !loadedOk ) return false; + } + // calculate which bins overlap this region uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); int numBins = BinsFromRegion(region, isRightBoundSpecified, bins); @@ -310,7 +603,8 @@ bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& regi // get minimum offset to consider const LinearOffsetVector& linearOffsets = refIndex.Offsets; - uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT); + const uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) + ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT); // store all alignment 'chunk' starts (file offsets) for bins in this region for ( int i = 0; i < numBins; ++i ) { @@ -345,72 +639,33 @@ bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& regi } // returns whether reference has alignments or no -bool BamStandardIndex::BamStandardIndexPrivate::HasAlignments(const int& referenceID) const { - - // ID not found - if ( referenceID < 0 || referenceID >= (int)m_indexData.size() ) - return false; - - // return whether reference contains data - const ReferenceIndex& refIndex = m_indexData.at(referenceID); - return !( refIndex.Bins.empty() ); +bool BamStandardIndex::BamStandardIndexPrivate::HasAlignments(const int& refId) const { + if ( m_indexData.find(refId) == m_indexData.end()) return false; + const ReferenceIndex& refEntry = m_indexData.at(refId); + return refEntry.HasAlignments; } -// saves BAM bin entry for index -void BamStandardIndex::BamStandardIndexPrivate::InsertBinEntry(BamBinMap& binMap, - const uint32_t& saveBin, - const uint64_t& saveOffset, - const uint64_t& lastOffset) -{ - // look up saveBin - BamBinMap::iterator binIter = binMap.find(saveBin); - - // create new chunk - Chunk newChunk(saveOffset, lastOffset); - - // if entry doesn't exist - if ( binIter == binMap.end() ) { - ChunkVector newChunks; - newChunks.push_back(newChunk); - binMap.insert( pair(saveBin, newChunks)); - } - - // otherwise - else { - ChunkVector& binChunks = (*binIter).second; - binChunks.push_back( newChunk ); - } +// return true if all index data is cached +bool BamStandardIndex::BamStandardIndexPrivate::HasFullDataCache(void) const { + return m_hasFullDataCache; } -// saves linear offset entry for index -void BamStandardIndex::BamStandardIndexPrivate::InsertLinearOffset(LinearOffsetVector& offsets, - const BamAlignment& bAlignment, - const uint64_t& lastOffset) -{ - // get converted offsets - int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; - int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; - - // resize vector if necessary - int oldSize = offsets.size(); - int newSize = endOffset + 1; - if ( oldSize < newSize ) - offsets.resize(newSize, 0); - - // store offset - for( int i = beginOffset + 1; i <= endOffset; ++i ) { - if ( offsets[i] == 0 ) - offsets[i] = lastOffset; - } -} +// returns true if index cache has data for desired reference +bool BamStandardIndex::BamStandardIndexPrivate::IsDataLoaded(const int& refId) const { + if ( m_indexData.find(refId) == m_indexData.end() ) return false; // unknown reference + const ReferenceIndex& refEntry = m_indexData.at(refId); + if ( !refEntry.HasAlignments ) return true; // no data period + // return whether bin map contains data + return ( !refEntry.Bins.empty() ); +} // attempts to use index to jump to region; returns success/fail bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { // localize parent data if ( m_parent == 0 ) return false; - BamReader* reader = m_parent->m_reader; - BgzfData* mBGZF = m_parent->m_BGZF; + BamReader* reader = m_parent->m_reader; + BgzfData* mBGZF = m_parent->m_BGZF; RefVector& references = m_parent->m_references; // be sure reader & BGZF file are valid & open for reading @@ -418,7 +673,8 @@ bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region, bo return false; // make sure left-bound position is valid - if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false; + if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) + return false; // calculate offsets for this region // if failed, print message, set flag, and return failure @@ -441,7 +697,10 @@ bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region, bo // if this alignment corresponds to desired position // return success of seeking back to the offset before the 'current offset' (to cover overlaps) - if ( (bAlignment.RefID == region.LeftRefID && bAlignment.Position + bAlignment.Length > region.LeftPosition) || (bAlignment.RefID > region.LeftRefID) ) { + if ( ((bAlignment.RefID == region.LeftRefID) && + ((bAlignment.Position + bAlignment.Length) > region.LeftPosition)) || + (bAlignment.RefID > region.LeftRefID) ) + { if ( o != offsets.begin() ) --o; return mBGZF->Seek(*o); } @@ -457,127 +716,219 @@ bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region, bo return result; } -// loads existing data from file into memory -bool BamStandardIndex::BamStandardIndexPrivate::Load(const string& filename) { - - // localize parent data - if ( m_parent == 0 ) return false; - const bool isBigEndian = m_parent->m_isBigEndian; - - // open index file, abort on error - FILE* indexStream = fopen(filename.c_str(), "rb"); - if( !indexStream ) { - fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str()); - return false; +// clears index data from all references except the first +void BamStandardIndex::BamStandardIndexPrivate::KeepOnlyFirstReferenceOffsets(void) { + BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); + KeepOnlyReferenceOffsets((*indexBegin).first); +} + +// clears index data from all references except the one specified +void BamStandardIndex::BamStandardIndexPrivate::KeepOnlyReferenceOffsets(const int& refId) { + BamStandardIndexData::iterator mapIter = m_indexData.begin(); + BamStandardIndexData::iterator mapEnd = m_indexData.end(); + for ( ; mapIter != mapEnd; ++mapIter ) { + const int entryRefId = (*mapIter).first; + if ( entryRefId != refId ) + ClearReferenceOffsets(entryRefId); } +} - // set placeholder to receive input byte count (suppresses compiler warnings) - size_t elementsRead = 0; - - // see if index is valid BAM index - char magic[4]; - elementsRead = fread(magic, 1, 4, indexStream); - if ( strncmp(magic, "BAI\1", 4) ) { - fprintf(stderr, "Problem with index file - invalid format.\n"); - fclose(indexStream); +bool BamStandardIndex::BamStandardIndexPrivate::LoadAllReferences(bool saveData) { + + // skip if data already loaded + if ( m_hasFullDataCache ) return true; + + // get number of reference sequences + uint32_t numReferences; + if ( !LoadReferenceCount((int&)numReferences) ) return false; + + // iterate over reference entries + bool loadedOk = true; + for ( int i = 0; i < (int)numReferences; ++i ) + loadedOk &= LoadReference(i, saveData); + + // set flag + if ( loadedOk && saveData ) + m_hasFullDataCache = true; + + // return success/failure of loading references + return loadedOk; +} + +// load header data from index file, return true if loaded OK +bool BamStandardIndex::BamStandardIndexPrivate::LoadHeader(void) { + + bool loadedOk = CheckMagicNumber(); + + // store offset of beginning of data + m_dataBeginOffset = ftell64(m_parent->m_indexStream); + + // return success/failure of load + return loadedOk; +} + +// load a single index bin entry from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamStandardIndex::BamStandardIndexPrivate::LoadBin(ReferenceIndex& refEntry, bool saveData) { + + size_t elementsRead = 0; + + // get bin ID + uint32_t binId; + elementsRead += fread(&binId, sizeof(binId), 1, m_parent->m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(binId); + + // load alignment chunks for this bin + ChunkVector chunks; + bool chunksOk = LoadChunks(chunks, saveData); + + // store bin entry + if ( chunksOk && saveData ) + refEntry.Bins.insert(pair(binId, chunks)); + + // return success/failure of load + return ( (elementsRead == 1) && chunksOk ); +} + +bool BamStandardIndex::BamStandardIndexPrivate::LoadBins(ReferenceIndex& refEntry, bool saveData) { + + size_t elementsRead = 0; + + // get number of bins + int32_t numBins; + elementsRead += fread(&numBins, sizeof(numBins), 1, m_parent->m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numBins); + + // set flag + refEntry.HasAlignments = ( numBins != 0 ); + + // iterate over bins + bool binsOk = true; + for ( int i = 0; i < numBins; ++i ) + binsOk &= LoadBin(refEntry, saveData); + + // return success/failure of load + return ( (elementsRead == 1) && binsOk ); +} + +// load a single index bin entry from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamStandardIndex::BamStandardIndexPrivate::LoadChunk(ChunkVector& chunks, bool saveData) { + + size_t elementsRead = 0; + + // read in chunk data + uint64_t start; + uint64_t stop; + elementsRead += fread(&start, sizeof(start), 1, m_parent->m_indexStream); + elementsRead += fread(&stop, sizeof(stop), 1, m_parent->m_indexStream); + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_64(start); + SwapEndian_64(stop); } - // get number of reference sequences - uint32_t numRefSeqs; - elementsRead = fread(&numRefSeqs, 4, 1, indexStream); - if ( isBigEndian ) SwapEndian_32(numRefSeqs); - - // intialize space for BamStandardIndexData data structure - m_indexData.reserve(numRefSeqs); + // save data if requested + if ( saveData ) chunks.push_back( Chunk(start, stop) ); - // iterate over reference sequences - for ( unsigned int i = 0; i < numRefSeqs; ++i ) { + // return success/failure of load + return ( elementsRead == 2 ); +} - // get number of bins for this reference sequence - int32_t numBins; - elementsRead = fread(&numBins, 4, 1, indexStream); - if ( isBigEndian ) SwapEndian_32(numBins); +bool BamStandardIndex::BamStandardIndexPrivate::LoadChunks(ChunkVector& chunks, bool saveData) { - // intialize BinVector - BamBinMap binMap; + size_t elementsRead = 0; - // iterate over bins for that reference sequence - for ( int j = 0; j < numBins; ++j ) { + // read in number of chunks + uint32_t numChunks; + elementsRead += fread(&numChunks, sizeof(numChunks), 1, m_parent->m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numChunks); - // get binID - uint32_t binID; - elementsRead = fread(&binID, 4, 1, indexStream); + // initialize space for chunks if we're storing this data + if ( saveData ) chunks.reserve(numChunks); - // get number of regionChunks in this bin - uint32_t numChunks; - elementsRead = fread(&numChunks, 4, 1, indexStream); + // iterate over chunks + bool chunksOk = true; + for ( int i = 0; i < (int)numChunks; ++i ) + chunksOk &= LoadChunk(chunks, saveData); - if ( isBigEndian ) { - SwapEndian_32(binID); - SwapEndian_32(numChunks); - } - - // intialize ChunkVector - ChunkVector regionChunks; - regionChunks.reserve(numChunks); - - // iterate over regionChunks in this bin - for ( unsigned int k = 0; k < numChunks; ++k ) { - - // get chunk boundaries (left, right) - uint64_t left; - uint64_t right; - elementsRead = fread(&left, 8, 1, indexStream); - elementsRead = fread(&right, 8, 1, indexStream); - - if ( isBigEndian ) { - SwapEndian_64(left); - SwapEndian_64(right); - } - - // save ChunkPair - regionChunks.push_back( Chunk(left, right) ); - } + // sort chunk vector + sort( chunks.begin(), chunks.end(), ChunkLessThan ); - // sort chunks for this bin - sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan ); + // return success/failure of load + return ( (elementsRead == 1) && chunksOk ); +} - // save binID, chunkVector for this bin - binMap.insert( pair(binID, regionChunks) ); - } +// load a single index linear offset entry from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamStandardIndex::BamStandardIndexPrivate::LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData) { - // ----------------------------------------------------- - // load linear index for this reference sequence - - // get number of linear offsets - int32_t numLinearOffsets; - elementsRead = fread(&numLinearOffsets, 4, 1, indexStream); - if ( isBigEndian ) SwapEndian_32(numLinearOffsets); - - // intialize LinearOffsetVector - LinearOffsetVector offsets; - offsets.reserve(numLinearOffsets); - - // iterate over linear offsets for this reference sequeence - uint64_t linearOffset; - for ( int j = 0; j < numLinearOffsets; ++j ) { - // read a linear offset & store - elementsRead = fread(&linearOffset, 8, 1, indexStream); - if ( isBigEndian ) SwapEndian_64(linearOffset); - offsets.push_back(linearOffset); - } + size_t elementsRead = 0; - // sort linear offsets - sort( offsets.begin(), offsets.end() ); + // read in number of linear offsets + int32_t numLinearOffsets; + elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_parent->m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets); + + // set up destination vector (if we're saving the data) + LinearOffsetVector linearOffsets; + if ( saveData ) linearOffsets.reserve(numLinearOffsets); + + // iterate over linear offsets + uint64_t linearOffset; + for ( int i = 0; i < numLinearOffsets; ++i ) { + elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_parent->m_indexStream); + if ( m_isBigEndian ) SwapEndian_64(linearOffset); + if ( saveData ) linearOffsets.push_back(linearOffset); + } + + // sort linear offsets + sort ( linearOffsets.begin(), linearOffsets.end() ); + + // save in reference index entry if desired + if ( saveData ) refEntry.Offsets = linearOffsets; + + // return success/failure of load + return ( elementsRead == (size_t)(numLinearOffsets + 1) ); +} - // store index data for that reference sequence - m_indexData.push_back( ReferenceIndex(binMap, offsets) ); +bool BamStandardIndex::BamStandardIndexPrivate::LoadFirstReference(bool saveData) { + BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); + return LoadReference((*indexBegin).first, saveData); +} + +// load a single reference from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamStandardIndex::BamStandardIndexPrivate::LoadReference(const int& refId, bool saveData) { + + // if reference not previously loaded, create new entry + if ( m_indexData.find(refId) == m_indexData.end() ) { + ReferenceIndex newEntry; + newEntry.HasAlignments = false; + m_indexData.insert( pair(refId, newEntry) ); } - // close index file (.bai) and return - fclose(indexStream); - return true; + // load reference data + ReferenceIndex& entry = m_indexData.at(refId); + bool loadedOk = true; + loadedOk &= LoadBins(entry, saveData); + loadedOk &= LoadLinearOffsets(entry, saveData); + return loadedOk; +} + +// loads number of references, return true if loaded OK +bool BamStandardIndex::BamStandardIndexPrivate::LoadReferenceCount(int& numReferences) { + + size_t elementsRead = 0; + + // read reference count + elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + + // return success/failure of load + return ( elementsRead == 1 ); } // merges 'alignment chunks' in BAM bin (used for index building) @@ -589,7 +940,7 @@ void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) { for ( ; indexIter != indexEnd; ++indexIter ) { // get BAM bin map for this reference - ReferenceIndex& refIndex = (*indexIter); + ReferenceIndex& refIndex = (*indexIter).second; BamBinMap& bamBinMap = refIndex.Bins; // iterate over BAM bins @@ -616,12 +967,9 @@ void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) { // get iteratorChunk based on vector iterator Chunk& iteratorChunk = (*chunkIter); - // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted) - if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) { - - // set currentChunk.Stop to iteratorChunk.Stop + // if chunk ends where (iterator) chunk starts, then merge + if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) currentChunk.Stop = iteratorChunk.Stop; - } // otherwise else { @@ -637,112 +985,245 @@ void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) { } } -// writes in-memory index data out to file -// N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamStandardIndex::BamStandardIndexPrivate::Write(const std::string& bamFilename) { +// saves BAM bin entry for index +void BamStandardIndex::BamStandardIndexPrivate::SaveBinEntry(BamBinMap& binMap, + const uint32_t& saveBin, + const uint64_t& saveOffset, + const uint64_t& lastOffset) +{ + // look up saveBin + BamBinMap::iterator binIter = binMap.find(saveBin); - // localize parent data - if ( m_parent == 0 ) return false; - const bool isBigEndian = m_parent->m_isBigEndian; - - // open index file - string indexFilename = bamFilename + ".bai"; - FILE* indexStream = fopen(indexFilename.c_str(), "wb"); - if ( indexStream == 0 ) { - fprintf(stderr, "ERROR: Could not open file to save index.\n"); - return false; - } + // create new chunk + Chunk newChunk(saveOffset, lastOffset); - // write BAM index header - fwrite("BAI\1", 1, 4, indexStream); + // if entry doesn't exist + if ( binIter == binMap.end() ) { + ChunkVector newChunks; + newChunks.push_back(newChunk); + binMap.insert( pair(saveBin, newChunks)); + } - // write number of reference sequences - int32_t numReferenceSeqs = m_indexData.size(); - if ( isBigEndian ) SwapEndian_32(numReferenceSeqs); - fwrite(&numReferenceSeqs, 4, 1, indexStream); + // otherwise + else { + ChunkVector& binChunks = (*binIter).second; + binChunks.push_back( newChunk ); + } +} - // iterate over reference sequences - BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); - BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++ indexIter ) { +// saves linear offset entry for index +void BamStandardIndex::BamStandardIndexPrivate::SaveLinearOffset(LinearOffsetVector& offsets, + const BamAlignment& bAlignment, + const uint64_t& lastOffset) +{ + // get converted offsets + int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; + int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; - // get reference index data - const ReferenceIndex& refIndex = (*indexIter); - const BamBinMap& binMap = refIndex.Bins; - const LinearOffsetVector& offsets = refIndex.Offsets; - - // write number of bins - int32_t binCount = binMap.size(); - if ( isBigEndian ) SwapEndian_32(binCount); - fwrite(&binCount, 4, 1, indexStream); - - // iterate over bins - BamBinMap::const_iterator binIter = binMap.begin(); - BamBinMap::const_iterator binEnd = binMap.end(); - for ( ; binIter != binEnd; ++binIter ) { + // resize vector if necessary + int oldSize = offsets.size(); + int newSize = endOffset + 1; + if ( oldSize < newSize ) + offsets.resize(newSize, 0); - // get bin data (key and chunk vector) - uint32_t binKey = (*binIter).first; - const ChunkVector& binChunks = (*binIter).second; + // store offset + for( int i = beginOffset + 1; i <= endOffset; ++i ) { + if ( offsets[i] == 0 ) + offsets[i] = lastOffset; + } +} - // save BAM bin key - if ( isBigEndian ) SwapEndian_32(binKey); - fwrite(&binKey, 4, 1, indexStream); +// initializes index data structure to hold @count references +void BamStandardIndex::BamStandardIndexPrivate::SetReferenceCount(const int& count) { + for ( int i = 0; i < count; ++i ) + m_indexData[i].HasAlignments = false; +} - // save chunk count - int32_t chunkCount = binChunks.size(); - if ( isBigEndian ) SwapEndian_32(chunkCount); - fwrite(&chunkCount, 4, 1, indexStream); +bool BamStandardIndex::BamStandardIndexPrivate::SkipToFirstReference(void) { + BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); + return SkipToReference( (*indexBegin).first ); +} - // iterate over chunks - ChunkVector::const_iterator chunkIter = binChunks.begin(); - ChunkVector::const_iterator chunkEnd = binChunks.end(); - for ( ; chunkIter != chunkEnd; ++chunkIter ) { - - // get current chunk data - const Chunk& chunk = (*chunkIter); - uint64_t start = chunk.Start; - uint64_t stop = chunk.Stop; - - if ( isBigEndian ) { - SwapEndian_64(start); - SwapEndian_64(stop); - } - - // save chunk offsets - fwrite(&start, 8, 1, indexStream); - fwrite(&stop, 8, 1, indexStream); - } - } +// position file pointer to desired reference begin, return true if skipped OK +bool BamStandardIndex::BamStandardIndexPrivate::SkipToReference(const int& refId) { - // write linear offsets size - int32_t offsetSize = offsets.size(); - if ( isBigEndian ) SwapEndian_32(offsetSize); - fwrite(&offsetSize, 4, 1, indexStream); + // attempt rewind + if ( !m_parent->Rewind() ) return false; - // iterate over linear offsets - LinearOffsetVector::const_iterator offsetIter = offsets.begin(); - LinearOffsetVector::const_iterator offsetEnd = offsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) { + // read in number of references + uint32_t numReferences; + size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(numReferences); - // write linear offset value - uint64_t linearOffset = (*offsetIter); - if ( isBigEndian ) SwapEndian_64(linearOffset); - fwrite(&linearOffset, 8, 1, indexStream); - } + // iterate over reference entries + bool skippedOk = true; + int currentRefId = 0; + while (currentRefId != refId) { + skippedOk &= LoadReference(currentRefId, false); + ++currentRefId; } - // flush buffer, close file, and return success - fflush(indexStream); - fclose(indexStream); - return true; + // return success + return skippedOk; } +// write header to new index file +bool BamStandardIndex::BamStandardIndexPrivate::WriteHeader(void) { + + size_t elementsWritten = 0; + + // write magic number + elementsWritten += fwrite("BAI\1", sizeof(char), 4, m_parent->m_indexStream); + + // store offset of beginning of data + m_dataBeginOffset = ftell64(m_parent->m_indexStream); + + // return success/failure of write + return (elementsWritten == 4); +} + +// write index data for all references to new index file +bool BamStandardIndex::BamStandardIndexPrivate::WriteAllReferences(void) { + + size_t elementsWritten = 0; + + // write number of reference sequences + int32_t numReferenceSeqs = m_indexData.size(); + if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs); + elementsWritten += fwrite(&numReferenceSeqs, sizeof(numReferenceSeqs), 1, m_parent->m_indexStream); + + // iterate over reference sequences + bool refsOk = true; + BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); + BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); + for ( ; indexIter != indexEnd; ++ indexIter ) + refsOk &= WriteReference( (*indexIter).second ); + + // return success/failure of write + return ( (elementsWritten == 1) && refsOk ); +} + +// write index data for bin to new index file +bool BamStandardIndex::BamStandardIndexPrivate::WriteBin(const uint32_t& binId, const ChunkVector& chunks) { + + size_t elementsWritten = 0; + + // write BAM bin ID + uint32_t binKey = binId; + if ( m_isBigEndian ) SwapEndian_32(binKey); + elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_parent->m_indexStream); + + // write chunks + bool chunksOk = WriteChunks(chunks); + + // return success/failure of write + return ( (elementsWritten == 1) && chunksOk ); +} + +// write index data for bins to new index file +bool BamStandardIndex::BamStandardIndexPrivate::WriteBins(const BamBinMap& bins) { + + size_t elementsWritten = 0; + + // write number of bins + int32_t binCount = bins.size(); + if ( m_isBigEndian ) SwapEndian_32(binCount); + elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_parent->m_indexStream); + + // iterate over bins + bool binsOk = true; + BamBinMap::const_iterator binIter = bins.begin(); + BamBinMap::const_iterator binEnd = bins.end(); + for ( ; binIter != binEnd; ++binIter ) + binsOk &= WriteBin( (*binIter).first, (*binIter).second ); + + // return success/failure of write + return ( (elementsWritten == 1) && binsOk ); +} + +// write index data for chunk entry to new index file +bool BamStandardIndex::BamStandardIndexPrivate::WriteChunk(const Chunk& chunk) { + + size_t elementsWritten = 0; + + // localize alignment chunk offsets + uint64_t start = chunk.Start; + uint64_t stop = chunk.Stop; + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_64(start); + SwapEndian_64(stop); + } + + // write to index file + elementsWritten += fwrite(&start, sizeof(start), 1, m_parent->m_indexStream); + elementsWritten += fwrite(&stop, sizeof(stop), 1, m_parent->m_indexStream); + + // return success/failure of write + return ( elementsWritten == 2 ); +} + +// write index data for chunk entry to new index file +bool BamStandardIndex::BamStandardIndexPrivate::WriteChunks(const ChunkVector& chunks) { + + size_t elementsWritten = 0; + + // write chunks + int32_t chunkCount = chunks.size(); + if ( m_isBigEndian ) SwapEndian_32(chunkCount); + elementsWritten += fwrite(&chunkCount, sizeof(chunkCount), 1, m_parent->m_indexStream); + + // iterate over chunks + bool chunksOk = true; + ChunkVector::const_iterator chunkIter = chunks.begin(); + ChunkVector::const_iterator chunkEnd = chunks.end(); + for ( ; chunkIter != chunkEnd; ++chunkIter ) + chunksOk &= WriteChunk( (*chunkIter) ); + + // return success/failure of write + return ( (elementsWritten == 1) && chunksOk ); +} + +// write index data for linear offsets entry to new index file +bool BamStandardIndex::BamStandardIndexPrivate::WriteLinearOffsets(const LinearOffsetVector& offsets) { + + size_t elementsWritten = 0; + + // write number of linear offsets + int32_t offsetCount = offsets.size(); + if ( m_isBigEndian ) SwapEndian_32(offsetCount); + elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_parent->m_indexStream); + + // iterate over linear offsets + LinearOffsetVector::const_iterator offsetIter = offsets.begin(); + LinearOffsetVector::const_iterator offsetEnd = offsets.end(); + for ( ; offsetIter != offsetEnd; ++offsetIter ) { + + // write linear offset + uint64_t linearOffset = (*offsetIter); + if ( m_isBigEndian ) SwapEndian_64(linearOffset); + elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, m_parent->m_indexStream); + } + + // return success/failure of write + return ( elementsWritten == (size_t)(offsetCount + 1) ); +} + +// write index data for a single reference to new index file +bool BamStandardIndex::BamStandardIndexPrivate::WriteReference(const ReferenceIndex& refEntry) { + bool refOk = true; + refOk &= WriteBins(refEntry.Bins); + refOk &= WriteLinearOffsets(refEntry.Offsets); + return refOk; +} + // --------------------------------------------------------------- // BamStandardIndex implementation -BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) - : BamIndex(bgzf, reader, isBigEndian) +BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader) + : BamIndex(bgzf, reader) { d = new BamStandardIndexPrivate(this); } @@ -752,21 +1233,20 @@ BamStandardIndex::~BamStandardIndex(void) { d = 0; } -// creates index data (in-memory) from current reader data +// BamIndex interface implementation bool BamStandardIndex::Build(void) { return d->Build(); } - -// returns whether reference has alignments or no +void BamStandardIndex::ClearAllData(void) { d->ClearAllData(); } +const off_t BamStandardIndex::DataBeginOffset(void) const { return d->DataBeginOffset(); } bool BamStandardIndex::HasAlignments(const int& referenceID) const { return d->HasAlignments(referenceID); } - -// attempts to use index to jump to region; returns success/fail +bool BamStandardIndex::HasFullDataCache(void) const { return d->HasFullDataCache(); } bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); } - -// loads existing data from file into memory -bool BamStandardIndex::Load(const string& filename) { return d->Load(filename); } - -// writes in-memory index data out to file -// N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamStandardIndex::Write(const string& bamFilename) { return d->Write(bamFilename); } +void BamStandardIndex::KeepOnlyFirstReferenceOffsets(void) { d->KeepOnlyFirstReferenceOffsets(); } +bool BamStandardIndex::LoadAllReferences(bool saveData) { return d->LoadAllReferences(saveData); } +bool BamStandardIndex::LoadFirstReference(bool saveData) { return d->LoadFirstReference(saveData); } +bool BamStandardIndex::LoadHeader(void) { return d->LoadHeader(); } +bool BamStandardIndex::SkipToFirstReference(void) { return d->SkipToFirstReference(); } +bool BamStandardIndex::WriteAllReferences(void) { return d->WriteAllReferences(); } +bool BamStandardIndex::WriteHeader(void) { return d->WriteHeader(); } // ######################################################################################### // ######################################################################################### @@ -776,7 +1256,7 @@ bool BamStandardIndex::Write(const string& bamFilename) { return d->Write(bamFil namespace BamTools { -// individual index entry +// individual index offset entry struct BamToolsIndexEntry { // data members @@ -794,8 +1274,21 @@ struct BamToolsIndexEntry { { } }; +// reference index entry +struct BamToolsReferenceEntry { + + // data members + bool HasAlignments; + vector Offsets; + + // ctor + BamToolsReferenceEntry(void) + : HasAlignments(false) + { } +}; + // the actual index data structure -typedef map > BamToolsIndexData; +typedef map BamToolsIndexData; } // namespace BamTools @@ -814,77 +1307,148 @@ struct BamToolsIndex::BamToolsIndexPrivate { // do something new // else // do the old thing - enum Version { BTI_1_0 = 1 , BTI_1_1 , BTI_1_2 }; - // ------------------------- - // data members + // parent object + BamToolsIndex* m_parent; - BamToolsIndexData m_indexData; - BamToolsIndex* m_parent; + // data members int32_t m_blockSize; - Version m_version; - - // ------------------------- - // ctor & dtor - - BamToolsIndexPrivate(BamToolsIndex* parent) - : m_parent(parent) - , m_blockSize(1000) - , m_version(BTI_1_2) // latest version - used for writing new index files - { } - - ~BamToolsIndexPrivate(void) { } + BamToolsIndexData m_indexData; + off_t m_dataBeginOffset; + bool m_hasFullDataCache; + bool m_isBigEndian; + int32_t m_inputVersion; // Version is serialized as int + Version m_outputVersion; - // ------------------------- - // 'public' methods + // ctor & dtor + BamToolsIndexPrivate(BamToolsIndex* parent); + ~BamToolsIndexPrivate(void); - // creates index data (in-memory) from current reader data - bool Build(void); - // returns supported file extension - const string Extension(void) const { return string(".bti"); } - // returns whether reference has alignments or no - bool HasAlignments(const int& referenceID) const; - // attempts to use index to jump to region; returns success/fail - bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); - // loads existing data from file into memory - bool Load(const std::string& filename); - // writes in-memory index data out to file - // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) - bool Write(const std::string& bamFilename); + // parent interface methods + public: + + // creates index data (in-memory) from current reader data + bool Build(void); + // clear all current index offset data in memory + void ClearAllData(void); + // return file position after header metadata + const off_t DataBeginOffset(void) const; + // returns whether reference has alignments or no + bool HasAlignments(const int& referenceID) const; + // return true if all index data is cached + bool HasFullDataCache(void) const; + // attempts to use index to jump to region; returns success/fail + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + // clears index data from all references except the first + void KeepOnlyFirstReferenceOffsets(void); + // load index data for all references, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadAllReferences(bool saveData = true); + // load first reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadFirstReference(bool saveData = true); + // load header data from index file, return true if loaded OK + bool LoadHeader(void); + // position file pointer to desired reference begin, return true if skipped OK + bool SkipToFirstReference(void); + // write header to new index file + bool WriteHeader(void); + // write index data for all references to new index file + bool WriteAllReferences(void); - // ------------------------- // internal methods - - // calculates offset for a given region - bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); + private: + + // ----------------------- + // index file operations + + // check index file magic number, return true if OK + bool CheckMagicNumber(void); + // check index file version, return true if OK + bool CheckVersion(void); + // return true if FILE* is open + bool IsOpen(void) const; + // load a single index entry from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadIndexEntry(const int& refId, bool saveData = true); + // load a single reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadReference(const int& refId, bool saveData = true); + // loads number of references, return true if loaded OK + bool LoadReferenceCount(int& numReferences); + // position file pointer to desired reference begin, return true if skipped OK + bool SkipToReference(const int& refId); + // write current reference index data to new index file + bool WriteReferenceEntry(const BamToolsReferenceEntry& refEntry); + // write current index offset entry to new index file + bool WriteIndexEntry(const BamToolsIndexEntry& entry); + + // ----------------------- + // index data operations + + // clear all index offset data for desired reference + void ClearReferenceOffsets(const int& refId); + // calculate BAM file offset for desired region + // return true if no error (*NOT* equivalent to "has alignments or valid offset") + // check @hasAlignmentsInRegion to determine this status + // @region - target region + // @offset - resulting seek target + // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status + bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); + // returns true if index cache has data for desired reference + bool IsDataLoaded(const int& refId) const; + // clears index data from all references except the one specified + void KeepOnlyReferenceOffsets(const int& refId); + // saves an index offset entry in memory + void SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry); + // pre-allocates size for offset vector + void SetOffsetCount(const int& refId, const int& offsetCount); + // initializes index data structure to hold @count references + void SetReferenceCount(const int& count); }; +// ctor +BamToolsIndex::BamToolsIndexPrivate::BamToolsIndexPrivate(BamToolsIndex* parent) + : m_parent(parent) + , m_blockSize(1000) + , m_dataBeginOffset(0) + , m_hasFullDataCache(false) + , m_inputVersion(0) + , m_outputVersion(BTI_1_2) // latest version - used for writing new index files +{ + m_isBigEndian = BamTools::SystemIsBigEndian(); +} + +// dtor +BamToolsIndex::BamToolsIndexPrivate::~BamToolsIndexPrivate(void) { + ClearAllData(); +} + // creates index data (in-memory) from current reader data bool BamToolsIndex::BamToolsIndexPrivate::Build(void) { // localize parent data if ( m_parent == 0 ) return false; - BamReader* reader = m_parent->m_reader; - BgzfData* mBGZF = m_parent->m_BGZF; + BamReader* reader = m_parent->m_reader; + BgzfData* mBGZF = m_parent->m_BGZF; // be sure reader & BGZF file are valid & open for reading if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen ) return false; // move file pointer to beginning of alignments - // quit if failed - if ( !reader->Rewind() ) - return false; + if ( !reader->Rewind() ) return false; // initialize index data structure with space for all references const int numReferences = (int)m_parent->m_references.size(); - for ( int i = 0; i < numReferences; ++i ) - m_indexData[i].clear(); - + m_indexData.clear(); + m_hasFullDataCache = false; + SetReferenceCount(numReferences); + // set up counters and markers int32_t currentBlockCount = 0; int64_t currentAlignmentOffset = mBGZF->Tell(); @@ -901,7 +1465,8 @@ bool BamToolsIndex::BamToolsIndexPrivate::Build(void) { if ( currentBlockCount > 0 && al.RefID != blockRefId ) { // store previous data - m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) ); + BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); + SaveOffsetEntry(blockRefId, entry); // intialize new block for current alignment's reference currentBlockCount = 0; @@ -925,7 +1490,8 @@ bool BamToolsIndex::BamToolsIndexPrivate::Build(void) { // if block is full, get offset for next block, reset currentBlockCount if ( currentBlockCount == m_blockSize ) { - m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) ); + BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); + SaveOffsetEntry(blockRefId, entry); blockStartOffset = mBGZF->Tell(); currentBlockCount = 0; } @@ -936,22 +1502,120 @@ bool BamToolsIndex::BamToolsIndexPrivate::Build(void) { } // store final block with data - m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) ); + BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); + SaveOffsetEntry(blockRefId, entry); - // attempt to rewind back to begnning of alignments - // return success/fail + // set flag + m_hasFullDataCache = true; + + // return success/failure of rewind return reader->Rewind(); } +// check index file magic number, return true if OK +bool BamToolsIndex::BamToolsIndexPrivate::CheckMagicNumber(void) { + + // see if index is valid BAM index + char magic[4]; + size_t elementsRead = fread(magic, 1, 4, m_parent->m_indexStream); + if ( elementsRead != 4 ) return false; + if ( strncmp(magic, "BTI\1", 4) != 0 ) { + fprintf(stderr, "Problem with index file - invalid format.\n"); + return false; + } + + // otherwise ok + return true; +} + +// check index file version, return true if OK +bool BamToolsIndex::BamToolsIndexPrivate::CheckVersion(void) { + + // read version from file + size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_parent->m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(m_inputVersion); + + // if version is negative, or zero + if ( m_inputVersion <= 0 ) { + fprintf(stderr, "Problem with index file - invalid version.\n"); + return false; + } + + // if version is newer than can be supported by this version of bamtools + else if ( m_inputVersion > m_outputVersion ) { + fprintf(stderr, "Problem with index file - attempting to use an outdated version of BamTools with a newer index file.\n"); + fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n"); + return false; + } + + // ------------------------------------------------------------------ + // check for deprecated, unsupported versions + // (typically whose format did not accomodate a particular bug fix) + + else if ( (Version)m_inputVersion == BTI_1_0 ) { + fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n"); + fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); + return false; + } + + else if ( (Version)m_inputVersion == BTI_1_1 ) { + fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n"); + fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); + return false; + } + + // otherwise ok + else return true; +} + +// clear all current index offset data in memory +void BamToolsIndex::BamToolsIndexPrivate::ClearAllData(void) { + BamToolsIndexData::const_iterator indexIter = m_indexData.begin(); + BamToolsIndexData::const_iterator indexEnd = m_indexData.end(); + for ( ; indexIter != indexEnd; ++indexIter ) { + const int& refId = (*indexIter).first; + ClearReferenceOffsets(refId); + } +} + +// clear all index offset data for desired reference +void BamToolsIndex::BamToolsIndexPrivate::ClearReferenceOffsets(const int& refId) { + if ( m_indexData.find(refId) == m_indexData.end() ) return; + vector& offsets = m_indexData.at(refId).Offsets; + offsets.clear(); + m_hasFullDataCache = false; +} + +// return file position after header metadata +const off_t BamToolsIndex::BamToolsIndexPrivate::DataBeginOffset(void) const { + return m_dataBeginOffset; +} + +// calculate BAM file offset for desired region +// return true if no error (*NOT* equivalent to "has alignments or valid offset") +// check @hasAlignmentsInRegion to determine this status +// @region - target region +// @offset - resulting seek target +// @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status // N.B. - ignores isRightBoundSpecified bool BamToolsIndex::BamToolsIndexPrivate::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { // return false if leftBound refID is not found in index data - if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) return false; + if ( m_indexData.find(region.LeftRefID) == m_indexData.end()) return false; - const vector referenceOffsets = m_indexData[region.LeftRefID]; + // load index data for region if not already cached + if ( !IsDataLoaded(region.LeftRefID) ) { + bool loadedOk = true; + loadedOk &= SkipToReference(region.LeftRefID); + loadedOk &= LoadReference(region.LeftRefID); + if ( !loadedOk ) return false; + } + + // localize index data for this reference (& sanity check that data actually exists) + const vector& referenceOffsets = m_indexData.at(region.LeftRefID).Offsets; if ( referenceOffsets.empty() ) return false; - + // ------------------------------------------------------- // calculate nearest index to jump to @@ -974,11 +1638,26 @@ bool BamToolsIndex::BamToolsIndexPrivate::GetOffset(const BamRegion& region, int } // returns whether reference has alignments or no -bool BamToolsIndex::BamToolsIndexPrivate::HasAlignments(const int& referenceID) const { - if ( m_indexData.find(referenceID) == m_indexData.end() ) - return false; - else - return ( !m_indexData.at(referenceID).empty() ); +bool BamToolsIndex::BamToolsIndexPrivate::HasAlignments(const int& refId) const { + if ( m_indexData.find(refId) == m_indexData.end()) return false; + const BamToolsReferenceEntry& refEntry = m_indexData.at(refId); + return refEntry.HasAlignments; +} + +// return true if all index data is cached +bool BamToolsIndex::BamToolsIndexPrivate::HasFullDataCache(void) const { + return m_hasFullDataCache; +} + +// returns true if index cache has data for desired reference +bool BamToolsIndex::BamToolsIndexPrivate::IsDataLoaded(const int& refId) const { + + if ( m_indexData.find(refId) == m_indexData.end() ) return false; // unknown reference + const BamToolsReferenceEntry& refEntry = m_indexData.at(refId); + if ( !refEntry.HasAlignments ) return true; // no data period + + // return whether offsets list contains data + return !refEntry.Offsets.empty(); } // attempts to use index to jump to region; returns success/fail @@ -989,8 +1668,8 @@ bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region, bool* ha // localize parent data if ( m_parent == 0 ) return false; - BamReader* reader = m_parent->m_reader; - BgzfData* mBGZF = m_parent->m_BGZF; + BamReader* reader = m_parent->m_reader; + BgzfData* mBGZF = m_parent->m_BGZF; RefVector& references = m_parent->m_references; // check valid BamReader state @@ -1000,7 +1679,8 @@ bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region, bool* ha } // make sure left-bound position is valid - if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false; + if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) + return false; // calculate nearest offset to jump to int64_t offset; @@ -1009,198 +1689,286 @@ bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region, bool* ha return false; } - // attempt seek in file, return success/fail + // return success/failure of seek return mBGZF->Seek(offset); } -// loads existing data from file into memory -bool BamToolsIndex::BamToolsIndexPrivate::Load(const string& filename) { - - // localize parent data - if ( m_parent == 0 ) return false; - const bool isBigEndian = m_parent->m_isBigEndian; - - // open index file, abort on error - FILE* indexStream = fopen(filename.c_str(), "rb"); - if( !indexStream ) { - fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str()); - return false; - } +// clears index data from all references except the first +void BamToolsIndex::BamToolsIndexPrivate::KeepOnlyFirstReferenceOffsets(void) { + BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); + KeepOnlyReferenceOffsets( (*indexBegin).first ); +} - // set placeholder to receive input byte count (suppresses compiler warnings) - size_t elementsRead = 0; - - // see if index is valid BAM index - char magic[4]; - elementsRead = fread(magic, 1, 4, indexStream); - if ( strncmp(magic, "BTI\1", 4) ) { - fprintf(stderr, "Problem with index file - invalid format.\n"); - fclose(indexStream); - return false; +// clears index data from all references except the one specified +void BamToolsIndex::BamToolsIndexPrivate::KeepOnlyReferenceOffsets(const int& refId) { + BamToolsIndexData::iterator mapIter = m_indexData.begin(); + BamToolsIndexData::iterator mapEnd = m_indexData.end(); + for ( ; mapIter != mapEnd; ++mapIter ) { + const int entryRefId = (*mapIter).first; + if ( entryRefId != refId ) + ClearReferenceOffsets(entryRefId); } +} + +// load index data for all references, return true if loaded OK +bool BamToolsIndex::BamToolsIndexPrivate::LoadAllReferences(bool saveData) { + + // skip if data already loaded + if ( m_hasFullDataCache ) return true; + + // read in number of references + int32_t numReferences; + if ( !LoadReferenceCount(numReferences) ) return false; + //SetReferenceCount(numReferences); + + // iterate over reference entries + bool loadedOk = true; + for ( int i = 0; i < numReferences; ++i ) + loadedOk &= LoadReference(i, saveData); + + // set flag + if ( loadedOk && saveData ) + m_hasFullDataCache = true; + + // return success/failure of load + return loadedOk; +} + +// load header data from index file, return true if loaded OK +bool BamToolsIndex::BamToolsIndexPrivate::LoadHeader(void) { + + // check magic number + if ( !CheckMagicNumber() ) return false; + + // check BTI version + if ( !CheckVersion() ) return false; + + // read in block size + size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_parent->m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(m_blockSize); + + // store offset of beginning of data + m_dataBeginOffset = ftell64(m_parent->m_indexStream); + + // return success/failure of load + return (elementsRead == 1); +} + +// load a single index entry from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamToolsIndex::BamToolsIndexPrivate::LoadIndexEntry(const int& refId, bool saveData) { - // read in version - int32_t indexVersion; - elementsRead = fread(&indexVersion, sizeof(indexVersion), 1, indexStream); - if ( isBigEndian ) SwapEndian_32(indexVersion); - if ( indexVersion <= 0 || indexVersion > m_version ) { - fprintf(stderr, "Problem with index file - unsupported version.\n"); - fclose(indexStream); + // read in index entry data members + size_t elementsRead = 0; + BamToolsIndexEntry entry; + elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_parent->m_indexStream); + elementsRead += fread(&entry.StartOffset, sizeof(entry.StartOffset), 1, m_parent->m_indexStream); + elementsRead += fread(&entry.StartPosition, sizeof(entry.StartPosition), 1, m_parent->m_indexStream); + if ( elementsRead != 3 ) { + cerr << "Error reading index entry. Expected 3 elements, read in: " << elementsRead << endl; return false; } - if ( (Version)indexVersion == BTI_1_0 ) { - fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n"); - fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); - fclose(indexStream); - exit(1); - } - - if ( (Version)indexVersion == BTI_1_1 ) { - fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n"); - fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); - fclose(indexStream); - exit(1); + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_32(entry.MaxEndPosition); + SwapEndian_64(entry.StartOffset); + SwapEndian_32(entry.StartPosition); } - // read in block size - elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, indexStream); - if ( isBigEndian ) SwapEndian_32(m_blockSize); + // save data + if ( saveData ) + SaveOffsetEntry(refId, entry); + + // return success/failure of load + return true; +} + +// load a single reference from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamToolsIndex::BamToolsIndexPrivate::LoadFirstReference(bool saveData) { + BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); + return LoadReference( (*indexBegin).first, saveData ); +} + +// load a single reference from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamToolsIndex::BamToolsIndexPrivate::LoadReference(const int& refId, bool saveData) { + + // read in number of offsets for this reference + uint32_t numOffsets; + size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_parent->m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(numOffsets); + + // initialize offsets container for this reference + SetOffsetCount(refId, (int)numOffsets); + // iterate over offset entries + for ( unsigned int j = 0; j < numOffsets; ++j ) + LoadIndexEntry(refId, saveData); + + // return success/failure of load + return true; +} + +// loads number of references, return true if loaded OK +bool BamToolsIndex::BamToolsIndexPrivate::LoadReferenceCount(int& numReferences) { + + size_t elementsRead = 0; + + // read reference count + elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + + // return success/failure of load + return ( elementsRead == 1 ); +} + +// saves an index offset entry in memory +void BamToolsIndex::BamToolsIndexPrivate::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) { + BamToolsReferenceEntry& refEntry = m_indexData[refId]; + refEntry.HasAlignments = true; + refEntry.Offsets.push_back(entry); +} + +// pre-allocates size for offset vector +void BamToolsIndex::BamToolsIndexPrivate::SetOffsetCount(const int& refId, const int& offsetCount) { + BamToolsReferenceEntry& refEntry = m_indexData[refId]; + refEntry.Offsets.reserve(offsetCount); + refEntry.HasAlignments = ( offsetCount > 0); +} + +// initializes index data structure to hold @count references +void BamToolsIndex::BamToolsIndexPrivate::SetReferenceCount(const int& count) { + for ( int i = 0; i < count; ++i ) + m_indexData[i].HasAlignments = false; +} + +// position file pointer to first reference begin, return true if skipped OK +bool BamToolsIndex::BamToolsIndexPrivate::SkipToFirstReference(void) { + BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); + return SkipToReference( (*indexBegin).first ); +} + +// position file pointer to desired reference begin, return true if skipped OK +bool BamToolsIndex::BamToolsIndexPrivate::SkipToReference(const int& refId) { + + // attempt rewind + if ( !m_parent->Rewind() ) return false; + // read in number of references int32_t numReferences; - elementsRead = fread(&numReferences, sizeof(numReferences), 1, indexStream); - if ( isBigEndian ) SwapEndian_32(numReferences); + size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(numReferences); // iterate over reference entries - for ( int i = 0; i < numReferences; ++i ) { - - // read in number of offsets for this reference - uint32_t numOffsets; - elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream); - if ( isBigEndian ) SwapEndian_32(numOffsets); - - // initialize offsets container for this reference - vector offsets; - offsets.reserve(numOffsets); - - // iterate over offset entries - for ( unsigned int j = 0; j < numOffsets; ++j ) { - - // copy entry data - int32_t maxEndPosition; - int64_t startOffset; - int32_t startPosition; - - // read in data - elementsRead = fread(&maxEndPosition, sizeof(maxEndPosition), 1, indexStream); - elementsRead = fread(&startOffset, sizeof(startOffset), 1, indexStream); - elementsRead = fread(&startPosition, sizeof(startPosition), 1, indexStream); - - // swap endian-ness if necessary - if ( isBigEndian ) { - SwapEndian_32(maxEndPosition); - SwapEndian_64(startOffset); - SwapEndian_32(startPosition); - } - - // save current index entry - offsets.push_back( BamToolsIndexEntry(maxEndPosition, startOffset, startPosition) ); - } - - // save refID, offsetVector entry into data structure - m_indexData.insert( make_pair(i, offsets) ); + bool skippedOk = true; + int currentRefId = 0; + while (currentRefId != refId) { + skippedOk &= LoadReference(currentRefId, false); + ++currentRefId; } - // close index file and return - fclose(indexStream); - return true; + // return success/failure of skip + return skippedOk; } -// writes in-memory index data out to file -// N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamToolsIndex::BamToolsIndexPrivate::Write(const std::string& bamFilename) { - - // localize parent data - if ( m_parent == 0 ) return false; - const bool isBigEndian = m_parent->m_isBigEndian; - - // open index file for writing - string indexFilename = bamFilename + ".bti"; - FILE* indexStream = fopen(indexFilename.c_str(), "wb"); - if ( indexStream == 0 ) { - fprintf(stderr, "ERROR: Could not open file to save index.\n"); - return false; - } +// write header to new index file +bool BamToolsIndex::BamToolsIndexPrivate::WriteHeader(void) { + + size_t elementsWritten = 0; // write BTI index format 'magic number' - fwrite("BTI\1", 1, 4, indexStream); + elementsWritten += fwrite("BTI\1", 1, 4, m_parent->m_indexStream); // write BTI index format version - int32_t currentVersion = (int32_t)m_version; - if ( isBigEndian ) SwapEndian_32(currentVersion); - fwrite(¤tVersion, sizeof(int32_t), 1, indexStream); - + int32_t currentVersion = (int32_t)m_outputVersion; + if ( m_isBigEndian ) SwapEndian_32(currentVersion); + elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_parent->m_indexStream); + // write block size int32_t blockSize = m_blockSize; - if ( isBigEndian ) SwapEndian_32(blockSize); - fwrite(&blockSize, sizeof(blockSize), 1, indexStream); - + if ( m_isBigEndian ) SwapEndian_32(blockSize); + elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_parent->m_indexStream); + + // store offset of beginning of data + m_dataBeginOffset = ftell64(m_parent->m_indexStream); + + // return success/failure of write + return ( elementsWritten == 6 ); +} + +// write index data for all references to new index file +bool BamToolsIndex::BamToolsIndexPrivate::WriteAllReferences(void) { + + size_t elementsWritten = 0; + // write number of references int32_t numReferences = (int32_t)m_indexData.size(); - if ( isBigEndian ) SwapEndian_32(numReferences); - fwrite(&numReferences, sizeof(numReferences), 1, indexStream); - - // iterate through references in index + if ( m_isBigEndian ) SwapEndian_32(numReferences); + elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream); + + // iterate through references in index + bool refOk = true; BamToolsIndexData::const_iterator refIter = m_indexData.begin(); BamToolsIndexData::const_iterator refEnd = m_indexData.end(); - for ( ; refIter != refEnd; ++refIter ) { - - const vector offsets = (*refIter).second; - - // write number of offsets listed for this reference - uint32_t numOffsets = offsets.size(); - if ( isBigEndian ) SwapEndian_32(numOffsets); - fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream); - - // iterate over offset entries - vector::const_iterator offsetIter = offsets.begin(); - vector::const_iterator offsetEnd = offsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) { - - // get reference index data - const BamToolsIndexEntry& entry = (*offsetIter); - - // copy entry data - int32_t maxEndPosition = entry.MaxEndPosition; - int64_t startOffset = entry.StartOffset; - int32_t startPosition = entry.StartPosition; - - // swap endian-ness if necessary - if ( isBigEndian ) { - SwapEndian_32(maxEndPosition); - SwapEndian_64(startOffset); - SwapEndian_32(startPosition); - } - - // write the reference index entry - fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, indexStream); - fwrite(&startOffset, sizeof(startOffset), 1, indexStream); - fwrite(&startPosition, sizeof(startPosition), 1, indexStream); - } + for ( ; refIter != refEnd; ++refIter ) + refOk &= WriteReferenceEntry( (*refIter).second ); + + return ( (elementsWritten == 1) && refOk ); +} + +// write current reference index data to new index file +bool BamToolsIndex::BamToolsIndexPrivate::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) { + + size_t elementsWritten = 0; + + // write number of offsets listed for this reference + uint32_t numOffsets = refEntry.Offsets.size(); + if ( m_isBigEndian ) SwapEndian_32(numOffsets); + elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_parent->m_indexStream); + + // iterate over offset entries + bool entriesOk = true; + vector::const_iterator offsetIter = refEntry.Offsets.begin(); + vector::const_iterator offsetEnd = refEntry.Offsets.end(); + for ( ; offsetIter != offsetEnd; ++offsetIter ) + entriesOk &= WriteIndexEntry( (*offsetIter) ); + + return ( (elementsWritten == 1) && entriesOk ); +} + +// write current index offset entry to new index file +bool BamToolsIndex::BamToolsIndexPrivate::WriteIndexEntry(const BamToolsIndexEntry& entry) { + + // copy entry data + int32_t maxEndPosition = entry.MaxEndPosition; + int64_t startOffset = entry.StartOffset; + int32_t startPosition = entry.StartPosition; + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_32(maxEndPosition); + SwapEndian_64(startOffset); + SwapEndian_32(startPosition); } - // flush any remaining output, close file, and return success - fflush(indexStream); - fclose(indexStream); - return true; + // write the reference index entry + size_t elementsWritten = 0; + elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_parent->m_indexStream); + elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_parent->m_indexStream); + elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_parent->m_indexStream); + return ( elementsWritten == 3 ); } // --------------------------------------------------- // BamToolsIndex implementation -BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) - : BamIndex(bgzf, reader, isBigEndian) +BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader) + : BamIndex(bgzf, reader) { d = new BamToolsIndexPrivate(this); } @@ -1210,21 +1978,17 @@ BamToolsIndex::~BamToolsIndex(void) { d = 0; } -// creates index data (in-memory) from current reader data +// BamIndex interface implementation bool BamToolsIndex::Build(void) { return d->Build(); } - -// returns whether reference has alignments or no +void BamToolsIndex::ClearAllData(void) { d->ClearAllData(); } +const off_t BamToolsIndex::DataBeginOffset(void) const { return d->DataBeginOffset(); } bool BamToolsIndex::HasAlignments(const int& referenceID) const { return d->HasAlignments(referenceID); } - -// attempts to use index to jump to region; returns success/fail -// a "successful" jump indicates no error, but not whether this region has data -// * thus, the method sets a flag to indicate whether there are alignments -// available after the jump position +bool BamToolsIndex::HasFullDataCache(void) const { return d->HasFullDataCache(); } bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); } - -// loads existing data from file into memory -bool BamToolsIndex::Load(const string& filename) { return d->Load(filename); } - -// writes in-memory index data out to file -// N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamToolsIndex::Write(const string& bamFilename) { return d->Write(bamFilename); } +void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) { d->KeepOnlyFirstReferenceOffsets(); } +bool BamToolsIndex::LoadAllReferences(bool saveData) { return d->LoadAllReferences(saveData); } +bool BamToolsIndex::LoadFirstReference(bool saveData) { return d->LoadFirstReference(saveData); } +bool BamToolsIndex::LoadHeader(void) { return d->LoadHeader(); } +bool BamToolsIndex::SkipToFirstReference(void) { return d->SkipToFirstReference(); } +bool BamToolsIndex::WriteAllReferences(void) { return d->WriteAllReferences(); } +bool BamToolsIndex::WriteHeader(void) { return d->WriteHeader(); } diff --git a/src/api/BamIndex.h b/src/api/BamIndex.h index 9da858f..89e9d2b 100644 --- a/src/api/BamIndex.h +++ b/src/api/BamIndex.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 8 October 2010 (DB) +// Last modified: 20 October 2010 (DB) // --------------------------------------------------------------------------- // Provides index functionality - both for the standardized BAM index format // (".bai") as well as a BamTools-specific (nonstandard) index format (".bti"). @@ -15,7 +15,7 @@ #include #include #include -#include "BamAlignment.h" +#include "BamAux.h" namespace BamTools { @@ -26,10 +26,22 @@ class BgzfData; // BamIndex base class class BamIndex { + // specify index-caching behavior + // + // @FullIndexCaching - store entire index file contents in memory + // @LimitedIndexCaching - store only index data for current reference + // being processed + // @NoIndexCaching - do not store any index data. Load as needed to + // calculate jump offset + public: enum BamIndexCacheMode { FullIndexCaching = 0 + , LimitedIndexCaching + , NoIndexCaching + }; + // ctor & dtor public: - BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian); - virtual ~BamIndex(void) { } + BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader); + virtual ~BamIndex(void); // index interface public: @@ -37,19 +49,59 @@ class BamIndex { virtual bool Build(void) =0; // returns supported file extension virtual const std::string Extension(void) const =0; + // returns whether reference has alignments or no + virtual bool HasAlignments(const int& referenceID) const =0; // attempts to use index to jump to region; returns success/fail // a "successful" jump indicates no error, but not whether this region has data // * thus, the method sets a flag to indicate whether there are alignments // available after the jump position virtual bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) =0; - // returns whether reference has alignments or no - virtual bool HasAlignments(const int& referenceID) const =0; // loads existing data from file into memory - virtual bool Load(const std::string& filename) =0; + virtual bool Load(const std::string& filename); + // change the index caching behavior + virtual void SetCacheMode(const BamIndexCacheMode mode); // writes in-memory index data out to file // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) - virtual bool Write(const std::string& bamFilename) =0; + virtual bool Write(const std::string& bamFilename); + // derived-classes MUST provide implementation + protected: + // clear all current index offset data in memory + virtual void ClearAllData(void) =0; + // return file position after header metadata + virtual const off_t DataBeginOffset(void) const =0; + // return true if all index data is cached + virtual bool HasFullDataCache(void) const =0; + // clears index data from all references except the first + virtual void KeepOnlyFirstReferenceOffsets(void) =0; + // load index data for all references, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + virtual bool LoadAllReferences(bool saveData = true) =0; + // load first reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + virtual bool LoadFirstReference(bool saveData = true) =0; + // load header data from index file, return true if loaded OK + virtual bool LoadHeader(void) =0; + // position file pointer to first reference begin, return true if skipped OK + virtual bool SkipToFirstReference(void) =0; + // write index reference data + virtual bool WriteAllReferences(void) =0; + // write index header data + virtual bool WriteHeader(void) =0; + + // internal methods + protected: + // rewind index file to beginning of index data, return true if rewound OK + bool Rewind(void); + + private: + // return true if FILE* is open + bool IsOpen(void) const; + // opens index file according to requested mode, return true if opened OK + bool OpenIndexFile(const std::string& filename, const std::string& mode); + // updates in-memory cache of index data, depending on current cache mode + void UpdateCache(void); + // factory methods for returning proper BamIndex-derived type based on available index files public: @@ -59,24 +111,23 @@ class BamIndex { // // ** default preferred type is BamToolsIndex ** use this anytime it exists enum PreferredIndexType { BAMTOOLS = 0, STANDARD }; - static BamIndex* FromBamFilename(const std::string& bamFilename, - BamTools::BgzfData* bgzf, + static BamIndex* FromBamFilename(const std::string& bamFilename, + BamTools::BgzfData* bgzf, BamTools::BamReader* reader, - bool isBigEndian, const BamIndex::PreferredIndexType& type = BamIndex::BAMTOOLS); // returns index based on explicitly named index file (or 0 if not found) - static BamIndex* FromIndexFilename(const std::string& indexFilename, - BamTools::BgzfData* bgzf, - BamTools::BamReader* reader, - bool isBigEndian); + static BamIndex* FromIndexFilename(const std::string& indexFilename, + BamTools::BgzfData* bgzf, + BamTools::BamReader* reader); // data members protected: BamTools::BgzfData* m_BGZF; BamTools::BamReader* m_reader; BamTools::RefVector m_references; - bool m_isBigEndian; + BamIndex::BamIndexCacheMode m_cacheMode; + FILE* m_indexStream; }; // -------------------------------------------------- @@ -89,8 +140,7 @@ class BamStandardIndex : public BamIndex { // ctor & dtor public: BamStandardIndex(BamTools::BgzfData* bgzf, - BamTools::BamReader* reader, - bool isBigEndian); + BamTools::BamReader* reader); ~BamStandardIndex(void); // interface (implements BamIndex virtual methods) @@ -106,12 +156,30 @@ class BamStandardIndex : public BamIndex { // * thus, the method sets a flag to indicate whether there are alignments // available after the jump position bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); - // loads existing data from file into memory - bool Load(const std::string& filename); - // writes in-memory index data out to file - // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) - bool Write(const std::string& bamFilename); - + protected: + // clear all current index offset data in memory + void ClearAllData(void); + // return file position after header metadata + const off_t DataBeginOffset(void) const; + // return true if all index data is cached + bool HasFullDataCache(void) const; + // clears index data from all references except the first + void KeepOnlyFirstReferenceOffsets(void); + // load index data for all references, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadAllReferences(bool saveData = true); + // load first reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadFirstReference(bool saveData = true); + // load header data from index file, return true if loaded OK + bool LoadHeader(void); + // position file pointer to first reference begin, return true if skipped OK + bool SkipToFirstReference(void); + // write index reference data + bool WriteAllReferences(void); + // write index header data + bool WriteHeader(void); + // internal implementation private: struct BamStandardIndexPrivate; @@ -127,8 +195,7 @@ class BamToolsIndex : public BamIndex { // ctor & dtor public: BamToolsIndex(BamTools::BgzfData* bgzf, - BamTools::BamReader* reader, - bool isBigEndian); + BamTools::BamReader* reader); ~BamToolsIndex(void); // interface (implements BamIndex virtual methods) @@ -144,12 +211,30 @@ class BamToolsIndex : public BamIndex { // * thus, the method sets a flag to indicate whether there are alignments // available after the jump position bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); - // loads existing data from file into memory - bool Load(const std::string& filename); - // writes in-memory index data out to file - // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) - bool Write(const std::string& bamFilename); - + protected: + // clear all current index offset data in memory + void ClearAllData(void); + // return file position after header metadata + const off_t DataBeginOffset(void) const; + // return true if all index data is cached + bool HasFullDataCache(void) const; + // clears index data from all references except the first + void KeepOnlyFirstReferenceOffsets(void); + // load index data for all references, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadAllReferences(bool saveData = true); + // load first reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadFirstReference(bool saveData = true); + // load header data from index file, return true if loaded OK + bool LoadHeader(void); + // position file pointer to first reference begin, return true if skipped OK + bool SkipToFirstReference(void); + // write index reference data + bool WriteAllReferences(void); + // write index header data + bool WriteHeader(void); + // internal implementation private: struct BamToolsIndexPrivate; @@ -158,14 +243,16 @@ class BamToolsIndex : public BamIndex { // -------------------------------------------------- // BamIndex factory methods -// -// return proper BamIndex-derived type based on available index files +// returns index based on BAM filename 'stub' +// checks first for preferred type, returns that type if found +// (if not found, attmempts to load other type(s), returns 0 if NONE found) +// +// ** default preferred type is BamToolsIndex ** use this anytime it exists inline BamIndex* BamIndex::FromBamFilename(const std::string& bamFilename, BamTools::BgzfData* bgzf, BamTools::BamReader* reader, - bool isBigEndian, const BamIndex::PreferredIndexType& type) { // --------------------------------------------------- @@ -174,27 +261,27 @@ BamIndex* BamIndex::FromBamFilename(const std::string& bamFilename, const std::string bamtoolsIndexFilename = bamFilename + ".bti"; const bool bamtoolsIndexExists = BamTools::FileExists(bamtoolsIndexFilename); if ( (type == BamIndex::BAMTOOLS) && bamtoolsIndexExists ) - return new BamToolsIndex(bgzf, reader, isBigEndian); + return new BamToolsIndex(bgzf, reader); const std::string standardIndexFilename = bamFilename + ".bai"; const bool standardIndexExists = BamTools::FileExists(standardIndexFilename); if ( (type == BamIndex::STANDARD) && standardIndexExists ) - return new BamStandardIndex(bgzf, reader, isBigEndian); + return new BamStandardIndex(bgzf, reader); // ---------------------------------------------------- // preferred type could not be found, try other (non-preferred) types // if none found, return 0 - if ( bamtoolsIndexExists ) return new BamToolsIndex(bgzf, reader, isBigEndian); - if ( standardIndexExists ) return new BamStandardIndex(bgzf, reader, isBigEndian); + if ( bamtoolsIndexExists ) return new BamToolsIndex(bgzf, reader); + if ( standardIndexExists ) return new BamStandardIndex(bgzf, reader); return 0; } +// returns index based on explicitly named index file (or 0 if not found) inline -BamIndex* BamIndex::FromIndexFilename(const std::string& indexFilename, - BamTools::BgzfData* bgzf, - BamTools::BamReader* reader, - bool isBigEndian) +BamIndex* BamIndex::FromIndexFilename(const std::string& indexFilename, + BamTools::BgzfData* bgzf, + BamTools::BamReader* reader) { // see if specified file exists const bool indexExists = BamTools::FileExists(indexFilename); @@ -205,11 +292,11 @@ BamIndex* BamIndex::FromIndexFilename(const std::string& indexFilename, // if has bamtoolsIndexExtension if ( indexFilename.find(bamtoolsIndexExtension) == (indexFilename.length() - bamtoolsIndexExtension.length()) ) - return new BamToolsIndex(bgzf, reader, isBigEndian); + return new BamToolsIndex(bgzf, reader); // if has standardIndexExtension if ( indexFilename.find(standardIndexExtension) == (indexFilename.length() - standardIndexExtension.length()) ) - return new BamStandardIndex(bgzf, reader, isBigEndian); + return new BamStandardIndex(bgzf, reader); // otherwise, unsupported file type return 0; diff --git a/src/api/BamReader.cpp b/src/api/BamReader.cpp index ce21fab..d7476db 100644 --- a/src/api/BamReader.cpp +++ b/src/api/BamReader.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 9 October 2010 (DB) +// Last modified: 13 October 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -19,7 +19,6 @@ #include #include "BGZF.h" #include "BamReader.h" -#include "BamIndex.h" using namespace BamTools; using namespace std; @@ -42,11 +41,14 @@ struct BamReader::BamReaderPrivate { string HeaderText; BamIndex* Index; RefVector References; - bool IsIndexLoaded; + bool HasIndex; int64_t AlignmentsBeginOffset; string Filename; string IndexFilename; + // index caching mode + BamIndex::BamIndexCacheMode IndexCacheMode; + // system data bool IsBigEndian; @@ -61,15 +63,12 @@ struct BamReader::BamReaderPrivate { const char* DNA_LOOKUP; const char* CIGAR_LOOKUP; - // ------------------------------- // constructor & destructor - // ------------------------------- BamReaderPrivate(BamReader* parent); ~BamReaderPrivate(void); // ------------------------------- // "public" interface - // ------------------------------- // file operations void Close(void); @@ -89,34 +88,37 @@ struct BamReader::BamReaderPrivate { // index operations bool CreateIndex(bool useStandardIndex); + void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode); - // ------------------------------- - // internal methods - // ------------------------------- - // *** reading alignments and auxiliary data *** // - - // adjusts requested region if necessary (depending on where data actually begins) - void AdjustRegion(BamRegion& region); - // fills out character data for BamAlignment data - bool BuildCharData(BamAlignment& bAlignment); - // checks to see if alignment overlaps current region - RegionState IsOverlap(BamAlignment& bAlignment); - // retrieves header text from BAM file - void LoadHeaderData(void); - // retrieves BAM alignment under file pointer - bool LoadNextAlignment(BamAlignment& bAlignment); - // builds reference data structure from BAM file - void LoadReferenceData(void); - // mark references with 'HasAlignments' status - void MarkReferences(void); - - // *** index file handling *** // - - // clear out inernal index data structure - void ClearIndex(void); - // loads index from BAM index file - bool LoadIndex(const bool lookForIndex, const bool preferStandardIndex); + // internal methods + private: + + // --------------------------------------- + // reading alignments and auxiliary data + + // adjusts requested region if necessary (depending on where data actually begins) + void AdjustRegion(BamRegion& region); + // fills out character data for BamAlignment data + bool BuildCharData(BamAlignment& bAlignment); + // checks to see if alignment overlaps current region + RegionState IsOverlap(BamAlignment& bAlignment); + // retrieves header text from BAM file + void LoadHeaderData(void); + // retrieves BAM alignment under file pointer + bool LoadNextAlignment(BamAlignment& bAlignment); + // builds reference data structure from BAM file + void LoadReferenceData(void); + // mark references with 'HasAlignments' status + void MarkReferences(void); + + // --------------------------------- + // index file handling + + // clear out inernal index data structure + void ClearIndex(void); + // loads index from BAM index file + bool LoadIndex(const bool lookForIndex, const bool preferStandardIndex); }; // ----------------------------------------------------- @@ -135,7 +137,8 @@ BamReader::~BamReader(void) { // file operations void BamReader::Close(void) { d->Close(); } -bool BamReader::IsIndexLoaded(void) const { return d->IsIndexLoaded; } +bool BamReader::HasIndex(void) const { return d->HasIndex; } +bool BamReader::IsIndexLoaded(void) const { return HasIndex(); } bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; } bool BamReader::Jump(int refID, int position) { return d->SetRegion( BamRegion(refID, position) ); } bool BamReader::Open(const std::string& filename, @@ -164,6 +167,7 @@ const std::string BamReader::GetFilename(void) const { return d->Filename; } // index operations bool BamReader::CreateIndex(bool useStandardIndex) { return d->CreateIndex(useStandardIndex); } +void BamReader::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { d->SetIndexCacheMode(mode); } // ----------------------------------------------------- // BamReaderPrivate implementation @@ -172,8 +176,9 @@ bool BamReader::CreateIndex(bool useStandardIndex) { return d->CreateIndex(useSt // constructor BamReader::BamReaderPrivate::BamReaderPrivate(BamReader* parent) : Index(0) - , IsIndexLoaded(false) + , HasIndex(false) , AlignmentsBeginOffset(0) + , IndexCacheMode(BamIndex::LimitedIndexCaching) , HasAlignmentsInRegion(true) , Parent(parent) , DNA_LOOKUP("=ACMGRSVTWYHKDBN") @@ -217,17 +222,17 @@ void BamReader::BamReaderPrivate::AdjustRegion(BamRegion& region) { bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { // calculate character lengths/offsets - const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; - const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4); - const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2; - const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength; - const unsigned int tagDataLength = dataLength - tagDataOffset; + const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; + const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4); + const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2; + const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength; + const unsigned int tagDataLength = dataLength - tagDataOffset; // set up char buffers - const char* allCharData = bAlignment.SupportData.AllCharData.data(); - const char* seqData = ((const char*)allCharData) + seqDataOffset; - const char* qualData = ((const char*)allCharData) + qualDataOffset; - char* tagData = ((char*)allCharData) + tagDataOffset; + const char* allCharData = bAlignment.SupportData.AllCharData.data(); + const char* seqData = ((const char*)allCharData) + seqDataOffset; + const char* qualData = ((const char*)allCharData) + qualDataOffset; + char* tagData = ((char*)allCharData) + tagDataOffset; // store alignment name (relies on null char in name as terminator) bAlignment.Name.assign((const char*)(allCharData)); @@ -236,7 +241,7 @@ bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { bAlignment.QueryBases.clear(); bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength); for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) { - char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; + char singleBase = DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; bAlignment.QueryBases.append(1, singleBase); } @@ -363,7 +368,7 @@ bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { void BamReader::BamReaderPrivate::ClearIndex(void) { delete Index; Index = 0; - IsIndexLoaded = false; + HasIndex = false; } // closes the BAM file @@ -392,15 +397,17 @@ bool BamReader::BamReaderPrivate::CreateIndex(bool useStandardIndex) { // create index based on type requested if ( useStandardIndex ) - Index = new BamStandardIndex(&mBGZF, Parent, IsBigEndian); - // create BamTools 'custom' index + Index = new BamStandardIndex(&mBGZF, Parent); else - Index = new BamToolsIndex(&mBGZF, Parent, IsBigEndian); + Index = new BamToolsIndex(&mBGZF, Parent); + + // set index cache mode to full for writing + Index->SetCacheMode(BamIndex::FullIndexCaching); // build new index bool ok = true; ok &= Index->Build(); - IsIndexLoaded = ok; + HasIndex = ok; // mark empty references MarkReferences(); @@ -408,6 +415,9 @@ bool BamReader::BamReaderPrivate::CreateIndex(bool useStandardIndex) { // attempt to save index data to file ok &= Index->Write(Filename); + // set client's desired index cache mode + Index->SetCacheMode(IndexCacheMode); + // return success/fail of both building & writing index return ok; } @@ -576,7 +586,7 @@ bool BamReader::BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS); - Index = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, IsBigEndian, type); + Index = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, type); // if null, return failure if ( Index == 0 ) return false; @@ -588,21 +598,23 @@ bool BamReader::BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool else { // attempt to load BamIndex based on IndexFilename provided by client - Index = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent, IsBigEndian); + Index = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent); // if null, return failure if ( Index == 0 ) return false; } - - // an index file was found - // return success of loading the index data from file - IsIndexLoaded = Index->Load(IndexFilename); + + // set cache mode for BamIndex + Index->SetCacheMode(IndexCacheMode); + + // loading the index data from file + HasIndex = Index->Load(IndexFilename); // mark empty references MarkReferences(); // return index status - return IsIndexLoaded; + return HasIndex; } // populates BamAlignment with alignment data under file pointer, returns success/fail @@ -725,7 +737,7 @@ void BamReader::BamReaderPrivate::LoadReferenceData(void) { void BamReader::BamReaderPrivate::MarkReferences(void) { // ensure index is available - if ( Index == 0 || !IsIndexLoaded ) return; + if ( !HasIndex ) return; // mark empty references for ( int i = 0; i < (int)References.size(); ++i ) @@ -784,6 +796,13 @@ bool BamReader::BamReaderPrivate::Rewind(void) { return mBGZF.Seek(AlignmentsBeginOffset); } +// change the index caching behavior +void BamReader::BamReaderPrivate::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { + IndexCacheMode = mode; + if ( Index == 0 ) return; + Index->SetCacheMode(mode); +} + // asks Index to attempt a Jump() to specified region // returns success/failure bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) { @@ -799,7 +818,7 @@ bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) { Region.clear(); // check for existing index - if ( !IsIndexLoaded || Index == 0 ) return false; + if ( !HasIndex ) return false; // adjust region if necessary to reflect where data actually begins BamRegion adjustedRegion(region); @@ -822,8 +841,7 @@ bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) { // If this occurs, any subsequent calls to GetNexAlignment[Core] simply return false // BamMultiReader is then able to successfully pull alignments from a region from multiple files // even if one or more have no data. - if ( !Index->Jump(adjustedRegion, &HasAlignmentsInRegion) ) - return false; + if ( !Index->Jump(adjustedRegion, &HasAlignmentsInRegion) ) return false; // save region and return success Region = adjustedRegion; diff --git a/src/api/BamReader.h b/src/api/BamReader.h index 132c95b..1160bd5 100644 --- a/src/api/BamReader.h +++ b/src/api/BamReader.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 18 September 2010 (DB) +// Last modified: 13 October 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -16,6 +16,7 @@ #include #include "BamAlignment.h" +#include "BamIndex.h" namespace BamTools { @@ -35,15 +36,15 @@ class BamReader { // close BAM file void Close(void); - // returns whether index data is loaded (i.e. reader is able to Jump() or not) - bool IsIndexLoaded(void) const; // returns whether reader is open for reading or not bool IsOpen(void) const; // performs random-access jump using (reference, position) as a left-bound bool Jump(int refID, int position = 0); // opens BAM file (and optional BAM index file, if provided) - // @lookForIndex - if no indexFilename provided, look for an existing index file - // @preferStandardIndex - if true, give priority in index file searching to standard BAM index + // @lookForIndex - if no indexFilename provided, look in BAM file's directory for an existing index file + // default behavior is to skip index file search if no index filename given + // @preferStandardIndex - if true, give priority in index file searching to standard BAM index (*.bai) + // default behavior is to prefer the BamToolsIndex (*.bti) if both are available bool Open(const std::string& filename, const std::string& indexFilename = "", const bool lookForIndex = false, @@ -51,8 +52,7 @@ class BamReader { // returns file pointer to beginning of alignments bool Rewind(void); // sets a region of interest (with left & right bound reference/position) - // attempts a Jump() to left bound as well - // returns success/failure of Jump() + // returns success/failure of seeking to left bound of region bool SetRegion(const BamRegion& region); bool SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound); @@ -62,10 +62,10 @@ class BamReader { // retrieves next available alignment (returns success/fail) bool GetNextAlignment(BamAlignment& bAlignment); - // retrieves next available alignment core data (returns success/fail) // ** DOES NOT parse any character data (read name, bases, qualities, tag data) ** - // useful for operations requiring ONLY aligner-related information (refId/position, alignment flags, CIGAR, mapQuality, etc) + // useful for operations requiring ONLY aligner-related information + // (refId/position, alignment flags, CIGAR, mapQuality, etc) bool GetNextAlignmentCore(BamAlignment& bAlignment); // ---------------------- @@ -91,6 +91,32 @@ class BamReader { // default behavior is to create the BAM standard index (".bai") // set flag to false to create the BamTools-specific index (".bti") bool CreateIndex(bool useStandardIndex = true); + // returns whether index data is available for reading + // (e.g. if true, BamReader should be able to seek to a region) + bool HasIndex(void) const; + // change the index caching behavior + // default BamReader/Index mode is LimitedIndexCaching + // @mode - can be either FullIndexCaching, LimitedIndexCaching, + // or NoIndexCaching. See BamIndex.h for more details + void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode); + + // deprecated methods + public: + + // deprecated (but still available): prefer HasIndex() instead + // + // Deprecated purely for API semantic clarity - HasIndex() should be clearer + // than IsIndexLoaded() in light of the new caching modes that may clear the + // index data from memory, but leave the index file open for later random access + // seeks. + // + // For example, what would (IsIndexLoaded() == true) mean when cacheMode has been + // explicitly set to NoIndexCaching? This is confusing at best, misleading about + // current memory behavior at worst. + // + // returns whether index data is available + // (e.g. if true, BamReader should be able to seek to a region) + bool IsIndexLoaded(void) const; // private implementation private: -- 2.39.2