From 4bff07c63b54d5eeecfe9c0fa63489e3ac73883b Mon Sep 17 00:00:00 2001 From: Derek Date: Fri, 10 Sep 2010 17:38:03 -0400 Subject: [PATCH] Reimplemented BamToolsIndex for bug fix and performance upgrade. *** NOTE *** This commit invalidats any previous BamToolsIndex files (.bti). Please re-run 'bamtools index -bti -in ' to generate the new index files. --- src/api/BamAux.h | 4 +- src/api/BamIndex.cpp | 626 +++++++++++++++++++++++++++---------------- src/api/BamIndex.h | 13 +- 3 files changed, 395 insertions(+), 248 deletions(-) diff --git a/src/api/BamAux.h b/src/api/BamAux.h index 057568e..2c1c7ac 100644 --- a/src/api/BamAux.h +++ b/src/api/BamAux.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 3 September 2010 (DB) +// Last modified: 10 September 2010 (DB) // --------------------------------------------------------------------------- // Provides the basic constants, data structures, etc. for using BAM files // *************************************************************************** @@ -274,7 +274,7 @@ struct BamRegion { // member functions void clear(void) { LeftRefID = -1; LeftPosition = -1; RightRefID = -1; RightPosition = -1; } bool isLeftBoundSpecified(void) const { return ( LeftRefID != -1 && LeftPosition != -1 ); } - bool isNull(void) const { return !( isLeftBoundSpecified() && isRightBoundSpecified() ); } + bool isNull(void) const { return ( !isLeftBoundSpecified() && !isRightBoundSpecified() ); } bool isRightBoundSpecified(void) const { return ( RightRefID != -1 && RightPosition != -1 ); } }; diff --git a/src/api/BamIndex.cpp b/src/api/BamIndex.cpp index ab03d44..fabfbf1 100644 --- a/src/api/BamIndex.cpp +++ b/src/api/BamIndex.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 9 September 2010 (DB) +// Last modified: 10 September 2010 (DB) // --------------------------------------------------------------------------- // Provides index functionality - both for the default (standardized) BAM // index format (.bai) as well as a BamTools-specific (nonstandard) index @@ -95,7 +95,7 @@ typedef vector BamStandardIndexData; } // namespace BamTools // ------------------------------- -// BamStandardIndex implementation +// BamStandardIndexPrivate implementation struct BamStandardIndex::BamStandardIndexPrivate { @@ -111,30 +111,33 @@ struct BamStandardIndex::BamStandardIndexPrivate { BamStandardIndexPrivate(BamStandardIndex* parent) : m_parent(parent) { } ~BamStandardIndexPrivate(void) { m_indexData.clear(); } + // ------------------------- + // 'public' methods + + // creates index data (in-memory) from current reader data + bool Build(void); + // attempts to use index to jump to region; returns success/fail + bool Jump(const BamTools::BamRegion& region); + // loads existing data from file into memory + bool Load(const std::string& filename); + // writes in-memory index data out to file + // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) + bool Write(const std::string& bamFilename); + // ------------------------- // internal methods // calculate bins that overlap region - int BinsFromRegion(const BamTools::BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]); + int BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]); + // calculates offset(s) for a given region + bool GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets); // saves BAM bin entry for index void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset); // saves linear offset entry for index void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset); // simplifies index by merging 'chunks' void MergeChunks(void); - }; - -BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) - : BamIndex(bgzf, reader, isBigEndian) -{ - d = new BamStandardIndexPrivate(this); -} - -BamStandardIndex::~BamStandardIndex(void) { - delete d; - d = 0; -} // calculate bins that overlap region int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) { @@ -168,20 +171,25 @@ int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& r return i; } -bool BamStandardIndex::Build(void) { +bool BamStandardIndex::BamStandardIndexPrivate::Build(void) { + + // localize parent data + if ( m_parent == 0 ) return false; + BamReader* reader = m_parent->m_reader; + BgzfData* mBGZF = m_parent->m_BGZF; + RefVector& references = m_parent->m_references; // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen ) return false; // move file pointer to beginning of alignments - m_reader->Rewind(); + reader->Rewind(); // get reference count, reserve index space - int numReferences = (int)m_references.size(); - for ( int i = 0; i < numReferences; ++i ) { - d->m_indexData.push_back(ReferenceIndex()); - } + int numReferences = (int)references.size(); + for ( int i = 0; i < numReferences; ++i ) + m_indexData.push_back(ReferenceIndex()); // sets default constant for bin, ID, offset, coordinate variables const uint32_t defaultValue = 0xffffffffu; @@ -195,14 +203,14 @@ bool BamStandardIndex::Build(void) { int32_t lastRefID(defaultValue); // offset data - uint64_t saveOffset = m_BGZF->Tell(); + uint64_t saveOffset = mBGZF->Tell(); uint64_t lastOffset = saveOffset; // coordinate data int32_t lastCoordinate = defaultValue; BamAlignment bAlignment; - while ( m_reader->GetNextAlignmentCore(bAlignment) ) { + while ( reader->GetNextAlignmentCore(bAlignment) ) { // change of chromosome, save ID, reset bin if ( lastRefID != bAlignment.RefID ) { @@ -221,9 +229,9 @@ bool BamStandardIndex::Build(void) { if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) { // save linear offset entry (matched to BAM entry refID) - ReferenceIndex& refIndex = d->m_indexData.at(bAlignment.RefID); + ReferenceIndex& refIndex = m_indexData.at(bAlignment.RefID); LinearOffsetVector& offsets = refIndex.Offsets; - d->InsertLinearOffset(offsets, bAlignment, lastOffset); + InsertLinearOffset(offsets, bAlignment, lastOffset); } // if current BamAlignment bin != lastBin, "then possibly write the binning index" @@ -233,9 +241,9 @@ bool BamStandardIndex::Build(void) { if ( saveBin != defaultValue ) { // save Bam bin entry - ReferenceIndex& refIndex = d->m_indexData.at(saveRefID); + ReferenceIndex& refIndex = m_indexData.at(saveRefID); BamBinMap& binMap = refIndex.Bins; - d->InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); + InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); } // update saveOffset @@ -253,13 +261,13 @@ bool BamStandardIndex::Build(void) { } // make sure that current file pointer is beyond lastOffset - if ( m_BGZF->Tell() <= (int64_t)lastOffset ) { + if ( mBGZF->Tell() <= (int64_t)lastOffset ) { fprintf(stderr, "Error in BGZF offsets.\n"); exit(1); } // update lastOffset - lastOffset = m_BGZF->Tell(); + lastOffset = mBGZF->Tell(); // update lastCoordinate lastCoordinate = bAlignment.Position; @@ -268,19 +276,19 @@ bool BamStandardIndex::Build(void) { // save any leftover BAM data (as long as refID is valid) if ( saveRefID >= 0 ) { // save Bam bin entry - ReferenceIndex& refIndex = d->m_indexData.at(saveRefID); + ReferenceIndex& refIndex = m_indexData.at(saveRefID); BamBinMap& binMap = refIndex.Bins; - d->InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); + InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); } // simplify index by merging chunks - d->MergeChunks(); + MergeChunks(); // iterate through references in index // store whether reference has data & // sort offsets in linear offset vector - BamStandardIndexData::iterator indexIter = d->m_indexData.begin(); - BamStandardIndexData::iterator indexEnd = d->m_indexData.end(); + BamStandardIndexData::iterator indexIter = m_indexData.begin(); + BamStandardIndexData::iterator indexEnd = m_indexData.end(); for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) { // get reference index data @@ -289,24 +297,24 @@ bool BamStandardIndex::Build(void) { LinearOffsetVector& offsets = refIndex.Offsets; // store whether reference has alignments or no - m_references[i].RefHasAlignments = ( binMap.size() > 0 ); + references[i].RefHasAlignments = ( binMap.size() > 0 ); // sort linear offsets sort(offsets.begin(), offsets.end()); } // rewind file pointer to beginning of alignments, return success/fail - return m_reader->Rewind(); + return reader->Rewind(); } -bool BamStandardIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets) { +bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets) { // calculate which bins overlap this region uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); - int numBins = d->BinsFromRegion(region, isRightBoundSpecified, bins); + int numBins = BinsFromRegion(region, isRightBoundSpecified, bins); // get bins for this reference - const ReferenceIndex& refIndex = d->m_indexData.at(region.LeftRefID); + const ReferenceIndex& refIndex = m_indexData.at(region.LeftRefID); const BamBinMap& binMap = refIndex.Bins; // get minimum offset to consider @@ -392,18 +400,23 @@ void BamStandardIndex::BamStandardIndexPrivate::InsertLinearOffset(LinearOffsetV } } -bool BamStandardIndex::Jump(const BamRegion& region) { +bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region) { - if ( m_reader == 0 || m_BGZF == 0 || !m_reader->IsOpen() ) { - fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n"); + // localize parent data + if ( m_parent == 0 ) return false; + BamReader* reader = m_parent->m_reader; + BgzfData* mBGZF = m_parent->m_BGZF; + RefVector& references = m_parent->m_references; + + // be sure reader & BGZF file are valid & open for reading + if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen ) return false; - } // see if left-bound reference of region has alignments - if ( !HasAlignments(region.LeftRefID) ) return false; + if ( !m_parent->HasAlignments(region.LeftRefID) ) return false; // make sure left-bound position is valid - if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) return false; + if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false; vector offsets; if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets) ) { @@ -417,14 +430,14 @@ bool BamStandardIndex::Jump(const BamRegion& region) { for ( vector::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { // attempt seek & load first available alignment - result &= m_BGZF->Seek(*o); - m_reader->GetNextAlignmentCore(bAlignment); + result &= mBGZF->Seek(*o); + reader->GetNextAlignmentCore(bAlignment); // if this alignment corresponds to desired position // return success of seeking back to 'current offset' if ( (bAlignment.RefID == region.LeftRefID && bAlignment.Position + bAlignment.Length > region.LeftPosition) || (bAlignment.RefID > region.LeftRefID) ) { if ( o != offsets.begin() ) --o; - return m_BGZF->Seek(*o); + return mBGZF->Seek(*o); } } @@ -432,8 +445,13 @@ bool BamStandardIndex::Jump(const BamRegion& region) { return result; } -bool BamStandardIndex::Load(const string& filename) { +bool BamStandardIndex::BamStandardIndexPrivate::Load(const string& filename) { + // localize parent data + if ( m_parent == 0 ) return false; + const bool isBigEndian = m_parent->m_isBigEndian; + RefVector& references = m_parent->m_references; + // open index file, abort on error FILE* indexStream = fopen(filename.c_str(), "rb"); if( !indexStream ) { @@ -456,10 +474,10 @@ bool BamStandardIndex::Load(const string& filename) { // get number of reference sequences uint32_t numRefSeqs; elementsRead = fread(&numRefSeqs, 4, 1, indexStream); - if ( m_isBigEndian ) SwapEndian_32(numRefSeqs); + if ( isBigEndian ) SwapEndian_32(numRefSeqs); // intialize space for BamStandardIndexData data structure - d->m_indexData.reserve(numRefSeqs); + m_indexData.reserve(numRefSeqs); // iterate over reference sequences for ( unsigned int i = 0; i < numRefSeqs; ++i ) { @@ -467,10 +485,10 @@ bool BamStandardIndex::Load(const string& filename) { // get number of bins for this reference sequence int32_t numBins; elementsRead = fread(&numBins, 4, 1, indexStream); - if ( m_isBigEndian ) SwapEndian_32(numBins); + if ( isBigEndian ) SwapEndian_32(numBins); if ( numBins > 0 ) { - RefData& refEntry = m_references[i]; + RefData& refEntry = references[i]; refEntry.RefHasAlignments = true; } @@ -488,7 +506,7 @@ bool BamStandardIndex::Load(const string& filename) { uint32_t numChunks; elementsRead = fread(&numChunks, 4, 1, indexStream); - if ( m_isBigEndian ) { + if ( isBigEndian ) { SwapEndian_32(binID); SwapEndian_32(numChunks); } @@ -503,10 +521,10 @@ bool BamStandardIndex::Load(const string& filename) { // get chunk boundaries (left, right) uint64_t left; uint64_t right; - elementsRead = fread(&left, 8, 1, indexStream); + elementsRead = fread(&left, 8, 1, indexStream); elementsRead = fread(&right, 8, 1, indexStream); - if ( m_isBigEndian ) { + if ( isBigEndian ) { SwapEndian_64(left); SwapEndian_64(right); } @@ -528,7 +546,7 @@ bool BamStandardIndex::Load(const string& filename) { // get number of linear offsets int32_t numLinearOffsets; elementsRead = fread(&numLinearOffsets, 4, 1, indexStream); - if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets); + if ( isBigEndian ) SwapEndian_32(numLinearOffsets); // intialize LinearOffsetVector LinearOffsetVector offsets; @@ -539,7 +557,7 @@ bool BamStandardIndex::Load(const string& filename) { for ( int j = 0; j < numLinearOffsets; ++j ) { // read a linear offset & store elementsRead = fread(&linearOffset, 8, 1, indexStream); - if ( m_isBigEndian ) SwapEndian_64(linearOffset); + if ( isBigEndian ) SwapEndian_64(linearOffset); offsets.push_back(linearOffset); } @@ -547,7 +565,7 @@ bool BamStandardIndex::Load(const string& filename) { sort( offsets.begin(), offsets.end() ); // store index data for that reference sequence - d->m_indexData.push_back( ReferenceIndex(binMap, offsets) ); + m_indexData.push_back( ReferenceIndex(binMap, offsets) ); } // close index file (.bai) and return @@ -614,8 +632,12 @@ void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) { // writes in-memory index data out to file // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamStandardIndex::Write(const std::string& bamFilename) { +bool BamStandardIndex::BamStandardIndexPrivate::Write(const std::string& bamFilename) { + // localize parent data + if ( m_parent == 0 ) return false; + const bool isBigEndian = m_parent->m_isBigEndian; + string indexFilename = bamFilename + ".bai"; FILE* indexStream = fopen(indexFilename.c_str(), "wb"); if ( indexStream == 0 ) { @@ -627,13 +649,13 @@ bool BamStandardIndex::Write(const std::string& bamFilename) { fwrite("BAI\1", 1, 4, indexStream); // write number of reference sequences - int32_t numReferenceSeqs = d->m_indexData.size(); - if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs); + int32_t numReferenceSeqs = m_indexData.size(); + if ( isBigEndian ) SwapEndian_32(numReferenceSeqs); fwrite(&numReferenceSeqs, 4, 1, indexStream); // iterate over reference sequences - BamStandardIndexData::const_iterator indexIter = d->m_indexData.begin(); - BamStandardIndexData::const_iterator indexEnd = d->m_indexData.end(); + BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); + BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); for ( ; indexIter != indexEnd; ++ indexIter ) { // get reference index data @@ -643,7 +665,7 @@ bool BamStandardIndex::Write(const std::string& bamFilename) { // write number of bins int32_t binCount = binMap.size(); - if ( m_isBigEndian ) SwapEndian_32(binCount); + if ( isBigEndian ) SwapEndian_32(binCount); fwrite(&binCount, 4, 1, indexStream); // iterate over bins @@ -656,12 +678,12 @@ bool BamStandardIndex::Write(const std::string& bamFilename) { const ChunkVector& binChunks = (*binIter).second; // save BAM bin key - if ( m_isBigEndian ) SwapEndian_32(binKey); + if ( isBigEndian ) SwapEndian_32(binKey); fwrite(&binKey, 4, 1, indexStream); // save chunk count int32_t chunkCount = binChunks.size(); - if ( m_isBigEndian ) SwapEndian_32(chunkCount); + if ( isBigEndian ) SwapEndian_32(chunkCount); fwrite(&chunkCount, 4, 1, indexStream); // iterate over chunks @@ -674,7 +696,7 @@ bool BamStandardIndex::Write(const std::string& bamFilename) { uint64_t start = chunk.Start; uint64_t stop = chunk.Stop; - if ( m_isBigEndian ) { + if ( isBigEndian ) { SwapEndian_64(start); SwapEndian_64(stop); } @@ -687,7 +709,7 @@ bool BamStandardIndex::Write(const std::string& bamFilename) { // write linear offsets size int32_t offsetSize = offsets.size(); - if ( m_isBigEndian ) SwapEndian_32(offsetSize); + if ( isBigEndian ) SwapEndian_32(offsetSize); fwrite(&offsetSize, 4, 1, indexStream); // iterate over linear offsets @@ -697,7 +719,7 @@ bool BamStandardIndex::Write(const std::string& bamFilename) { // write linear offset value uint64_t linearOffset = (*offsetIter); - if ( m_isBigEndian ) SwapEndian_64(linearOffset); + if ( isBigEndian ) SwapEndian_64(linearOffset); fwrite(&linearOffset, 8, 1, indexStream); } } @@ -707,43 +729,88 @@ bool BamStandardIndex::Write(const std::string& bamFilename) { fclose(indexStream); return true; } + +// --------------------------------------------------------------- +// BamStandardIndex implementation + +BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) + : BamIndex(bgzf, reader, isBigEndian) +{ + d = new BamStandardIndexPrivate(this); +} + +BamStandardIndex::~BamStandardIndex(void) { + delete d; + d = 0; +} + +// creates index data (in-memory) from current reader data +bool BamStandardIndex::Build(void) { return d->Build(); } + +// attempts to use index to jump to region; returns success/fail +bool BamStandardIndex::Jump(const BamRegion& region) { return d->Jump(region); } + +// loads existing data from file into memory +bool BamStandardIndex::Load(const string& filename) { return d->Load(filename); } + +// writes in-memory index data out to file +// N.B. - (this is the original BAM filename, method will modify it to use applicable extension) +bool BamStandardIndex::Write(const string& bamFilename) { return d->Write(bamFilename); } // ######################################################################################### // ######################################################################################### -// ------------------------------------- -// BamToolsIndex implementation +// --------------------------------------------------- +// BamToolsIndex structs & typedefs namespace BamTools { +// individual index entry struct BamToolsIndexEntry { // data members - int64_t Offset; - int32_t RefID; + int32_t MaxEndPosition; + int64_t StartOffset; int32_t StartPosition; // ctor - BamToolsIndexEntry(const int64_t& offset = 0, - const int32_t& id = -1, - const int32_t& position = -1) - : Offset(offset) - , RefID(id) - , StartPosition(position) + BamToolsIndexEntry(const int32_t& maxEndPosition = 0, + const int64_t& startOffset = 0, + const int32_t& startPosition = 0) + : MaxEndPosition(maxEndPosition) + , StartOffset(startOffset) + , StartPosition(startPosition) { } }; -typedef vector BamToolsIndexData; +// the actual index data structure +typedef map > BamToolsIndexData; } // namespace BamTools +// --------------------------------------------------- +// BamToolsIndexPrivate implementation + struct BamToolsIndex::BamToolsIndexPrivate { + + // keep a list of any supported versions here + // (might be useful later to handle any 'legacy' versions if the format changes) + // listed for example like: BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on + // + // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by: + // + // if ( indexVersion >= BTI_1_2 ) + // do something new + // else + // do the old thing + enum Version { BTI_1_0 = 1 }; // ------------------------- // data members BamToolsIndexData m_indexData; BamToolsIndex* m_parent; int32_t m_blockSize; + Version m_version; // ------------------------- // ctor & dtor @@ -751,158 +818,163 @@ struct BamToolsIndex::BamToolsIndexPrivate { BamToolsIndexPrivate(BamToolsIndex* parent) : m_parent(parent) , m_blockSize(1000) + , m_version(BTI_1_0) // latest version - used for writing new index files { } ~BamToolsIndexPrivate(void) { } + // ------------------------- + // 'public' methods + + // creates index data (in-memory) from current reader data + bool Build(void); + // returns supported file extension + const std::string Extension(void) const { return std::string(".bti"); } + // attempts to use index to jump to region; returns success/fail + bool Jump(const BamTools::BamRegion& region); + // loads existing data from file into memory + bool Load(const std::string& filename); + // writes in-memory index data out to file + // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) + bool Write(const std::string& bamFilename); + // ------------------------- // internal methods + + // calculates offset(s) for a given region + bool GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets); }; -BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) - : BamIndex(bgzf, reader, isBigEndian) -{ - d = new BamToolsIndexPrivate(this); -} - -BamToolsIndex::~BamToolsIndex(void) { - delete d; - d = 0; -} - -bool BamToolsIndex::Build(void) { +bool BamToolsIndex::BamToolsIndexPrivate::Build(void) { + + // localize parent data + if ( m_parent == 0 ) return false; + BamReader* reader = m_parent->m_reader; + BgzfData* mBGZF = m_parent->m_BGZF; + RefVector& references = m_parent->m_references; // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen ) return false; // move file pointer to beginning of alignments - m_reader->Rewind(); + // quit if failed + if ( !reader->Rewind() ) + return false; - // plow through alignments, store block offsets - int32_t currentBlockCount = 0; - int64_t blockStartOffset = m_BGZF->Tell(); - int32_t blockStartId = -1; - int32_t blockStartPosition = -1; + // set up counters and markers + int32_t currentBlockCount = 0; + int64_t currentAlignmentOffset = mBGZF->Tell(); + int32_t blockRefId = 0; + int32_t blockMaxEndPosition = 0; + int64_t blockStartOffset = currentAlignmentOffset; + int32_t blockStartPosition = -1; + + // plow through alignments, storing index entries BamAlignment al; - while ( m_reader->GetNextAlignmentCore(al) ) { + while ( reader->GetNextAlignmentCore(al) ) { + + // if block contains data (not the first time through) AND alignment is on a new reference + if ( currentBlockCount > 0 && al.RefID != blockRefId ) { + + // make sure reference flag is set + references[blockRefId].RefHasAlignments = true; + + // store previous data + m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) ); + + // intialize new block + currentBlockCount = 0; + blockMaxEndPosition = al.GetEndPosition(); + blockStartOffset = currentAlignmentOffset; + } - // set reference flag - m_references[al.RefID].RefHasAlignments = true; - // if beginning of block, save first alignment's refID & position if ( currentBlockCount == 0 ) { - blockStartId = al.RefID; + blockRefId = al.RefID; blockStartPosition = al.Position; } // increment block counter ++currentBlockCount; + // check end position + int32_t alignmentEndPosition = al.GetEndPosition(); + if ( alignmentEndPosition > blockMaxEndPosition ) + blockMaxEndPosition = alignmentEndPosition; + // if block is full, get offset for next block, reset currentBlockCount - if ( currentBlockCount == d->m_blockSize ) { - d->m_indexData.push_back( BamToolsIndexEntry(blockStartOffset, blockStartId, blockStartPosition) ); - blockStartOffset = m_BGZF->Tell(); + if ( currentBlockCount == m_blockSize ) { + + // make sure reference flag is set + references[blockRefId].RefHasAlignments = true; + + m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) ); + blockStartOffset = mBGZF->Tell(); currentBlockCount = 0; } + + // not the best name, but for the next iteration, this value will be the offset of the *current* alignment + // necessary because we won't know if this next alignment is on a new reference until we actually read it + currentAlignmentOffset = mBGZF->Tell(); } - return m_reader->Rewind(); + // attempt to rewind back to begnning of alignments + // return success/fail + return reader->Rewind(); } // N.B. - ignores isRightBoundSpecified -bool BamToolsIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets) { +bool BamToolsIndex::BamToolsIndexPrivate::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets) { - // return false if no index data present - if ( d->m_indexData.empty() ) return false; + // return false if leftBound refID is not found in index data + if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) return false; // clear any prior data offsets.clear(); + const vector referenceOffsets = m_indexData[region.LeftRefID]; + if ( referenceOffsets.empty() ) return false; -/* bool found = false; - int64_t previousOffset = -1; - BamToolsIndexData::const_iterator indexIter = d->m_indexData.end() - 1; - BamToolsIndexData::const_iterator indexBegin = d->m_indexData.begin(); - for ( ; indexIter >= indexBegin; --indexIter ) { - - const BamToolsIndexEntry& entry = (*indexIter); - - cerr << "Considering entry at " << entry.RefID << ":" << entry.Position << " => " << entry.Offset << endl; - - if ( entry.RefID < region.LeftRefID) { - found = true; - break; - } - - if ( (entry.RefID == region.LeftRefID) && (entry.Position < region.LeftPosition) ) { - found = true; - break; - } - - previousOffset = entry.Offset; - } - - if ( !found || previousOffset == -1 ) return false; - - // store offset & return success - cerr << endl; - cerr << "Using offset: " << previousOffset << endl; - cerr << endl; - - offsets.push_back(previousOffset); - return true;*/ - - -// cerr << "--------------------------------------------------" << endl; -// cerr << "Looking for " << region.LeftRefID << ":" << region.LeftPosition << endl; -// cerr << endl; - + // ------------------------------------------------------- // calculate nearest index to jump to - int64_t previousOffset = -1; - BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin(); - BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end(); + + // save first offset + int64_t offset = (*referenceOffsets.begin()).StartOffset; + vector::const_iterator indexIter = referenceOffsets.begin(); + vector::const_iterator indexEnd = referenceOffsets.end(); for ( ; indexIter != indexEnd; ++indexIter ) { - const BamToolsIndexEntry& entry = (*indexIter); - -// cerr << "Considering entry at " << entry.RefID << ":" << entry.StartPosition << " => " << entry.Offset << endl; - - // check if we are 'past' beginning of desired region - // if so, we will break out & use previously stored offset - if ( entry.RefID > region.LeftRefID ) break; - if ( (entry.RefID == region.LeftRefID) && (entry.StartPosition >= region.LeftPosition) ) break; - - // not past desired region, so store current entry offset in previousOffset - previousOffset = entry.Offset; + if ( entry.MaxEndPosition >= region.LeftPosition ) break; + offset = (*indexIter).StartOffset; } // no index found if ( indexIter == indexEnd ) return false; - - // first offset matches (so previousOffset was never set) - if ( previousOffset == -1 ) previousOffset = (*indexIter).Offset; - // store offset & return success -// cerr << endl; -// cerr << "Using offset: " << previousOffset << endl; -// cerr << endl; - offsets.push_back(previousOffset); + //store offset & return success + offsets.push_back(offset); return true; } -bool BamToolsIndex::Jump(const BamRegion& region) { +bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region) { - if ( m_reader == 0 || m_BGZF == 0 || !m_reader->IsOpen() ) { + // localize parent data + if ( m_parent == 0 ) return false; + BamReader* reader = m_parent->m_reader; + BgzfData* mBGZF = m_parent->m_BGZF; + RefVector& references = m_parent->m_references; + + if ( reader == 0 || mBGZF == 0 || !reader->IsOpen() ) { fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n"); return false; } // see if left-bound reference of region has alignments - if ( !HasAlignments(region.LeftRefID) ) return false; + if ( !m_parent->HasAlignments(region.LeftRefID) ) return false; // make sure left-bound position is valid - if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) return false; + if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false; vector offsets; if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets) ) { @@ -916,14 +988,14 @@ bool BamToolsIndex::Jump(const BamRegion& region) { for ( vector::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { // attempt seek & load first available alignment - result &= m_BGZF->Seek(*o); - m_reader->GetNextAlignmentCore(bAlignment); + result &= mBGZF->Seek(*o); + reader->GetNextAlignmentCore(bAlignment); // if this alignment corresponds to desired position // return success of seeking back to 'current offset' if ( (bAlignment.RefID == region.LeftRefID && bAlignment.Position + bAlignment.Length > region.LeftPosition) || (bAlignment.RefID > region.LeftRefID) ) { if ( o != offsets.begin() ) --o; - return m_BGZF->Seek(*o); + return mBGZF->Seek(*o); } } @@ -931,7 +1003,12 @@ bool BamToolsIndex::Jump(const BamRegion& region) { return result; } -bool BamToolsIndex::Load(const string& filename) { +bool BamToolsIndex::BamToolsIndexPrivate::Load(const string& filename) { + + // localize parent data + if ( m_parent == 0 ) return false; + const bool isBigEndian = m_parent->m_isBigEndian; + RefVector& references = m_parent->m_references; // open index file, abort on error FILE* indexStream = fopen(filename.c_str(), "rb"); @@ -952,42 +1029,66 @@ bool BamToolsIndex::Load(const string& filename) { return false; } + // read in version + int32_t indexVersion; + elementsRead = fread(&indexVersion, sizeof(indexVersion), 1, indexStream); + if ( isBigEndian ) SwapEndian_32(indexVersion); + if ( indexVersion <= 0 || indexVersion > m_version ) { + fprintf(stderr, "Problem with index file - unsupported version.\n"); + fclose(indexStream); + return false; + } + // read in block size - elementsRead = fread(&d->m_blockSize, sizeof(d->m_blockSize), 1, indexStream); - if ( m_isBigEndian ) SwapEndian_32(d->m_blockSize); + elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, indexStream); + if ( isBigEndian ) SwapEndian_32(m_blockSize); - // read in number of offsets - uint32_t numOffsets; - elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream); - if ( m_isBigEndian ) SwapEndian_32(numOffsets); + // read in number of references + int32_t numReferences; + elementsRead = fread(&numReferences, sizeof(numReferences), 1, indexStream); + if ( isBigEndian ) SwapEndian_32(numReferences); - // reserve space for index data - d->m_indexData.reserve(numOffsets); - - // iterate over index entries - for ( unsigned int i = 0; i < numOffsets; ++i ) { + // iterate over reference entries + for ( int i = 0; i < numReferences; ++i ) { - int64_t offset; - int32_t id; - int32_t position; + // read in number of offsets for this reference + uint32_t numOffsets; + elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream); + if ( isBigEndian ) SwapEndian_32(numOffsets); - // read in data - elementsRead = fread(&offset, sizeof(offset), 1, indexStream); - elementsRead = fread(&id, sizeof(id), 1, indexStream); - elementsRead = fread(&position, sizeof(position), 1, indexStream); + // initialize offsets container for this reference + vector offsets; + offsets.reserve(numOffsets); - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_64(offset); - SwapEndian_32(id); - SwapEndian_32(position); + // iterate over offset entries + for ( unsigned int j = 0; j < numOffsets; ++j ) { + + // copy entry data + int32_t maxEndPosition; + int64_t startOffset; + int32_t startPosition; + + // read in data + elementsRead = fread(&maxEndPosition, sizeof(maxEndPosition), 1, indexStream); + elementsRead = fread(&startOffset, sizeof(startOffset), 1, indexStream); + elementsRead = fread(&startPosition, sizeof(startPosition), 1, indexStream); + + // swap endian-ness if necessary + if ( isBigEndian ) { + SwapEndian_32(maxEndPosition); + SwapEndian_64(startOffset); + SwapEndian_32(startPosition); + } + + // save current index entry + offsets.push_back( BamToolsIndexEntry(maxEndPosition, startOffset, startPosition) ); } - // save reference index entry - d->m_indexData.push_back( BamToolsIndexEntry(offset, id, position) ); + // save refID, offsetVector entry into data structure + m_indexData.insert( make_pair(i, offsets) ); - // set reference flag - m_references[id].RefHasAlignments = true; // what about sparse references? wont be able to set flag? + // set ref.HasAlignments flag + references[i].RefHasAlignments = (numOffsets != 0); } // close index file and return @@ -997,8 +1098,12 @@ bool BamToolsIndex::Load(const string& filename) { // writes in-memory index data out to file // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamToolsIndex::Write(const std::string& bamFilename) { +bool BamToolsIndex::BamToolsIndexPrivate::Write(const std::string& bamFilename) { + // localize parent data + if ( m_parent == 0 ) return false; + const bool isBigEndian = m_parent->m_isBigEndian; + // open index file for writing string indexFilename = bamFilename + ".bti"; FILE* indexStream = fopen(indexFilename.c_str(), "wb"); @@ -1007,43 +1112,61 @@ bool BamToolsIndex::Write(const std::string& bamFilename) { return false; } - // write BAM index header + // write BTI index format 'magic number' fwrite("BTI\1", 1, 4, indexStream); + // write BTI index format version + int32_t currentVersion = (int32_t)m_version; + if ( isBigEndian ) SwapEndian_32(currentVersion); + fwrite(¤tVersion, sizeof(int32_t), 1, indexStream); + // write block size - int32_t blockSize = d->m_blockSize; - if ( m_isBigEndian ) SwapEndian_32(blockSize); + int32_t blockSize = m_blockSize; + if ( isBigEndian ) SwapEndian_32(blockSize); fwrite(&blockSize, sizeof(blockSize), 1, indexStream); - // write number of offset entries - uint32_t numOffsets = d->m_indexData.size(); - if ( m_isBigEndian ) SwapEndian_32(numOffsets); - fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream); + // write number of references + int32_t numReferences = (int32_t)m_indexData.size(); + if ( isBigEndian ) SwapEndian_32(numReferences); + fwrite(&numReferences, sizeof(numReferences), 1, indexStream); - // iterate over offset entries - BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin(); - BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end(); - for ( ; indexIter != indexEnd; ++ indexIter ) { - - // get reference index data - const BamToolsIndexEntry& entry = (*indexIter); - - // copy entry data - int64_t offset = entry.Offset; - int32_t id = entry.RefID; - int32_t position = entry.StartPosition; + // iterate through references in index + BamToolsIndexData::const_iterator refIter = m_indexData.begin(); + BamToolsIndexData::const_iterator refEnd = m_indexData.end(); + for ( ; refIter != refEnd; ++refIter ) { + + const vector offsets = (*refIter).second; - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_64(offset); - SwapEndian_32(id); - SwapEndian_32(position); + // write number of offsets listed for this reference + uint32_t numOffsets = offsets.size(); + if ( isBigEndian ) SwapEndian_32(numOffsets); + fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream); + + // iterate over offset entries + vector::const_iterator offsetIter = offsets.begin(); + vector::const_iterator offsetEnd = offsets.end(); + for ( ; offsetIter != offsetEnd; ++offsetIter ) { + + // get reference index data + const BamToolsIndexEntry& entry = (*offsetIter); + + // copy entry data + int32_t maxEndPosition = entry.MaxEndPosition; + int64_t startOffset = entry.StartOffset; + int32_t startPosition = entry.StartPosition; + + // swap endian-ness if necessary + if ( isBigEndian ) { + SwapEndian_32(maxEndPosition); + SwapEndian_64(startOffset); + SwapEndian_32(startPosition); + } + + // write the reference index entry + fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, indexStream); + fwrite(&startOffset, sizeof(startOffset), 1, indexStream); + fwrite(&startPosition, sizeof(startPosition), 1, indexStream); } - - // write the reference index entry - fwrite(&offset, sizeof(offset), 1, indexStream); - fwrite(&id, sizeof(id), 1, indexStream); - fwrite(&position, sizeof(position), 1, indexStream); } // flush any remaining output, close file, and return success @@ -1051,3 +1174,30 @@ bool BamToolsIndex::Write(const std::string& bamFilename) { fclose(indexStream); return true; } + +// --------------------------------------------------- +// BamToolsIndex implementation + +BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) + : BamIndex(bgzf, reader, isBigEndian) +{ + d = new BamToolsIndexPrivate(this); +} + +BamToolsIndex::~BamToolsIndex(void) { + delete d; + d = 0; +} + +// creates index data (in-memory) from current reader data +bool BamToolsIndex::Build(void) { return d->Build(); } + +// attempts to use index to jump to region; returns success/fail +bool BamToolsIndex::Jump(const BamRegion& region) { return d->Jump(region); } + +// loads existing data from file into memory +bool BamToolsIndex::Load(const string& filename) { return d->Load(filename); } + +// writes in-memory index data out to file +// N.B. - (this is the original BAM filename, method will modify it to use applicable extension) +bool BamToolsIndex::Write(const string& bamFilename) { return d->Write(bamFilename); } diff --git a/src/api/BamIndex.h b/src/api/BamIndex.h index dc95889..e45bc27 100644 --- a/src/api/BamIndex.h +++ b/src/api/BamIndex.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 3 September 2010 (DB) +// Last modified: 10 September 2010 (DB) // --------------------------------------------------------------------------- // Provides index functionality - both for the standardized BAM index format // (".bai") as well as a BamTools-specific (nonstandard) index format (".bti"). @@ -30,15 +30,14 @@ class BamIndex { public: BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian); virtual ~BamIndex(void) { } - + // index interface public: // creates index data (in-memory) from current reader data virtual bool Build(void) =0; // returns supported file extension virtual const std::string Extension(void) const =0; - // calculates offset(s) for a given region - //virtual bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector& offsets) =0; + // attempts to use index to jump to region; returns success/fail virtual bool Jump(const BamTools::BamRegion& region) =0; // returns whether reference has alignments or no virtual bool HasAlignments(const int& referenceID); @@ -97,8 +96,7 @@ class BamStandardIndex : public BamIndex { bool Build(void); // returns supported file extension const std::string Extension(void) const { return std::string(".bai"); } - // calculates offset(s) for a given region - bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector& offsets); + // attempts to use index to jump to region; returns success/fail bool Jump(const BamTools::BamRegion& region); // loads existing data from file into memory bool Load(const std::string& filename); @@ -131,8 +129,7 @@ class BamToolsIndex : public BamIndex { bool Build(void); // returns supported file extension const std::string Extension(void) const { return std::string(".bti"); } - // calculates offset(s) for a given region - bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector& offsets); + // attempts to use index to jump to region; returns success/fail bool Jump(const BamTools::BamRegion& region); // loads existing data from file into memory bool Load(const std::string& filename); -- 2.39.2