X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Fapi%2Finternal%2FBamToolsIndex_p.cpp;h=567b5514428d515348282f1bddfbfed14a55f7a9;hb=9f1ce8c47aeadb6dc1320b52ee671c3341b97935;hp=cccb4841dc888db9bd2ad692250734b246bdd6b2;hpb=4423944dcf9ab108ba012bceb5d5db4eb4cb561d;p=bamtools.git diff --git a/src/api/internal/BamToolsIndex_p.cpp b/src/api/internal/BamToolsIndex_p.cpp index cccb484..567b551 100644 --- a/src/api/internal/BamToolsIndex_p.cpp +++ b/src/api/internal/BamToolsIndex_p.cpp @@ -1,576 +1,622 @@ // *************************************************************************** // BamToolsIndex.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College -// All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 13 January 2011 (DB) +// Last modified: 10 October 2011 (DB) // --------------------------------------------------------------------------- // Provides index operations for the BamTools index format (".bti") // *************************************************************************** -#include -#include -#include -#include +#include "api/BamAlignment.h" +#include "api/internal/BamException_p.h" +#include "api/internal/BamReader_p.h" +#include "api/internal/BamToolsIndex_p.h" +#include "api/internal/BgzfStream_p.h" using namespace BamTools; using namespace BamTools::Internal; #include #include +#include #include #include +#include #include using namespace std; -BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader) - : BamIndex(bgzf, reader) - , m_blockSize(1000) - , m_dataBeginOffset(0) - , m_hasFullDataCache(false) +// -------------------------------- +// static BamToolsIndex constants +// -------------------------------- + +const uint32_t BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000; +const string BamToolsIndex::BTI_EXTENSION = ".bti"; +const char* const BamToolsIndex::BTI_MAGIC = "BTI\1"; +const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t)*2 + sizeof(int64_t); + +// ---------------------------- +// RaiiWrapper implementation +// ---------------------------- + +BamToolsIndex::RaiiWrapper::RaiiWrapper(void) + : IndexStream(0) +{ } + +BamToolsIndex::RaiiWrapper::~RaiiWrapper(void) { + if ( IndexStream ) + fclose(IndexStream); +} + +// ------------------------------ +// BamToolsIndex implementation +// ------------------------------ + +// ctor +BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader) + : BamIndex(reader) + , m_cacheMode(BamIndex::LimitedIndexCaching) + , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH) , m_inputVersion(0) - , m_outputVersion(BTI_1_2) // latest version - used for writing new index files + , m_outputVersion(BTI_2_0) // latest version - used for writing new index files { m_isBigEndian = BamTools::SystemIsBigEndian(); } // dtor BamToolsIndex::~BamToolsIndex(void) { - ClearAllData(); + CloseFile(); } -// creates index data (in-memory) from current reader data -bool BamToolsIndex::Build(void) { - - // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) - return false; +void BamToolsIndex::CheckMagicNumber(void) { - // move file pointer to beginning of alignments - if ( !m_reader->Rewind() ) return false; - - // initialize index data structure with space for all references - const int numReferences = (int)m_references.size(); - m_indexData.clear(); - m_hasFullDataCache = false; - SetReferenceCount(numReferences); - - // set up counters and markers - int32_t currentBlockCount = 0; - int64_t currentAlignmentOffset = m_BGZF->Tell(); - int32_t blockRefId = 0; - int32_t blockMaxEndPosition = 0; - int64_t blockStartOffset = currentAlignmentOffset; - int32_t blockStartPosition = -1; - - // plow through alignments, storing index entries - BamAlignment al; - while ( m_reader->GetNextAlignmentCore(al) ) { - - // if block contains data (not the first time through) AND alignment is on a new reference - if ( currentBlockCount > 0 && al.RefID != blockRefId ) { - - // store previous data - BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); - SaveOffsetEntry(blockRefId, entry); - - // intialize new block for current alignment's reference - currentBlockCount = 0; - blockMaxEndPosition = al.GetEndPosition(); - blockStartOffset = currentAlignmentOffset; - } + // read magic number + char magic[4]; + size_t elementsRead = fread(magic, sizeof(char), 4, Resources.IndexStream); + if ( elementsRead != 4 ) + throw BamException("BamToolsIndex::CheckMagicNumber", "could not read BTI magic number"); - // if beginning of block, save first alignment's refID & position - if ( currentBlockCount == 0 ) { - blockRefId = al.RefID; - blockStartPosition = al.Position; - } + // validate expected magic number + if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 ) + throw BamException("BamToolsIndex::CheckMagicNumber", "invalid BTI magic number"); +} - // increment block counter - ++currentBlockCount; +// check index file version, return true if OK +void BamToolsIndex::CheckVersion(void) { - // check end position - int32_t alignmentEndPosition = al.GetEndPosition(); - if ( alignmentEndPosition > blockMaxEndPosition ) - blockMaxEndPosition = alignmentEndPosition; + // read version from file + size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, Resources.IndexStream); + if ( elementsRead != 1 ) + throw BamException("BamToolsIndex::CheckVersion", "could not read format version"); + if ( m_isBigEndian ) SwapEndian_32(m_inputVersion); - // if block is full, get offset for next block, reset currentBlockCount - if ( currentBlockCount == m_blockSize ) { - BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); - SaveOffsetEntry(blockRefId, entry); - blockStartOffset = m_BGZF->Tell(); - currentBlockCount = 0; - } + // if version is negative, or zero + if ( m_inputVersion <= 0 ) + throw BamException("BamToolsIndex::CheckVersion", "invalid format version"); - // not the best name, but for the next iteration, this value will be the offset of the *current* alignment - // necessary because we won't know if this next alignment is on a new reference until we actually read it - currentAlignmentOffset = m_BGZF->Tell(); + // if version is newer than can be supported by this version of bamtools + else if ( m_inputVersion > m_outputVersion ) { + const string message = "unsupported format: this index was created by a newer version of BamTools. " + "Update your local version of BamTools to use the index file."; + throw BamException("BamToolsIndex::CheckVersion", message); } - // store final block with data - BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); - SaveOffsetEntry(blockRefId, entry); - - // set flag - m_hasFullDataCache = true; - - // return success/failure of rewind - return m_reader->Rewind(); + // ------------------------------------------------------------------ + // check for deprecated, unsupported versions + // (the format had to be modified to accomodate a particular bug fix) + + // Version 2.0: introduced support for half-open intervals, instead of the old closed intervals + // respondBy: throwing exception - we're not going to try to handle the old BTI files. + else if ( (Version)m_inputVersion < BamToolsIndex::BTI_2_0 ) { + const string message = "unsupported format: this version of the index may not properly handle " + "coordinate intervals. Please run 'bamtools index -bti -in yourData.bam' " + "to generate an up-to-date, fixed BTI file."; + throw BamException("BamToolsIndex::CheckVersion", message); + } } -// check index file magic number, return true if OK -bool BamToolsIndex::CheckMagicNumber(void) { +void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) { + refEntry.ID = -1; + refEntry.Blocks.clear(); +} - // see if index is valid BAM index - char magic[4]; - size_t elementsRead = fread(magic, 1, 4, m_indexStream); - if ( elementsRead != 4 ) return false; - if ( strncmp(magic, "BTI\1", 4) != 0 ) { - fprintf(stderr, "Problem with index file - invalid format.\n"); - return false; +void BamToolsIndex::CloseFile(void) { + if ( IsFileOpen() ) { + fclose(Resources.IndexStream); + Resources.IndexStream = 0; } - - // otherwise ok - return true; + m_indexFileSummary.clear(); } -// check index file version, return true if OK -bool BamToolsIndex::CheckVersion(void) { - - // read version from file - size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(m_inputVersion); +// builds index from associated BAM file & writes out to index file +bool BamToolsIndex::Create(void) { - // if version is negative, or zero - if ( m_inputVersion <= 0 ) { - fprintf(stderr, "Problem with index file - invalid version.\n"); + // skip if BamReader is invalid or not open + if ( m_reader == 0 || !m_reader->IsOpen() ) { + SetErrorString("BamToolsIndex::Create", "could not create index: reader is not open"); return false; } - // if version is newer than can be supported by this version of bamtools - else if ( m_inputVersion > m_outputVersion ) { - fprintf(stderr, "Problem with index file - attempting to use an outdated version of BamTools with a newer index file.\n"); - fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n"); + // rewind BamReader + if ( !m_reader->Rewind() ) { + const string readerError = m_reader->GetErrorString(); + const string message = "could not create index: \n\t" + readerError; + SetErrorString("BamToolsIndex::Create", message); return false; } - // ------------------------------------------------------------------ - // check for deprecated, unsupported versions - // (typically whose format did not accomodate a particular bug fix) + try { + // open new index file (read & write) + const string indexFilename = m_reader->Filename() + Extension(); + OpenFile(indexFilename, "w+b"); + + // initialize BtiFileSummary with number of references + const int& numReferences = m_reader->GetReferenceCount(); + InitializeFileSummary(numReferences); + + // intialize output file header + WriteHeader(); + + // index building markers + uint32_t currentBlockCount = 0; + int64_t currentAlignmentOffset = m_reader->Tell(); + int32_t blockRefId = -1; + int32_t blockMaxEndPosition = -1; + int64_t blockStartOffset = currentAlignmentOffset; + int32_t blockStartPosition = -1; + + // plow through alignments, storing index entries + BamAlignment al; + BtiReferenceEntry refEntry; + while ( m_reader->LoadNextAlignment(al) ) { + + // if moved to new reference + if ( al.RefID != blockRefId ) { + + // if first pass, check: + if ( currentBlockCount == 0 ) { + + // write any empty references up to (but not including) al.RefID + for ( int i = 0; i < al.RefID; ++i ) + WriteReferenceEntry( BtiReferenceEntry(i) ); + } + + // not first pass: + else { + + // store previous BTI block data in reference entry + const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); + + // write reference entry, then clear + WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); + + // write any empty references between (but not including) + // the last blockRefID and current al.RefID + for ( int i = blockRefId+1; i < al.RefID; ++i ) + WriteReferenceEntry( BtiReferenceEntry(i) ); + + // reset block count + currentBlockCount = 0; + } + + // set ID for new reference entry + refEntry.ID = al.RefID; + } + + // if beginning of block, update counters + if ( currentBlockCount == 0 ) { + blockRefId = al.RefID; + blockStartOffset = currentAlignmentOffset; + blockStartPosition = al.Position; + blockMaxEndPosition = al.GetEndPosition(); + } + + // increment block counter + ++currentBlockCount; + + // check end position + const int32_t alignmentEndPosition = al.GetEndPosition(); + if ( alignmentEndPosition > blockMaxEndPosition ) + blockMaxEndPosition = alignmentEndPosition; + + // if block is full, get offset for next block, reset currentBlockCount + if ( currentBlockCount == m_blockSize ) { + + // store previous block data in reference entry + const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); + + // update markers + blockStartOffset = m_reader->Tell(); + currentBlockCount = 0; + } + + // not the best name, but for the next iteration, this value will be the offset of the + // *current* alignment. this is necessary because we won't know if this next alignment + // is on a new reference until we actually read it + currentAlignmentOffset = m_reader->Tell(); + } + + // after finishing alignments, if any data was read, check: + if ( blockRefId >= 0 ) { + + // store last BTI block data in reference entry + const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); - else if ( (Version)m_inputVersion == BTI_1_0 ) { - fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n"); - fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); + // write last reference entry, then clear + WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); + + // then write any empty references remaining at end of file + for ( int i = blockRefId+1; i < numReferences; ++i ) + WriteReferenceEntry( BtiReferenceEntry(i) ); + } + + } catch ( BamException& e ) { + m_errorString = e.what(); return false; } - else if ( (Version)m_inputVersion == BTI_1_1 ) { - fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n"); - fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); + // rewind BamReader + if ( !m_reader->Rewind() ) { + const string readerError = m_reader->GetErrorString(); + const string message = "could not create index: \n\t" + readerError; + SetErrorString("BamToolsIndex::Create", message); return false; } - // otherwise ok - else return true; -} - -// clear all current index offset data in memory -void BamToolsIndex::ClearAllData(void) { - BamToolsIndexData::const_iterator indexIter = m_indexData.begin(); - BamToolsIndexData::const_iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - const int& refId = (*indexIter).first; - ClearReferenceOffsets(refId); - } + // return success + return true; } -// clear all index offset data for desired reference -void BamToolsIndex::ClearReferenceOffsets(const int& refId) { - if ( m_indexData.find(refId) == m_indexData.end() ) return; - vector& offsets = m_indexData[refId].Offsets; - offsets.clear(); - m_hasFullDataCache = false; +// returns format's file extension +const std::string BamToolsIndex::Extension(void) { + return BamToolsIndex::BTI_EXTENSION; } -// return file position after header metadata -const off_t BamToolsIndex::DataBeginOffset(void) const { - return m_dataBeginOffset; -} +void BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { + + // return false ref ID is not a valid index in file summary data + if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() ) + throw BamException("BamToolsIndex::GetOffset", "invalid region requested"); + + // retrieve reference index data for left bound reference + BtiReferenceEntry refEntry(region.LeftRefID); + ReadReferenceEntry(refEntry); + + // binary search for an overlapping block (may not be first one though) + bool found = false; + typedef BtiBlockVector::const_iterator BtiBlockConstIterator; + BtiBlockConstIterator blockFirst = refEntry.Blocks.begin(); + BtiBlockConstIterator blockIter = blockFirst; + BtiBlockConstIterator blockLast = refEntry.Blocks.end(); + iterator_traits::difference_type count = distance(blockFirst, blockLast); + iterator_traits::difference_type step; + while ( count > 0 ) { + blockIter = blockFirst; + step = count/2; + advance(blockIter, step); + + const BtiBlock& block = (*blockIter); + if ( block.StartPosition <= region.RightPosition ) { + if ( block.MaxEndPosition > region.LeftPosition ) { + offset = block.StartOffset; + break; + } + blockFirst = ++blockIter; + count -= step+1; + } + else count = step; + } -// calculate BAM file offset for desired region -// return true if no error (*NOT* equivalent to "has alignments or valid offset") -// check @hasAlignmentsInRegion to determine this status -// @region - target region -// @offset - resulting seek target -// @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status -// N.B. - ignores isRightBoundSpecified -bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { + // if we didn't search "off the end" of the blocks + if ( blockIter != blockLast ) { - // return false if leftBound refID is not found in index data - BamToolsIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID); - if ( indexIter == m_indexData.end()) return false; + // "walk back" until we've gone too far + while ( blockIter != blockFirst ) { + const BtiBlock& currentBlock = (*blockIter); - // load index data for region if not already cached - if ( !IsDataLoaded(region.LeftRefID) ) { - bool loadedOk = true; - loadedOk &= SkipToReference(region.LeftRefID); - loadedOk &= LoadReference(region.LeftRefID); - if ( !loadedOk ) return false; - } + --blockIter; + const BtiBlock& previousBlock = (*blockIter); + if ( previousBlock.MaxEndPosition <= region.LeftPosition ) { + offset = currentBlock.StartOffset; + found = true; + break; + } + } - // localize index data for this reference (& sanity check that data actually exists) - indexIter = m_indexData.find(region.LeftRefID); - if ( indexIter == m_indexData.end()) return false; - const vector& referenceOffsets = (*indexIter).second.Offsets; - if ( referenceOffsets.empty() ) return false; - - // ------------------------------------------------------- - // calculate nearest index to jump to - - // save first offset - offset = (*referenceOffsets.begin()).StartOffset; - - // iterate over offsets entries on this reference - vector::const_iterator offsetIter = referenceOffsets.begin(); - vector::const_iterator offsetEnd = referenceOffsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) { - const BamToolsIndexEntry& entry = (*offsetIter); - // break if alignment 'entry' overlaps region - if ( entry.MaxEndPosition >= region.LeftPosition ) break; - offset = (*offsetIter).StartOffset; + // if we walked all the way to first block, just return that and let the reader's + // region overlap parsing do the rest + if ( blockIter == blockFirst ) { + const BtiBlock& block = (*blockIter); + offset = block.StartOffset; + found = true; + } } - // set flag based on whether an index entry was found for this region - *hasAlignmentsInRegion = ( offsetIter != offsetEnd ); - // if cache mode set to none, dump the data we just loaded - if (m_cacheMode == BamIndex::NoIndexCaching ) - ClearReferenceOffsets(region.LeftRefID); - - // return success - return true; + // sets to false if blocks container is empty, or if no matching block could be found + *hasAlignmentsInRegion = found; } // returns whether reference has alignments or no -bool BamToolsIndex::HasAlignments(const int& refId) const { - BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end()) return false; - const BamToolsReferenceEntry& refEntry = (*indexIter).second; - return refEntry.HasAlignments; +bool BamToolsIndex::HasAlignments(const int& referenceID) const { + if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() ) + return false; + const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID); + return ( refSummary.NumBlocks > 0 ); } -// return true if all index data is cached -bool BamToolsIndex::HasFullDataCache(void) const { - return m_hasFullDataCache; +// pre-allocates space for each reference's summary data +void BamToolsIndex::InitializeFileSummary(const int& numReferences) { + m_indexFileSummary.clear(); + for ( int i = 0; i < numReferences; ++i ) + m_indexFileSummary.push_back( BtiReferenceSummary() ); } -// returns true if index cache has data for desired reference -bool BamToolsIndex::IsDataLoaded(const int& refId) const { - - BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end()) return false; - const BamToolsReferenceEntry& refEntry = (*indexIter).second; - - if ( !refEntry.HasAlignments ) return true; // no data period - - // return whether offsets list contains data - return !refEntry.Offsets.empty(); +// returns true if the index stream is open +bool BamToolsIndex::IsFileOpen(void) const { + return ( Resources.IndexStream != 0 ); } -// attempts to use index to jump to region; returns success/fail -bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { +// attempts to use index data to jump to @region, returns success/fail +// a "successful" jump indicates no error, but not whether this region has data +// * thus, the method sets a flag to indicate whether there are alignments +// available after the jump position +bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) { // clear flag *hasAlignmentsInRegion = false; - // check valid BamReader state - if ( m_reader == 0 || m_BGZF == 0 || !m_reader->IsOpen() ) { - fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n"); + // skip if invalid reader or not open + if ( m_reader == 0 || !m_reader->IsOpen() ) { + SetErrorString("BamToolsIndex::Jump", "could not jump: reader is not open"); return false; } // make sure left-bound position is valid - if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) + const RefVector& references = m_reader->GetReferenceData(); + if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) { + SetErrorString("BamToolsIndex::Jump", "could not create index: invalid region requested"); return false; + } // calculate nearest offset to jump to int64_t offset; - if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) { - fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n"); + try { + GetOffset(region, offset, hasAlignmentsInRegion); + } catch ( BamException& e ) { + m_errorString = e.what(); return false; } // return success/failure of seek - return m_BGZF->Seek(offset); + return m_reader->Seek(offset); } -// clears index data from all references except the first -void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) { - BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); - KeepOnlyReferenceOffsets( (*indexBegin).first ); -} +// loads existing data from file into memory +bool BamToolsIndex::Load(const std::string& filename) { -// clears index data from all references except the one specified -void BamToolsIndex::KeepOnlyReferenceOffsets(const int& refId) { - BamToolsIndexData::iterator mapIter = m_indexData.begin(); - BamToolsIndexData::iterator mapEnd = m_indexData.end(); - for ( ; mapIter != mapEnd; ++mapIter ) { - const int entryRefId = (*mapIter).first; - if ( entryRefId != refId ) - ClearReferenceOffsets(entryRefId); + try { + + // attempt to open file (read-only) + OpenFile(filename, "rb"); + + // load metadata & generate in-memory summary + LoadHeader(); + LoadFileSummary(); + + // return success + return true; + + } catch ( BamException& e ) { + m_errorString = e.what(); + return false; } } -// load index data for all references, return true if loaded OK -bool BamToolsIndex::LoadAllReferences(bool saveData) { +void BamToolsIndex::LoadFileSummary(void) { - // skip if data already loaded - if ( m_hasFullDataCache ) return true; + // load number of reference sequences + int numReferences; + LoadNumReferences(numReferences); - // read in number of references - int32_t numReferences; - if ( !LoadReferenceCount(numReferences) ) return false; - //SetReferenceCount(numReferences); + // initialize file summary data + InitializeFileSummary(numReferences); - // iterate over reference entries - bool loadedOk = true; - for ( int i = 0; i < numReferences; ++i ) - loadedOk &= LoadReference(i, saveData); + // load summary for each reference + BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin(); + BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end(); + for ( ; summaryIter != summaryEnd; ++summaryIter ) + LoadReferenceSummary(*summaryIter); +} - // set flag - if ( loadedOk && saveData ) - m_hasFullDataCache = true; +void BamToolsIndex::LoadHeader(void) { - // return success/failure of load - return loadedOk; + // check BTI file metadata + CheckMagicNumber(); + CheckVersion(); + + // use file's BTI block size to set member variable + const size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, Resources.IndexStream); + if ( m_isBigEndian ) SwapEndian_32(m_blockSize); + if ( elementsRead != 1 ) + throw BamException("BamToolsIndex::LoadHeader", "could not read BTI block size"); } -// load header data from index file, return true if loaded OK -bool BamToolsIndex::LoadHeader(void) { +void BamToolsIndex::LoadNumBlocks(int& numBlocks) { + const size_t elementsRead = fread(&numBlocks, sizeof(numBlocks), 1, Resources.IndexStream); + if ( m_isBigEndian ) SwapEndian_32(numBlocks); + if ( elementsRead != 1 ) + throw BamException("BamToolsIndex::LoadNumBlocks", "could not read number of BTI blocks"); +} - // check magic number - if ( !CheckMagicNumber() ) return false; +void BamToolsIndex::LoadNumReferences(int& numReferences) { + const size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, Resources.IndexStream); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + if ( elementsRead != 1 ) + throw BamException("BamToolsIndex::LoadNumReferences", "could not read number of references"); +} - // check BTI version - if ( !CheckVersion() ) return false; +void BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) { - // read in block size - size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(m_blockSize); + // load number of blocks + int numBlocks; + LoadNumBlocks(numBlocks); - // store offset of beginning of data - m_dataBeginOffset = ftell64(m_indexStream); + // store block summary data for this reference + refSummary.NumBlocks = numBlocks; + refSummary.FirstBlockFilePosition = Tell(); - // return success/failure of load - return (elementsRead == 1); + // skip reference's blocks + SkipBlocks(numBlocks); } -// load a single index entry from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamToolsIndex::LoadIndexEntry(const int& refId, bool saveData) { +void BamToolsIndex::OpenFile(const std::string& filename, const char* mode) { - // read in index entry data members - size_t elementsRead = 0; - BamToolsIndexEntry entry; - elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_indexStream); - elementsRead += fread(&entry.StartOffset, sizeof(entry.StartOffset), 1, m_indexStream); - elementsRead += fread(&entry.StartPosition, sizeof(entry.StartPosition), 1, m_indexStream); - if ( elementsRead != 3 ) { - cerr << "Error reading index entry. Expected 3 elements, read in: " << elementsRead << endl; - return false; + // make sure any previous index file is closed + CloseFile(); + + // attempt to open file + Resources.IndexStream = fopen(filename.c_str(), mode); + if ( !IsFileOpen() ) { + const string message = string("could not open file: ") + filename; + throw BamException("BamToolsIndex::OpenFile", message); } +} + +void BamToolsIndex::ReadBlock(BtiBlock& block) { + + // read in block data members + size_t elementsRead = 0; + elementsRead += fread(&block.MaxEndPosition, sizeof(block.MaxEndPosition), 1, Resources.IndexStream); + elementsRead += fread(&block.StartOffset, sizeof(block.StartOffset), 1, Resources.IndexStream); + elementsRead += fread(&block.StartPosition, sizeof(block.StartPosition), 1, Resources.IndexStream); // swap endian-ness if necessary if ( m_isBigEndian ) { - SwapEndian_32(entry.MaxEndPosition); - SwapEndian_64(entry.StartOffset); - SwapEndian_32(entry.StartPosition); + SwapEndian_32(block.MaxEndPosition); + SwapEndian_64(block.StartOffset); + SwapEndian_32(block.StartPosition); } - // save data - if ( saveData ) - SaveOffsetEntry(refId, entry); - - // return success/failure of load - return true; -} - -// load a single reference from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamToolsIndex::LoadFirstReference(bool saveData) { - BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); - return LoadReference( (*indexBegin).first, saveData ); + if ( elementsRead != 3 ) + throw BamException("BamToolsIndex::ReadBlock", "could not read block"); } -// load a single reference from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamToolsIndex::LoadReference(const int& refId, bool saveData) { - - // read in number of offsets for this reference - uint32_t numOffsets; - size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(numOffsets); +void BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) { - // initialize offsets container for this reference - SetOffsetCount(refId, (int)numOffsets); + // prep blocks container + blocks.clear(); + blocks.reserve(refSummary.NumBlocks); - // iterate over offset entries - for ( unsigned int j = 0; j < numOffsets; ++j ) - LoadIndexEntry(refId, saveData); + // skip to first block entry + Seek( refSummary.FirstBlockFilePosition, SEEK_SET ); - // return success/failure of load - return true; + // read & store block entries + BtiBlock block; + for ( int i = 0; i < refSummary.NumBlocks; ++i ) { + ReadBlock(block); + blocks.push_back(block); + } } -// loads number of references, return true if loaded OK -bool BamToolsIndex::LoadReferenceCount(int& numReferences) { - - size_t elementsRead = 0; +void BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) { - // read reference count - elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(numReferences); + // return false if refId not valid index in file summary structure + if ( refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size() ) + throw BamException("BamToolsIndex::ReadReferenceEntry", "invalid reference requested"); - // return success/failure of load - return ( elementsRead == 1 ); + // use index summary to assist reading the reference's BTI blocks + const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID); + ReadBlocks(refSummary, refEntry.Blocks); } -// saves an index offset entry in memory -void BamToolsIndex::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) { - BamToolsReferenceEntry& refEntry = m_indexData[refId]; - refEntry.HasAlignments = true; - refEntry.Offsets.push_back(entry); +void BamToolsIndex::Seek(const int64_t& position, const int& origin) { + if ( fseek64(Resources.IndexStream, position, origin) != 0 ) + throw BamException("BamToolsIndex::Seek", "could not seek in BAI file"); } -// pre-allocates size for offset vector -void BamToolsIndex::SetOffsetCount(const int& refId, const int& offsetCount) { - BamToolsReferenceEntry& refEntry = m_indexData[refId]; - refEntry.Offsets.reserve(offsetCount); - refEntry.HasAlignments = ( offsetCount > 0); +// change the index caching behavior +void BamToolsIndex::SetCacheMode(const BamIndex::IndexCacheMode& mode) { + m_cacheMode = mode; + // do nothing else here ? cache mode will be ignored from now on, most likely } -// initializes index data structure to hold @count references -void BamToolsIndex::SetReferenceCount(const int& count) { - for ( int i = 0; i < count; ++i ) - m_indexData[i].HasAlignments = false; +void BamToolsIndex::SkipBlocks(const int& numBlocks) { + Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR ); } -// position file pointer to first reference begin, return true if skipped OK -bool BamToolsIndex::SkipToFirstReference(void) { - BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); - return SkipToReference( (*indexBegin).first ); +int64_t BamToolsIndex::Tell(void) const { + return ftell64(Resources.IndexStream); } -// position file pointer to desired reference begin, return true if skipped OK -bool BamToolsIndex::SkipToReference(const int& refId) { - - // attempt rewind - if ( !Rewind() ) return false; +void BamToolsIndex::WriteBlock(const BtiBlock& block) { - // read in number of references - int32_t numReferences; - size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(numReferences); + // copy entry data + int32_t maxEndPosition = block.MaxEndPosition; + int64_t startOffset = block.StartOffset; + int32_t startPosition = block.StartPosition; - // iterate over reference entries - bool skippedOk = true; - int currentRefId = 0; - while (currentRefId != refId) { - skippedOk &= LoadReference(currentRefId, false); - ++currentRefId; + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_32(maxEndPosition); + SwapEndian_64(startOffset); + SwapEndian_32(startPosition); } - // return success/failure of skip - return skippedOk; + // write the reference index entry + size_t elementsWritten = 0; + elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, Resources.IndexStream); + elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, Resources.IndexStream); + elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, Resources.IndexStream); + if ( elementsWritten != 3 ) + throw BamException("BamToolsIndex::WriteBlock", "could not write BTI block"); +} + +void BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) { + BtiBlockVector::const_iterator blockIter = blocks.begin(); + BtiBlockVector::const_iterator blockEnd = blocks.end(); + for ( ; blockIter != blockEnd; ++blockIter ) + WriteBlock(*blockIter); } -// write header to new index file -bool BamToolsIndex::WriteHeader(void) { +void BamToolsIndex::WriteHeader(void) { size_t elementsWritten = 0; // write BTI index format 'magic number' - elementsWritten += fwrite("BTI\1", 1, 4, m_indexStream); + elementsWritten += fwrite(BamToolsIndex::BTI_MAGIC, 1, 4, Resources.IndexStream); // write BTI index format version int32_t currentVersion = (int32_t)m_outputVersion; if ( m_isBigEndian ) SwapEndian_32(currentVersion); - elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_indexStream); + elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, Resources.IndexStream); // write block size - int32_t blockSize = m_blockSize; + uint32_t blockSize = m_blockSize; if ( m_isBigEndian ) SwapEndian_32(blockSize); - elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_indexStream); - - // store offset of beginning of data - m_dataBeginOffset = ftell64(m_indexStream); - - // return success/failure of write - return ( elementsWritten == 6 ); -} - -// write index data for all references to new index file -bool BamToolsIndex::WriteAllReferences(void) { - - size_t elementsWritten = 0; + elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, Resources.IndexStream); // write number of references - int32_t numReferences = (int32_t)m_indexData.size(); + int32_t numReferences = m_indexFileSummary.size(); if ( m_isBigEndian ) SwapEndian_32(numReferences); - elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream); + elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, Resources.IndexStream); - // iterate through references in index - bool refOk = true; - BamToolsIndexData::const_iterator refIter = m_indexData.begin(); - BamToolsIndexData::const_iterator refEnd = m_indexData.end(); - for ( ; refIter != refEnd; ++refIter ) - refOk &= WriteReferenceEntry( (*refIter).second ); - - return ( (elementsWritten == 1) && refOk ); + if ( elementsWritten != 7 ) + throw BamException("BamToolsIndex::WriteHeader", "could not write BTI header"); } -// write current reference index data to new index file -bool BamToolsIndex::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) { +void BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) { - size_t elementsWritten = 0; + // write number of blocks this reference + uint32_t numBlocks = refEntry.Blocks.size(); + if ( m_isBigEndian ) SwapEndian_32(numBlocks); + const size_t elementsWritten = fwrite(&numBlocks, sizeof(numBlocks), 1, Resources.IndexStream); + if ( elementsWritten != 1 ) + throw BamException("BamToolsIndex::WriteReferenceEntry", "could not write number of blocks"); - // write number of offsets listed for this reference - uint32_t numOffsets = refEntry.Offsets.size(); - if ( m_isBigEndian ) SwapEndian_32(numOffsets); - elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_indexStream); - - // iterate over offset entries - bool entriesOk = true; - vector::const_iterator offsetIter = refEntry.Offsets.begin(); - vector::const_iterator offsetEnd = refEntry.Offsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) - entriesOk &= WriteIndexEntry( (*offsetIter) ); - - return ( (elementsWritten == 1) && entriesOk ); -} - -// write current index offset entry to new index file -bool BamToolsIndex::WriteIndexEntry(const BamToolsIndexEntry& entry) { - - // copy entry data - int32_t maxEndPosition = entry.MaxEndPosition; - int64_t startOffset = entry.StartOffset; - int32_t startPosition = entry.StartPosition; - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_32(maxEndPosition); - SwapEndian_64(startOffset); - SwapEndian_32(startPosition); - } - - // write the reference index entry - size_t elementsWritten = 0; - elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream); - elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_indexStream); - elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_indexStream); - return ( elementsWritten == 3 ); + // write actual block entries + WriteBlocks(refEntry.Blocks); }