X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Fapi%2FBamIndex.cpp;h=3e5f86e701e68baa9751781f687e012dab5223ee;hb=8c80d760637f8df39262683cd2570f0589423d36;hp=fb85d07038893f95347e448688c3edf8edf8b506;hpb=0ef88b963e167f9d66fe9e55d9c22c3863498b29;p=bamtools.git diff --git a/src/api/BamIndex.cpp b/src/api/BamIndex.cpp index fb85d07..3e5f86e 100644 --- a/src/api/BamIndex.cpp +++ b/src/api/BamIndex.cpp @@ -3,1201 +3,188 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 10 September 2010 (DB) +// Last modified: 23 March 2011 (DB) // --------------------------------------------------------------------------- // Provides index functionality - both for the default (standardized) BAM // index format (.bai) as well as a BamTools-specific (nonstandard) index // format (.bti). // *************************************************************************** +#include +#include +#include +#include +using namespace BamTools; +using namespace BamTools::Internal; + #include #include #include #include #include -#include "BamIndex.h" -#include "BamReader.h" -#include "BGZF.h" using namespace std; -using namespace BamTools; -// ------------------------------- -// BamIndex implementation +/*! \class BamTools::BamIndex + \brief Provides methods for generating & loading BAM index files. -BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian) - : m_BGZF(bgzf) - , m_reader(reader) - , m_isBigEndian(isBigEndian) -{ - if ( m_reader && m_reader->IsOpen() ) - m_references = m_reader->GetReferenceData(); -} + This class straddles the line between public API and internal + implementation detail. Most client code should never have to use this + class directly. -bool BamIndex::HasAlignments(const int& referenceID) { - - // return false if invalid ID - if ( (referenceID < 0) || (referenceID >= (int)m_references.size()) ) - return false; - - // else return status of reference (has alignments?) - else - return m_references.at(referenceID).RefHasAlignments; -} + It is exposed to the public API to allow advanced users to implement + their own custom indexing schemes. -// ######################################################################################### -// ######################################################################################### + N.B. - Please note that if you wish to derive your own subclass, you are + entering waters that are not well-documented at the moment and are + likely to be changing soon anyway. Implementing a custom index is + technically do-able at the moment, but the learning curve is (at the + moment) overly steep. Changes will be coming soon to alleviate some + of this headache. +*/ -// ------------------------------- -// BamStandardIndex structs & typedefs - -namespace BamTools { +// ctor +BamIndex::BamIndex(void) + : m_indexStream(0) + , m_indexFilename("") + , m_cacheMode(BamIndex::LimitedIndexCaching) +{ } -// -------------------------------------------------- -// BamStandardIndex data structures & typedefs -struct Chunk { - - // data members - uint64_t Start; - uint64_t Stop; - - // constructor - Chunk(const uint64_t& start = 0, - const uint64_t& stop = 0) - : Start(start) - , Stop(stop) - { } -}; - -bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) { - return lhs.Start < rhs.Start; +// dtor +BamIndex::~BamIndex(void) { + if ( IsOpen() ) fclose(m_indexStream); + m_indexFilename = ""; } -typedef vector ChunkVector; -typedef map BamBinMap; -typedef vector LinearOffsetVector; - -struct ReferenceIndex { - - // data members - BamBinMap Bins; - LinearOffsetVector Offsets; - - // constructor - ReferenceIndex(const BamBinMap& binMap = BamBinMap(), - const LinearOffsetVector& offsets = LinearOffsetVector()) - : Bins(binMap) - , Offsets(offsets) - { } -}; - -typedef vector BamStandardIndexData; - -} // namespace BamTools - -// ------------------------------- -// BamStandardIndexPrivate implementation - -struct BamStandardIndex::BamStandardIndexPrivate { - - // ------------------------- - // data members - - BamStandardIndexData m_indexData; - BamStandardIndex* m_parent; - - // ------------------------- - // ctor & dtor - - BamStandardIndexPrivate(BamStandardIndex* parent) : m_parent(parent) { } - ~BamStandardIndexPrivate(void) { m_indexData.clear(); } - - // ------------------------- - // 'public' methods - - // creates index data (in-memory) from current reader data - bool Build(void); - // attempts to use index to jump to region; returns success/fail - bool Jump(const BamTools::BamRegion& region); - // loads existing data from file into memory - bool Load(const std::string& filename); - // writes in-memory index data out to file - // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) - bool Write(const std::string& bamFilename); - - // ------------------------- - // internal methods - - // calculate bins that overlap region - int BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]); - // calculates offset(s) for a given region - bool GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets); - // saves BAM bin entry for index - void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset); - // saves linear offset entry for index - void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset); - // simplifies index by merging 'chunks' - void MergeChunks(void); -}; - -// calculate bins that overlap region -int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) { - - // get region boundaries - uint32_t begin = (unsigned int)region.LeftPosition; - uint32_t end; - - // if right bound specified AND left&right bounds are on same reference - // OK to use right bound position - if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) ) - end = (unsigned int)region.RightPosition; - - // otherwise, use end of left bound reference as cutoff - else - end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1; - - // initialize list, bin '0' always a valid bin - int i = 0; - bins[i++] = 0; - - // get rest of bins that contain this region - unsigned int k; - for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; } - for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; } - for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; } - for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; } - for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; } - - // return number of bins stored - return i; +// return true if FILE* is open +bool BamIndex::IsOpen(void) const { + return ( m_indexStream != 0 ); } -bool BamStandardIndex::BamStandardIndexPrivate::Build(void) { - - // localize parent data - if ( m_parent == 0 ) return false; - BamReader* reader = m_parent->m_reader; - BgzfData* mBGZF = m_parent->m_BGZF; - RefVector& references = m_parent->m_references; - - // be sure reader & BGZF file are valid & open for reading - if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen ) - return false; - - // move file pointer to beginning of alignments - reader->Rewind(); - - // get reference count, reserve index space - int numReferences = (int)references.size(); - for ( int i = 0; i < numReferences; ++i ) - m_indexData.push_back(ReferenceIndex()); - - // sets default constant for bin, ID, offset, coordinate variables - const uint32_t defaultValue = 0xffffffffu; - - // bin data - uint32_t saveBin(defaultValue); - uint32_t lastBin(defaultValue); - - // reference ID data - int32_t saveRefID(defaultValue); - int32_t lastRefID(defaultValue); - - // offset data - uint64_t saveOffset = mBGZF->Tell(); - uint64_t lastOffset = saveOffset; - - // coordinate data - int32_t lastCoordinate = defaultValue; - - BamAlignment bAlignment; - while ( reader->GetNextAlignmentCore(bAlignment) ) { - - // change of chromosome, save ID, reset bin - if ( lastRefID != bAlignment.RefID ) { - lastRefID = bAlignment.RefID; - lastBin = defaultValue; - } - - // if lastCoordinate greater than BAM position - file not sorted properly - else if ( lastCoordinate > bAlignment.Position ) { - fprintf(stderr, "BAM file not properly sorted:\n"); - fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID); - exit(1); - } - - // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions) - if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) { - - // save linear offset entry (matched to BAM entry refID) - ReferenceIndex& refIndex = m_indexData.at(bAlignment.RefID); - LinearOffsetVector& offsets = refIndex.Offsets; - InsertLinearOffset(offsets, bAlignment, lastOffset); - } - - // if current BamAlignment bin != lastBin, "then possibly write the binning index" - if ( bAlignment.Bin != lastBin ) { - - // if not first time through - if ( saveBin != defaultValue ) { - - // save Bam bin entry - ReferenceIndex& refIndex = m_indexData.at(saveRefID); - BamBinMap& binMap = refIndex.Bins; - InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); - } - - // update saveOffset - saveOffset = lastOffset; - - // update bin values - saveBin = bAlignment.Bin; - lastBin = bAlignment.Bin; - - // update saveRefID - saveRefID = bAlignment.RefID; - - // if invalid RefID, break out - if ( saveRefID < 0 ) break; - } - - // make sure that current file pointer is beyond lastOffset - if ( mBGZF->Tell() <= (int64_t)lastOffset ) { - fprintf(stderr, "Error in BGZF offsets.\n"); - exit(1); - } - - // update lastOffset - lastOffset = mBGZF->Tell(); - - // update lastCoordinate - lastCoordinate = bAlignment.Position; - } - - // save any leftover BAM data (as long as refID is valid) - if ( saveRefID >= 0 ) { - // save Bam bin entry - ReferenceIndex& refIndex = m_indexData.at(saveRefID); - BamBinMap& binMap = refIndex.Bins; - InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); - } - - // simplify index by merging chunks - MergeChunks(); - - // iterate through references in index - // store whether reference has data & - // sort offsets in linear offset vector - BamStandardIndexData::iterator indexIter = m_indexData.begin(); - BamStandardIndexData::iterator indexEnd = m_indexData.end(); - for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) { - - // get reference index data - ReferenceIndex& refIndex = (*indexIter); - BamBinMap& binMap = refIndex.Bins; - LinearOffsetVector& offsets = refIndex.Offsets; - - // store whether reference has alignments or no - references[i].RefHasAlignments = ( binMap.size() > 0 ); - - // sort linear offsets - sort(offsets.begin(), offsets.end()); - } - - // rewind file pointer to beginning of alignments, return success/fail - return reader->Rewind(); -} - -bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets) { - - // calculate which bins overlap this region - uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); - int numBins = BinsFromRegion(region, isRightBoundSpecified, bins); - - // get bins for this reference - const ReferenceIndex& refIndex = m_indexData.at(region.LeftRefID); - const BamBinMap& binMap = refIndex.Bins; - - // get minimum offset to consider - const LinearOffsetVector& linearOffsets = refIndex.Offsets; - uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT); - - // store all alignment 'chunk' starts (file offsets) for bins in this region - for ( int i = 0; i < numBins; ++i ) { - - const uint16_t binKey = bins[i]; - map::const_iterator binIter = binMap.find(binKey); - if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) { - - // iterate over chunks - const ChunkVector& chunks = (*binIter).second; - std::vector::const_iterator chunksIter = chunks.begin(); - std::vector::const_iterator chunksEnd = chunks.end(); - for ( ; chunksIter != chunksEnd; ++chunksIter) { - - // if valid chunk found, store its file offset - const Chunk& chunk = (*chunksIter); - if ( chunk.Stop > minOffset ) - offsets.push_back( chunk.Start ); - } - } - } - - // clean up memory - free(bins); - - // sort the offsets before returning - sort(offsets.begin(), offsets.end()); - - // return whether any offsets were found - return ( offsets.size() != 0 ); -} - -// saves BAM bin entry for index -void BamStandardIndex::BamStandardIndexPrivate::InsertBinEntry(BamBinMap& binMap, - const uint32_t& saveBin, - const uint64_t& saveOffset, - const uint64_t& lastOffset) -{ - // look up saveBin - BamBinMap::iterator binIter = binMap.find(saveBin); - - // create new chunk - Chunk newChunk(saveOffset, lastOffset); - - // if entry doesn't exist - if ( binIter == binMap.end() ) { - ChunkVector newChunks; - newChunks.push_back(newChunk); - binMap.insert( pair(saveBin, newChunks)); - } - - // otherwise - else { - ChunkVector& binChunks = (*binIter).second; - binChunks.push_back( newChunk ); - } -} - -// saves linear offset entry for index -void BamStandardIndex::BamStandardIndexPrivate::InsertLinearOffset(LinearOffsetVector& offsets, - const BamAlignment& bAlignment, - const uint64_t& lastOffset) -{ - // get converted offsets - int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; - int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; - - // resize vector if necessary - int oldSize = offsets.size(); - int newSize = endOffset + 1; - if ( oldSize < newSize ) - offsets.resize(newSize, 0); - - // store offset - for( int i = beginOffset + 1; i <= endOffset; ++i ) { - if ( offsets[i] == 0 ) - offsets[i] = lastOffset; - } -} +// loads existing data from file into memory +bool BamIndex::Load(const string& filename) { -bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region) { - - // localize parent data - if ( m_parent == 0 ) return false; - BamReader* reader = m_parent->m_reader; - BgzfData* mBGZF = m_parent->m_BGZF; - RefVector& references = m_parent->m_references; - - // be sure reader & BGZF file are valid & open for reading - if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen ) - return false; - - // see if left-bound reference of region has alignments - if ( !m_parent->HasAlignments(region.LeftRefID) ) return false; - - // make sure left-bound position is valid - if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false; - - vector offsets; - if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets) ) { - fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n"); + // open index file, abort on error + if ( !OpenIndexFile(filename, "rb") ) { + fprintf(stderr, "BamIndex ERROR: unable to open the BAM index file %s for reading.\n", filename.c_str()); return false; } - - // iterate through offsets - BamAlignment bAlignment; - bool result = true; - for ( vector::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { - - // attempt seek & load first available alignment - result &= mBGZF->Seek(*o); - reader->GetNextAlignmentCore(bAlignment); - - // if this alignment corresponds to desired position - // return success of seeking back to 'current offset' - if ( (bAlignment.RefID == region.LeftRefID && bAlignment.Position + bAlignment.Length > region.LeftPosition) || (bAlignment.RefID > region.LeftRefID) ) { - if ( o != offsets.begin() ) --o; - return mBGZF->Seek(*o); - } - } - - if ( !result ) fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n"); - return result; -} -bool BamStandardIndex::BamStandardIndexPrivate::Load(const string& filename) { - - // localize parent data - if ( m_parent == 0 ) return false; - const bool isBigEndian = m_parent->m_isBigEndian; - RefVector& references = m_parent->m_references; - - // open index file, abort on error - FILE* indexStream = fopen(filename.c_str(), "rb"); - if( !indexStream ) { - fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str()); + // check magic number + if ( !LoadHeader() ) { + fclose(m_indexStream); return false; } - // set placeholder to receive input byte count (suppresses compiler warnings) - size_t elementsRead = 0; - - // see if index is valid BAM index - char magic[4]; - elementsRead = fread(magic, 1, 4, indexStream); - if ( strncmp(magic, "BAI\1", 4) ) { - fprintf(stderr, "Problem with index file - invalid format.\n"); - fclose(indexStream); + // load reference data (but only keep in memory if full caching requested) + bool saveInitialLoad = ( m_cacheMode == BamIndex::FullIndexCaching ); + if ( !LoadAllReferences(saveInitialLoad) ) { + fclose(m_indexStream); return false; } - // get number of reference sequences - uint32_t numRefSeqs; - elementsRead = fread(&numRefSeqs, 4, 1, indexStream); - if ( isBigEndian ) SwapEndian_32(numRefSeqs); - - // intialize space for BamStandardIndexData data structure - m_indexData.reserve(numRefSeqs); - - // iterate over reference sequences - for ( unsigned int i = 0; i < numRefSeqs; ++i ) { - - // get number of bins for this reference sequence - int32_t numBins; - elementsRead = fread(&numBins, 4, 1, indexStream); - if ( isBigEndian ) SwapEndian_32(numBins); - - if ( numBins > 0 ) { - RefData& refEntry = references[i]; - refEntry.RefHasAlignments = true; - } - - // intialize BinVector - BamBinMap binMap; - - // iterate over bins for that reference sequence - for ( int j = 0; j < numBins; ++j ) { - - // get binID - uint32_t binID; - elementsRead = fread(&binID, 4, 1, indexStream); - - // get number of regionChunks in this bin - uint32_t numChunks; - elementsRead = fread(&numChunks, 4, 1, indexStream); - - if ( isBigEndian ) { - SwapEndian_32(binID); - SwapEndian_32(numChunks); - } - - // intialize ChunkVector - ChunkVector regionChunks; - regionChunks.reserve(numChunks); - - // iterate over regionChunks in this bin - for ( unsigned int k = 0; k < numChunks; ++k ) { - - // get chunk boundaries (left, right) - uint64_t left; - uint64_t right; - elementsRead = fread(&left, 8, 1, indexStream); - elementsRead = fread(&right, 8, 1, indexStream); - - if ( isBigEndian ) { - SwapEndian_64(left); - SwapEndian_64(right); - } - - // save ChunkPair - regionChunks.push_back( Chunk(left, right) ); - } - - // sort chunks for this bin - sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan ); - - // save binID, chunkVector for this bin - binMap.insert( pair(binID, regionChunks) ); - } - - // ----------------------------------------------------- - // load linear index for this reference sequence - - // get number of linear offsets - int32_t numLinearOffsets; - elementsRead = fread(&numLinearOffsets, 4, 1, indexStream); - if ( isBigEndian ) SwapEndian_32(numLinearOffsets); - - // intialize LinearOffsetVector - LinearOffsetVector offsets; - offsets.reserve(numLinearOffsets); + // update index cache based on selected mode + UpdateCache(); - // iterate over linear offsets for this reference sequeence - uint64_t linearOffset; - for ( int j = 0; j < numLinearOffsets; ++j ) { - // read a linear offset & store - elementsRead = fread(&linearOffset, 8, 1, indexStream); - if ( isBigEndian ) SwapEndian_64(linearOffset); - offsets.push_back(linearOffset); - } - - // sort linear offsets - sort( offsets.begin(), offsets.end() ); - - // store index data for that reference sequence - m_indexData.push_back( ReferenceIndex(binMap, offsets) ); - } - - // close index file (.bai) and return - fclose(indexStream); + // return success return true; } -// merges 'alignment chunks' in BAM bin (used for index building) -void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) { - - // iterate over reference enties - BamStandardIndexData::iterator indexIter = m_indexData.begin(); - BamStandardIndexData::iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - - // get BAM bin map for this reference - ReferenceIndex& refIndex = (*indexIter); - BamBinMap& bamBinMap = refIndex.Bins; - - // iterate over BAM bins - BamBinMap::iterator binIter = bamBinMap.begin(); - BamBinMap::iterator binEnd = bamBinMap.end(); - for ( ; binIter != binEnd; ++binIter ) { - - // get chunk vector for this bin - ChunkVector& binChunks = (*binIter).second; - if ( binChunks.size() == 0 ) continue; - - ChunkVector mergedChunks; - mergedChunks.push_back( binChunks[0] ); - - // iterate over chunks - int i = 0; - ChunkVector::iterator chunkIter = binChunks.begin(); - ChunkVector::iterator chunkEnd = binChunks.end(); - for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { - - // get 'currentChunk' based on numeric index - Chunk& currentChunk = mergedChunks[i]; - - // get iteratorChunk based on vector iterator - Chunk& iteratorChunk = (*chunkIter); - - // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted) - if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) { - - // set currentChunk.Stop to iteratorChunk.Stop - currentChunk.Stop = iteratorChunk.Stop; - } - - // otherwise - else { - // set currentChunk + 1 to iteratorChunk - mergedChunks.push_back(iteratorChunk); - ++i; - } - } - - // saved merged chunk vector - (*binIter).second = mergedChunks; - } - } -} - -// writes in-memory index data out to file -// N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamStandardIndex::BamStandardIndexPrivate::Write(const std::string& bamFilename) { - - // localize parent data - if ( m_parent == 0 ) return false; - const bool isBigEndian = m_parent->m_isBigEndian; - - string indexFilename = bamFilename + ".bai"; - FILE* indexStream = fopen(indexFilename.c_str(), "wb"); - if ( indexStream == 0 ) { - fprintf(stderr, "ERROR: Could not open file to save index.\n"); - return false; - } - - // write BAM index header - fwrite("BAI\1", 1, 4, indexStream); - - // write number of reference sequences - int32_t numReferenceSeqs = m_indexData.size(); - if ( isBigEndian ) SwapEndian_32(numReferenceSeqs); - fwrite(&numReferenceSeqs, 4, 1, indexStream); - - // iterate over reference sequences - BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); - BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++ indexIter ) { - - // get reference index data - const ReferenceIndex& refIndex = (*indexIter); - const BamBinMap& binMap = refIndex.Bins; - const LinearOffsetVector& offsets = refIndex.Offsets; +// opens index file for reading/writing, return true if opened OK +bool BamIndex::OpenIndexFile(const string& filename, const string& mode) { - // write number of bins - int32_t binCount = binMap.size(); - if ( isBigEndian ) SwapEndian_32(binCount); - fwrite(&binCount, 4, 1, indexStream); + // attempt to open file, return false if error + m_indexStream = fopen(filename.c_str(), mode.c_str()); + if ( m_indexStream == 0 ) return false; - // iterate over bins - BamBinMap::const_iterator binIter = binMap.begin(); - BamBinMap::const_iterator binEnd = binMap.end(); - for ( ; binIter != binEnd; ++binIter ) { - - // get bin data (key and chunk vector) - uint32_t binKey = (*binIter).first; - const ChunkVector& binChunks = (*binIter).second; - - // save BAM bin key - if ( isBigEndian ) SwapEndian_32(binKey); - fwrite(&binKey, 4, 1, indexStream); - - // save chunk count - int32_t chunkCount = binChunks.size(); - if ( isBigEndian ) SwapEndian_32(chunkCount); - fwrite(&chunkCount, 4, 1, indexStream); - - // iterate over chunks - ChunkVector::const_iterator chunkIter = binChunks.begin(); - ChunkVector::const_iterator chunkEnd = binChunks.end(); - for ( ; chunkIter != chunkEnd; ++chunkIter ) { - - // get current chunk data - const Chunk& chunk = (*chunkIter); - uint64_t start = chunk.Start; - uint64_t stop = chunk.Stop; - - if ( isBigEndian ) { - SwapEndian_64(start); - SwapEndian_64(stop); - } - - // save chunk offsets - fwrite(&start, 8, 1, indexStream); - fwrite(&stop, 8, 1, indexStream); - } - } - - // write linear offsets size - int32_t offsetSize = offsets.size(); - if ( isBigEndian ) SwapEndian_32(offsetSize); - fwrite(&offsetSize, 4, 1, indexStream); - - // iterate over linear offsets - LinearOffsetVector::const_iterator offsetIter = offsets.begin(); - LinearOffsetVector::const_iterator offsetEnd = offsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) { - - // write linear offset value - uint64_t linearOffset = (*offsetIter); - if ( isBigEndian ) SwapEndian_64(linearOffset); - fwrite(&linearOffset, 8, 1, indexStream); - } - } - - // flush buffer, close file, and return success - fflush(indexStream); - fclose(indexStream); + // otherwise save filename & return sucess + m_indexFilename = filename; return true; } - -// --------------------------------------------------------------- -// BamStandardIndex implementation - -BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) - : BamIndex(bgzf, reader, isBigEndian) -{ - d = new BamStandardIndexPrivate(this); -} -BamStandardIndex::~BamStandardIndex(void) { - delete d; - d = 0; +// rewind index file to beginning of index data, return true if rewound OK +bool BamIndex::Rewind(void) { + return ( fseek64(m_indexStream, DataBeginOffset(), SEEK_SET) == 0 ); } -// creates index data (in-memory) from current reader data -bool BamStandardIndex::Build(void) { return d->Build(); } - -// attempts to use index to jump to region; returns success/fail -bool BamStandardIndex::Jump(const BamRegion& region) { return d->Jump(region); } - -// loads existing data from file into memory -bool BamStandardIndex::Load(const string& filename) { return d->Load(filename); } - -// writes in-memory index data out to file -// N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamStandardIndex::Write(const string& bamFilename) { return d->Write(bamFilename); } - -// ######################################################################################### -// ######################################################################################### - -// --------------------------------------------------- -// BamToolsIndex structs & typedefs +// change the index caching behavior +void BamIndex::SetCacheMode(const BamIndex::IndexCacheMode& mode) { + if ( mode != m_cacheMode ) { + m_cacheMode = mode; + UpdateCache(); + } +} -namespace BamTools { - -// individual index entry -struct BamToolsIndexEntry { - - // data members - int32_t MaxEndPosition; - int64_t StartOffset; - int32_t StartPosition; - - // ctor - BamToolsIndexEntry(const int32_t& maxEndPosition = 0, - const int64_t& startOffset = 0, - const int32_t& startPosition = 0) - : MaxEndPosition(maxEndPosition) - , StartOffset(startOffset) - , StartPosition(startPosition) - { } -}; +// updates in-memory cache of index data, depending on current cache mode +void BamIndex::UpdateCache(void) { -// the actual index data structure -typedef map > BamToolsIndexData; - -} // namespace BamTools + // skip if file not open + if ( !IsOpen() ) return; -// --------------------------------------------------- -// BamToolsIndexPrivate implementation + // reflect requested cache mode behavior + switch ( m_cacheMode ) { -struct BamToolsIndex::BamToolsIndexPrivate { - - // keep a list of any supported versions here - // (might be useful later to handle any 'legacy' versions if the format changes) - // listed for example like: BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on - // - // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by: - // - // if ( indexVersion >= BTI_1_2 ) - // do something new - // else - // do the old thing - enum Version { BTI_1_0 = 1 }; - - // ------------------------- - // data members - BamToolsIndexData m_indexData; - BamToolsIndex* m_parent; - int32_t m_blockSize; - Version m_version; - - // ------------------------- - // ctor & dtor - - BamToolsIndexPrivate(BamToolsIndex* parent) - : m_parent(parent) - , m_blockSize(1000) - , m_version(BTI_1_0) // latest version - used for writing new index files - { } - - ~BamToolsIndexPrivate(void) { } - - // ------------------------- - // 'public' methods - - // creates index data (in-memory) from current reader data - bool Build(void); - // returns supported file extension - const std::string Extension(void) const { return std::string(".bti"); } - // attempts to use index to jump to region; returns success/fail - bool Jump(const BamTools::BamRegion& region); - // loads existing data from file into memory - bool Load(const std::string& filename); - // writes in-memory index data out to file - // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) - bool Write(const std::string& bamFilename); - - // ------------------------- - // internal methods - - // calculates offset(s) for a given region - bool GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets); -}; + case (BamIndex::FullIndexCaching) : + Rewind(); + LoadAllReferences(true); + break; -bool BamToolsIndex::BamToolsIndexPrivate::Build(void) { - - // localize parent data - if ( m_parent == 0 ) return false; - BamReader* reader = m_parent->m_reader; - BgzfData* mBGZF = m_parent->m_BGZF; - RefVector& references = m_parent->m_references; - - // be sure reader & BGZF file are valid & open for reading - if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen ) - return false; + case (BamIndex::LimitedIndexCaching) : + if ( HasFullDataCache() ) + KeepOnlyFirstReferenceOffsets(); + else { + ClearAllData(); + SkipToFirstReference(); + LoadFirstReference(true); + } + break; - // move file pointer to beginning of alignments - // quit if failed - if ( !reader->Rewind() ) - return false; - - // set up counters and markers - int32_t currentBlockCount = 0; - int64_t currentAlignmentOffset = mBGZF->Tell(); - int32_t blockRefId = 0; - int32_t blockMaxEndPosition = 0; - int64_t blockStartOffset = currentAlignmentOffset; - int32_t blockStartPosition = -1; - - // plow through alignments, storing index entries - BamAlignment al; - while ( reader->GetNextAlignmentCore(al) ) { - - // if block contains data (not the first time through) AND alignment is on a new reference - if ( currentBlockCount > 0 && al.RefID != blockRefId ) { - - // make sure reference flag is set - references[blockRefId].RefHasAlignments = true; - - // store previous data - m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) ); - - // intialize new block - currentBlockCount = 0; - blockMaxEndPosition = al.GetEndPosition(); - blockStartOffset = currentAlignmentOffset; - } - - // if beginning of block, save first alignment's refID & position - if ( currentBlockCount == 0 ) { - blockRefId = al.RefID; - blockStartPosition = al.Position; - } - - // increment block counter - ++currentBlockCount; - - // check end position - int32_t alignmentEndPosition = al.GetEndPosition(); - if ( alignmentEndPosition > blockMaxEndPosition ) - blockMaxEndPosition = alignmentEndPosition; - - // if block is full, get offset for next block, reset currentBlockCount - if ( currentBlockCount == m_blockSize ) { - - // make sure reference flag is set - references[blockRefId].RefHasAlignments = true; - - m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) ); - blockStartOffset = mBGZF->Tell(); - currentBlockCount = 0; - } - - // not the best name, but for the next iteration, this value will be the offset of the *current* alignment - // necessary because we won't know if this next alignment is on a new reference until we actually read it - currentAlignmentOffset = mBGZF->Tell(); - } - - // attempt to rewind back to begnning of alignments - // return success/fail - return reader->Rewind(); -} + case(BamIndex::NoIndexCaching) : + ClearAllData(); + break; -// N.B. - ignores isRightBoundSpecified -bool BamToolsIndex::BamToolsIndexPrivate::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets) { - - // return false if leftBound refID is not found in index data - if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) return false; - - // clear any prior data - offsets.clear(); - - const vector referenceOffsets = m_indexData[region.LeftRefID]; - if ( referenceOffsets.empty() ) return false; - - // ------------------------------------------------------- - // calculate nearest index to jump to - - // save first offset - int64_t offset = (*referenceOffsets.begin()).StartOffset; - vector::const_iterator indexIter = referenceOffsets.begin(); - vector::const_iterator indexEnd = referenceOffsets.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - const BamToolsIndexEntry& entry = (*indexIter); - if ( entry.MaxEndPosition >= region.LeftPosition ) break; - offset = (*indexIter).StartOffset; + default : + // unreachable + ; } - - // no index found - if ( indexIter == indexEnd ) return false; - - //store offset & return success - offsets.push_back(offset); - return true; } -bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region) { - - // localize parent data - if ( m_parent == 0 ) return false; - BamReader* reader = m_parent->m_reader; - BgzfData* mBGZF = m_parent->m_BGZF; - RefVector& references = m_parent->m_references; - - if ( reader == 0 || mBGZF == 0 || !reader->IsOpen() ) { - fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n"); - return false; - } - - // see if left-bound reference of region has alignments - if ( !m_parent->HasAlignments(region.LeftRefID) ) return false; - - // make sure left-bound position is valid - if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false; - - vector offsets; - if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets) ) { - fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n"); - return false; - } - - // iterate through offsets - BamAlignment bAlignment; - bool result = true; - for ( vector::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { - - // attempt seek & load first available alignment - result &= mBGZF->Seek(*o); - reader->GetNextAlignmentCore(bAlignment); - - // if this alignment corresponds to desired position - // return success of seeking back to 'current offset' - if ( (bAlignment.RefID == region.LeftRefID && bAlignment.Position + bAlignment.Length > region.LeftPosition) || (bAlignment.RefID > region.LeftRefID) ) { - if ( o != offsets.begin() ) --o; - return mBGZF->Seek(*o); - } - } - - if ( !result ) fprintf(stderr, "ERROR: Could not jump: unable to seek to offset for specified region.\n"); - return result; -} +// writes in-memory index data out to file +bool BamIndex::Write(const string& bamFilename) { -bool BamToolsIndex::BamToolsIndexPrivate::Load(const string& filename) { - - // localize parent data - if ( m_parent == 0 ) return false; - const bool isBigEndian = m_parent->m_isBigEndian; - RefVector& references = m_parent->m_references; - - // open index file, abort on error - FILE* indexStream = fopen(filename.c_str(), "rb"); - if( !indexStream ) { - fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str()); - return false; - } - - // set placeholder to receive input byte count (suppresses compiler warnings) - size_t elementsRead = 0; - - // see if index is valid BAM index - char magic[4]; - elementsRead = fread(magic, 1, 4, indexStream); - if ( strncmp(magic, "BTI\1", 4) ) { - fprintf(stderr, "Problem with index file - invalid format.\n"); - fclose(indexStream); + // open index file for writing + string indexFilename = bamFilename + Extension(); + if ( !OpenIndexFile(indexFilename, "wb") ) { + fprintf(stderr, "BamIndex ERROR: could not open file to save index data.\n"); return false; } - // read in version - int32_t indexVersion; - elementsRead = fread(&indexVersion, sizeof(indexVersion), 1, indexStream); - if ( isBigEndian ) SwapEndian_32(indexVersion); - if ( indexVersion <= 0 || indexVersion > m_version ) { - fprintf(stderr, "Problem with index file - unsupported version.\n"); - fclose(indexStream); - return false; + // write index header data + if ( !WriteHeader() ) { + fprintf(stderr, "BamIndex ERROR: there was a problem writing index metadata to the new index file.\n"); + fflush(m_indexStream); + fclose(m_indexStream); + exit(1); } - // read in block size - elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, indexStream); - if ( isBigEndian ) SwapEndian_32(m_blockSize); - - // read in number of references - int32_t numReferences; - elementsRead = fread(&numReferences, sizeof(numReferences), 1, indexStream); - if ( isBigEndian ) SwapEndian_32(numReferences); - - // iterate over reference entries - for ( int i = 0; i < numReferences; ++i ) { - - // read in number of offsets for this reference - uint32_t numOffsets; - elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream); - if ( isBigEndian ) SwapEndian_32(numOffsets); - - // initialize offsets container for this reference - vector offsets; - offsets.reserve(numOffsets); - - // iterate over offset entries - for ( unsigned int j = 0; j < numOffsets; ++j ) { - - // copy entry data - int32_t maxEndPosition; - int64_t startOffset; - int32_t startPosition; - - // read in data - elementsRead = fread(&maxEndPosition, sizeof(maxEndPosition), 1, indexStream); - elementsRead = fread(&startOffset, sizeof(startOffset), 1, indexStream); - elementsRead = fread(&startPosition, sizeof(startPosition), 1, indexStream); - - // swap endian-ness if necessary - if ( isBigEndian ) { - SwapEndian_32(maxEndPosition); - SwapEndian_64(startOffset); - SwapEndian_32(startPosition); - } - - // save current index entry - offsets.push_back( BamToolsIndexEntry(maxEndPosition, startOffset, startPosition) ); - } - - // save refID, offsetVector entry into data structure - m_indexData.insert( make_pair(i, offsets) ); - - // set ref.HasAlignments flag - references[i].RefHasAlignments = (numOffsets != 0); + // write main index data + if ( !WriteAllReferences() ) { + fprintf(stderr, "BamIndex ERROR: there was a problem writing index data to the new index file.\n"); + fflush(m_indexStream); + fclose(m_indexStream); + exit(1); } - // close index file and return - fclose(indexStream); - return true; -} + // flush any remaining output + fflush(m_indexStream); + fclose(m_indexStream); -// writes in-memory index data out to file -// N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamToolsIndex::BamToolsIndexPrivate::Write(const std::string& bamFilename) { - - // localize parent data - if ( m_parent == 0 ) return false; - const bool isBigEndian = m_parent->m_isBigEndian; - - // open index file for writing - string indexFilename = bamFilename + ".bti"; - FILE* indexStream = fopen(indexFilename.c_str(), "wb"); - if ( indexStream == 0 ) { - fprintf(stderr, "ERROR: Could not open file to save index.\n"); + // re-open index file for later reading + if ( !OpenIndexFile(indexFilename, "rb") ) { + fprintf(stderr, "BamIndex ERROR: could not open newly created index file for reading.\n"); return false; } - // write BTI index format 'magic number' - fwrite("BTI\1", 1, 4, indexStream); - - // write BTI index format version - int32_t currentVersion = (int32_t)m_version; - if ( isBigEndian ) SwapEndian_32(currentVersion); - fwrite(¤tVersion, sizeof(int32_t), 1, indexStream); - - // write block size - int32_t blockSize = m_blockSize; - if ( isBigEndian ) SwapEndian_32(blockSize); - fwrite(&blockSize, sizeof(blockSize), 1, indexStream); - - // write number of references - int32_t numReferences = (int32_t)m_indexData.size(); - if ( isBigEndian ) SwapEndian_32(numReferences); - fwrite(&numReferences, sizeof(numReferences), 1, indexStream); - - // iterate through references in index - BamToolsIndexData::const_iterator refIter = m_indexData.begin(); - BamToolsIndexData::const_iterator refEnd = m_indexData.end(); - for ( ; refIter != refEnd; ++refIter ) { - - const vector offsets = (*refIter).second; - - // write number of offsets listed for this reference - uint32_t numOffsets = offsets.size(); - if ( isBigEndian ) SwapEndian_32(numOffsets); - fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream); - - // iterate over offset entries - vector::const_iterator offsetIter = offsets.begin(); - vector::const_iterator offsetEnd = offsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) { - - // get reference index data - const BamToolsIndexEntry& entry = (*offsetIter); - - // copy entry data - int32_t maxEndPosition = entry.MaxEndPosition; - int64_t startOffset = entry.StartOffset; - int32_t startPosition = entry.StartPosition; - - // swap endian-ness if necessary - if ( isBigEndian ) { - SwapEndian_32(maxEndPosition); - SwapEndian_64(startOffset); - SwapEndian_32(startPosition); - } - - // write the reference index entry - fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, indexStream); - fwrite(&startOffset, sizeof(startOffset), 1, indexStream); - fwrite(&startPosition, sizeof(startPosition), 1, indexStream); - } - } - - // flush any remaining output, close file, and return success - fflush(indexStream); - fclose(indexStream); + // save index filename & return success + m_indexFilename = indexFilename; return true; } - -// --------------------------------------------------- -// BamToolsIndex implementation - -BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) - : BamIndex(bgzf, reader, isBigEndian) -{ - d = new BamToolsIndexPrivate(this); -} - -BamToolsIndex::~BamToolsIndex(void) { - delete d; - d = 0; -} - -// creates index data (in-memory) from current reader data -bool BamToolsIndex::Build(void) { return d->Build(); } - -// attempts to use index to jump to region; returns success/fail -bool BamToolsIndex::Jump(const BamRegion& region) { return d->Jump(region); } - -// loads existing data from file into memory -bool BamToolsIndex::Load(const string& filename) { return d->Load(filename); } - -// writes in-memory index data out to file -// N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamToolsIndex::Write(const string& bamFilename) { return d->Write(bamFilename); }