From f0bc238bde22f2fdda922dfee8fca9e37995f2e9 Mon Sep 17 00:00:00 2001 From: Derek Date: Fri, 3 Sep 2010 17:14:05 -0400 Subject: [PATCH] Large-scale API indexing re-organization: * Moved FileExists() to BamAux.h so that all API classes have access to its functionality. * Created 2 'factory methods' in BamIndex.h to return a BamIndex subclass, depending on client\'s specified PreferredIndexType & on what files actually exist on disk. * Renamed BamDefaultIndex as BamStandardIndex. Hopefully this name should be a clearer description going forward than BamDefaultIndex, since the standardized index may not always be the 'default' in every situation. --- src/api/BamAux.h | 9 ++- src/api/BamIndex.cpp | 98 +++++++++++------------ src/api/BamIndex.h | 121 ++++++++++++++++++++++++---- src/api/BamMultiReader.cpp | 29 ++----- src/api/BamMultiReader.h | 9 +-- src/api/BamReader.cpp | 158 ++++++++++++++++++------------------- src/api/BamReader.h | 15 +++- 7 files changed, 257 insertions(+), 182 deletions(-) diff --git a/src/api/BamAux.h b/src/api/BamAux.h index 73e8838..25f4538 100644 --- a/src/api/BamAux.h +++ b/src/api/BamAux.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 27 July 2010 (DB) +// Last modified: 3 September 2010 (DB) // --------------------------------------------------------------------------- // Provides the basic constants, data structures, etc. for using BAM files // *************************************************************************** @@ -19,6 +19,8 @@ // C++ includes #include +#include +#include #include #include #include @@ -350,6 +352,11 @@ inline void SwapEndian_64p(char* data) { SwapEndian_64(value); } +inline bool FileExists(const std::string& filename) { + std::ifstream f(filename.c_str(), std::ifstream::in); + return !f.fail(); +} + // ---------------------------------------------------------------- // BamAlignment member methods diff --git a/src/api/BamIndex.cpp b/src/api/BamIndex.cpp index d74e751..cad9d71 100644 --- a/src/api/BamIndex.cpp +++ b/src/api/BamIndex.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 17 August 2010 (DB) +// Last modified: 3 September 2010 (DB) // --------------------------------------------------------------------------- // Provides index functionality - both for the default (standardized) BAM // index format (.bai) as well as a BamTools-specific (nonstandard) index @@ -13,7 +13,6 @@ #include #include #include -// #include #include #include "BamIndex.h" #include "BamReader.h" @@ -48,12 +47,12 @@ bool BamIndex::HasAlignments(const int& referenceID) { // ######################################################################################### // ------------------------------- -// BamDefaultIndex structs & typedefs +// BamStandardIndex structs & typedefs namespace BamTools { // -------------------------------------------------- -// BamDefaultIndex data structures & typedefs +// BamStandardIndex data structures & typedefs struct Chunk { // data members @@ -90,26 +89,26 @@ struct ReferenceIndex { { } }; -typedef vector BamDefaultIndexData; +typedef vector BamStandardIndexData; } // namespace BamTools // ------------------------------- -// BamDefaultIndex implementation +// BamStandardIndex implementation -struct BamDefaultIndex::BamDefaultIndexPrivate { +struct BamStandardIndex::BamStandardIndexPrivate { // ------------------------- // data members - BamDefaultIndexData m_indexData; - BamDefaultIndex* m_parent; + BamStandardIndexData m_indexData; + BamStandardIndex* m_parent; // ------------------------- // ctor & dtor - BamDefaultIndexPrivate(BamDefaultIndex* parent) : m_parent(parent) { } - ~BamDefaultIndexPrivate(void) { } + BamStandardIndexPrivate(BamStandardIndex* parent) : m_parent(parent) { } + ~BamStandardIndexPrivate(void) { m_indexData.clear(); } // ------------------------- // internal methods @@ -125,20 +124,19 @@ struct BamDefaultIndex::BamDefaultIndexPrivate { }; -BamDefaultIndex::BamDefaultIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) +BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) : BamIndex(bgzf, reader, isBigEndian) { - d = new BamDefaultIndexPrivate(this); + d = new BamStandardIndexPrivate(this); } -BamDefaultIndex::~BamDefaultIndex(void) { - d->m_indexData.clear(); +BamStandardIndex::~BamStandardIndex(void) { delete d; d = 0; } // calculate bins that overlap region -int BamDefaultIndex::BamDefaultIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) { +int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) { // get region boundaries uint32_t begin = (unsigned int)region.LeftPosition; @@ -169,7 +167,7 @@ int BamDefaultIndex::BamDefaultIndexPrivate::BinsFromRegion(const BamRegion& reg return i; } -bool BamDefaultIndex::Build(void) { +bool BamStandardIndex::Build(void) { // be sure reader & BGZF file are valid & open for reading if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) @@ -249,8 +247,8 @@ bool BamDefaultIndex::Build(void) { // update saveRefID saveRefID = bAlignment.RefID; - // if invalid RefID, break out (why?) - if ( saveRefID < 0 ) { break; } + // if invalid RefID, break out + if ( saveRefID < 0 ) break; } // make sure that current file pointer is beyond lastOffset @@ -280,8 +278,8 @@ bool BamDefaultIndex::Build(void) { // iterate through references in index // store whether reference has data & // sort offsets in linear offset vector - BamDefaultIndexData::iterator indexIter = d->m_indexData.begin(); - BamDefaultIndexData::iterator indexEnd = d->m_indexData.end(); + BamStandardIndexData::iterator indexIter = d->m_indexData.begin(); + BamStandardIndexData::iterator indexEnd = d->m_indexData.end(); for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) { // get reference index data @@ -300,7 +298,7 @@ bool BamDefaultIndex::Build(void) { return m_reader->Rewind(); } -bool BamDefaultIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets) { +bool BamStandardIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector& offsets) { // calculate which bins overlap this region uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); @@ -321,6 +319,7 @@ bool BamDefaultIndex::GetOffsets(const BamRegion& region, const bool isRightBoun map::const_iterator binIter = binMap.find(binKey); if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) { + // iterate over chunks const ChunkVector& chunks = (*binIter).second; std::vector::const_iterator chunksIter = chunks.begin(); std::vector::const_iterator chunksEnd = chunks.end(); @@ -345,7 +344,7 @@ bool BamDefaultIndex::GetOffsets(const BamRegion& region, const bool isRightBoun } // saves BAM bin entry for index -void BamDefaultIndex::BamDefaultIndexPrivate::InsertBinEntry(BamBinMap& binMap, +void BamStandardIndex::BamStandardIndexPrivate::InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset) @@ -371,7 +370,7 @@ void BamDefaultIndex::BamDefaultIndexPrivate::InsertBinEntry(BamBinMap& bin } // saves linear offset entry for index -void BamDefaultIndex::BamDefaultIndexPrivate::InsertLinearOffset(LinearOffsetVector& offsets, +void BamStandardIndex::BamStandardIndexPrivate::InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset) { @@ -392,7 +391,7 @@ void BamDefaultIndex::BamDefaultIndexPrivate::InsertLinearOffset(LinearOffsetVec } } -bool BamDefaultIndex::Load(const string& filename) { +bool BamStandardIndex::Load(const string& filename) { // open index file, abort on error FILE* indexStream = fopen(filename.c_str(), "rb"); @@ -416,9 +415,9 @@ bool BamDefaultIndex::Load(const string& filename) { // get number of reference sequences uint32_t numRefSeqs; elementsRead = fread(&numRefSeqs, 4, 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_32(numRefSeqs); } + if ( m_isBigEndian ) SwapEndian_32(numRefSeqs); - // intialize space for BamDefaultIndexData data structure + // intialize space for BamStandardIndexData data structure d->m_indexData.reserve(numRefSeqs); // iterate over reference sequences @@ -427,7 +426,7 @@ bool BamDefaultIndex::Load(const string& filename) { // get number of bins for this reference sequence int32_t numBins; elementsRead = fread(&numBins, 4, 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_32(numBins); } + if ( m_isBigEndian ) SwapEndian_32(numBins); if ( numBins > 0 ) { RefData& refEntry = m_references[i]; @@ -482,12 +481,13 @@ bool BamDefaultIndex::Load(const string& filename) { binMap.insert( pair(binID, regionChunks) ); } + // ----------------------------------------------------- // load linear index for this reference sequence // get number of linear offsets int32_t numLinearOffsets; elementsRead = fread(&numLinearOffsets, 4, 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_32(numLinearOffsets); } + if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets); // intialize LinearOffsetVector LinearOffsetVector offsets; @@ -498,7 +498,7 @@ bool BamDefaultIndex::Load(const string& filename) { for ( int j = 0; j < numLinearOffsets; ++j ) { // read a linear offset & store elementsRead = fread(&linearOffset, 8, 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_64(linearOffset); } + if ( m_isBigEndian ) SwapEndian_64(linearOffset); offsets.push_back(linearOffset); } @@ -515,11 +515,11 @@ bool BamDefaultIndex::Load(const string& filename) { } // merges 'alignment chunks' in BAM bin (used for index building) -void BamDefaultIndex::BamDefaultIndexPrivate::MergeChunks(void) { +void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) { // iterate over reference enties - BamDefaultIndexData::iterator indexIter = m_indexData.begin(); - BamDefaultIndexData::iterator indexEnd = m_indexData.end(); + BamStandardIndexData::iterator indexIter = m_indexData.begin(); + BamStandardIndexData::iterator indexEnd = m_indexData.end(); for ( ; indexIter != indexEnd; ++indexIter ) { // get BAM bin map for this reference @@ -533,7 +533,7 @@ void BamDefaultIndex::BamDefaultIndexPrivate::MergeChunks(void) { // get chunk vector for this bin ChunkVector& binChunks = (*binIter).second; - if ( binChunks.size() == 0 ) { continue; } + if ( binChunks.size() == 0 ) continue; ChunkVector mergedChunks; mergedChunks.push_back( binChunks[0] ); @@ -573,7 +573,7 @@ void BamDefaultIndex::BamDefaultIndexPrivate::MergeChunks(void) { // writes in-memory index data out to file // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamDefaultIndex::Write(const std::string& bamFilename) { +bool BamStandardIndex::Write(const std::string& bamFilename) { string indexFilename = bamFilename + ".bai"; FILE* indexStream = fopen(indexFilename.c_str(), "wb"); @@ -587,12 +587,12 @@ bool BamDefaultIndex::Write(const std::string& bamFilename) { // write number of reference sequences int32_t numReferenceSeqs = d->m_indexData.size(); - if ( m_isBigEndian ) { SwapEndian_32(numReferenceSeqs); } + if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs); fwrite(&numReferenceSeqs, 4, 1, indexStream); // iterate over reference sequences - BamDefaultIndexData::const_iterator indexIter = d->m_indexData.begin(); - BamDefaultIndexData::const_iterator indexEnd = d->m_indexData.end(); + BamStandardIndexData::const_iterator indexIter = d->m_indexData.begin(); + BamStandardIndexData::const_iterator indexEnd = d->m_indexData.end(); for ( ; indexIter != indexEnd; ++ indexIter ) { // get reference index data @@ -602,7 +602,7 @@ bool BamDefaultIndex::Write(const std::string& bamFilename) { // write number of bins int32_t binCount = binMap.size(); - if ( m_isBigEndian ) { SwapEndian_32(binCount); } + if ( m_isBigEndian ) SwapEndian_32(binCount); fwrite(&binCount, 4, 1, indexStream); // iterate over bins @@ -615,12 +615,12 @@ bool BamDefaultIndex::Write(const std::string& bamFilename) { const ChunkVector& binChunks = (*binIter).second; // save BAM bin key - if ( m_isBigEndian ) { SwapEndian_32(binKey); } + if ( m_isBigEndian ) SwapEndian_32(binKey); fwrite(&binKey, 4, 1, indexStream); // save chunk count int32_t chunkCount = binChunks.size(); - if ( m_isBigEndian ) { SwapEndian_32(chunkCount); } + if ( m_isBigEndian ) SwapEndian_32(chunkCount); fwrite(&chunkCount, 4, 1, indexStream); // iterate over chunks @@ -646,7 +646,7 @@ bool BamDefaultIndex::Write(const std::string& bamFilename) { // write linear offsets size int32_t offsetSize = offsets.size(); - if ( m_isBigEndian ) { SwapEndian_32(offsetSize); } + if ( m_isBigEndian ) SwapEndian_32(offsetSize); fwrite(&offsetSize, 4, 1, indexStream); // iterate over linear offsets @@ -656,7 +656,7 @@ bool BamDefaultIndex::Write(const std::string& bamFilename) { // write linear offset value uint64_t linearOffset = (*offsetIter); - if ( m_isBigEndian ) { SwapEndian_64(linearOffset); } + if ( m_isBigEndian ) SwapEndian_64(linearOffset); fwrite(&linearOffset, 8, 1, indexStream); } } @@ -760,7 +760,6 @@ bool BamToolsIndex::Build(void) { // if block is full, get offset for next block, reset currentBlockCount if ( currentBlockCount == d->m_blockSize ) { - d->m_indexData.push_back( BamToolsIndexEntry(blockStartOffset, blockStartId, blockStartPosition) ); blockStartOffset = m_BGZF->Tell(); currentBlockCount = 0; @@ -797,8 +796,7 @@ bool BamToolsIndex::GetOffsets(const BamRegion& region, const bool isRightBoundS } // no index was found - if ( previousOffset == -1 ) - return false; + if ( previousOffset == -1 ) return false; // store offset & return success offsets.push_back(previousOffset); @@ -828,12 +826,12 @@ bool BamToolsIndex::Load(const string& filename) { // read in block size elementsRead = fread(&d->m_blockSize, sizeof(d->m_blockSize), 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_32(d->m_blockSize); } + if ( m_isBigEndian ) SwapEndian_32(d->m_blockSize); // read in number of offsets uint32_t numOffsets; elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_32(numOffsets); } + if ( m_isBigEndian ) SwapEndian_32(numOffsets); // reserve space for index data d->m_indexData.reserve(numOffsets); @@ -885,12 +883,12 @@ bool BamToolsIndex::Write(const std::string& bamFilename) { // write block size int32_t blockSize = d->m_blockSize; - if ( m_isBigEndian ) { SwapEndian_32(blockSize); } + if ( m_isBigEndian ) SwapEndian_32(blockSize); fwrite(&blockSize, sizeof(blockSize), 1, indexStream); // write number of offset entries uint32_t numOffsets = d->m_indexData.size(); - if ( m_isBigEndian ) { SwapEndian_32(numOffsets); } + if ( m_isBigEndian ) SwapEndian_32(numOffsets); fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream); // iterate over offset entries diff --git a/src/api/BamIndex.h b/src/api/BamIndex.h index b9ce7d0..d92fe65 100644 --- a/src/api/BamIndex.h +++ b/src/api/BamIndex.h @@ -3,16 +3,16 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 17 August 2010 (DB) +// Last modified: 3 September 2010 (DB) // --------------------------------------------------------------------------- -// Provides index functionality - both for the default (standardized) BAM -// index format (.bai) as well as a BamTools-specific (nonstandard) index -// format (.bti). +// Provides index functionality - both for the standardized BAM index format +// (".bai") as well as a BamTools-specific (nonstandard) index format (".bti"). // *************************************************************************** #ifndef BAM_INDEX_H #define BAM_INDEX_H +#include #include #include #include "BamAux.h" @@ -26,25 +26,49 @@ class BgzfData; // BamIndex base class class BamIndex { + // ctor & dtor public: - BamIndex(BamTools::BgzfData* bgzf, - BamTools::BamReader* reader, - bool isBigEndian); + BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian); virtual ~BamIndex(void) { } + // index interface public: // creates index data (in-memory) from current reader data virtual bool Build(void) =0; + // returns supported file extension + virtual const std::string Extension(void) const =0; // calculates offset(s) for a given region virtual bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector& offsets) =0; - // loads existing data from file into memory - virtual bool Load(const std::string& filename) =0; // returns whether reference has alignments or no virtual bool HasAlignments(const int& referenceID); + // loads existing data from file into memory + virtual bool Load(const std::string& filename) =0; // writes in-memory index data out to file // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) virtual bool Write(const std::string& bamFilename) =0; + // factory methods for returning proper BamIndex-derived type based on available index files + public: + + // returns index based on BAM filename 'stub' + // checks first for preferred type, returns that type if found + // (if not found, attmempts to load other type(s), returns 0 if NONE found) + // + // ** default preferred type is BamToolsIndex ** use this anytime it exists + enum PreferredIndexType { BAMTOOLS = 0, STANDARD }; + static BamIndex* FromBamFilename(const std::string& bamFilename, + BamTools::BgzfData* bgzf, + BamTools::BamReader* reader, + bool isBigEndian, + const BamIndex::PreferredIndexType& type = BamIndex::BAMTOOLS); + + // returns index based on explicitly named index file (or 0 if not found) + static BamIndex* FromIndexFilename(const std::string& indexFilename, + BamTools::BgzfData* bgzf, + BamTools::BamReader* reader, + bool isBigEndian); + + // data members protected: BamTools::BgzfData* m_BGZF; BamTools::BamReader* m_reader; @@ -53,23 +77,25 @@ class BamIndex { }; // -------------------------------------------------- -// BamDefaultIndex class +// BamStandardIndex class // -// implements default (per SAM/BAM spec) index file ops -class BamDefaultIndex : public BamIndex { +// implements standardized (per SAM/BAM spec) index file ops +class BamStandardIndex : public BamIndex { // ctor & dtor public: - BamDefaultIndex(BamTools::BgzfData* bgzf, + BamStandardIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian); - ~BamDefaultIndex(void); + ~BamStandardIndex(void); // interface (implements BamIndex virtual methods) public: // creates index data (in-memory) from current reader data bool Build(void); + // returns supported file extension + const std::string Extension(void) const { return std::string(".bai"); } // calculates offset(s) for a given region bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector& offsets); // loads existing data from file into memory @@ -80,8 +106,8 @@ class BamDefaultIndex : public BamIndex { // internal implementation private: - struct BamDefaultIndexPrivate; - BamDefaultIndexPrivate* d; + struct BamStandardIndexPrivate; + BamStandardIndexPrivate* d; }; // -------------------------------------------------- @@ -101,6 +127,8 @@ class BamToolsIndex : public BamIndex { public: // creates index data (in-memory) from current reader data bool Build(void); + // returns supported file extension + const std::string Extension(void) const { return std::string(".bti"); } // calculates offset(s) for a given region bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector& offsets); // loads existing data from file into memory @@ -115,6 +143,65 @@ class BamToolsIndex : public BamIndex { BamToolsIndexPrivate* d; }; +// -------------------------------------------------- +// BamIndex factory methods +// +// return proper BamIndex-derived type based on available index files + +inline +BamIndex* BamIndex::FromBamFilename(const std::string& bamFilename, + BamTools::BgzfData* bgzf, + BamTools::BamReader* reader, + bool isBigEndian, + const BamIndex::PreferredIndexType& type) +{ + // --------------------------------------------------- + // attempt to load preferred type first + + const std::string bamtoolsIndexFilename = bamFilename + ".bti"; + const bool bamtoolsIndexExists = BamTools::FileExists(bamtoolsIndexFilename); + if ( (type == BamIndex::BAMTOOLS) && bamtoolsIndexExists ) + return new BamToolsIndex(bgzf, reader, isBigEndian); + + const std::string standardIndexFilename = bamFilename + ".bai"; + const bool standardIndexExists = BamTools::FileExists(standardIndexFilename); + if ( (type == BamIndex::STANDARD) && standardIndexExists ) + return new BamStandardIndex(bgzf, reader, isBigEndian); + + // ---------------------------------------------------- + // preferred type could not be found, try other (non-preferred) types + // if none found, return 0 + + if ( bamtoolsIndexExists ) return new BamToolsIndex(bgzf, reader, isBigEndian); + if ( standardIndexExists ) return new BamStandardIndex(bgzf, reader, isBigEndian); + return 0; +} + +inline +BamIndex* BamIndex::FromIndexFilename(const std::string& indexFilename, + BamTools::BgzfData* bgzf, + BamTools::BamReader* reader, + bool isBigEndian) +{ + // see if specified file exists + const bool indexExists = BamTools::FileExists(indexFilename); + if ( !indexExists ) return 0; + + const std::string bamtoolsIndexExtension(".bti"); + const std::string standardIndexExtension(".bai"); + + // if has bamtoolsIndexExtension + if ( indexFilename.find(bamtoolsIndexExtension) == (indexFilename.length() - bamtoolsIndexExtension.length()) ) + return new BamToolsIndex(bgzf, reader, isBigEndian); + + // if has standardIndexExtension + if ( indexFilename.find(standardIndexExtension) == (indexFilename.length() - standardIndexExtension.length()) ) + return new BamStandardIndex(bgzf, reader, isBigEndian); + + // otherwise, unsupported file type + return 0; +} + } // namespace BamTools -#endif // BAM_INDEX_H \ No newline at end of file +#endif // BAM_INDEX_H diff --git a/src/api/BamMultiReader.cpp b/src/api/BamMultiReader.cpp index 6de97dc..8ee4080 100644 --- a/src/api/BamMultiReader.cpp +++ b/src/api/BamMultiReader.cpp @@ -31,15 +31,6 @@ using namespace BamTools; using namespace std; -namespace BamTools { - -bool FileExists(const string& filename) { - ifstream f(filename.c_str(), ifstream::in); - return !f.fail(); -} - -} // namespace BamTools - // ----------------------------------------------------- // BamMultiReader implementation // ----------------------------------------------------- @@ -310,21 +301,11 @@ bool BamMultiReader::Open(const vector filenames, bool openIndexes, bool bool openedOK = true; if (openIndexes) { - - const string defaultIndexFilename = filename + ".bai"; - const string bamToolsIndexFilename = filename + ".bti"; - - // if default BAM index requested and one exists - if ( useDefaultIndex && FileExists(defaultIndexFilename) ) - openedOK = reader->Open(filename, defaultIndexFilename); - - // else see if BamTools index exists - else if ( FileExists(bamToolsIndexFilename) ) - openedOK = reader->Open(filename, bamToolsIndexFilename); - // else none exist... just open without - else - openedOK = reader->Open(filename); + // leave index filename empty + // this allows BamReader & BamIndex to search for any available + // useDefaultIndex gives hint to prefer BAI over BTI + openedOK = reader->Open(filename, "", true, useDefaultIndex); } // ignoring index file(s) @@ -348,7 +329,7 @@ bool BamMultiReader::Open(const vector filenames, bool openIndexes, bool } } - // TODO; any more error handling when openedOKvis false ?? + // TODO; any further error handling when openedOK is false ?? else return false; } diff --git a/src/api/BamMultiReader.h b/src/api/BamMultiReader.h index dce7042..cc30326 100644 --- a/src/api/BamMultiReader.h +++ b/src/api/BamMultiReader.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 2 September 2010 (DB) +// Last modified: 3 September 2010 (DB) // --------------------------------------------------------------------------- // Functionality for simultaneously reading multiple BAM files // *************************************************************************** @@ -59,10 +59,9 @@ class BamMultiReader { // indexes. // @coreMode - setup our first alignments using GetNextAlignmentCore(); // also useful for merging - // @useDefaultIndex - look for default BAM index ".bai" first. If false, - // or if ".bai" does not exist, will look for BamTools index ".bti". If - // neither exist, will open without an index - bool Open(const vector filenames, bool openIndexes = true, bool coreMode = false, bool useDefaultIndex = true); + // @preferStandardIndex - look for standard BAM index ".bai" first. If false, + // will look for BamTools index ".bti". + bool Open(const vector filenames, bool openIndexes = true, bool coreMode = false, bool preferStandardIndex = true); // returns whether underlying BAM readers ALL have an index loaded // this is useful to indicate whether Jump() or SetRegion() are possible diff --git a/src/api/BamReader.cpp b/src/api/BamReader.cpp index f32b4fb..874cd83 100644 --- a/src/api/BamReader.cpp +++ b/src/api/BamReader.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 30 August 2010 (DB) +// Last modified: 3 September 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -81,8 +81,11 @@ struct BamReader::BamReaderPrivate { // file operations void Close(void); - bool Jump(int refID, int position = 0); - bool Open(const string& filename, const string& indexFilename = ""); + bool Jump(int refID, int position); + bool Open(const std::string& filename, + const std::string& indexFilename, + const bool lookForIndex, + const bool preferStandardIndex); bool Rewind(void); bool SetRegion(const BamRegion& region); @@ -94,7 +97,7 @@ struct BamReader::BamReaderPrivate { int GetReferenceID(const string& refName) const; // index operations - bool CreateIndex(bool useDefaultIndex); + bool CreateIndex(bool useStandardIndex); // ------------------------------- // internal methods @@ -118,7 +121,7 @@ struct BamReader::BamReaderPrivate { // clear out inernal index data structure void ClearIndex(void); // loads index from BAM index file - bool LoadIndex(void); + bool LoadIndex(const bool lookForIndex, const bool preferStandardIndex); }; // ----------------------------------------------------- @@ -139,14 +142,21 @@ BamReader::~BamReader(void) { void BamReader::Close(void) { d->Close(); } bool BamReader::IsIndexLoaded(void) const { return d->IsIndexLoaded; } bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; } -bool BamReader::Jump(int refID, int position) { +bool BamReader::Jump(int refID, int position) +{ d->Region.LeftRefID = refID; d->Region.LeftPosition = position; d->IsLeftBoundSpecified = true; d->IsRightBoundSpecified = false; return d->Jump(refID, position); } -bool BamReader::Open(const string& filename, const string& indexFilename) { return d->Open(filename, indexFilename); } +bool BamReader::Open(const std::string& filename, + const std::string& indexFilename, + const bool lookForIndex, + const bool preferStandardIndex) +{ + return d->Open(filename, indexFilename, lookForIndex, preferStandardIndex); +} bool BamReader::Rewind(void) { return d->Rewind(); } bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); } bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) { @@ -165,7 +175,7 @@ int BamReader::GetReferenceID(const string& refName) const { return d->GetRefere const std::string BamReader::GetFilename(void) const { return d->Filename; } // index operations -bool BamReader::CreateIndex(bool useDefaultIndex) { return d->CreateIndex(useDefaultIndex); } +bool BamReader::CreateIndex(bool useStandardIndex) { return d->CreateIndex(useStandardIndex); } // ----------------------------------------------------- // BamReaderPrivate implementation @@ -197,7 +207,6 @@ bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { // calculate character lengths/offsets const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; -// const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength; const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4); const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2; const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength; @@ -205,31 +214,13 @@ bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { // set up char buffers const char* allCharData = bAlignment.SupportData.AllCharData.data(); -// uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); const char* seqData = ((const char*)allCharData) + seqDataOffset; const char* qualData = ((const char*)allCharData) + qualDataOffset; char* tagData = ((char*)allCharData) + tagDataOffset; // store alignment name (relies on null char in name as terminator) bAlignment.Name.assign((const char*)(allCharData)); - -// // save CigarOps -// CigarOp op; -// bAlignment.CigarData.clear(); -// bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations); -// for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) { -// -// // swap if necessary -// if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); } -// -// // build CigarOp structure -// op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); -// op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; -// -// // save CigarOp -// bAlignment.CigarData.push_back(op); -// } - + // save query sequence bAlignment.QueryBases.clear(); bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength); @@ -382,14 +373,16 @@ void BamReader::BamReaderPrivate::Close(void) { IsRegionSpecified = false; } -// create BAM index from BAM file (keep structure in memory) and write to default index output file -bool BamReader::BamReaderPrivate::CreateIndex(bool useDefaultIndex) { +// creates index for BAM file, saves to file +// default behavior is to create the BAM standard index (".bai") +// set flag to false to create the BamTools-specific index (".bti") +bool BamReader::BamReaderPrivate::CreateIndex(bool useStandardIndex) { // clear out prior index data ClearIndex(); - // create default index - if ( useDefaultIndex ) + // create index based on type requested + if ( useStandardIndex ) NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian); // create BamTools 'custom' index else @@ -438,16 +431,14 @@ bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) BamReader::BamReaderPrivate::RegionState state = IsOverlap(bAlignment); // if alignment lies after region, return false - if ( state == AFTER_REGION ) - return false; + if ( state == AFTER_REGION ) return false; while ( state != WITHIN_REGION ) { // if no valid alignment available (likely EOF) return failure if ( !LoadNextAlignment(bAlignment) ) return false; // if alignment lies after region, return false (no available read within region) state = IsOverlap(bAlignment); - if ( state == AFTER_REGION) return false; - + if ( state == AFTER_REGION ) return false; } // return success (alignment found that overlaps region) @@ -466,9 +457,8 @@ int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const { vector refNames; RefVector::const_iterator refIter = References.begin(); RefVector::const_iterator refEnd = References.end(); - for ( ; refIter != refEnd; ++refIter) { + for ( ; refIter != refEnd; ++refIter) refNames.push_back( (*refIter).RefName ); - } // return 'index-of' refName ( if not found, returns refNames.size() ) return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); @@ -572,7 +562,7 @@ void BamReader::BamReaderPrivate::LoadHeaderData(void) { // get BAM header text length mBGZF.Read(buffer, 4); unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(headerTextLength); } + if ( IsBigEndian ) SwapEndian_32(headerTextLength); // get BAM header text char* headerText = (char*)calloc(headerTextLength + 1, 1); @@ -583,35 +573,36 @@ void BamReader::BamReaderPrivate::LoadHeaderData(void) { free(headerText); } -// load existing index data from BAM index file (".bai"), return success/fail -bool BamReader::BamReaderPrivate::LoadIndex(void) { +// load existing index data from BAM index file (".bti" OR ".bai"), return success/fail +bool BamReader::BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool preferStandardIndex) { // clear out any existing index data ClearIndex(); - // skip if index file empty - if ( IndexFilename.empty() ) - return false; - - // check supplied filename for index type - size_t defaultExtensionFound = IndexFilename.find(".bai"); - size_t customExtensionFound = IndexFilename.find(".bti"); - - // if SAM/BAM default (".bai") - if ( defaultExtensionFound != string::npos ) - NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian); - - // if BamTools custom index (".bti") - else if ( customExtensionFound != string::npos ) - NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian); + // if no index filename provided, so we need to look for available index files + if ( IndexFilename.empty() ) { + + // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag + const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS); + NewIndex = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, IsBigEndian, type); + + // if null, return failure + if ( NewIndex == 0 ) return false; + + // generate proper IndexFilename based on type of index created + IndexFilename = Filename + NewIndex->Extension(); + } - // else unknown else { - printf("ERROR: Unknown index file extension.\n"); - return false; + // attempt to load BamIndex based on IndexFilename provided by client + NewIndex = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent, IsBigEndian); + + // if null, return failure + if ( NewIndex == 0 ) return false; } - // return success of loading index data from file + // an index file was found + // return success of loading the index data from file IsIndexLoaded = NewIndex->Load(IndexFilename); return IsIndexLoaded; } @@ -624,16 +615,15 @@ bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { mBGZF.Read(buffer, 4); bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer); if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); } - if ( bAlignment.SupportData.BlockLength == 0 ) { return false; } + if ( bAlignment.SupportData.BlockLength == 0 ) return false; // read in core alignment data, make sure the right size of data was read char x[BAM_CORE_SIZE]; - if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; } + if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) return false; if ( IsBigEndian ) { - for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) { - SwapEndian_32p(&x[i]); - } + for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) + SwapEndian_32p(&x[i]); } // set BamAlignment 'core' and 'support' data @@ -703,8 +693,8 @@ void BamReader::BamReaderPrivate::LoadReferenceData(void) { char buffer[4]; mBGZF.Read(buffer, 4); unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(numberRefSeqs); } - if (numberRefSeqs == 0) { return; } + if ( IsBigEndian ) SwapEndian_32(numberRefSeqs); + if ( numberRefSeqs == 0 ) return; References.reserve((int)numberRefSeqs); // iterate over all references in header @@ -713,14 +703,14 @@ void BamReader::BamReaderPrivate::LoadReferenceData(void) { // get length of reference name mBGZF.Read(buffer, 4); unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(refNameLength); } + if ( IsBigEndian ) SwapEndian_32(refNameLength); char* refName = (char*)calloc(refNameLength, 1); // get reference name and reference sequence length mBGZF.Read(refName, refNameLength); mBGZF.Read(buffer, 4); int refLength = BgzfData::UnpackSignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(refLength); } + if ( IsBigEndian ) SwapEndian_32(refLength); // store data for reference RefData aReference; @@ -734,14 +724,14 @@ void BamReader::BamReaderPrivate::LoadReferenceData(void) { } // opens BAM file (and index) -bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) { +bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename, const bool lookForIndex, const bool preferStandardIndex) { + // store filenames Filename = filename; IndexFilename = indexFilename; // open the BGZF file for reading, return false on failure - if ( !mBGZF.Open(filename, "rb") ) - return false; + if ( !mBGZF.Open(filename, "rb") ) return false; // retrieve header text & reference data LoadHeaderData(); @@ -750,12 +740,20 @@ bool BamReader::BamReaderPrivate::Open(const string& filename, const string& ind // store file offset of first alignment AlignmentsBeginOffset = mBGZF.Tell(); - // open index file & load index data (if exists) - if ( !IndexFilename.empty() ) - LoadIndex(); + // if no index filename provided + if ( IndexFilename.empty() ) { + + // client did not specify that index SHOULD be found + // useful for cases where sequential access is all that is required + if ( !lookForIndex ) return true; + + // otherwise, look for index file, return success/fail + return LoadIndex(lookForIndex, preferStandardIndex) ; + } - // return success - return true; + // client supplied an index filename + // attempt to load index data, return success/fail + return LoadIndex(lookForIndex, preferStandardIndex); } // returns BAM file pointer to beginning of alignment data @@ -790,10 +788,8 @@ bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) { Region = region; // set flags - if ( region.LeftRefID >= 0 && region.LeftPosition >= 0 ) - IsLeftBoundSpecified = true; - if ( region.RightRefID >= 0 && region.RightPosition >= 0 ) - IsRightBoundSpecified = true; + if ( region.LeftRefID >= 0 && region.LeftPosition >= 0 ) IsLeftBoundSpecified = true; + if ( region.RightRefID >= 0 && region.RightPosition >= 0 ) IsRightBoundSpecified = true; // attempt jump to beginning of region, return success/fail of Jump() return Jump( Region.LeftRefID, Region.LeftPosition ); diff --git a/src/api/BamReader.h b/src/api/BamReader.h index 5d4c83a..6e863b6 100644 --- a/src/api/BamReader.h +++ b/src/api/BamReader.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 2 September 2010 (DB) +// Last modified: 3 September 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -45,7 +45,12 @@ class BamReader { // performs random-access jump to reference, position bool Jump(int refID, int position = 0); // opens BAM file (and optional BAM index file, if provided) - bool Open(const std::string& filename, const std::string& indexFilename = ""); + // @lookForIndex - if no indexFilename provided, look for an existing index file + // @preferStandardIndex - if true, give priority in index file searching to standard BAM index + bool Open(const std::string& filename, + const std::string& indexFilename = "", + const bool lookForIndex = false, + const bool preferStandardIndex = false); // returns file pointer to beginning of alignments bool Rewind(void); // sets a region of interest (with left & right bound reference/position) @@ -86,8 +91,10 @@ class BamReader { // BAM index operations // ---------------------- - // creates index for BAM file, saves to file (default = bamFilename + ".bai") - bool CreateIndex(bool useDefaultIndex = true); + // creates index for BAM file, saves to file + // default behavior is to create the BAM standard index (".bai") + // set flag to false to create the BamTools-specific index (".bti") + bool CreateIndex(bool useStandardIndex = true); // private implementation private: -- 2.39.2