From: Derek Date: Fri, 10 Sep 2010 03:42:04 +0000 (-0400) Subject: Reorganizing index/jumping calls. Now all BamReader cares about is sending a Jump... X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=62b1c71c1d824b6d37b029ce67133f42f2563d78;p=bamtools.git Reorganizing index/jumping calls. Now all BamReader cares about is sending a Jump() request to a BamIndex with a desired region and receiving a success/fail flag. BamIndex-derived classes will now handle all index-format-specific offset calculation, overlap checking, etc and make sure its associated BGZF stream has seek-ed as close to the desired region as that index scheme allows. --- diff --git a/src/api/BamAux.h b/src/api/BamAux.h index 7a7fb1c..057568e 100644 --- a/src/api/BamAux.h +++ b/src/api/BamAux.h @@ -270,6 +270,12 @@ struct BamRegion { , RightRefID(rightID) , RightPosition(rightPos) { } + + // member functions + void clear(void) { LeftRefID = -1; LeftPosition = -1; RightRefID = -1; RightPosition = -1; } + bool isLeftBoundSpecified(void) const { return ( LeftRefID != -1 && LeftPosition != -1 ); } + bool isNull(void) const { return !( isLeftBoundSpecified() && isRightBoundSpecified() ); } + bool isRightBoundSpecified(void) const { return ( RightRefID != -1 && RightPosition != -1 ); } }; // ---------------------------------------------------------------- diff --git a/src/api/BamIndex.cpp b/src/api/BamIndex.cpp index 59a1c9c..71b8da4 100644 --- a/src/api/BamIndex.cpp +++ b/src/api/BamIndex.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 3 September 2010 (DB) +// Last modified: 9 September 2010 (DB) // --------------------------------------------------------------------------- // Provides index functionality - both for the default (standardized) BAM // index format (.bai) as well as a BamTools-specific (nonstandard) index @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "BamIndex.h" #include "BamReader.h" @@ -391,6 +392,48 @@ void BamStandardIndex::BamStandardIndexPrivate::InsertLinearOffset(LinearOffsetV } } +bool BamStandardIndex::Jump(const BamRegion& region) { + + cerr << "Jumping in BAI" << endl; + + if ( m_reader == 0 || m_BGZF == 0 || !m_reader->IsOpen() ) { + fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n"); + return false; + } + + // see if left-bound reference of region has alignments + if ( !HasAlignments(region.LeftRefID) ) return false; + + // make sure left-bound position is valid + if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) return false; + + vector offsets; + if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets) ) { + fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n"); + return false; + } + + // iterate through offsets + BamAlignment bAlignment; + bool result = true; + for ( vector::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { + + // attempt seek & load first available alignment + result &= m_BGZF->Seek(*o); + m_reader->GetNextAlignmentCore(bAlignment); + + // if this alignment corresponds to desired position + // return success of seeking back to 'current offset' + if ( (bAlignment.RefID == region.LeftRefID && bAlignment.Position + bAlignment.Length > region.LeftPosition) || (bAlignment.RefID > region.LeftRefID) ) { + if ( o != offsets.begin() ) --o; + return m_BGZF->Seek(*o); + } + } + + if ( !result ) fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n"); + return result; +} + bool BamStandardIndex::Load(const string& filename) { // open index file, abort on error @@ -679,16 +722,16 @@ struct BamToolsIndexEntry { // data members int64_t Offset; - int RefID; - int Position; + int32_t RefID; + int32_t StartPosition; // ctor - BamToolsIndexEntry(const uint64_t& offset = 0, - const int& id = -1, - const int& position = -1) + BamToolsIndexEntry(const int64_t& offset = 0, + const int32_t& id = -1, + const int32_t& position = -1) : Offset(offset) , RefID(id) - , Position(position) + , StartPosition(position) { } }; @@ -741,8 +784,8 @@ bool BamToolsIndex::Build(void) { // plow through alignments, store block offsets int32_t currentBlockCount = 0; int64_t blockStartOffset = m_BGZF->Tell(); - int blockStartId = -1; - int blockStartPosition = -1; + int32_t blockStartId = -1; + int32_t blockStartPosition = -1; BamAlignment al; while ( m_reader->GetNextAlignmentCore(al) ) { @@ -777,7 +820,46 @@ bool BamToolsIndex::GetOffsets(const BamRegion& region, const bool isRightBoundS // clear any prior data offsets.clear(); - + + +/* bool found = false; + int64_t previousOffset = -1; + BamToolsIndexData::const_iterator indexIter = d->m_indexData.end() - 1; + BamToolsIndexData::const_iterator indexBegin = d->m_indexData.begin(); + for ( ; indexIter >= indexBegin; --indexIter ) { + + const BamToolsIndexEntry& entry = (*indexIter); + + cerr << "Considering entry at " << entry.RefID << ":" << entry.Position << " => " << entry.Offset << endl; + + if ( entry.RefID < region.LeftRefID) { + found = true; + break; + } + + if ( (entry.RefID == region.LeftRefID) && (entry.Position < region.LeftPosition) ) { + found = true; + break; + } + + previousOffset = entry.Offset; + } + + if ( !found || previousOffset == -1 ) return false; + + // store offset & return success + cerr << endl; + cerr << "Using offset: " << previousOffset << endl; + cerr << endl; + + offsets.push_back(previousOffset); + return true;*/ + + +// cerr << "--------------------------------------------------" << endl; +// cerr << "Looking for " << region.LeftRefID << ":" << region.LeftPosition << endl; +// cerr << endl; + // calculate nearest index to jump to int64_t previousOffset = -1; BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin(); @@ -786,23 +868,71 @@ bool BamToolsIndex::GetOffsets(const BamRegion& region, const bool isRightBoundS const BamToolsIndexEntry& entry = (*indexIter); +// cerr << "Considering entry at " << entry.RefID << ":" << entry.StartPosition << " => " << entry.Offset << endl; + // check if we are 'past' beginning of desired region // if so, we will break out & use previously stored offset if ( entry.RefID > region.LeftRefID ) break; - if ( (entry.RefID == region.LeftRefID) && (entry.Position > region.LeftPosition) ) break; + if ( (entry.RefID == region.LeftRefID) && (entry.StartPosition >= region.LeftPosition) ) break; // not past desired region, so store current entry offset in previousOffset previousOffset = entry.Offset; } - // no index was found - if ( previousOffset == -1 ) return false; + // no index found + if ( indexIter == indexEnd ) return false; + + // first offset matches (so previousOffset was never set) + if ( previousOffset == -1 ) previousOffset = (*indexIter).Offset; // store offset & return success +// cerr << endl; +// cerr << "Using offset: " << previousOffset << endl; +// cerr << endl; offsets.push_back(previousOffset); return true; } +bool BamToolsIndex::Jump(const BamRegion& region) { + + if ( m_reader == 0 || m_BGZF == 0 || !m_reader->IsOpen() ) { + fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n"); + return false; + } + + // see if left-bound reference of region has alignments + if ( !HasAlignments(region.LeftRefID) ) return false; + + // make sure left-bound position is valid + if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) return false; + + vector offsets; + if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets) ) { + fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n"); + return false; + } + + // iterate through offsets + BamAlignment bAlignment; + bool result = true; + for ( vector::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { + + // attempt seek & load first available alignment + result &= m_BGZF->Seek(*o); + m_reader->GetNextAlignmentCore(bAlignment); + + // if this alignment corresponds to desired position + // return success of seeking back to 'current offset' + if ( (bAlignment.RefID == region.LeftRefID && bAlignment.Position + bAlignment.Length > region.LeftPosition) || (bAlignment.RefID > region.LeftRefID) ) { + if ( o != offsets.begin() ) --o; + return m_BGZF->Seek(*o); + } + } + + if ( !result ) fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n"); + return result; +} + bool BamToolsIndex::Load(const string& filename) { // open index file, abort on error @@ -839,13 +969,13 @@ bool BamToolsIndex::Load(const string& filename) { // iterate over index entries for ( unsigned int i = 0; i < numOffsets; ++i ) { - uint64_t offset; - int id; - int position; + int64_t offset; + int32_t id; + int32_t position; // read in data - elementsRead = fread(&offset, sizeof(offset), 1, indexStream); - elementsRead = fread(&id, sizeof(id), 1, indexStream); + elementsRead = fread(&offset, sizeof(offset), 1, indexStream); + elementsRead = fread(&id, sizeof(id), 1, indexStream); elementsRead = fread(&position, sizeof(position), 1, indexStream); // swap endian-ness if necessary @@ -871,6 +1001,7 @@ bool BamToolsIndex::Load(const string& filename) { // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) bool BamToolsIndex::Write(const std::string& bamFilename) { + // open index file for writing string indexFilename = bamFilename + ".bti"; FILE* indexStream = fopen(indexFilename.c_str(), "wb"); if ( indexStream == 0 ) { @@ -900,9 +1031,9 @@ bool BamToolsIndex::Write(const std::string& bamFilename) { const BamToolsIndexEntry& entry = (*indexIter); // copy entry data - uint64_t offset = entry.Offset; - int id = entry.RefID; - int position = entry.Position; + int64_t offset = entry.Offset; + int32_t id = entry.RefID; + int32_t position = entry.StartPosition; // swap endian-ness if necessary if ( m_isBigEndian ) { @@ -912,12 +1043,12 @@ bool BamToolsIndex::Write(const std::string& bamFilename) { } // write the reference index entry - fwrite(&offset, sizeof(offset), 1, indexStream); - fwrite(&id, sizeof(id), 1, indexStream); + fwrite(&offset, sizeof(offset), 1, indexStream); + fwrite(&id, sizeof(id), 1, indexStream); fwrite(&position, sizeof(position), 1, indexStream); } - // flush file buffer, close file, and return success + // flush any remaining output, close file, and return success fflush(indexStream); fclose(indexStream); return true; diff --git a/src/api/BamIndex.h b/src/api/BamIndex.h index d92fe65..dc95889 100644 --- a/src/api/BamIndex.h +++ b/src/api/BamIndex.h @@ -38,7 +38,8 @@ class BamIndex { // returns supported file extension virtual const std::string Extension(void) const =0; // calculates offset(s) for a given region - virtual bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector& offsets) =0; + //virtual bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector& offsets) =0; + virtual bool Jump(const BamTools::BamRegion& region) =0; // returns whether reference has alignments or no virtual bool HasAlignments(const int& referenceID); // loads existing data from file into memory @@ -98,6 +99,7 @@ class BamStandardIndex : public BamIndex { const std::string Extension(void) const { return std::string(".bai"); } // calculates offset(s) for a given region bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector& offsets); + bool Jump(const BamTools::BamRegion& region); // loads existing data from file into memory bool Load(const std::string& filename); // writes in-memory index data out to file @@ -131,6 +133,7 @@ class BamToolsIndex : public BamIndex { const std::string Extension(void) const { return std::string(".bti"); } // calculates offset(s) for a given region bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector& offsets); + bool Jump(const BamTools::BamRegion& region); // loads existing data from file into memory bool Load(const std::string& filename); // writes in-memory index data out to file diff --git a/src/api/BamReader.cpp b/src/api/BamReader.cpp index 93a991b..06232af 100644 --- a/src/api/BamReader.cpp +++ b/src/api/BamReader.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 7 September 2010 (DB) +// Last modified: 10 September 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -42,8 +42,7 @@ struct BamReader::BamReaderPrivate { // general file data BgzfData mBGZF; string HeaderText; - //BamIndex Index; - BamIndex* NewIndex; + BamIndex* Index; RefVector References; bool IsIndexLoaded; int64_t AlignmentsBeginOffset; @@ -54,11 +53,7 @@ struct BamReader::BamReaderPrivate { bool IsBigEndian; // user-specified region values - BamRegion Region; - bool IsLeftBoundSpecified; - bool IsRightBoundSpecified; - - bool IsRegionSpecified; + BamRegion Region; int CurrentRefID; int CurrentLeft; @@ -81,7 +76,6 @@ struct BamReader::BamReaderPrivate { // file operations void Close(void); - bool Jump(int refID, int position); bool Open(const std::string& filename, const std::string& indexFilename, const bool lookForIndex, @@ -142,14 +136,7 @@ BamReader::~BamReader(void) { void BamReader::Close(void) { d->Close(); } bool BamReader::IsIndexLoaded(void) const { return d->IsIndexLoaded; } bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; } -bool BamReader::Jump(int refID, int position) -{ - d->Region.LeftRefID = refID; - d->Region.LeftPosition = position; - d->IsLeftBoundSpecified = true; - d->IsRightBoundSpecified = false; - return d->Jump(refID, position); -} +bool BamReader::Jump(int refID, int position) { return d->SetRegion( BamRegion(refID, position) ); } bool BamReader::Open(const std::string& filename, const std::string& indexFilename, const bool lookForIndex, @@ -183,12 +170,9 @@ bool BamReader::CreateIndex(bool useStandardIndex) { return d->CreateIndex(useSt // constructor BamReader::BamReaderPrivate::BamReaderPrivate(BamReader* parent) - : NewIndex(0) + : Index(0) , IsIndexLoaded(false) , AlignmentsBeginOffset(0) - , IsLeftBoundSpecified(false) - , IsRightBoundSpecified(false) - , IsRegionSpecified(false) , CurrentRefID(0) , CurrentLeft(0) , Parent(parent) @@ -350,8 +334,8 @@ bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { // clear index data structure void BamReader::BamReaderPrivate::ClearIndex(void) { - delete NewIndex; - NewIndex = 0; + delete Index; + Index = 0; IsIndexLoaded = false; } @@ -368,9 +352,7 @@ void BamReader::BamReaderPrivate::Close(void) { HeaderText.clear(); // clear out region flags - IsLeftBoundSpecified = false; - IsRightBoundSpecified = false; - IsRegionSpecified = false; + Region.clear(); } // creates index for BAM file, saves to file @@ -383,18 +365,18 @@ bool BamReader::BamReaderPrivate::CreateIndex(bool useStandardIndex) { // create index based on type requested if ( useStandardIndex ) - NewIndex = new BamStandardIndex(&mBGZF, Parent, IsBigEndian); + Index = new BamStandardIndex(&mBGZF, Parent, IsBigEndian); // create BamTools 'custom' index else - NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian); + Index = new BamToolsIndex(&mBGZF, Parent, IsBigEndian); // build new index bool ok = true; - ok &= NewIndex->Build(); + ok &= Index->Build(); IsIndexLoaded = ok; // attempt to save index data to file - ok &= NewIndex->Write(Filename); + ok &= Index->Write(Filename); // return success/fail of both building & writing index return ok; @@ -425,7 +407,7 @@ bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) bAlignment.SupportData.HasCoreOnly = true; // if region not specified, return success - if ( !IsLeftBoundSpecified ) return true; + if ( !Region.isLeftBoundSpecified() ) return true; // determine region state (before, within, after) BamReader::BamReaderPrivate::RegionState state = IsOverlap(bAlignment); @@ -472,7 +454,7 @@ BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap( // check alignment start against right bound cutoff // if full region of interest was given - if ( IsRightBoundSpecified ) { + if ( Region.isRightBoundSpecified() ) { // read starts on right bound reference, but AFTER right bound position if ( bAlignment.RefID == Region.RightRefID && bAlignment.Position > Region.RightPosition ) @@ -506,44 +488,6 @@ BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap( return BEFORE_REGION; } -// jumps to specified region(refID, leftBound) in BAM file, returns success/fail -bool BamReader::BamReaderPrivate::Jump(int refID, int position) { - - // ----------------------------------------------------------------------- - // check for existing index - if ( !IsIndexLoaded || NewIndex == 0 ) return false; - // see if reference has alignments - if ( !NewIndex->HasAlignments(refID) ) return false; - // make sure position is valid - if ( position > References.at(refID).RefLength ) return false; - - // determine possible offsets - vector offsets; - if ( !NewIndex->GetOffsets(Region, IsRightBoundSpecified, offsets) ) { - fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n"); - return false; - } - - // iterate through offsets - BamAlignment bAlignment; - bool result = true; - for ( vector::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { - - // attempt seek & load first available alignment - result &= mBGZF.Seek(*o); - LoadNextAlignment(bAlignment); - - // if this alignment corresponds to desired position - // return success of seeking back to 'current offset' - if ( (bAlignment.RefID == refID && bAlignment.Position + bAlignment.Length > position) || (bAlignment.RefID > refID) ) { - if ( o != offsets.begin() ) --o; - return mBGZF.Seek(*o); - } - } - - return result; -} - // load BAM header data void BamReader::BamReaderPrivate::LoadHeaderData(void) { @@ -584,26 +528,26 @@ bool BamReader::BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS); - NewIndex = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, IsBigEndian, type); + Index = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, IsBigEndian, type); // if null, return failure - if ( NewIndex == 0 ) return false; + if ( Index == 0 ) return false; // generate proper IndexFilename based on type of index created - IndexFilename = Filename + NewIndex->Extension(); + IndexFilename = Filename + Index->Extension(); } else { // attempt to load BamIndex based on IndexFilename provided by client - NewIndex = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent, IsBigEndian); + Index = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent, IsBigEndian); // if null, return failure - if ( NewIndex == 0 ) return false; + if ( Index == 0 ) return false; } // an index file was found // return success of loading the index data from file - IsIndexLoaded = NewIndex->Load(IndexFilename); + IsIndexLoaded = Index->Load(IndexFilename); return IsIndexLoaded; } @@ -759,38 +703,44 @@ bool BamReader::BamReaderPrivate::Open(const string& filename, const string& ind // returns BAM file pointer to beginning of alignment data bool BamReader::BamReaderPrivate::Rewind(void) { - // rewind to first alignment + // rewind to first alignment, return false if unable to seek if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false; - // retrieve first alignment data + // retrieve first alignment data, return false if unable to read BamAlignment al; if ( !LoadNextAlignment(al) ) return false; // reset default region info using first alignment in file - Region.LeftRefID = al.RefID; - Region.LeftPosition = al.Position; - Region.RightRefID = -1; - Region.RightPosition = -1; - IsLeftBoundSpecified = false; - IsRightBoundSpecified = false; - - // rewind back to before first alignment + Region.clear(); + Region.LeftRefID = al.RefID; + Region.LeftPosition = al.Position; + + // rewind back to beginning of first alignment // return success/fail of seek return mBGZF.Seek(AlignmentsBeginOffset); } -// sets a region of interest (with left & right bound reference/position) -// attempts a Jump() to left bound as well -// returns success/failure of Jump() +// asks Index to attempt a Jump() to specified region +// returns success/failure bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) { + + // clear out any prior BamReader region data + // + // N.B. - this is cleared so that BamIndex now has free reign to call + // GetNextAlignmentCore() and do overlap checking without worrying about BamReader + // performing any overlap checking of its own and moving on to the next read... Calls + // to GetNextAlignmentCore() with no Region set, simply return the next alignment. + // This ensures that the Index is able to do just that. (All without exposing + // LoadNextAlignment() to the public API, and potentially confusing clients with the nomenclature) + Region.clear(); + + // check for existing index + if ( !IsIndexLoaded || Index == 0 ) return false; - // save region of interest + // attempt jump to user-specified region, return false if failed + if ( !Index->Jump(region) ) return false; + + // if successful, save region data locally, return success Region = region; - - // set flags - if ( region.LeftRefID >= 0 && region.LeftPosition >= 0 ) IsLeftBoundSpecified = true; - if ( region.RightRefID >= 0 && region.RightPosition >= 0 ) IsRightBoundSpecified = true; - - // attempt jump to beginning of region, return success/fail of Jump() - return Jump( Region.LeftRefID, Region.LeftPosition ); + return true; }