From: Derek Date: Sat, 18 Sep 2010 04:42:23 +0000 (-0400) Subject: Fixed: bug related to accessing data (or regions with no alignments) near the ends... X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=89dbe0f38482e43f5e49d9cc5c8670ec00f3f53f;p=bamtools.git Fixed: bug related to accessing data (or regions with no alignments) near the ends of references, when using .bti index files. Required modifying the BamToolsIndex build step. *NOTE: This update invalidates any existing .bti files, please re-generate any that you have currently.* Versioning system in BTI will not allow users to use the older, buggy version... so no chance of accidental usage. --- diff --git a/src/api/BamIndex.cpp b/src/api/BamIndex.cpp index a7f93de..f79b2aa 100644 --- a/src/api/BamIndex.cpp +++ b/src/api/BamIndex.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 16 September 2010 (DB) +// Last modified: 18 September 2010 (DB) // --------------------------------------------------------------------------- // Provides index functionality - both for the default (standardized) BAM // index format (.bai) as well as a BamTools-specific (nonstandard) index @@ -121,7 +121,7 @@ struct BamStandardIndex::BamStandardIndexPrivate { // creates index data (in-memory) from current reader data bool Build(void); // attempts to use index to jump to region; returns success/fail - bool Jump(const BamTools::BamRegion& region); + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); // loads existing data from file into memory bool Load(const std::string& filename); // writes in-memory index data out to file @@ -404,7 +404,7 @@ void BamStandardIndex::BamStandardIndexPrivate::InsertLinearOffset(LinearOffsetV } } -bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region) { +bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { // localize parent data if ( m_parent == 0 ) return false; @@ -422,9 +422,12 @@ bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region) { // make sure left-bound position is valid if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false; + // calculate offsets for this region + // if failed, print message, set flag, and return failure vector offsets; if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets) ) { fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n"); + *hasAlignmentsInRegion = false; return false; } @@ -434,18 +437,25 @@ bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region) { for ( vector::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { // attempt seek & load first available alignment + // set flag to true if data exists result &= mBGZF->Seek(*o); - reader->GetNextAlignmentCore(bAlignment); + *hasAlignmentsInRegion = reader->GetNextAlignmentCore(bAlignment); // if this alignment corresponds to desired position - // return success of seeking back to 'current offset' + // return success of seeking back to the offset before the 'current offset' (to cover overlaps) if ( (bAlignment.RefID == region.LeftRefID && bAlignment.Position + bAlignment.Length > region.LeftPosition) || (bAlignment.RefID > region.LeftRefID) ) { if ( o != offsets.begin() ) --o; return mBGZF->Seek(*o); } } - if ( !result ) fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n"); + // if error in jumping, print message & set flag + if ( !result ) { + fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n"); + *hasAlignmentsInRegion = false; + } + + // return success/failure return result; } @@ -752,7 +762,7 @@ BamStandardIndex::~BamStandardIndex(void) { bool BamStandardIndex::Build(void) { return d->Build(); } // attempts to use index to jump to region; returns success/fail -bool BamStandardIndex::Jump(const BamRegion& region) { return d->Jump(region); } +bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); } // loads existing data from file into memory bool BamStandardIndex::Load(const string& filename) { return d->Load(filename); } @@ -807,7 +817,9 @@ struct BamToolsIndex::BamToolsIndexPrivate { // do something new // else // do the old thing - enum Version { BTI_1_0 = 1 }; + enum Version { BTI_1_0 = 1 + , BTI_1_1 + }; // ------------------------- // data members @@ -822,7 +834,7 @@ struct BamToolsIndex::BamToolsIndexPrivate { BamToolsIndexPrivate(BamToolsIndex* parent) : m_parent(parent) , m_blockSize(1000) - , m_version(BTI_1_0) // latest version - used for writing new index files + , m_version(BTI_1_1) // latest version - used for writing new index files { } ~BamToolsIndexPrivate(void) { } @@ -835,7 +847,7 @@ struct BamToolsIndex::BamToolsIndexPrivate { // returns supported file extension const std::string Extension(void) const { return std::string(".bti"); } // attempts to use index to jump to region; returns success/fail - bool Jump(const BamTools::BamRegion& region); + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); // loads existing data from file into memory bool Load(const std::string& filename); // writes in-memory index data out to file @@ -846,7 +858,7 @@ struct BamToolsIndex::BamToolsIndexPrivate { // internal methods // calculates offset for a given region - bool GetOffset(const BamRegion& region, int64_t& offset); + bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); }; bool BamToolsIndex::BamToolsIndexPrivate::Build(void) { @@ -923,13 +935,16 @@ bool BamToolsIndex::BamToolsIndexPrivate::Build(void) { currentAlignmentOffset = mBGZF->Tell(); } + // store final block + m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) ); + // attempt to rewind back to begnning of alignments // return success/fail return reader->Rewind(); } // N.B. - ignores isRightBoundSpecified -bool BamToolsIndex::BamToolsIndexPrivate::GetOffset(const BamRegion& region, int64_t& offset) { +bool BamToolsIndex::BamToolsIndexPrivate::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { // return false if leftBound refID is not found in index data if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) return false; @@ -942,22 +957,26 @@ bool BamToolsIndex::BamToolsIndexPrivate::GetOffset(const BamRegion& region, int // save first offset offset = (*referenceOffsets.begin()).StartOffset; + + // iterate over offsets entries on this reference vector::const_iterator indexIter = referenceOffsets.begin(); vector::const_iterator indexEnd = referenceOffsets.end(); for ( ; indexIter != indexEnd; ++indexIter ) { const BamToolsIndexEntry& entry = (*indexIter); + // break if alignment 'entry' overlaps region if ( entry.MaxEndPosition >= region.LeftPosition ) break; offset = (*indexIter).StartOffset; } - // no index found - if ( indexIter == indexEnd ) return false; - - // return succes + // set flag based on whether an index entry was found for this region + *hasAlignmentsInRegion = ( indexIter != indexEnd ); return true; } -bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region) { +bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { + + // clear flag + *hasAlignmentsInRegion = false; // localize parent data if ( m_parent == 0 ) return false; @@ -965,6 +984,7 @@ bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region) { BgzfData* mBGZF = m_parent->m_BGZF; RefVector& references = m_parent->m_references; + // check valid BamReader state if ( reader == 0 || mBGZF == 0 || !reader->IsOpen() ) { fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n"); return false; @@ -978,7 +998,7 @@ bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region) { // calculate nearest offset to jump to int64_t offset; - if ( !GetOffset(region, offset) ) { + if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) { fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n"); return false; } @@ -1023,6 +1043,13 @@ bool BamToolsIndex::BamToolsIndexPrivate::Load(const string& filename) { return false; } + if ( (Version)indexVersion < BTI_1_1 ) { + fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n"); + fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); + fclose(indexStream); + exit(1); + } + // read in block size elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, indexStream); if ( isBigEndian ) SwapEndian_32(m_blockSize); @@ -1177,7 +1204,10 @@ BamToolsIndex::~BamToolsIndex(void) { bool BamToolsIndex::Build(void) { return d->Build(); } // attempts to use index to jump to region; returns success/fail -bool BamToolsIndex::Jump(const BamRegion& region) { return d->Jump(region); } +// a "successful" jump indicates no error, but not whether this region has data +// * thus, the method sets a flag to indicate whether there are alignments +// available after the jump position +bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); } // loads existing data from file into memory bool BamToolsIndex::Load(const string& filename) { return d->Load(filename); } diff --git a/src/api/BamIndex.h b/src/api/BamIndex.h index e45bc27..fdd0d13 100644 --- a/src/api/BamIndex.h +++ b/src/api/BamIndex.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 10 September 2010 (DB) +// Last modified: 17 September 2010 (DB) // --------------------------------------------------------------------------- // Provides index functionality - both for the standardized BAM index format // (".bai") as well as a BamTools-specific (nonstandard) index format (".bti"). @@ -38,7 +38,10 @@ class BamIndex { // returns supported file extension virtual const std::string Extension(void) const =0; // attempts to use index to jump to region; returns success/fail - virtual bool Jump(const BamTools::BamRegion& region) =0; + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + virtual bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) =0; // returns whether reference has alignments or no virtual bool HasAlignments(const int& referenceID); // loads existing data from file into memory @@ -97,7 +100,10 @@ class BamStandardIndex : public BamIndex { // returns supported file extension const std::string Extension(void) const { return std::string(".bai"); } // attempts to use index to jump to region; returns success/fail - bool Jump(const BamTools::BamRegion& region); + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); // loads existing data from file into memory bool Load(const std::string& filename); // writes in-memory index data out to file @@ -130,7 +136,10 @@ class BamToolsIndex : public BamIndex { // returns supported file extension const std::string Extension(void) const { return std::string(".bti"); } // attempts to use index to jump to region; returns success/fail - bool Jump(const BamTools::BamRegion& region); + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); // loads existing data from file into memory bool Load(const std::string& filename); // writes in-memory index data out to file diff --git a/src/api/BamReader.cpp b/src/api/BamReader.cpp index 06232af..f16d98d 100644 --- a/src/api/BamReader.cpp +++ b/src/api/BamReader.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 10 September 2010 (DB) +// Last modified: 17 September 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -17,8 +17,6 @@ #include #include #include - -// BamTools includes #include "BGZF.h" #include "BamReader.h" #include "BamIndex.h" @@ -53,9 +51,8 @@ struct BamReader::BamReaderPrivate { bool IsBigEndian; // user-specified region values - BamRegion Region; - int CurrentRefID; - int CurrentLeft; + BamRegion Region; + bool HasAlignmentsInRegion; // parent BamReader BamReader* Parent; @@ -173,8 +170,6 @@ BamReader::BamReaderPrivate::BamReaderPrivate(BamReader* parent) : Index(0) , IsIndexLoaded(false) , AlignmentsBeginOffset(0) - , CurrentRefID(0) - , CurrentLeft(0) , Parent(parent) , DNA_LOOKUP("=ACMGRSVTWYHKDBN") , CIGAR_LOOKUP("MIDNSHP") @@ -400,6 +395,10 @@ bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) { // useful for operations requiring ONLY positional or other alignment-related information bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) { + // if region is set but has no alignments + if ( !Region.isNull() && !HasAlignmentsInRegion ) + return false; + // if valid alignment available if ( LoadNextAlignment(bAlignment) ) { @@ -714,6 +713,7 @@ bool BamReader::BamReaderPrivate::Rewind(void) { Region.clear(); Region.LeftRefID = al.RefID; Region.LeftPosition = al.Position; + HasAlignmentsInRegion = true; // rewind back to beginning of first alignment // return success/fail of seek @@ -737,10 +737,17 @@ bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) { // check for existing index if ( !IsIndexLoaded || Index == 0 ) return false; - // attempt jump to user-specified region, return false if failed - if ( !Index->Jump(region) ) return false; + // attempt jump to user-specified region return false if jump could not be performed at all + // (invalid index, unknown reference, etc) + // + // Index::Jump() is allowed to modify the HasAlignmentsInRegion flag + // * This covers case where a region is requested that lies beyond the last alignment on a reference + // If this occurs, any subsequent calls to GetNexAlignment[Core] simply return false + // BamMultiReader is then able to successfully pull alignments from a region from multiple files + // even if one or more have no data. + if ( !Index->Jump(region, &HasAlignmentsInRegion) ) return false; - // if successful, save region data locally, return success + // if jump successful, save region data & return success Region = region; return true; }