From 971c4168ee75fd806dbc33b6adaf067b749fe14f Mon Sep 17 00:00:00 2001 From: derek Date: Wed, 27 Jul 2011 11:34:41 -0400 Subject: [PATCH] Implemented binary search through bins in BAI --- src/api/internal/BamStandardIndex_p.cpp | 85 ++++++++++++++----------- src/api/internal/BamStandardIndex_p.h | 4 +- 2 files changed, 51 insertions(+), 38 deletions(-) diff --git a/src/api/internal/BamStandardIndex_p.cpp b/src/api/internal/BamStandardIndex_p.cpp index 6cb7b0a..8993d43 100644 --- a/src/api/internal/BamStandardIndex_p.cpp +++ b/src/api/internal/BamStandardIndex_p.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 16 June 2011 (DB) +// Last modified: 24 June 2011 (DB) // --------------------------------------------------------------------------- // Provides index operations for the standardized BAM index format (".bai") // *************************************************************************** @@ -402,7 +402,7 @@ const string BamStandardIndex::Extension(void) { return BamStandardIndex::BAI_EXTENSION; } -bool BamStandardIndex::GetOffsets(const BamRegion& region, vector& offsets) { +bool BamStandardIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { // cannot calculate offsets if unknown/invalid reference ID requested if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() ) @@ -429,14 +429,51 @@ bool BamStandardIndex::GetOffsets(const BamRegion& region, vector& offs // attempt to use reference summary, minOffset, & candidateBins to calculate offsets // no data should not be error + vector offsets; if ( !CalculateCandidateOffsets(refSummary, minOffset, candidateBins, offsets) ) { cerr << "BamStandardIndex ERROR: could not calculate candidate offsets for requested region" << endl; return false; } - // ensure that offsets are sorted before returning + // ensure that offsets are sorted before processing sort( offsets.begin(), offsets.end() ); + // binary search for an overlapping block (may not be first one though) + BamAlignment al; + typedef vector::const_iterator OffsetConstIterator; + OffsetConstIterator offsetFirst = offsets.begin(); + OffsetConstIterator offsetIter = offsetFirst; + OffsetConstIterator offsetLast = offsets.end(); + iterator_traits::difference_type count = distance(offsetFirst, offsetLast); + iterator_traits::difference_type step; + while ( count > 0 ) { + offsetIter = offsetFirst; + step = count/2; + advance(offsetIter, step); + + // attempt seek to candidate offset + const int64_t& candidateOffset = (*offsetIter); + if ( !m_reader->Seek(candidateOffset) ) { + cerr << "BamStandardIndex ERROR: could not jump" + << ", there was a problem seeking in BAM file" << endl; + return false; + } + + // load first available alignment, setting flag to true if data exists + *hasAlignmentsInRegion = m_reader->LoadNextAlignment(al); + + // check alignment against region + if ( al.GetEndPosition() < region.LeftPosition ) { + offsetFirst = ++offsetIter; + count -= step+1; + } else count = step; + } + + // seek back to the offset before the 'current offset' (to cover overlaps) + if ( offsetIter != offsets.begin() ) + --offsetIter; + offset = (*offsetIter); + // return succes return true; } @@ -466,44 +503,20 @@ bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion if ( m_reader == 0 || !m_reader->IsOpen() ) return false; - // calculate offsets for this region - vector offsets; - if ( !GetOffsets(region, offsets) ) { + // calculate nearest offset to jump to + int64_t offset; + if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) { cerr << "BamStandardIndex ERROR: could not jump" - << ", unable to retrieve offsets for region" << endl; + << ", unable to calculate offset for specified region" << endl; return false; } - // iterate through candidate offsets - BamAlignment al; - vector::const_iterator offsetIter = offsets.begin(); - vector::const_iterator offsetEnd = offsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter) { - - // attempt seek - if ( !m_reader->Seek(*offsetIter) ) { - cerr << "BamStandardIndex ERROR: could not jump" - << ", there was a problem seeking in BAM file" << endl; - return false; - } - - // load first available alignment, setting flag to true if data exists - *hasAlignmentsInRegion = m_reader->LoadNextAlignment(al); - - // if this alignment corresponds to desired position - // return success of seeking back to the offset before the 'current offset' (to cover overlaps) - if ( ((al.RefID == region.LeftRefID) && - ((al.Position + al.Length) > region.LeftPosition)) || - (al.RefID > region.LeftRefID) ) - { - if ( offsetIter != offsets.begin() ) - --offsetIter; - return m_reader->Seek(*offsetIter); - } - } + // if region has alignments, return success/fail of seeking there + if ( *hasAlignmentsInRegion ) + return m_reader->Seek(offset); - // return success (no offset data is not an error, - // but hasAlignments flag will be marked accordingly) + // otherwise, simply return true (but hasAlignmentsInRegion flag has been set to false) + // (this is OK, BamReader will check this flag before trying to load data) return true; } diff --git a/src/api/internal/BamStandardIndex_p.h b/src/api/internal/BamStandardIndex_p.h index 7c61a62..7123ae7 100644 --- a/src/api/internal/BamStandardIndex_p.h +++ b/src/api/internal/BamStandardIndex_p.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 5 April 2011 (DB) +// Last modified: 24 June 2011 (DB) // --------------------------------------------------------------------------- // Provides index operations for the standardized BAM index format (".bai") // *************************************************************************** @@ -161,7 +161,7 @@ class BamStandardIndex : public BamIndex { std::set& candidateBins, std::vector& offsets); uint64_t CalculateMinOffset(const BaiReferenceSummary& refSummary, const uint32_t& begin); - bool GetOffsets(const BamRegion& region, std::vector& offsets); + bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); uint64_t LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index); // internal BAI summary (create/load) methods -- 2.39.2