From 7f701a076169ceecca08b5d5d9bc8b5deebc8350 Mon Sep 17 00:00:00 2001 From: Derek Date: Thu, 17 Jun 2010 17:35:27 -0400 Subject: [PATCH] Modified Jump() scheme to take better account of specified region and drill down closer to region beginning. Introduced RegionState to BRP in order to allow LoadNextAlignment to quit once an alignment is found beyond region. --- BamReader.cpp | 84 +++++++++++++++++++++++++++++++++++----------- bamtools_count.cpp | 20 ++++++++--- 2 files changed, 80 insertions(+), 24 deletions(-) diff --git a/BamReader.cpp b/BamReader.cpp index 2754e76..41e3dc2 100644 --- a/BamReader.cpp +++ b/BamReader.cpp @@ -16,6 +16,7 @@ #include #include #include +#include // BamTools includes #include "BGZF.h" @@ -25,6 +26,14 @@ using namespace std; struct BamReader::BamReaderPrivate { + // ------------------------------- + // structs, enums, typedefs + // ------------------------------- + enum RegionState { BEFORE_REGION = 0 + , WITHIN_REGION + , AFTER_REGION + }; + // ------------------------------- // data members // ------------------------------- @@ -93,9 +102,9 @@ struct BamReader::BamReaderPrivate { // fills out character data for BamAlignment data bool BuildCharData(BamAlignment& bAlignment); // calculate file offset for first alignment chunk overlapping specified region - int64_t GetOffset(void); + int64_t GetOffset(std::vector& chunkStarts); // checks to see if alignment overlaps current region - bool IsOverlap(BamAlignment& bAlignment); + RegionState IsOverlap(BamAlignment& bAlignment); // retrieves header text from BAM file void LoadHeaderData(void); // retrieves BAM alignment under file pointer @@ -541,10 +550,20 @@ bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) // if region not specified, return success if ( !IsLeftBoundSpecified ) return true; - // load next alignment until region overlap is found - while ( !IsOverlap(bAlignment) ) { + // determine region state (before, within, after) + BamReader::BamReaderPrivate::RegionState state = IsOverlap(bAlignment); + + // if alignment lies after region, return false + if ( state == AFTER_REGION ) + return false; + + while ( state != WITHIN_REGION ) { // if no valid alignment available (likely EOF) return failure if ( !LoadNextAlignment(bAlignment) ) return false; + // if alignment lies after region, return false (no available read within region) + state = IsOverlap(bAlignment); + if ( state == AFTER_REGION) return false; + } // return success (alignment found that overlaps region) @@ -557,7 +576,7 @@ bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) } // calculate file offset for first alignment chunk overlapping specified region -int64_t BamReader::BamReaderPrivate::GetOffset(void) { +int64_t BamReader::BamReaderPrivate::GetOffset(std::vector& chunkStarts) { // calculate which bins overlap this region uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); @@ -572,13 +591,12 @@ int64_t BamReader::BamReaderPrivate::GetOffset(void) { uint64_t minOffset = ( (unsigned int)(Region.LeftPosition>>BAM_LIDX_SHIFT) >= offsets.size() ) ? 0 : offsets.at(Region.LeftPosition>>BAM_LIDX_SHIFT); // store offsets to beginning of alignment 'chunks' - std::vector chunkStarts; + //std::vector chunkStarts; // store all alignment 'chunk' starts for bins in this region for (int i = 0; i < numBins; ++i ) { const uint16_t binKey = bins[i]; - map::const_iterator binIter = binMap.find(binKey); if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) { @@ -665,9 +683,9 @@ void BamReader::BamReaderPrivate::InsertLinearOffset(LinearOffsetVector& offsets } } -// returns whether alignment overlaps currently specified region +// returns region state - whether alignment ends before, overlaps, or starts after currently specified region // this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true -bool BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { +BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { // -------------------------------------------------- // check alignment start against right bound cutoff @@ -677,11 +695,11 @@ bool BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { // read starts on right bound reference, but AFTER right bound position if ( bAlignment.RefID == Region.RightRefID && bAlignment.Position > Region.RightPosition ) - return false; + return AFTER_REGION; // if read starts on reference AFTER right bound, return false if ( bAlignment.RefID > Region.RightRefID ) - return false; + return AFTER_REGION; } // -------------------------------------------------------- @@ -690,15 +708,21 @@ bool BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { // if read starts on left bound reference AND after left boundary, return success if ( bAlignment.RefID == Region.LeftRefID && bAlignment.Position >= Region.LeftPosition) - return true; + return WITHIN_REGION; // if read is on any reference sequence before left bound, return false if ( bAlignment.RefID < Region.LeftRefID ) - return false; + return BEFORE_REGION; + // -------------------------------------------------------- // read is on left bound reference, but starts before left bound position - // calculate the alignment end to see if it overlaps - return ( bAlignment.GetEndPosition() >= Region.LeftPosition ); + + // if it overlaps, return WITHIN_REGION + if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) + return WITHIN_REGION; + // else begins before left bound position + else + return BEFORE_REGION; } // jumps to specified region(refID, leftBound) in BAM file, returns success/fail @@ -708,14 +732,36 @@ bool BamReader::BamReaderPrivate::Jump(int refID, int position) { if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) { // calculate offset - int64_t offset = GetOffset(); + std::vector chunkStarts; + int64_t offset = GetOffset(chunkStarts); + sort(chunkStarts.begin(), chunkStarts.end()); // if in valid offset, return failure // otherwise return success of seek operation - if ( offset == -1 ) + if ( offset == -1 ) { return false; - else - return mBGZF.Seek(offset); + } else { + //return mBGZF.Seek(offset); + BamAlignment bAlignment; + bool result = true; + for (std::vector::const_iterator o = chunkStarts.begin(); o != chunkStarts.end(); ++o) { + // std::cerr << *o << endl; + // std::cerr << "Seeking to offset: " << *o << endl; + result &= mBGZF.Seek(*o); + LoadNextAlignment(bAlignment); + // std::cerr << "alignment: " << bAlignment.RefID + // << ":" << bAlignment.Position << ".." << bAlignment.Position + bAlignment.Length << endl; + if ((bAlignment.RefID == refID && bAlignment.Position + bAlignment.Length > position) || bAlignment.RefID > refID) { + // std::cerr << "here i am" << endl; + // std::cerr << "seeking to offset: " << (*(o-1)) << endl; + // std::cerr << "*** Finished jumping ***" << endl; + return mBGZF.Seek(*o); + } + } + + //std::cerr << "*** Finished jumping ***" << endl; + return result; + } } // invalid jump request parameters, return failure diff --git a/bamtools_count.cpp b/bamtools_count.cpp index 67d1e9b..a0dadf3 100644 --- a/bamtools_count.cpp +++ b/bamtools_count.cpp @@ -115,10 +115,16 @@ int CountTool::Run(int argc, char* argv[]) { reader.Open(m_settings->InputBamFilename, m_settings->BamIndexFilename); // attempt Jump(), catch error - if ( !reader.Jump(region.StartChromID, region.StartPosition) ) { - foundError = true; - errorStream << "Could not jump to desired REGION: " << m_settings->Region << endl; + // if ( !reader.Jump(region.StartChromID, region.StartPosition) ) { + // foundError = true; + // errorStream << "Could not jump to desired REGION: " << m_settings->Region << endl; + // } + // + if ( !reader.SetRegion(region.StartChromID, region.StartPosition, region.StopChromID, region.StopPosition) ) { + foundError = true; + errorStream << "Could not set BamReader region to REGION: " << m_settings->Region << endl; } + } else { @@ -145,8 +151,12 @@ int CountTool::Run(int argc, char* argv[]) { // ( either current ref is before stopRefID OR // current ref stopRefID but we're before stopPos ) BamAlignment al; - while ( reader.GetNextAlignment(al) && ((al.RefID < region.StopChromID ) || (al.RefID == region.StopChromID && al.Position <= region.StopPosition)) ) + //while ( reader.GetNextAlignment(al) && ((al.RefID < region.StopChromID ) || (al.RefID == region.StopChromID && al.Position <= region.StopPosition)) ) + // ++alignmentCount; + // + while ( reader.GetNextAlignment(al) ) ++alignmentCount; + } } else { @@ -165,4 +175,4 @@ int CountTool::Run(int argc, char* argv[]) { // clean & exit reader.Close(); return (int)foundError; -} \ No newline at end of file +} -- 2.39.5