// ***************************************************************************
// BamStandardIndex.cpp (c) 2010 Derek Barnett
// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 27 April 2011 (DB)
+// Last modified: 24 June 2011 (DB)
// ---------------------------------------------------------------------------
// Provides index operations for the standardized BAM index format (".bai")
// ***************************************************************************
// read chunk start & stop from buffer
memcpy((char*)&chunkStart, m_buffer+offset, sizeof(uint64_t));
offset += sizeof(uint64_t);
- memcpy((char*)&chunkStop, m_buffer, sizeof(uint64_t));
+ memcpy((char*)&chunkStop, m_buffer+offset, sizeof(uint64_t));
offset += sizeof(uint64_t);
// swap endian-ness if necessary
// store alignment chunk's start offset
// if its stop offset is larger than our 'minOffset'
- if ( chunkStop > minOffset )
+ if ( chunkStop >= minOffset )
offsets.push_back(chunkStart);
}
}
}
- // return success/failure on calculating at least 1 offset
- return ( !offsets.empty() );
+ // return success
+ return true;
}
uint64_t BamStandardIndex::CalculateMinOffset(const BaiReferenceSummary& refSummary,
// builds index from associated BAM file & writes out to index file
bool BamStandardIndex::Create(void) {
- cerr << "Creating BAI..." << endl;
-
// return false if BamReader is invalid or not open
if ( m_reader == 0 || !m_reader->IsOpen() ) {
cerr << "BamStandardIndex ERROR: BamReader is not open"
return false;
}
- cerr << "BAM file is open" << endl;
-
// rewind BamReader
if ( !m_reader->Rewind() ) {
cerr << "BamStandardIndex ERROR: could not rewind BamReader to create index"
return false;
}
- cerr << "BAM file is rewound" << endl;
-
// open new index file (read & write)
string indexFilename = m_reader->Filename() + Extension();
if ( !OpenFile(indexFilename, "w+b") ) {
return BamStandardIndex::BAI_EXTENSION;
}
-bool BamStandardIndex::GetOffsets(const BamRegion& region, vector<int64_t>& offsets) {
+bool BamStandardIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
// cannot calculate offsets if unknown/invalid reference ID requested
if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() )
// set up region boundaries based on actual BamReader data
uint32_t begin;
uint32_t end;
- if ( !AdjustRegion(region, begin, end) )
+ if ( !AdjustRegion(region, begin, end) ) {
+ cerr << "BamStandardIndex ERROR: cannot calculate offsets on invalid region" << endl;
return false;
+ }
// retrieve all candidate bin IDs for region
set<uint16_t> candidateBins;
const uint64_t& minOffset = CalculateMinOffset(refSummary, begin);
// attempt to use reference summary, minOffset, & candidateBins to calculate offsets
- if ( !CalculateCandidateOffsets(refSummary, minOffset, candidateBins, offsets) )
+ // no data should not be error
+ vector<int64_t> offsets;
+ if ( !CalculateCandidateOffsets(refSummary, minOffset, candidateBins, offsets) ) {
+ cerr << "BamStandardIndex ERROR: could not calculate candidate offsets for requested region" << endl;
return false;
+ }
- // ensure that offsets are sorted before returning
+ // ensure that offsets are sorted before processing
sort( offsets.begin(), offsets.end() );
+ // binary search for an overlapping block (may not be first one though)
+ BamAlignment al;
+ typedef vector<int64_t>::const_iterator OffsetConstIterator;
+ OffsetConstIterator offsetFirst = offsets.begin();
+ OffsetConstIterator offsetIter = offsetFirst;
+ OffsetConstIterator offsetLast = offsets.end();
+ iterator_traits<OffsetConstIterator>::difference_type count = distance(offsetFirst, offsetLast);
+ iterator_traits<OffsetConstIterator>::difference_type step;
+ while ( count > 0 ) {
+ offsetIter = offsetFirst;
+ step = count/2;
+ advance(offsetIter, step);
+
+ // attempt seek to candidate offset
+ const int64_t& candidateOffset = (*offsetIter);
+ if ( !m_reader->Seek(candidateOffset) ) {
+ cerr << "BamStandardIndex ERROR: could not jump"
+ << ", there was a problem seeking in BAM file" << endl;
+ return false;
+ }
+
+ // load first available alignment, setting flag to true if data exists
+ *hasAlignmentsInRegion = m_reader->LoadNextAlignment(al);
+
+ // check alignment against region
+ if ( al.GetEndPosition() < region.LeftPosition ) {
+ offsetFirst = ++offsetIter;
+ count -= step+1;
+ } else count = step;
+ }
+
+ // seek back to the offset before the 'current offset' (to cover overlaps)
+ if ( offsetIter != offsets.begin() )
+ --offsetIter;
+ offset = (*offsetIter);
+
// return succes
return true;
}
// available after the jump position
bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
+ // clear out flag
+ *hasAlignmentsInRegion = false;
+
// skip if reader is not valid or is not open
if ( m_reader == 0 || !m_reader->IsOpen() )
return false;
- // calculate offsets for this region
- // if failed, print message, set flag, and return failure
- vector<int64_t> offsets;
- if ( !GetOffsets(region, offsets) ) {
+ // calculate nearest offset to jump to
+ int64_t offset;
+ if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) {
cerr << "BamStandardIndex ERROR: could not jump"
- << ", unable to retrieve offsets for region" << endl;
- *hasAlignmentsInRegion = false;
+ << ", unable to calculate offset for specified region" << endl;
return false;
}
- // if no offsets retrieved, set flag
- if ( offsets.empty() )
- *hasAlignmentsInRegion = false;
-
- // iterate through candidate offsets
- BamAlignment al;
- bool result = true;
- vector<int64_t>::const_iterator offsetIter = offsets.begin();
- vector<int64_t>::const_iterator offsetEnd = offsets.end();
- for ( ; offsetIter != offsetEnd; ++offsetIter) {
-
- // attempt seek & load first available alignment
- // set flag to true if data exists
- result &= m_reader->Seek(*offsetIter);
- *hasAlignmentsInRegion = m_reader->LoadNextAlignment(al);
-
- // if this alignment corresponds to desired position
- // return success of seeking back to the offset before the 'current offset' (to cover overlaps)
- if ( ((al.RefID == region.LeftRefID) &&
- ((al.Position + al.Length) > region.LeftPosition)) ||
- (al.RefID > region.LeftRefID) )
- {
- if ( offsetIter != offsets.begin() )
- --offsetIter;
- return m_reader->Seek(*offsetIter);
- }
- }
-
- // if error in jumping, print message & set flag
- if ( !result ) {
- cerr << "BamStandardIndex ERROR: could not jump"
- << ", there was a problem seeking in BAM file" << endl;
- *hasAlignmentsInRegion = false;
- }
+ // if region has alignments, return success/fail of seeking there
+ if ( *hasAlignmentsInRegion )
+ return m_reader->Seek(offset);
- // return success/failure
- return result;
+ // otherwise, simply return true (but hasAlignmentsInRegion flag has been set to false)
+ // (this is OK, BamReader will check this flag before trying to load data)
+ return true;
}
// loads existing data from file into memory
// create new alignment chunk
BaiAlignmentChunk newChunk(currentOffset, lastOffset);
+
+
// if no entry exists yet for this bin, create one and store alignment chunk
BaiBinMap::iterator binIter = binMap.find(currentBin);
if ( binIter == binMap.end() ) {
}
bool BamStandardIndex::SummarizeReference(BaiReferenceSummary& refSummary) {
+
bool loadedOk = true;
loadedOk &= SummarizeBins(refSummary);
loadedOk &= SummarizeLinearOffsets(refSummary);