// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 5 April 2011 (DB)
+// Last modified: 27 April 2011 (DB)
// ---------------------------------------------------------------------------
// Provides index operations for the standardized BAM index format (".bai")
// ***************************************************************************
// builds index from associated BAM file & writes out to index file
bool BamStandardIndex::Create(void) {
+ cerr << "Creating BAI..." << endl;
+
// return false if BamReader is invalid or not open
if ( m_reader == 0 || !m_reader->IsOpen() ) {
cerr << "BamStandardIndex ERROR: BamReader is not open"
return false;
}
+ cerr << "BAM file is open" << endl;
+
// rewind BamReader
if ( !m_reader->Rewind() ) {
cerr << "BamStandardIndex ERROR: could not rewind BamReader to create index"
return false;
}
+ cerr << "BAM file is rewound" << endl;
+
// open new index file (read & write)
string indexFilename = m_reader->Filename() + Extension();
if ( !OpenFile(indexFilename, "w+b") ) {
createdOk &= WriteReferenceEntry(refEntry);
ClearReferenceEntry(refEntry);
+ // write any empty references between (but *NOT* including) lastRefID & al.RefID
+ for ( int i = lastRefID+1; i < al.RefID; ++i ) {
+ BaiReferenceEntry emptyEntry(i);
+ createdOk &= WriteReferenceEntry(emptyEntry);
+ }
+
// update bin markers
currentOffset = lastOffset;
currentBin = al.Bin;
currentRefID = al.RefID;
}
+ // first pass
+ // write any empty references up to (but *NOT* including) al.RefID
+ else {
+ for ( int i = 0; i < al.RefID; ++i ) {
+ BaiReferenceEntry emptyEntry(i);
+ createdOk &= WriteReferenceEntry(emptyEntry);
+ }
+ }
+
// update reference markers
refEntry.ID = al.RefID;
lastRefID = al.RefID;
lastPosition = al.Position;
}
- // store last alignment chunk to its bin, then write last reference entry
+ // after finishing alignments, if any data was read, check:
if ( currentRefID >= 0 ) {
+
+ // store last alignment chunk to its bin, then write last reference entry with data
SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset);
createdOk &= WriteReferenceEntry(refEntry);
+
+ // then write any empty references remaining at end of file
+ for ( int i = currentRefID+1; i < numReferences; ++i ) {
+ BaiReferenceEntry emptyEntry(i);
+ createdOk &= WriteReferenceEntry(emptyEntry);
+ }
}
// rewind reader now that we're done building
// if invalid format 'magic number', close & return failure
if ( !CheckMagicNumber() ) {
+ cerr << "BamStandardIndex ERROR: unexpected format for index file: " << filename
+ << ", aborting index load" << endl;
CloseFile();
return false;
}
// attempt to load index file summary, return success/failure
- return SummarizeIndexFile();
+ if ( !SummarizeIndexFile() ) {
+ cerr << "BamStandardIndex ERROR: could not generate a summary of index file " << filename
+ << ", aborting index load" << endl;
+ CloseFile();
+ return false;
+ }
+
+ // if we get here, index summary is loaded OK
+ return true;
}
uint64_t BamStandardIndex::LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index) {
refSummary.FirstBinFilePosition = Tell();
// attempt skip reference bins, return success/failure
- return SkipBins(numBins);
+ if ( !SkipBins(numBins) )
+ return false;
+
+ // if we get here, bin summarized OK
+ return true;
}
bool BamStandardIndex::SummarizeIndexFile(void) {
refSummary.FirstLinearOffsetFilePosition = Tell();
// skip linear offsets in index file
- return SkipLinearOffsets(numLinearOffsets);
+ if ( !SkipLinearOffsets(numLinearOffsets) )
+ return false;
+
+ // if get here, linear offsets summarized OK
+ return true;
}
bool BamStandardIndex::SummarizeReference(BaiReferenceSummary& refSummary) {
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 5 April 2011 (DB)
+// Last modified: 27 April 2011 (DB)
// ---------------------------------------------------------------------------
// Provides index operations for the BamTools index format (".bti")
// ***************************************************************************
#include <cstring>
#include <algorithm>
#include <iostream>
+#include <iterator>
#include <map>
using namespace std;
// open new index file (read & write)
string indexFilename = m_reader->Filename() + Extension();
if ( !OpenFile(indexFilename, "w+b") ) {
- cerr << "BamStandardIndex ERROR: could not open ouput index file " << indexFilename
+ cerr << "BamToolsIndex ERROR: could not open ouput index file " << indexFilename
<< ", aborting index creation" << endl;
return false;
}
// index building markers
int32_t currentBlockCount = 0;
int64_t currentAlignmentOffset = m_reader->Tell();
- int32_t blockRefId = 0;
- int32_t blockMaxEndPosition = 0;
+ int32_t blockRefId = -1;
+ int32_t blockMaxEndPosition = -1;
int64_t blockStartOffset = currentAlignmentOffset;
int32_t blockStartPosition = -1;
BtiReferenceEntry refEntry;
while ( m_reader->LoadNextAlignment(al) ) {
- // if block contains data (not the first time through) AND alignment is on a new reference
- if ( currentBlockCount > 0 && al.RefID != blockRefId ) {
+ // if moved to new reference
+ if ( al.RefID != blockRefId ) {
- // store previous BTI block data in reference entry
- BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
- refEntry.Blocks.push_back(block);
+ // if first pass, check:
+ if ( currentBlockCount == 0 ) {
- // write reference entry, then clear
- createdOk &= WriteReferenceEntry(refEntry);
- ClearReferenceEntry(refEntry);
+ // write any empty references up to (but not including) al.RefID
+ for ( int i = 0; i < al.RefID; ++i ) {
+ BtiReferenceEntry emptyEntry(i);
+ createdOk &= WriteReferenceEntry(emptyEntry);
+ }
+ }
- // update markers
- currentBlockCount = 0;
- blockMaxEndPosition = al.GetEndPosition();
- blockStartOffset = currentAlignmentOffset;
+ // not first pass:
+ else {
+
+ // store previous BTI block data in reference entry
+ BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ refEntry.Blocks.push_back(block);
+
+ // write reference entry, then clear
+ createdOk &= WriteReferenceEntry(refEntry);
+ ClearReferenceEntry(refEntry);
+
+ // write any empty references between (but not including) the last blockRefID and current al.RefID
+ for ( int i = blockRefId+1; i < al.RefID; ++i ) {
+ BtiReferenceEntry emptyEntry(i);
+ createdOk &= WriteReferenceEntry(emptyEntry);
+ }
+
+ // reset block count
+ currentBlockCount = 0;
+ }
+
+ // set ID for new reference entry
+ refEntry.ID = al.RefID;
}
- // if beginning of block, save first alignment's refID & position
+ // if beginning of block, update counters
if ( currentBlockCount == 0 ) {
- blockRefId = al.RefID;
- blockStartPosition = al.Position;
+ blockRefId = al.RefID;
+ blockStartOffset = currentAlignmentOffset;
+ blockStartPosition = al.Position;
+ blockMaxEndPosition = al.GetEndPosition();
}
// increment block counter
currentAlignmentOffset = m_reader->Tell();
}
- // store last BTI block data in reference entry
- BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
- refEntry.Blocks.push_back(block);
+ // after finishing alignments, if any data was read, check:
+ if ( blockRefId >= 0 ) {
- // write last reference entry, then clear
- createdOk &= WriteReferenceEntry(refEntry);
- ClearReferenceEntry(refEntry);
+ // store last BTI block data in reference entry
+ BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ refEntry.Blocks.push_back(block);
+
+ // write last reference entry, then clear
+ createdOk &= WriteReferenceEntry(refEntry);
+ ClearReferenceEntry(refEntry);
+
+ // then write any empty references remaining at end of file
+ for ( int i = blockRefId+1; i < numReferences; ++i ) {
+ BtiReferenceEntry emptyEntry(i);
+ createdOk &= WriteReferenceEntry(emptyEntry);
+ }
+ }
// rewind reader & return result
createdOk &= m_reader->Rewind();
return false;
}
- // TODO: can this next loop be sped up?
- // Binary search for overlap instead of linear search perhaps.
+ // binary search for an overlapping block (may not be first one though)
+ bool found = false;
+ typedef BtiBlockVector::const_iterator BtiBlockConstIterator;
+ BtiBlockConstIterator blockFirst = refEntry.Blocks.begin();
+ BtiBlockConstIterator blockIter = blockFirst;
+ BtiBlockConstIterator blockLast = refEntry.Blocks.end();
+ iterator_traits<BtiBlockConstIterator>::difference_type count = distance(blockFirst, blockLast);
+ iterator_traits<BtiBlockConstIterator>::difference_type step;
+ while ( count > 0 ) {
+ blockIter = blockFirst;
+ step = count/2;
+ advance(blockIter, step);
- // iterate over blocks on this reference
- BtiBlockVector::const_iterator blockIter = refEntry.Blocks.begin();
- BtiBlockVector::const_iterator blockEnd = refEntry.Blocks.end();
- for ( ; blockIter != blockEnd; ++blockIter ) {
const BtiBlock& block = (*blockIter);
+ if ( block.StartPosition <= region.RightPosition ) {
+ if ( block.MaxEndPosition >= region.LeftPosition ) {
+ offset = block.StartOffset;
+ break;
+ }
+ blockFirst = ++blockIter;
+ count -= step+1;
+ }
+ else count = step;
+ }
+
+ // if we didn't search "off the end" of the blocks
+ if ( blockIter != blockLast ) {
+
+ // "walk back" until we've gone too far
+ while ( blockIter != blockFirst ) {
+ const BtiBlock& currentBlock = (*blockIter);
- // store offset & break if block end overlaps region start
- if ( block.MaxEndPosition >= region.LeftPosition ) {
+ --blockIter;
+ const BtiBlock& previousBlock = (*blockIter);
+ if ( previousBlock.MaxEndPosition < region.LeftPosition ) {
+ offset = currentBlock.StartOffset;
+ found = true;
+ break;
+ }
+ }
+
+ // if we walked all the way to first block, just return that and let the reader's
+ // region overlap parsing do the rest
+ if ( blockIter == blockFirst ) {
+ const BtiBlock& block = (*blockIter);
offset = block.StartOffset;
- break;
+ found = true;
}
}
+
// sets to false if blocks container is empty, or if no matching block could be found
- *hasAlignmentsInRegion = ( blockIter != blockEnd );
+ *hasAlignmentsInRegion = found;
// return success
return true;
// attempt open index file (read-only)
if ( !OpenFile(filename, "rb") ) {
- cerr << "BamStandardIndex ERROR: could not open input index file " << filename
+ cerr << "BamToolsIndex ERROR: could not open input index file " << filename
<< ", aborting index load" << endl;
return false;
}
// attempt to load & validate BTI header data
if ( !LoadHeader() ) {
+ cerr << "BamToolsIndex ERROR: could load header from index file " << filename
+ << ", aborting index load" << endl;
CloseFile();
return false;
}
- // attempt to load index file summary, return success/failure
- return LoadFileSummary();
+ // attempt to load index file summary
+ if ( !LoadFileSummary() ) {
+ cerr << "BamToolsIndex ERROR: could not generate a summary of index file " << filename
+ << ", aborting index load" << endl;
+ CloseFile();
+ return false;
+ }
+
+ // if we get here, index summary is loaded OK
+ return true;
}
bool BamToolsIndex::LoadFileSummary(void) {
bool loadedOk = true;
BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin();
BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end();
- for ( ; summaryIter != summaryEnd; ++summaryIter )
+ for ( ; summaryIter != summaryEnd; ++summaryIter ) {
loadedOk &= LoadReferenceSummary(*summaryIter);
+ }
// return result
return loadedOk;
// write number of references
int32_t numReferences = m_indexFileSummary.size();
- cerr << "Writing numReferences = " << numReferences << endl;
if ( m_isBigEndian ) SwapEndian_32(numReferences);
elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream);