--- /dev/null
+// ***************************************************************************
+// BamToolsIndex.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// ---------------------------------------------------------------------------
+// Last modified: 25 October 2011 (DB)
+// ---------------------------------------------------------------------------
+// Provides index operations for the BamTools index format (".bti")
+// ***************************************************************************
+
+#include "api/BamAlignment.h"
+#include "api/internal/bam/BamReader_p.h"
+#include "api/internal/index/BamToolsIndex_p.h"
+#include "api/internal/io/BgzfStream_p.h"
+#include "api/internal/utils/BamException_p.h"
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <map>
+using namespace std;
+
+// --------------------------------
+// static BamToolsIndex constants
+// --------------------------------
+
+const uint32_t BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000;
+const string BamToolsIndex::BTI_EXTENSION = ".bti";
+const char* const BamToolsIndex::BTI_MAGIC = "BTI\1";
+const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t)*2 + sizeof(int64_t);
+
+// ----------------------------
+// RaiiWrapper implementation
+// ----------------------------
+
+BamToolsIndex::RaiiWrapper::RaiiWrapper(void)
+ : IndexStream(0)
+{ }
+
+BamToolsIndex::RaiiWrapper::~RaiiWrapper(void) {
+ if ( IndexStream )
+ fclose(IndexStream);
+}
+
+// ------------------------------
+// BamToolsIndex implementation
+// ------------------------------
+
+// ctor
+BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader)
+ : BamIndex(reader)
+ , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH)
+ , m_inputVersion(0)
+ , m_outputVersion(BTI_2_0) // latest version - used for writing new index files
+{
+ m_isBigEndian = BamTools::SystemIsBigEndian();
+}
+
+// dtor
+BamToolsIndex::~BamToolsIndex(void) {
+ CloseFile();
+}
+
+void BamToolsIndex::CheckMagicNumber(void) {
+
+ // read magic number
+ char magic[4];
+ size_t elementsRead = fread(magic, sizeof(char), 4, Resources.IndexStream);
+ if ( elementsRead != 4 )
+ throw BamException("BamToolsIndex::CheckMagicNumber", "could not read BTI magic number");
+
+ // validate expected magic number
+ if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 )
+ throw BamException("BamToolsIndex::CheckMagicNumber", "invalid BTI magic number");
+}
+
+// check index file version, return true if OK
+void BamToolsIndex::CheckVersion(void) {
+
+ // read version from file
+ size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, Resources.IndexStream);
+ if ( elementsRead != 1 )
+ throw BamException("BamToolsIndex::CheckVersion", "could not read format version");
+ if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
+
+ // if version is negative, or zero
+ if ( m_inputVersion <= 0 )
+ throw BamException("BamToolsIndex::CheckVersion", "invalid format version");
+
+ // if version is newer than can be supported by this version of bamtools
+ else if ( m_inputVersion > m_outputVersion ) {
+ const string message = "unsupported format: this index was created by a newer version of BamTools. "
+ "Update your local version of BamTools to use the index file.";
+ throw BamException("BamToolsIndex::CheckVersion", message);
+ }
+
+ // ------------------------------------------------------------------
+ // check for deprecated, unsupported versions
+ // (the format had to be modified to accomodate a particular bug fix)
+
+ // Version 2.0: introduced support for half-open intervals, instead of the old closed intervals
+ // respondBy: throwing exception - we're not going to try to handle the old BTI files.
+ else if ( (Version)m_inputVersion < BamToolsIndex::BTI_2_0 ) {
+ const string message = "unsupported format: this version of the index may not properly handle "
+ "coordinate intervals. Please run 'bamtools index -bti -in yourData.bam' "
+ "to generate an up-to-date, fixed BTI file.";
+ throw BamException("BamToolsIndex::CheckVersion", message);
+ }
+}
+
+void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) {
+ refEntry.ID = -1;
+ refEntry.Blocks.clear();
+}
+
+void BamToolsIndex::CloseFile(void) {
+ if ( IsFileOpen() ) {
+ fclose(Resources.IndexStream);
+ Resources.IndexStream = 0;
+ }
+ m_indexFileSummary.clear();
+}
+
+// builds index from associated BAM file & writes out to index file
+bool BamToolsIndex::Create(void) {
+
+ // skip if BamReader is invalid or not open
+ if ( m_reader == 0 || !m_reader->IsOpen() ) {
+ SetErrorString("BamToolsIndex::Create", "could not create index: reader is not open");
+ return false;
+ }
+
+ // rewind BamReader
+ if ( !m_reader->Rewind() ) {
+ const string readerError = m_reader->GetErrorString();
+ const string message = "could not create index: \n\t" + readerError;
+ SetErrorString("BamToolsIndex::Create", message);
+ return false;
+ }
+
+ try {
+ // open new index file (read & write)
+ const string indexFilename = m_reader->Filename() + Extension();
+ OpenFile(indexFilename, "w+b");
+
+ // initialize BtiFileSummary with number of references
+ const int& numReferences = m_reader->GetReferenceCount();
+ InitializeFileSummary(numReferences);
+
+ // intialize output file header
+ WriteHeader();
+
+ // index building markers
+ uint32_t currentBlockCount = 0;
+ int64_t currentAlignmentOffset = m_reader->Tell();
+ int32_t blockRefId = -1;
+ int32_t blockMaxEndPosition = -1;
+ int64_t blockStartOffset = currentAlignmentOffset;
+ int32_t blockStartPosition = -1;
+
+ // plow through alignments, storing index entries
+ BamAlignment al;
+ BtiReferenceEntry refEntry;
+ while ( m_reader->LoadNextAlignment(al) ) {
+
+ // if moved to new reference
+ if ( al.RefID != blockRefId ) {
+
+ // if first pass, check:
+ if ( currentBlockCount == 0 ) {
+
+ // write any empty references up to (but not including) al.RefID
+ for ( int i = 0; i < al.RefID; ++i )
+ WriteReferenceEntry( BtiReferenceEntry(i) );
+ }
+
+ // not first pass:
+ else {
+
+ // store previous BTI block data in reference entry
+ const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ refEntry.Blocks.push_back(block);
+
+ // write reference entry, then clear
+ WriteReferenceEntry(refEntry);
+ ClearReferenceEntry(refEntry);
+
+ // write any empty references between (but not including)
+ // the last blockRefID and current al.RefID
+ for ( int i = blockRefId+1; i < al.RefID; ++i )
+ WriteReferenceEntry( BtiReferenceEntry(i) );
+
+ // reset block count
+ currentBlockCount = 0;
+ }
+
+ // set ID for new reference entry
+ refEntry.ID = al.RefID;
+ }
+
+ // if beginning of block, update counters
+ if ( currentBlockCount == 0 ) {
+ blockRefId = al.RefID;
+ blockStartOffset = currentAlignmentOffset;
+ blockStartPosition = al.Position;
+ blockMaxEndPosition = al.GetEndPosition();
+ }
+
+ // increment block counter
+ ++currentBlockCount;
+
+ // check end position
+ const int32_t alignmentEndPosition = al.GetEndPosition();
+ if ( alignmentEndPosition > blockMaxEndPosition )
+ blockMaxEndPosition = alignmentEndPosition;
+
+ // if block is full, get offset for next block, reset currentBlockCount
+ if ( currentBlockCount == m_blockSize ) {
+
+ // store previous block data in reference entry
+ const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ refEntry.Blocks.push_back(block);
+
+ // update markers
+ blockStartOffset = m_reader->Tell();
+ currentBlockCount = 0;
+ }
+
+ // not the best name, but for the next iteration, this value will be the offset of the
+ // *current* alignment. this is necessary because we won't know if this next alignment
+ // is on a new reference until we actually read it
+ currentAlignmentOffset = m_reader->Tell();
+ }
+
+ // after finishing alignments, if any data was read, check:
+ if ( blockRefId >= 0 ) {
+
+ // store last BTI block data in reference entry
+ const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
+ refEntry.Blocks.push_back(block);
+
+ // write last reference entry, then clear
+ WriteReferenceEntry(refEntry);
+ ClearReferenceEntry(refEntry);
+
+ // then write any empty references remaining at end of file
+ for ( int i = blockRefId+1; i < numReferences; ++i )
+ WriteReferenceEntry( BtiReferenceEntry(i) );
+ }
+
+ } catch ( BamException& e ) {
+ m_errorString = e.what();
+ return false;
+ }
+
+ // rewind BamReader
+ if ( !m_reader->Rewind() ) {
+ const string readerError = m_reader->GetErrorString();
+ const string message = "could not create index: \n\t" + readerError;
+ SetErrorString("BamToolsIndex::Create", message);
+ return false;
+ }
+
+ // return success
+ return true;
+}
+
+// returns format's file extension
+const std::string BamToolsIndex::Extension(void) {
+ return BamToolsIndex::BTI_EXTENSION;
+}
+
+void BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
+
+ // return false ref ID is not a valid index in file summary data
+ if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() )
+ throw BamException("BamToolsIndex::GetOffset", "invalid region requested");
+
+ // retrieve reference index data for left bound reference
+ BtiReferenceEntry refEntry(region.LeftRefID);
+ ReadReferenceEntry(refEntry);
+
+ // binary search for an overlapping block (may not be first one though)
+ bool found = false;
+ typedef BtiBlockVector::const_iterator BtiBlockConstIterator;
+ BtiBlockConstIterator blockFirst = refEntry.Blocks.begin();
+ BtiBlockConstIterator blockIter = blockFirst;
+ BtiBlockConstIterator blockLast = refEntry.Blocks.end();
+ iterator_traits<BtiBlockConstIterator>::difference_type count = distance(blockFirst, blockLast);
+ iterator_traits<BtiBlockConstIterator>::difference_type step;
+ while ( count > 0 ) {
+ blockIter = blockFirst;
+ step = count/2;
+ advance(blockIter, step);
+
+ const BtiBlock& block = (*blockIter);
+ if ( block.StartPosition <= region.RightPosition ) {
+ if ( block.MaxEndPosition > region.LeftPosition ) {
+ offset = block.StartOffset;
+ break;
+ }
+ blockFirst = ++blockIter;
+ count -= step+1;
+ }
+ else count = step;
+ }
+
+ // if we didn't search "off the end" of the blocks
+ if ( blockIter != blockLast ) {
+
+ // "walk back" until we've gone too far
+ while ( blockIter != blockFirst ) {
+ const BtiBlock& currentBlock = (*blockIter);
+
+ --blockIter;
+ const BtiBlock& previousBlock = (*blockIter);
+ if ( previousBlock.MaxEndPosition <= region.LeftPosition ) {
+ offset = currentBlock.StartOffset;
+ found = true;
+ break;
+ }
+ }
+
+ // if we walked all the way to first block, just return that and let the reader's
+ // region overlap parsing do the rest
+ if ( blockIter == blockFirst ) {
+ const BtiBlock& block = (*blockIter);
+ offset = block.StartOffset;
+ found = true;
+ }
+ }
+
+
+ // sets to false if blocks container is empty, or if no matching block could be found
+ *hasAlignmentsInRegion = found;
+}
+
+// returns whether reference has alignments or no
+bool BamToolsIndex::HasAlignments(const int& referenceID) const {
+ if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() )
+ return false;
+ const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID);
+ return ( refSummary.NumBlocks > 0 );
+}
+
+// pre-allocates space for each reference's summary data
+void BamToolsIndex::InitializeFileSummary(const int& numReferences) {
+ m_indexFileSummary.clear();
+ for ( int i = 0; i < numReferences; ++i )
+ m_indexFileSummary.push_back( BtiReferenceSummary() );
+}
+
+// returns true if the index stream is open
+bool BamToolsIndex::IsFileOpen(void) const {
+ return ( Resources.IndexStream != 0 );
+}
+
+// attempts to use index data to jump to @region, returns success/fail
+// a "successful" jump indicates no error, but not whether this region has data
+// * thus, the method sets a flag to indicate whether there are alignments
+// available after the jump position
+bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) {
+
+ // clear flag
+ *hasAlignmentsInRegion = false;
+
+ // skip if invalid reader or not open
+ if ( m_reader == 0 || !m_reader->IsOpen() ) {
+ SetErrorString("BamToolsIndex::Jump", "could not jump: reader is not open");
+ return false;
+ }
+
+ // make sure left-bound position is valid
+ const RefVector& references = m_reader->GetReferenceData();
+ if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) {
+ SetErrorString("BamToolsIndex::Jump", "could not create index: invalid region requested");
+ return false;
+ }
+
+ // calculate nearest offset to jump to
+ int64_t offset;
+ try {
+ GetOffset(region, offset, hasAlignmentsInRegion);
+ } catch ( BamException& e ) {
+ m_errorString = e.what();
+ return false;
+ }
+
+ // return success/failure of seek
+ return m_reader->Seek(offset);
+}
+
+// loads existing data from file into memory
+bool BamToolsIndex::Load(const std::string& filename) {
+
+ try {
+
+ // attempt to open file (read-only)
+ OpenFile(filename, "rb");
+
+ // load metadata & generate in-memory summary
+ LoadHeader();
+ LoadFileSummary();
+
+ // return success
+ return true;
+
+ } catch ( BamException& e ) {
+ m_errorString = e.what();
+ return false;
+ }
+}
+
+void BamToolsIndex::LoadFileSummary(void) {
+
+ // load number of reference sequences
+ int numReferences;
+ LoadNumReferences(numReferences);
+
+ // initialize file summary data
+ InitializeFileSummary(numReferences);
+
+ // load summary for each reference
+ BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin();
+ BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end();
+ for ( ; summaryIter != summaryEnd; ++summaryIter )
+ LoadReferenceSummary(*summaryIter);
+}
+
+void BamToolsIndex::LoadHeader(void) {
+
+ // check BTI file metadata
+ CheckMagicNumber();
+ CheckVersion();
+
+ // use file's BTI block size to set member variable
+ const size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, Resources.IndexStream);
+ if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
+ if ( elementsRead != 1 )
+ throw BamException("BamToolsIndex::LoadHeader", "could not read BTI block size");
+}
+
+void BamToolsIndex::LoadNumBlocks(int& numBlocks) {
+ const size_t elementsRead = fread(&numBlocks, sizeof(numBlocks), 1, Resources.IndexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numBlocks);
+ if ( elementsRead != 1 )
+ throw BamException("BamToolsIndex::LoadNumBlocks", "could not read number of BTI blocks");
+}
+
+void BamToolsIndex::LoadNumReferences(int& numReferences) {
+ const size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, Resources.IndexStream);
+ if ( m_isBigEndian ) SwapEndian_32(numReferences);
+ if ( elementsRead != 1 )
+ throw BamException("BamToolsIndex::LoadNumReferences", "could not read number of references");
+}
+
+void BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) {
+
+ // load number of blocks
+ int numBlocks;
+ LoadNumBlocks(numBlocks);
+
+ // store block summary data for this reference
+ refSummary.NumBlocks = numBlocks;
+ refSummary.FirstBlockFilePosition = Tell();
+
+ // skip reference's blocks
+ SkipBlocks(numBlocks);
+}
+
+void BamToolsIndex::OpenFile(const std::string& filename, const char* mode) {
+
+ // make sure any previous index file is closed
+ CloseFile();
+
+ // attempt to open file
+ Resources.IndexStream = fopen(filename.c_str(), mode);
+ if ( !IsFileOpen() ) {
+ const string message = string("could not open file: ") + filename;
+ throw BamException("BamToolsIndex::OpenFile", message);
+ }
+}
+
+void BamToolsIndex::ReadBlock(BtiBlock& block) {
+
+ // read in block data members
+ size_t elementsRead = 0;
+ elementsRead += fread(&block.MaxEndPosition, sizeof(block.MaxEndPosition), 1, Resources.IndexStream);
+ elementsRead += fread(&block.StartOffset, sizeof(block.StartOffset), 1, Resources.IndexStream);
+ elementsRead += fread(&block.StartPosition, sizeof(block.StartPosition), 1, Resources.IndexStream);
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_32(block.MaxEndPosition);
+ SwapEndian_64(block.StartOffset);
+ SwapEndian_32(block.StartPosition);
+ }
+
+ if ( elementsRead != 3 )
+ throw BamException("BamToolsIndex::ReadBlock", "could not read block");
+}
+
+void BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) {
+
+ // prep blocks container
+ blocks.clear();
+ blocks.reserve(refSummary.NumBlocks);
+
+ // skip to first block entry
+ Seek( refSummary.FirstBlockFilePosition, SEEK_SET );
+
+ // read & store block entries
+ BtiBlock block;
+ for ( int i = 0; i < refSummary.NumBlocks; ++i ) {
+ ReadBlock(block);
+ blocks.push_back(block);
+ }
+}
+
+void BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) {
+
+ // return false if refId not valid index in file summary structure
+ if ( refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size() )
+ throw BamException("BamToolsIndex::ReadReferenceEntry", "invalid reference requested");
+
+ // use index summary to assist reading the reference's BTI blocks
+ const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID);
+ ReadBlocks(refSummary, refEntry.Blocks);
+}
+
+void BamToolsIndex::Seek(const int64_t& position, const int& origin) {
+ if ( fseek64(Resources.IndexStream, position, origin) != 0 )
+ throw BamException("BamToolsIndex::Seek", "could not seek in BAI file");
+}
+
+void BamToolsIndex::SkipBlocks(const int& numBlocks) {
+ Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR );
+}
+
+int64_t BamToolsIndex::Tell(void) const {
+ return ftell64(Resources.IndexStream);
+}
+
+void BamToolsIndex::WriteBlock(const BtiBlock& block) {
+
+ // copy entry data
+ int32_t maxEndPosition = block.MaxEndPosition;
+ int64_t startOffset = block.StartOffset;
+ int32_t startPosition = block.StartPosition;
+
+ // swap endian-ness if necessary
+ if ( m_isBigEndian ) {
+ SwapEndian_32(maxEndPosition);
+ SwapEndian_64(startOffset);
+ SwapEndian_32(startPosition);
+ }
+
+ // write the reference index entry
+ size_t elementsWritten = 0;
+ elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, Resources.IndexStream);
+ elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, Resources.IndexStream);
+ elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, Resources.IndexStream);
+ if ( elementsWritten != 3 )
+ throw BamException("BamToolsIndex::WriteBlock", "could not write BTI block");
+}
+
+void BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) {
+ BtiBlockVector::const_iterator blockIter = blocks.begin();
+ BtiBlockVector::const_iterator blockEnd = blocks.end();
+ for ( ; blockIter != blockEnd; ++blockIter )
+ WriteBlock(*blockIter);
+}
+
+void BamToolsIndex::WriteHeader(void) {
+
+ size_t elementsWritten = 0;
+
+ // write BTI index format 'magic number'
+ elementsWritten += fwrite(BamToolsIndex::BTI_MAGIC, 1, 4, Resources.IndexStream);
+
+ // write BTI index format version
+ int32_t currentVersion = (int32_t)m_outputVersion;
+ if ( m_isBigEndian ) SwapEndian_32(currentVersion);
+ elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, Resources.IndexStream);
+
+ // write block size
+ uint32_t blockSize = m_blockSize;
+ if ( m_isBigEndian ) SwapEndian_32(blockSize);
+ elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, Resources.IndexStream);
+
+ // write number of references
+ int32_t numReferences = m_indexFileSummary.size();
+ if ( m_isBigEndian ) SwapEndian_32(numReferences);
+ elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, Resources.IndexStream);
+
+ if ( elementsWritten != 7 )
+ throw BamException("BamToolsIndex::WriteHeader", "could not write BTI header");
+}
+
+void BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) {
+
+ // write number of blocks this reference
+ uint32_t numBlocks = refEntry.Blocks.size();
+ if ( m_isBigEndian ) SwapEndian_32(numBlocks);
+ const size_t elementsWritten = fwrite(&numBlocks, sizeof(numBlocks), 1, Resources.IndexStream);
+ if ( elementsWritten != 1 )
+ throw BamException("BamToolsIndex::WriteReferenceEntry", "could not write number of blocks");
+
+ // write actual block entries
+ WriteBlocks(refEntry.Blocks);
+}