1 // ***************************************************************************
2 // BamToolsIndex.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 10 November 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides index operations for the BamTools index format (".bti")
8 // ***************************************************************************
10 #include "api/BamAlignment.h"
11 #include "api/internal/bam/BamReader_p.h"
12 #include "api/internal/index/BamToolsIndex_p.h"
13 #include "api/internal/io/BamDeviceFactory_p.h"
14 #include "api/internal/io/BgzfStream_p.h"
15 #include "api/internal/utils/BamException_p.h"
16 using namespace BamTools;
17 using namespace BamTools::Internal;
28 // --------------------------------
29 // static BamToolsIndex constants
30 // --------------------------------
32 const uint32_t BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000;
33 const string BamToolsIndex::BTI_EXTENSION = ".bti";
34 const char* const BamToolsIndex::BTI_MAGIC = "BTI\1";
35 const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t)*2 + sizeof(int64_t);
37 // ----------------------------
38 // RaiiWrapper implementation
39 // ----------------------------
41 BamToolsIndex::RaiiWrapper::RaiiWrapper(void)
45 BamToolsIndex::RaiiWrapper::~RaiiWrapper(void) {
53 // ------------------------------
54 // BamToolsIndex implementation
55 // ------------------------------
58 BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader)
60 , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH)
62 , m_outputVersion(BTI_2_0) // latest version - used for writing new index files
64 m_isBigEndian = BamTools::SystemIsBigEndian();
68 BamToolsIndex::~BamToolsIndex(void) {
72 void BamToolsIndex::CheckMagicNumber(void) {
76 const int64_t numBytesRead = m_resources.Device->Read(magic, 4);
77 if ( numBytesRead != 4 )
78 throw BamException("BamToolsIndex::CheckMagicNumber", "could not read BTI magic number");
80 // validate expected magic number
81 if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 )
82 throw BamException("BamToolsIndex::CheckMagicNumber", "invalid BTI magic number");
85 // check index file version, return true if OK
86 void BamToolsIndex::CheckVersion(void) {
88 // read version from file
89 const int64_t numBytesRead = m_resources.Device->Read((char*)&m_inputVersion, sizeof(m_inputVersion));
90 if ( numBytesRead != sizeof(m_inputVersion) )
91 throw BamException("BamToolsIndex::CheckVersion", "could not read format version");
92 if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
94 // if version is negative, or zero
95 if ( m_inputVersion <= 0 )
96 throw BamException("BamToolsIndex::CheckVersion", "invalid format version");
98 // if version is newer than can be supported by this version of bamtools
99 else if ( m_inputVersion > m_outputVersion ) {
100 const string message = "unsupported format: this index was created by a newer version of BamTools. "
101 "Update your local version of BamTools to use the index file.";
102 throw BamException("BamToolsIndex::CheckVersion", message);
105 // ------------------------------------------------------------------
106 // check for deprecated, unsupported versions
107 // (the format had to be modified to accomodate a particular bug fix)
109 // Version 2.0: introduced support for half-open intervals, instead of the old closed intervals
110 // respondBy: throwing exception - we're not going to try to handle the old BTI files.
111 else if ( (Version)m_inputVersion < BamToolsIndex::BTI_2_0 ) {
112 const string message = "unsupported format: this version of the index may not properly handle "
113 "coordinate intervals. Please run 'bamtools index -bti -in yourData.bam' "
114 "to generate an up-to-date, fixed BTI file.";
115 throw BamException("BamToolsIndex::CheckVersion", message);
119 void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) {
121 refEntry.Blocks.clear();
124 void BamToolsIndex::CloseFile(void) {
125 if ( IsDeviceOpen() ) {
126 m_resources.Device->Close();
127 delete m_resources.Device;
128 m_resources.Device = 0;
130 m_indexFileSummary.clear();
133 // builds index from associated BAM file & writes out to index file
134 bool BamToolsIndex::Create(void) {
136 // skip if BamReader is invalid or not open
137 if ( m_reader == 0 || !m_reader->IsOpen() ) {
138 SetErrorString("BamToolsIndex::Create", "could not create index: reader is not open");
143 if ( !m_reader->Rewind() ) {
144 const string readerError = m_reader->GetErrorString();
145 const string message = "could not create index: \n\t" + readerError;
146 SetErrorString("BamToolsIndex::Create", message);
151 // open new index file (read & write)
152 const string indexFilename = m_reader->Filename() + Extension();
153 OpenFile(indexFilename, IBamIODevice::ReadWrite);
155 // initialize BtiFileSummary with number of references
156 const int& numReferences = m_reader->GetReferenceCount();
157 InitializeFileSummary(numReferences);
159 // intialize output file header
162 // index building markers
163 uint32_t currentBlockCount = 0;
164 int64_t currentAlignmentOffset = m_reader->Tell();
165 int32_t blockRefId = -1;
166 int32_t blockMaxEndPosition = -1;
167 int64_t blockStartOffset = currentAlignmentOffset;
168 int32_t blockStartPosition = -1;
170 // plow through alignments, storing index entries
172 BtiReferenceEntry refEntry;
173 while ( m_reader->LoadNextAlignment(al) ) {
175 // if moved to new reference
176 if ( al.RefID != blockRefId ) {
178 // if first pass, check:
179 if ( currentBlockCount == 0 ) {
181 // write any empty references up to (but not including) al.RefID
182 for ( int i = 0; i < al.RefID; ++i )
183 WriteReferenceEntry( BtiReferenceEntry(i) );
189 // store previous BTI block data in reference entry
190 const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
191 refEntry.Blocks.push_back(block);
193 // write reference entry, then clear
194 WriteReferenceEntry(refEntry);
195 ClearReferenceEntry(refEntry);
197 // write any empty references between (but not including)
198 // the last blockRefID and current al.RefID
199 for ( int i = blockRefId+1; i < al.RefID; ++i )
200 WriteReferenceEntry( BtiReferenceEntry(i) );
203 currentBlockCount = 0;
206 // set ID for new reference entry
207 refEntry.ID = al.RefID;
210 // if beginning of block, update counters
211 if ( currentBlockCount == 0 ) {
212 blockRefId = al.RefID;
213 blockStartOffset = currentAlignmentOffset;
214 blockStartPosition = al.Position;
215 blockMaxEndPosition = al.GetEndPosition();
218 // increment block counter
221 // check end position
222 const int32_t alignmentEndPosition = al.GetEndPosition();
223 if ( alignmentEndPosition > blockMaxEndPosition )
224 blockMaxEndPosition = alignmentEndPosition;
226 // if block is full, get offset for next block, reset currentBlockCount
227 if ( currentBlockCount == m_blockSize ) {
229 // store previous block data in reference entry
230 const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
231 refEntry.Blocks.push_back(block);
234 blockStartOffset = m_reader->Tell();
235 currentBlockCount = 0;
238 // not the best name, but for the next iteration, this value will be the offset of the
239 // *current* alignment. this is necessary because we won't know if this next alignment
240 // is on a new reference until we actually read it
241 currentAlignmentOffset = m_reader->Tell();
244 // after finishing alignments, if any data was read, check:
245 if ( blockRefId >= 0 ) {
247 // store last BTI block data in reference entry
248 const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
249 refEntry.Blocks.push_back(block);
251 // write last reference entry, then clear
252 WriteReferenceEntry(refEntry);
253 ClearReferenceEntry(refEntry);
255 // then write any empty references remaining at end of file
256 for ( int i = blockRefId+1; i < numReferences; ++i )
257 WriteReferenceEntry( BtiReferenceEntry(i) );
260 } catch ( BamException& e ) {
261 m_errorString = e.what();
266 if ( !m_reader->Rewind() ) {
267 const string readerError = m_reader->GetErrorString();
268 const string message = "could not create index: \n\t" + readerError;
269 SetErrorString("BamToolsIndex::Create", message);
277 // returns format's file extension
278 const std::string BamToolsIndex::Extension(void) {
279 return BamToolsIndex::BTI_EXTENSION;
282 void BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
284 // return false ref ID is not a valid index in file summary data
285 if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() )
286 throw BamException("BamToolsIndex::GetOffset", "invalid region requested");
288 // retrieve reference index data for left bound reference
289 BtiReferenceEntry refEntry(region.LeftRefID);
290 ReadReferenceEntry(refEntry);
292 // binary search for an overlapping block (may not be first one though)
294 typedef BtiBlockVector::const_iterator BtiBlockConstIterator;
295 BtiBlockConstIterator blockFirst = refEntry.Blocks.begin();
296 BtiBlockConstIterator blockIter = blockFirst;
297 BtiBlockConstIterator blockLast = refEntry.Blocks.end();
298 iterator_traits<BtiBlockConstIterator>::difference_type count = distance(blockFirst, blockLast);
299 iterator_traits<BtiBlockConstIterator>::difference_type step;
300 while ( count > 0 ) {
301 blockIter = blockFirst;
303 advance(blockIter, step);
305 const BtiBlock& block = (*blockIter);
306 if ( block.StartPosition <= region.RightPosition ) {
307 if ( block.MaxEndPosition > region.LeftPosition ) {
308 offset = block.StartOffset;
311 blockFirst = ++blockIter;
317 // if we didn't search "off the end" of the blocks
318 if ( blockIter != blockLast ) {
320 // "walk back" until we've gone too far
321 while ( blockIter != blockFirst ) {
322 const BtiBlock& currentBlock = (*blockIter);
325 const BtiBlock& previousBlock = (*blockIter);
326 if ( previousBlock.MaxEndPosition <= region.LeftPosition ) {
327 offset = currentBlock.StartOffset;
333 // if we walked all the way to first block, just return that and let the reader's
334 // region overlap parsing do the rest
335 if ( blockIter == blockFirst ) {
336 const BtiBlock& block = (*blockIter);
337 offset = block.StartOffset;
343 // sets to false if blocks container is empty, or if no matching block could be found
344 *hasAlignmentsInRegion = found;
347 // returns whether reference has alignments or no
348 bool BamToolsIndex::HasAlignments(const int& referenceID) const {
349 if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() )
351 const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID);
352 return ( refSummary.NumBlocks > 0 );
355 // pre-allocates space for each reference's summary data
356 void BamToolsIndex::InitializeFileSummary(const int& numReferences) {
357 m_indexFileSummary.clear();
358 for ( int i = 0; i < numReferences; ++i )
359 m_indexFileSummary.push_back( BtiReferenceSummary() );
362 // returns true if the index stream is open
363 bool BamToolsIndex::IsDeviceOpen(void) const {
364 if ( m_resources.Device == 0 )
366 return m_resources.Device->IsOpen();
369 // attempts to use index data to jump to @region, returns success/fail
370 // a "successful" jump indicates no error, but not whether this region has data
371 // * thus, the method sets a flag to indicate whether there are alignments
372 // available after the jump position
373 bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) {
376 *hasAlignmentsInRegion = false;
378 // skip if invalid reader or not open
379 if ( m_reader == 0 || !m_reader->IsOpen() ) {
380 SetErrorString("BamToolsIndex::Jump", "could not jump: reader is not open");
384 // make sure left-bound position is valid
385 const RefVector& references = m_reader->GetReferenceData();
386 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) {
387 SetErrorString("BamToolsIndex::Jump", "could not create index: invalid region requested");
391 // calculate nearest offset to jump to
394 GetOffset(region, offset, hasAlignmentsInRegion);
395 } catch ( BamException& e ) {
396 m_errorString = e.what();
400 // return success/failure of seek
401 return m_reader->Seek(offset);
404 // loads existing data from file into memory
405 bool BamToolsIndex::Load(const std::string& filename) {
409 // attempt to open file (read-only)
410 OpenFile(filename, IBamIODevice::ReadOnly);
412 // load metadata & generate in-memory summary
419 } catch ( BamException& e ) {
420 m_errorString = e.what();
425 void BamToolsIndex::LoadFileSummary(void) {
427 // load number of reference sequences
429 LoadNumReferences(numReferences);
431 // initialize file summary data
432 InitializeFileSummary(numReferences);
434 // load summary for each reference
435 BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin();
436 BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end();
437 for ( ; summaryIter != summaryEnd; ++summaryIter )
438 LoadReferenceSummary(*summaryIter);
441 void BamToolsIndex::LoadHeader(void) {
443 // check BTI file metadata
447 // use file's BTI block size to set member variable
448 const int64_t numBytesRead = m_resources.Device->Read((char*)&m_blockSize, sizeof(m_blockSize));
449 if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
450 if ( numBytesRead != sizeof(m_blockSize) )
451 throw BamException("BamToolsIndex::LoadHeader", "could not read BTI block size");
454 void BamToolsIndex::LoadNumBlocks(int& numBlocks) {
455 const int64_t numBytesRead = m_resources.Device->Read((char*)&numBlocks, sizeof(numBlocks));
456 if ( m_isBigEndian ) SwapEndian_32(numBlocks);
457 if ( numBytesRead != sizeof(numBlocks) )
458 throw BamException("BamToolsIndex::LoadNumBlocks", "could not read number of BTI blocks");
461 void BamToolsIndex::LoadNumReferences(int& numReferences) {
462 const int64_t numBytesRead = m_resources.Device->Read((char*)&numReferences, sizeof(numReferences));
463 if ( m_isBigEndian ) SwapEndian_32(numReferences);
464 if ( numBytesRead != sizeof(numReferences) )
465 throw BamException("BamToolsIndex::LoadNumReferences", "could not read number of references");
468 void BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) {
470 // load number of blocks
472 LoadNumBlocks(numBlocks);
474 // store block summary data for this reference
475 refSummary.NumBlocks = numBlocks;
476 refSummary.FirstBlockFilePosition = Tell();
478 // skip reference's blocks
479 SkipBlocks(numBlocks);
482 void BamToolsIndex::OpenFile(const std::string& filename, IBamIODevice::OpenMode mode) {
484 // make sure any previous index file is closed
487 m_resources.Device = BamDeviceFactory::CreateDevice(filename);
488 if ( m_resources.Device == 0 ) {
489 const string message = string("could not open file: ") + filename;
490 throw BamException("BamStandardIndex::OpenFile", message);
493 // attempt to open file
494 m_resources.Device->Open(mode);
495 if ( !IsDeviceOpen() ) {
496 const string message = string("could not open file: ") + filename;
497 throw BamException("BamToolsIndex::OpenFile", message);
501 void BamToolsIndex::ReadBlock(BtiBlock& block) {
503 // read in block data members
504 int64_t numBytesRead = 0;
505 numBytesRead += m_resources.Device->Read((char*)&block.MaxEndPosition, sizeof(block.MaxEndPosition));
506 numBytesRead += m_resources.Device->Read((char*)&block.StartOffset, sizeof(block.StartOffset));
507 numBytesRead += m_resources.Device->Read((char*)&block.StartPosition, sizeof(block.StartPosition));
509 // swap endian-ness if necessary
510 if ( m_isBigEndian ) {
511 SwapEndian_32(block.MaxEndPosition);
512 SwapEndian_64(block.StartOffset);
513 SwapEndian_32(block.StartPosition);
516 // check block read ok
517 const int expectedBytes = sizeof(block.MaxEndPosition) +
518 sizeof(block.StartOffset) +
519 sizeof(block.StartPosition);
520 if ( numBytesRead != expectedBytes )
521 throw BamException("BamToolsIndex::ReadBlock", "could not read block");
524 void BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) {
526 // prep blocks container
528 blocks.reserve(refSummary.NumBlocks);
530 // skip to first block entry
531 Seek( refSummary.FirstBlockFilePosition, SEEK_SET );
533 // read & store block entries
535 for ( int i = 0; i < refSummary.NumBlocks; ++i ) {
537 blocks.push_back(block);
541 void BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) {
543 // return false if refId not valid index in file summary structure
544 if ( refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size() )
545 throw BamException("BamToolsIndex::ReadReferenceEntry", "invalid reference requested");
547 // use index summary to assist reading the reference's BTI blocks
548 const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID);
549 ReadBlocks(refSummary, refEntry.Blocks);
552 void BamToolsIndex::Seek(const int64_t& position, const int origin) {
553 if ( !m_resources.Device->Seek(position, origin) )
554 throw BamException("BamToolsIndex::Seek", "could not seek in BAI file");
557 void BamToolsIndex::SkipBlocks(const int& numBlocks) {
558 Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR );
561 int64_t BamToolsIndex::Tell(void) const {
562 return m_resources.Device->Tell();
565 void BamToolsIndex::WriteBlock(const BtiBlock& block) {
568 int32_t maxEndPosition = block.MaxEndPosition;
569 int64_t startOffset = block.StartOffset;
570 int32_t startPosition = block.StartPosition;
572 // swap endian-ness if necessary
573 if ( m_isBigEndian ) {
574 SwapEndian_32(maxEndPosition);
575 SwapEndian_64(startOffset);
576 SwapEndian_32(startPosition);
579 // write the reference index entry
580 int64_t numBytesWritten = 0;
581 numBytesWritten += m_resources.Device->Write((const char*)&maxEndPosition, sizeof(maxEndPosition));
582 numBytesWritten += m_resources.Device->Write((const char*)&startOffset, sizeof(startOffset));
583 numBytesWritten += m_resources.Device->Write((const char*)&startPosition, sizeof(startPosition));
585 // check block written ok
586 const int expectedBytes = sizeof(maxEndPosition) +
587 sizeof(startOffset) +
588 sizeof(startPosition);
589 if ( numBytesWritten != expectedBytes )
590 throw BamException("BamToolsIndex::WriteBlock", "could not write BTI block");
593 void BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) {
594 BtiBlockVector::const_iterator blockIter = blocks.begin();
595 BtiBlockVector::const_iterator blockEnd = blocks.end();
596 for ( ; blockIter != blockEnd; ++blockIter )
597 WriteBlock(*blockIter);
600 void BamToolsIndex::WriteHeader(void) {
602 int64_t numBytesWritten = 0 ;
604 // write BTI index format 'magic number'
605 numBytesWritten += m_resources.Device->Write(BamToolsIndex::BTI_MAGIC, 4);
607 // write BTI index format version
608 int32_t currentVersion = (int32_t)m_outputVersion;
609 if ( m_isBigEndian ) SwapEndian_32(currentVersion);
610 numBytesWritten += m_resources.Device->Write((const char*)¤tVersion, sizeof(currentVersion));
613 uint32_t blockSize = m_blockSize;
614 if ( m_isBigEndian ) SwapEndian_32(blockSize);
615 numBytesWritten += m_resources.Device->Write((const char*)&blockSize, sizeof(blockSize));
617 // write number of references
618 int32_t numReferences = m_indexFileSummary.size();
619 if ( m_isBigEndian ) SwapEndian_32(numReferences);
620 numBytesWritten += m_resources.Device->Write((const char*)&numReferences, sizeof(numReferences));
622 // check header written ok
623 const int expectedBytes = 4 +
624 sizeof(currentVersion) +
626 sizeof(numReferences);
627 if ( numBytesWritten != expectedBytes )
628 throw BamException("BamToolsIndex::WriteHeader", "could not write BTI header");
631 void BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) {
633 // write number of blocks this reference
634 uint32_t numBlocks = refEntry.Blocks.size();
635 if ( m_isBigEndian ) SwapEndian_32(numBlocks);
636 const int64_t numBytesWritten = m_resources.Device->Write((const char*)&numBlocks, sizeof(numBlocks));
637 if ( numBytesWritten != sizeof(numBlocks) )
638 throw BamException("BamToolsIndex::WriteReferenceEntry", "could not write number of blocks");
640 // write actual block entries
641 WriteBlocks(refEntry.Blocks);