1 // ***************************************************************************
2 // BamToolsIndex.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 25 October 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides index operations for the BamTools index format (".bti")
8 // ***************************************************************************
10 #include "api/BamAlignment.h"
11 #include "api/internal/bam/BamReader_p.h"
12 #include "api/internal/index/BamToolsIndex_p.h"
13 #include "api/internal/io/BamDeviceFactory_p.h"
14 #include "api/internal/io/BgzfStream_p.h"
15 #include "api/internal/utils/BamException_p.h"
16 using namespace BamTools;
17 using namespace BamTools::Internal;
28 // --------------------------------
29 // static BamToolsIndex constants
30 // --------------------------------
32 const uint32_t BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000;
33 const string BamToolsIndex::BTI_EXTENSION = ".bti";
34 const char* const BamToolsIndex::BTI_MAGIC = "BTI\1";
35 const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t)*2 + sizeof(int64_t);
37 // ----------------------------
38 // RaiiWrapper implementation
39 // ----------------------------
41 BamToolsIndex::RaiiWrapper::RaiiWrapper(void)
46 BamToolsIndex::RaiiWrapper::~RaiiWrapper(void) {
59 // ------------------------------
60 // BamToolsIndex implementation
61 // ------------------------------
64 BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader)
66 , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH)
68 , m_outputVersion(BTI_2_0) // latest version - used for writing new index files
70 m_isBigEndian = BamTools::SystemIsBigEndian();
74 BamToolsIndex::~BamToolsIndex(void) {
78 void BamToolsIndex::CheckMagicNumber(void) {
82 const int64_t numBytesRead = m_resources.Device->Read(magic, 4);
83 if ( numBytesRead != 4 )
84 throw BamException("BamToolsIndex::CheckMagicNumber", "could not read BTI magic number");
86 // validate expected magic number
87 if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 )
88 throw BamException("BamToolsIndex::CheckMagicNumber", "invalid BTI magic number");
91 // check index file version, return true if OK
92 void BamToolsIndex::CheckVersion(void) {
94 // read version from file
95 const int64_t numBytesRead = m_resources.Device->Read((char*)&m_inputVersion, sizeof(m_inputVersion));
96 // size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_resources.IndexStream);
97 // if ( elementsRead != 1 )
98 if ( numBytesRead != sizeof(m_inputVersion) )
99 throw BamException("BamToolsIndex::CheckVersion", "could not read format version");
100 if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
102 // if version is negative, or zero
103 if ( m_inputVersion <= 0 )
104 throw BamException("BamToolsIndex::CheckVersion", "invalid format version");
106 // if version is newer than can be supported by this version of bamtools
107 else if ( m_inputVersion > m_outputVersion ) {
108 const string message = "unsupported format: this index was created by a newer version of BamTools. "
109 "Update your local version of BamTools to use the index file.";
110 throw BamException("BamToolsIndex::CheckVersion", message);
113 // ------------------------------------------------------------------
114 // check for deprecated, unsupported versions
115 // (the format had to be modified to accomodate a particular bug fix)
117 // Version 2.0: introduced support for half-open intervals, instead of the old closed intervals
118 // respondBy: throwing exception - we're not going to try to handle the old BTI files.
119 else if ( (Version)m_inputVersion < BamToolsIndex::BTI_2_0 ) {
120 const string message = "unsupported format: this version of the index may not properly handle "
121 "coordinate intervals. Please run 'bamtools index -bti -in yourData.bam' "
122 "to generate an up-to-date, fixed BTI file.";
123 throw BamException("BamToolsIndex::CheckVersion", message);
127 void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) {
129 refEntry.Blocks.clear();
132 void BamToolsIndex::CloseFile(void) {
133 if ( IsDeviceOpen() ) {
134 fclose(m_resources.IndexStream);
135 m_resources.IndexStream = 0;
137 m_resources.Device->Close();
138 delete m_resources.Device;
139 m_resources.Device = 0;
141 m_indexFileSummary.clear();
144 // builds index from associated BAM file & writes out to index file
145 bool BamToolsIndex::Create(void) {
147 // skip if BamReader is invalid or not open
148 if ( m_reader == 0 || !m_reader->IsOpen() ) {
149 SetErrorString("BamToolsIndex::Create", "could not create index: reader is not open");
154 if ( !m_reader->Rewind() ) {
155 const string readerError = m_reader->GetErrorString();
156 const string message = "could not create index: \n\t" + readerError;
157 SetErrorString("BamToolsIndex::Create", message);
162 // open new index file (read & write)
163 const string indexFilename = m_reader->Filename() + Extension();
164 OpenFile(indexFilename, IBamIODevice::ReadWrite);
166 // initialize BtiFileSummary with number of references
167 const int& numReferences = m_reader->GetReferenceCount();
168 InitializeFileSummary(numReferences);
170 // intialize output file header
173 // index building markers
174 uint32_t currentBlockCount = 0;
175 int64_t currentAlignmentOffset = m_reader->Tell();
176 int32_t blockRefId = -1;
177 int32_t blockMaxEndPosition = -1;
178 int64_t blockStartOffset = currentAlignmentOffset;
179 int32_t blockStartPosition = -1;
181 // plow through alignments, storing index entries
183 BtiReferenceEntry refEntry;
184 while ( m_reader->LoadNextAlignment(al) ) {
186 // if moved to new reference
187 if ( al.RefID != blockRefId ) {
189 // if first pass, check:
190 if ( currentBlockCount == 0 ) {
192 // write any empty references up to (but not including) al.RefID
193 for ( int i = 0; i < al.RefID; ++i )
194 WriteReferenceEntry( BtiReferenceEntry(i) );
200 // store previous BTI block data in reference entry
201 const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
202 refEntry.Blocks.push_back(block);
204 // write reference entry, then clear
205 WriteReferenceEntry(refEntry);
206 ClearReferenceEntry(refEntry);
208 // write any empty references between (but not including)
209 // the last blockRefID and current al.RefID
210 for ( int i = blockRefId+1; i < al.RefID; ++i )
211 WriteReferenceEntry( BtiReferenceEntry(i) );
214 currentBlockCount = 0;
217 // set ID for new reference entry
218 refEntry.ID = al.RefID;
221 // if beginning of block, update counters
222 if ( currentBlockCount == 0 ) {
223 blockRefId = al.RefID;
224 blockStartOffset = currentAlignmentOffset;
225 blockStartPosition = al.Position;
226 blockMaxEndPosition = al.GetEndPosition();
229 // increment block counter
232 // check end position
233 const int32_t alignmentEndPosition = al.GetEndPosition();
234 if ( alignmentEndPosition > blockMaxEndPosition )
235 blockMaxEndPosition = alignmentEndPosition;
237 // if block is full, get offset for next block, reset currentBlockCount
238 if ( currentBlockCount == m_blockSize ) {
240 // store previous block data in reference entry
241 const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
242 refEntry.Blocks.push_back(block);
245 blockStartOffset = m_reader->Tell();
246 currentBlockCount = 0;
249 // not the best name, but for the next iteration, this value will be the offset of the
250 // *current* alignment. this is necessary because we won't know if this next alignment
251 // is on a new reference until we actually read it
252 currentAlignmentOffset = m_reader->Tell();
255 // after finishing alignments, if any data was read, check:
256 if ( blockRefId >= 0 ) {
258 // store last BTI block data in reference entry
259 const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
260 refEntry.Blocks.push_back(block);
262 // write last reference entry, then clear
263 WriteReferenceEntry(refEntry);
264 ClearReferenceEntry(refEntry);
266 // then write any empty references remaining at end of file
267 for ( int i = blockRefId+1; i < numReferences; ++i )
268 WriteReferenceEntry( BtiReferenceEntry(i) );
271 } catch ( BamException& e ) {
272 m_errorString = e.what();
277 if ( !m_reader->Rewind() ) {
278 const string readerError = m_reader->GetErrorString();
279 const string message = "could not create index: \n\t" + readerError;
280 SetErrorString("BamToolsIndex::Create", message);
288 // returns format's file extension
289 const std::string BamToolsIndex::Extension(void) {
290 return BamToolsIndex::BTI_EXTENSION;
293 void BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
295 // return false ref ID is not a valid index in file summary data
296 if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() )
297 throw BamException("BamToolsIndex::GetOffset", "invalid region requested");
299 // retrieve reference index data for left bound reference
300 BtiReferenceEntry refEntry(region.LeftRefID);
301 ReadReferenceEntry(refEntry);
303 // binary search for an overlapping block (may not be first one though)
305 typedef BtiBlockVector::const_iterator BtiBlockConstIterator;
306 BtiBlockConstIterator blockFirst = refEntry.Blocks.begin();
307 BtiBlockConstIterator blockIter = blockFirst;
308 BtiBlockConstIterator blockLast = refEntry.Blocks.end();
309 iterator_traits<BtiBlockConstIterator>::difference_type count = distance(blockFirst, blockLast);
310 iterator_traits<BtiBlockConstIterator>::difference_type step;
311 while ( count > 0 ) {
312 blockIter = blockFirst;
314 advance(blockIter, step);
316 const BtiBlock& block = (*blockIter);
317 if ( block.StartPosition <= region.RightPosition ) {
318 if ( block.MaxEndPosition > region.LeftPosition ) {
319 offset = block.StartOffset;
322 blockFirst = ++blockIter;
328 // if we didn't search "off the end" of the blocks
329 if ( blockIter != blockLast ) {
331 // "walk back" until we've gone too far
332 while ( blockIter != blockFirst ) {
333 const BtiBlock& currentBlock = (*blockIter);
336 const BtiBlock& previousBlock = (*blockIter);
337 if ( previousBlock.MaxEndPosition <= region.LeftPosition ) {
338 offset = currentBlock.StartOffset;
344 // if we walked all the way to first block, just return that and let the reader's
345 // region overlap parsing do the rest
346 if ( blockIter == blockFirst ) {
347 const BtiBlock& block = (*blockIter);
348 offset = block.StartOffset;
354 // sets to false if blocks container is empty, or if no matching block could be found
355 *hasAlignmentsInRegion = found;
358 // returns whether reference has alignments or no
359 bool BamToolsIndex::HasAlignments(const int& referenceID) const {
360 if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() )
362 const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID);
363 return ( refSummary.NumBlocks > 0 );
366 // pre-allocates space for each reference's summary data
367 void BamToolsIndex::InitializeFileSummary(const int& numReferences) {
368 m_indexFileSummary.clear();
369 for ( int i = 0; i < numReferences; ++i )
370 m_indexFileSummary.push_back( BtiReferenceSummary() );
373 // returns true if the index stream is open
374 bool BamToolsIndex::IsDeviceOpen(void) const {
375 if ( m_resources.Device == 0 )
377 return m_resources.Device->IsOpen();
378 // return ( m_resources.IndexStream != 0 );
381 // attempts to use index data to jump to @region, returns success/fail
382 // a "successful" jump indicates no error, but not whether this region has data
383 // * thus, the method sets a flag to indicate whether there are alignments
384 // available after the jump position
385 bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) {
388 *hasAlignmentsInRegion = false;
390 // skip if invalid reader or not open
391 if ( m_reader == 0 || !m_reader->IsOpen() ) {
392 SetErrorString("BamToolsIndex::Jump", "could not jump: reader is not open");
396 // make sure left-bound position is valid
397 const RefVector& references = m_reader->GetReferenceData();
398 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) {
399 SetErrorString("BamToolsIndex::Jump", "could not create index: invalid region requested");
403 // calculate nearest offset to jump to
406 GetOffset(region, offset, hasAlignmentsInRegion);
407 } catch ( BamException& e ) {
408 m_errorString = e.what();
412 // return success/failure of seek
413 return m_reader->Seek(offset);
416 // loads existing data from file into memory
417 bool BamToolsIndex::Load(const std::string& filename) {
421 // attempt to open file (read-only)
422 OpenFile(filename, IBamIODevice::ReadOnly);
424 // load metadata & generate in-memory summary
431 } catch ( BamException& e ) {
432 m_errorString = e.what();
437 void BamToolsIndex::LoadFileSummary(void) {
439 // load number of reference sequences
441 LoadNumReferences(numReferences);
443 // initialize file summary data
444 InitializeFileSummary(numReferences);
446 // load summary for each reference
447 BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin();
448 BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end();
449 for ( ; summaryIter != summaryEnd; ++summaryIter )
450 LoadReferenceSummary(*summaryIter);
453 void BamToolsIndex::LoadHeader(void) {
455 // check BTI file metadata
459 // use file's BTI block size to set member variable
460 const int64_t numBytesRead = m_resources.Device->Read((char*)&m_blockSize, sizeof(m_blockSize));
461 // const size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_resources.IndexStream);
462 if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
463 // if ( elementsRead != 1 )
464 if ( numBytesRead != sizeof(m_blockSize) )
465 throw BamException("BamToolsIndex::LoadHeader", "could not read BTI block size");
468 void BamToolsIndex::LoadNumBlocks(int& numBlocks) {
469 const int64_t numBytesRead = m_resources.Device->Read((char*)&numBlocks, sizeof(numBlocks));
470 // const size_t elementsRead = fread(&numBlocks, sizeof(numBlocks), 1, m_resources.IndexStream);
471 if ( m_isBigEndian ) SwapEndian_32(numBlocks);
472 // if ( elementsRead != 1 )
473 if ( numBytesRead != sizeof(numBlocks) )
474 throw BamException("BamToolsIndex::LoadNumBlocks", "could not read number of BTI blocks");
477 void BamToolsIndex::LoadNumReferences(int& numReferences) {
478 const int64_t numBytesRead = m_resources.Device->Read((char*)&numReferences, sizeof(numReferences));
479 // const size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_resources.IndexStream);
480 if ( m_isBigEndian ) SwapEndian_32(numReferences);
481 // if ( elementsRead != 1 )
482 if ( numBytesRead != sizeof(numReferences) )
483 throw BamException("BamToolsIndex::LoadNumReferences", "could not read number of references");
486 void BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) {
488 // load number of blocks
490 LoadNumBlocks(numBlocks);
492 // store block summary data for this reference
493 refSummary.NumBlocks = numBlocks;
494 refSummary.FirstBlockFilePosition = Tell();
496 // skip reference's blocks
497 SkipBlocks(numBlocks);
500 void BamToolsIndex::OpenFile(const std::string& filename, IBamIODevice::OpenMode mode) {
502 // make sure any previous index file is closed
505 m_resources.Device = BamDeviceFactory::CreateDevice(filename);
506 if ( m_resources.Device == 0 ) {
507 const string message = string("could not open file: ") + filename;
508 throw BamException("BamStandardIndex::OpenFile", message);
511 // attempt to open file
512 // m_resources.IndexStream = fopen(filename.c_str(), mode);
513 m_resources.Device->Open(mode);
514 if ( !IsDeviceOpen() ) {
515 const string message = string("could not open file: ") + filename;
516 throw BamException("BamToolsIndex::OpenFile", message);
520 void BamToolsIndex::ReadBlock(BtiBlock& block) {
522 // read in block data members
523 // size_t elementsRead = 0;
524 // elementsRead += fread(&block.MaxEndPosition, sizeof(block.MaxEndPosition), 1, m_resources.IndexStream);
525 // elementsRead += fread(&block.StartOffset, sizeof(block.StartOffset), 1, m_resources.IndexStream);
526 // elementsRead += fread(&block.StartPosition, sizeof(block.StartPosition), 1, m_resources.IndexStream);
528 int64_t numBytesRead = 0;
529 numBytesRead += m_resources.Device->Read((char*)&block.MaxEndPosition, sizeof(block.MaxEndPosition));
530 numBytesRead += m_resources.Device->Read((char*)&block.StartOffset, sizeof(block.StartOffset));
531 numBytesRead += m_resources.Device->Read((char*)&block.StartPosition, sizeof(block.StartPosition));
533 // swap endian-ness if necessary
534 if ( m_isBigEndian ) {
535 SwapEndian_32(block.MaxEndPosition);
536 SwapEndian_64(block.StartOffset);
537 SwapEndian_32(block.StartPosition);
540 // if ( elementsRead != 3 )
541 const int expectedBytes = sizeof(block.MaxEndPosition) +
542 sizeof(block.StartOffset) +
543 sizeof(block.StartPosition);
544 if ( numBytesRead != expectedBytes )
545 throw BamException("BamToolsIndex::ReadBlock", "could not read block");
548 void BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) {
550 // prep blocks container
552 blocks.reserve(refSummary.NumBlocks);
554 // skip to first block entry
555 Seek( refSummary.FirstBlockFilePosition, SEEK_SET );
557 // read & store block entries
559 for ( int i = 0; i < refSummary.NumBlocks; ++i ) {
561 blocks.push_back(block);
565 void BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) {
567 // return false if refId not valid index in file summary structure
568 if ( refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size() )
569 throw BamException("BamToolsIndex::ReadReferenceEntry", "invalid reference requested");
571 // use index summary to assist reading the reference's BTI blocks
572 const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID);
573 ReadBlocks(refSummary, refEntry.Blocks);
576 void BamToolsIndex::Seek(const int64_t& position, const int origin) {
577 if ( !m_resources.Device->Seek(position, origin) )
578 // if ( fseek64(m_resources.IndexStream, position, origin) != 0 )
579 throw BamException("BamToolsIndex::Seek", "could not seek in BAI file");
582 void BamToolsIndex::SkipBlocks(const int& numBlocks) {
583 Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR );
586 int64_t BamToolsIndex::Tell(void) const {
587 return m_resources.Device->Tell();
588 // return ftell64(m_resources.IndexStream);
591 void BamToolsIndex::WriteBlock(const BtiBlock& block) {
594 int32_t maxEndPosition = block.MaxEndPosition;
595 int64_t startOffset = block.StartOffset;
596 int32_t startPosition = block.StartPosition;
598 // swap endian-ness if necessary
599 if ( m_isBigEndian ) {
600 SwapEndian_32(maxEndPosition);
601 SwapEndian_64(startOffset);
602 SwapEndian_32(startPosition);
605 // write the reference index entry
606 int64_t numBytesWritten = 0;
607 numBytesWritten += m_resources.Device->Write((const char*)&maxEndPosition, sizeof(maxEndPosition));
608 numBytesWritten += m_resources.Device->Write((const char*)&startOffset, sizeof(startOffset));
609 numBytesWritten += m_resources.Device->Write((const char*)&startPosition, sizeof(startPosition));
610 const int expectedBytes = sizeof(maxEndPosition) +
611 sizeof(startOffset) +
612 sizeof(startPosition);
614 // size_t elementsWritten = 0;
615 // elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_resources.IndexStream);
616 // elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_resources.IndexStream);
617 // elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_resources.IndexStream);
618 // if ( elementsWritten != 3 )
619 if ( numBytesWritten != expectedBytes )
620 throw BamException("BamToolsIndex::WriteBlock", "could not write BTI block");
623 void BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) {
624 BtiBlockVector::const_iterator blockIter = blocks.begin();
625 BtiBlockVector::const_iterator blockEnd = blocks.end();
626 for ( ; blockIter != blockEnd; ++blockIter )
627 WriteBlock(*blockIter);
630 void BamToolsIndex::WriteHeader(void) {
632 int64_t numBytesWritten = 0 ;
633 // size_t elementsWritten = 0;
635 // write BTI index format 'magic number'
636 numBytesWritten += m_resources.Device->Write(BamToolsIndex::BTI_MAGIC, 4);
637 // elementsWritten += fwrite(BamToolsIndex::BTI_MAGIC, 1, 4, m_resources.IndexStream);
639 // write BTI index format version
640 int32_t currentVersion = (int32_t)m_outputVersion;
641 if ( m_isBigEndian ) SwapEndian_32(currentVersion);
642 numBytesWritten += m_resources.Device->Write((const char*)¤tVersion, sizeof(currentVersion));
643 // elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_resources.IndexStream);
646 uint32_t blockSize = m_blockSize;
647 if ( m_isBigEndian ) SwapEndian_32(blockSize);
648 numBytesWritten += m_resources.Device->Write((const char*)&blockSize, sizeof(blockSize));
649 // elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_resources.IndexStream);
651 // write number of references
652 int32_t numReferences = m_indexFileSummary.size();
653 if ( m_isBigEndian ) SwapEndian_32(numReferences);
654 numBytesWritten += m_resources.Device->Write((const char*)&numReferences, sizeof(numReferences));
655 // elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_resources.IndexStream);
657 const int expectedBytes = 4 +
658 sizeof(currentVersion) +
660 sizeof(numReferences);
662 // if ( elementsWritten != 7 )
663 if ( numBytesWritten != expectedBytes )
664 throw BamException("BamToolsIndex::WriteHeader", "could not write BTI header");
667 void BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) {
669 // write number of blocks this reference
670 uint32_t numBlocks = refEntry.Blocks.size();
671 if ( m_isBigEndian ) SwapEndian_32(numBlocks);
672 const int64_t numBytesWritten = m_resources.Device->Write((const char*)&numBlocks, sizeof(numBlocks));
673 // const size_t elementsWritten = fwrite(&numBlocks, sizeof(numBlocks), 1, m_resources.IndexStream);
674 // if ( elementsWritten != 1 )
675 if ( numBytesWritten != sizeof(numBlocks) )
676 throw BamException("BamToolsIndex::WriteReferenceEntry", "could not write number of blocks");
678 // write actual block entries
679 WriteBlocks(refEntry.Blocks);