1 // ***************************************************************************
2 // BamToolsIndex.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 25 October 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides index operations for the BamTools index format (".bti")
8 // ***************************************************************************
10 #include "api/BamAlignment.h"
11 #include "api/internal/bam/BamReader_p.h"
12 #include "api/internal/index/BamToolsIndex_p.h"
13 #include "api/internal/io/BgzfStream_p.h"
14 #include "api/internal/utils/BamException_p.h"
15 using namespace BamTools;
16 using namespace BamTools::Internal;
27 // --------------------------------
28 // static BamToolsIndex constants
29 // --------------------------------
31 const uint32_t BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000;
32 const string BamToolsIndex::BTI_EXTENSION = ".bti";
33 const char* const BamToolsIndex::BTI_MAGIC = "BTI\1";
34 const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t)*2 + sizeof(int64_t);
36 // ----------------------------
37 // RaiiWrapper implementation
38 // ----------------------------
40 BamToolsIndex::RaiiWrapper::RaiiWrapper(void)
44 BamToolsIndex::RaiiWrapper::~RaiiWrapper(void) {
49 // ------------------------------
50 // BamToolsIndex implementation
51 // ------------------------------
54 BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader)
56 , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH)
58 , m_outputVersion(BTI_2_0) // latest version - used for writing new index files
60 m_isBigEndian = BamTools::SystemIsBigEndian();
64 BamToolsIndex::~BamToolsIndex(void) {
68 void BamToolsIndex::CheckMagicNumber(void) {
72 size_t elementsRead = fread(magic, sizeof(char), 4, Resources.IndexStream);
73 if ( elementsRead != 4 )
74 throw BamException("BamToolsIndex::CheckMagicNumber", "could not read BTI magic number");
76 // validate expected magic number
77 if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 )
78 throw BamException("BamToolsIndex::CheckMagicNumber", "invalid BTI magic number");
81 // check index file version, return true if OK
82 void BamToolsIndex::CheckVersion(void) {
84 // read version from file
85 size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, Resources.IndexStream);
86 if ( elementsRead != 1 )
87 throw BamException("BamToolsIndex::CheckVersion", "could not read format version");
88 if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
90 // if version is negative, or zero
91 if ( m_inputVersion <= 0 )
92 throw BamException("BamToolsIndex::CheckVersion", "invalid format version");
94 // if version is newer than can be supported by this version of bamtools
95 else if ( m_inputVersion > m_outputVersion ) {
96 const string message = "unsupported format: this index was created by a newer version of BamTools. "
97 "Update your local version of BamTools to use the index file.";
98 throw BamException("BamToolsIndex::CheckVersion", message);
101 // ------------------------------------------------------------------
102 // check for deprecated, unsupported versions
103 // (the format had to be modified to accomodate a particular bug fix)
105 // Version 2.0: introduced support for half-open intervals, instead of the old closed intervals
106 // respondBy: throwing exception - we're not going to try to handle the old BTI files.
107 else if ( (Version)m_inputVersion < BamToolsIndex::BTI_2_0 ) {
108 const string message = "unsupported format: this version of the index may not properly handle "
109 "coordinate intervals. Please run 'bamtools index -bti -in yourData.bam' "
110 "to generate an up-to-date, fixed BTI file.";
111 throw BamException("BamToolsIndex::CheckVersion", message);
115 void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) {
117 refEntry.Blocks.clear();
120 void BamToolsIndex::CloseFile(void) {
121 if ( IsFileOpen() ) {
122 fclose(Resources.IndexStream);
123 Resources.IndexStream = 0;
125 m_indexFileSummary.clear();
128 // builds index from associated BAM file & writes out to index file
129 bool BamToolsIndex::Create(void) {
131 // skip if BamReader is invalid or not open
132 if ( m_reader == 0 || !m_reader->IsOpen() ) {
133 SetErrorString("BamToolsIndex::Create", "could not create index: reader is not open");
138 if ( !m_reader->Rewind() ) {
139 const string readerError = m_reader->GetErrorString();
140 const string message = "could not create index: \n\t" + readerError;
141 SetErrorString("BamToolsIndex::Create", message);
146 // open new index file (read & write)
147 const string indexFilename = m_reader->Filename() + Extension();
148 OpenFile(indexFilename, "w+b");
150 // initialize BtiFileSummary with number of references
151 const int& numReferences = m_reader->GetReferenceCount();
152 InitializeFileSummary(numReferences);
154 // intialize output file header
157 // index building markers
158 uint32_t currentBlockCount = 0;
159 int64_t currentAlignmentOffset = m_reader->Tell();
160 int32_t blockRefId = -1;
161 int32_t blockMaxEndPosition = -1;
162 int64_t blockStartOffset = currentAlignmentOffset;
163 int32_t blockStartPosition = -1;
165 // plow through alignments, storing index entries
167 BtiReferenceEntry refEntry;
168 while ( m_reader->LoadNextAlignment(al) ) {
170 // if moved to new reference
171 if ( al.RefID != blockRefId ) {
173 // if first pass, check:
174 if ( currentBlockCount == 0 ) {
176 // write any empty references up to (but not including) al.RefID
177 for ( int i = 0; i < al.RefID; ++i )
178 WriteReferenceEntry( BtiReferenceEntry(i) );
184 // store previous BTI block data in reference entry
185 const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
186 refEntry.Blocks.push_back(block);
188 // write reference entry, then clear
189 WriteReferenceEntry(refEntry);
190 ClearReferenceEntry(refEntry);
192 // write any empty references between (but not including)
193 // the last blockRefID and current al.RefID
194 for ( int i = blockRefId+1; i < al.RefID; ++i )
195 WriteReferenceEntry( BtiReferenceEntry(i) );
198 currentBlockCount = 0;
201 // set ID for new reference entry
202 refEntry.ID = al.RefID;
205 // if beginning of block, update counters
206 if ( currentBlockCount == 0 ) {
207 blockRefId = al.RefID;
208 blockStartOffset = currentAlignmentOffset;
209 blockStartPosition = al.Position;
210 blockMaxEndPosition = al.GetEndPosition();
213 // increment block counter
216 // check end position
217 const int32_t alignmentEndPosition = al.GetEndPosition();
218 if ( alignmentEndPosition > blockMaxEndPosition )
219 blockMaxEndPosition = alignmentEndPosition;
221 // if block is full, get offset for next block, reset currentBlockCount
222 if ( currentBlockCount == m_blockSize ) {
224 // store previous block data in reference entry
225 const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
226 refEntry.Blocks.push_back(block);
229 blockStartOffset = m_reader->Tell();
230 currentBlockCount = 0;
233 // not the best name, but for the next iteration, this value will be the offset of the
234 // *current* alignment. this is necessary because we won't know if this next alignment
235 // is on a new reference until we actually read it
236 currentAlignmentOffset = m_reader->Tell();
239 // after finishing alignments, if any data was read, check:
240 if ( blockRefId >= 0 ) {
242 // store last BTI block data in reference entry
243 const BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
244 refEntry.Blocks.push_back(block);
246 // write last reference entry, then clear
247 WriteReferenceEntry(refEntry);
248 ClearReferenceEntry(refEntry);
250 // then write any empty references remaining at end of file
251 for ( int i = blockRefId+1; i < numReferences; ++i )
252 WriteReferenceEntry( BtiReferenceEntry(i) );
255 } catch ( BamException& e ) {
256 m_errorString = e.what();
261 if ( !m_reader->Rewind() ) {
262 const string readerError = m_reader->GetErrorString();
263 const string message = "could not create index: \n\t" + readerError;
264 SetErrorString("BamToolsIndex::Create", message);
272 // returns format's file extension
273 const std::string BamToolsIndex::Extension(void) {
274 return BamToolsIndex::BTI_EXTENSION;
277 void BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
279 // return false ref ID is not a valid index in file summary data
280 if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() )
281 throw BamException("BamToolsIndex::GetOffset", "invalid region requested");
283 // retrieve reference index data for left bound reference
284 BtiReferenceEntry refEntry(region.LeftRefID);
285 ReadReferenceEntry(refEntry);
287 // binary search for an overlapping block (may not be first one though)
289 typedef BtiBlockVector::const_iterator BtiBlockConstIterator;
290 BtiBlockConstIterator blockFirst = refEntry.Blocks.begin();
291 BtiBlockConstIterator blockIter = blockFirst;
292 BtiBlockConstIterator blockLast = refEntry.Blocks.end();
293 iterator_traits<BtiBlockConstIterator>::difference_type count = distance(blockFirst, blockLast);
294 iterator_traits<BtiBlockConstIterator>::difference_type step;
295 while ( count > 0 ) {
296 blockIter = blockFirst;
298 advance(blockIter, step);
300 const BtiBlock& block = (*blockIter);
301 if ( block.StartPosition <= region.RightPosition ) {
302 if ( block.MaxEndPosition > region.LeftPosition ) {
303 offset = block.StartOffset;
306 blockFirst = ++blockIter;
312 // if we didn't search "off the end" of the blocks
313 if ( blockIter != blockLast ) {
315 // "walk back" until we've gone too far
316 while ( blockIter != blockFirst ) {
317 const BtiBlock& currentBlock = (*blockIter);
320 const BtiBlock& previousBlock = (*blockIter);
321 if ( previousBlock.MaxEndPosition <= region.LeftPosition ) {
322 offset = currentBlock.StartOffset;
328 // if we walked all the way to first block, just return that and let the reader's
329 // region overlap parsing do the rest
330 if ( blockIter == blockFirst ) {
331 const BtiBlock& block = (*blockIter);
332 offset = block.StartOffset;
338 // sets to false if blocks container is empty, or if no matching block could be found
339 *hasAlignmentsInRegion = found;
342 // returns whether reference has alignments or no
343 bool BamToolsIndex::HasAlignments(const int& referenceID) const {
344 if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() )
346 const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID);
347 return ( refSummary.NumBlocks > 0 );
350 // pre-allocates space for each reference's summary data
351 void BamToolsIndex::InitializeFileSummary(const int& numReferences) {
352 m_indexFileSummary.clear();
353 for ( int i = 0; i < numReferences; ++i )
354 m_indexFileSummary.push_back( BtiReferenceSummary() );
357 // returns true if the index stream is open
358 bool BamToolsIndex::IsFileOpen(void) const {
359 return ( Resources.IndexStream != 0 );
362 // attempts to use index data to jump to @region, returns success/fail
363 // a "successful" jump indicates no error, but not whether this region has data
364 // * thus, the method sets a flag to indicate whether there are alignments
365 // available after the jump position
366 bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) {
369 *hasAlignmentsInRegion = false;
371 // skip if invalid reader or not open
372 if ( m_reader == 0 || !m_reader->IsOpen() ) {
373 SetErrorString("BamToolsIndex::Jump", "could not jump: reader is not open");
377 // make sure left-bound position is valid
378 const RefVector& references = m_reader->GetReferenceData();
379 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) {
380 SetErrorString("BamToolsIndex::Jump", "could not create index: invalid region requested");
384 // calculate nearest offset to jump to
387 GetOffset(region, offset, hasAlignmentsInRegion);
388 } catch ( BamException& e ) {
389 m_errorString = e.what();
393 // return success/failure of seek
394 return m_reader->Seek(offset);
397 // loads existing data from file into memory
398 bool BamToolsIndex::Load(const std::string& filename) {
402 // attempt to open file (read-only)
403 OpenFile(filename, "rb");
405 // load metadata & generate in-memory summary
412 } catch ( BamException& e ) {
413 m_errorString = e.what();
418 void BamToolsIndex::LoadFileSummary(void) {
420 // load number of reference sequences
422 LoadNumReferences(numReferences);
424 // initialize file summary data
425 InitializeFileSummary(numReferences);
427 // load summary for each reference
428 BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin();
429 BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end();
430 for ( ; summaryIter != summaryEnd; ++summaryIter )
431 LoadReferenceSummary(*summaryIter);
434 void BamToolsIndex::LoadHeader(void) {
436 // check BTI file metadata
440 // use file's BTI block size to set member variable
441 const size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, Resources.IndexStream);
442 if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
443 if ( elementsRead != 1 )
444 throw BamException("BamToolsIndex::LoadHeader", "could not read BTI block size");
447 void BamToolsIndex::LoadNumBlocks(int& numBlocks) {
448 const size_t elementsRead = fread(&numBlocks, sizeof(numBlocks), 1, Resources.IndexStream);
449 if ( m_isBigEndian ) SwapEndian_32(numBlocks);
450 if ( elementsRead != 1 )
451 throw BamException("BamToolsIndex::LoadNumBlocks", "could not read number of BTI blocks");
454 void BamToolsIndex::LoadNumReferences(int& numReferences) {
455 const size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, Resources.IndexStream);
456 if ( m_isBigEndian ) SwapEndian_32(numReferences);
457 if ( elementsRead != 1 )
458 throw BamException("BamToolsIndex::LoadNumReferences", "could not read number of references");
461 void BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) {
463 // load number of blocks
465 LoadNumBlocks(numBlocks);
467 // store block summary data for this reference
468 refSummary.NumBlocks = numBlocks;
469 refSummary.FirstBlockFilePosition = Tell();
471 // skip reference's blocks
472 SkipBlocks(numBlocks);
475 void BamToolsIndex::OpenFile(const std::string& filename, const char* mode) {
477 // make sure any previous index file is closed
480 // attempt to open file
481 Resources.IndexStream = fopen(filename.c_str(), mode);
482 if ( !IsFileOpen() ) {
483 const string message = string("could not open file: ") + filename;
484 throw BamException("BamToolsIndex::OpenFile", message);
488 void BamToolsIndex::ReadBlock(BtiBlock& block) {
490 // read in block data members
491 size_t elementsRead = 0;
492 elementsRead += fread(&block.MaxEndPosition, sizeof(block.MaxEndPosition), 1, Resources.IndexStream);
493 elementsRead += fread(&block.StartOffset, sizeof(block.StartOffset), 1, Resources.IndexStream);
494 elementsRead += fread(&block.StartPosition, sizeof(block.StartPosition), 1, Resources.IndexStream);
496 // swap endian-ness if necessary
497 if ( m_isBigEndian ) {
498 SwapEndian_32(block.MaxEndPosition);
499 SwapEndian_64(block.StartOffset);
500 SwapEndian_32(block.StartPosition);
503 if ( elementsRead != 3 )
504 throw BamException("BamToolsIndex::ReadBlock", "could not read block");
507 void BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) {
509 // prep blocks container
511 blocks.reserve(refSummary.NumBlocks);
513 // skip to first block entry
514 Seek( refSummary.FirstBlockFilePosition, SEEK_SET );
516 // read & store block entries
518 for ( int i = 0; i < refSummary.NumBlocks; ++i ) {
520 blocks.push_back(block);
524 void BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) {
526 // return false if refId not valid index in file summary structure
527 if ( refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size() )
528 throw BamException("BamToolsIndex::ReadReferenceEntry", "invalid reference requested");
530 // use index summary to assist reading the reference's BTI blocks
531 const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID);
532 ReadBlocks(refSummary, refEntry.Blocks);
535 void BamToolsIndex::Seek(const int64_t& position, const int& origin) {
536 if ( fseek64(Resources.IndexStream, position, origin) != 0 )
537 throw BamException("BamToolsIndex::Seek", "could not seek in BAI file");
540 void BamToolsIndex::SkipBlocks(const int& numBlocks) {
541 Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR );
544 int64_t BamToolsIndex::Tell(void) const {
545 return ftell64(Resources.IndexStream);
548 void BamToolsIndex::WriteBlock(const BtiBlock& block) {
551 int32_t maxEndPosition = block.MaxEndPosition;
552 int64_t startOffset = block.StartOffset;
553 int32_t startPosition = block.StartPosition;
555 // swap endian-ness if necessary
556 if ( m_isBigEndian ) {
557 SwapEndian_32(maxEndPosition);
558 SwapEndian_64(startOffset);
559 SwapEndian_32(startPosition);
562 // write the reference index entry
563 size_t elementsWritten = 0;
564 elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, Resources.IndexStream);
565 elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, Resources.IndexStream);
566 elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, Resources.IndexStream);
567 if ( elementsWritten != 3 )
568 throw BamException("BamToolsIndex::WriteBlock", "could not write BTI block");
571 void BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) {
572 BtiBlockVector::const_iterator blockIter = blocks.begin();
573 BtiBlockVector::const_iterator blockEnd = blocks.end();
574 for ( ; blockIter != blockEnd; ++blockIter )
575 WriteBlock(*blockIter);
578 void BamToolsIndex::WriteHeader(void) {
580 size_t elementsWritten = 0;
582 // write BTI index format 'magic number'
583 elementsWritten += fwrite(BamToolsIndex::BTI_MAGIC, 1, 4, Resources.IndexStream);
585 // write BTI index format version
586 int32_t currentVersion = (int32_t)m_outputVersion;
587 if ( m_isBigEndian ) SwapEndian_32(currentVersion);
588 elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, Resources.IndexStream);
591 uint32_t blockSize = m_blockSize;
592 if ( m_isBigEndian ) SwapEndian_32(blockSize);
593 elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, Resources.IndexStream);
595 // write number of references
596 int32_t numReferences = m_indexFileSummary.size();
597 if ( m_isBigEndian ) SwapEndian_32(numReferences);
598 elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, Resources.IndexStream);
600 if ( elementsWritten != 7 )
601 throw BamException("BamToolsIndex::WriteHeader", "could not write BTI header");
604 void BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) {
606 // write number of blocks this reference
607 uint32_t numBlocks = refEntry.Blocks.size();
608 if ( m_isBigEndian ) SwapEndian_32(numBlocks);
609 const size_t elementsWritten = fwrite(&numBlocks, sizeof(numBlocks), 1, Resources.IndexStream);
610 if ( elementsWritten != 1 )
611 throw BamException("BamToolsIndex::WriteReferenceEntry", "could not write number of blocks");
613 // write actual block entries
614 WriteBlocks(refEntry.Blocks);