1 // ***************************************************************************
2 // BamToolsIndex.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 6 October 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides index operations for the BamTools index format (".bti")
8 // ***************************************************************************
10 #include <api/BamAlignment.h>
11 #include <api/internal/BamException_p.h>
12 #include <api/internal/BamReader_p.h>
13 #include <api/internal/BamToolsIndex_p.h>
14 #include <api/internal/BgzfStream_p.h>
15 using namespace BamTools;
16 using namespace BamTools::Internal;
27 // --------------------------------
28 // static BamToolsIndex constants
29 // --------------------------------
31 const uint32_t BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000;
32 const string BamToolsIndex::BTI_EXTENSION = ".bti";
33 const char* const BamToolsIndex::BTI_MAGIC = "BTI\1";
34 const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t)*2 + sizeof(int64_t);
36 // ----------------------------
37 // RaiiWrapper implementation
38 // ----------------------------
40 BamToolsIndex::RaiiWrapper::RaiiWrapper(void)
44 BamToolsIndex::RaiiWrapper::~RaiiWrapper(void) {
49 // ------------------------------
50 // BamToolsIndex implementation
51 // ------------------------------
54 BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader)
56 , m_cacheMode(BamIndex::LimitedIndexCaching)
57 , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH)
59 , m_outputVersion(BTI_1_2) // latest version - used for writing new index files
61 m_isBigEndian = BamTools::SystemIsBigEndian();
65 BamToolsIndex::~BamToolsIndex(void) {
69 void BamToolsIndex::CheckMagicNumber(void) {
73 size_t elementsRead = fread(magic, sizeof(char), 4, Resources.IndexStream);
74 if ( elementsRead != 4 )
75 throw BamException("BamToolsIndex::CheckMagicNumber", "could not read BTI magic number");
77 // validate expected magic number
78 if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 )
79 throw BamException("BamToolsIndex::CheckMagicNumber", "invalid BTI magic number");
82 // check index file version, return true if OK
83 void BamToolsIndex::CheckVersion(void) {
85 // read version from file
86 size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, Resources.IndexStream);
87 if ( elementsRead != 1 )
88 throw BamException("BamToolsIndex::CheckVersion", "could not read format version");
89 if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
91 // if version is negative, or zero
92 if ( m_inputVersion <= 0 )
93 throw BamException("BamToolsIndex::CheckVersion", "invalid format version");
95 // if version is newer than can be supported by this version of bamtools
96 else if ( m_inputVersion > m_outputVersion ) {
97 const string message = "unsupported format: this index was created by a newer version of BamTools. "
98 "Update your local version of BamTools to use the index file.";
99 throw BamException("BamToolsIndex::CheckVersion", message);
102 // ------------------------------------------------------------------
103 // check for deprecated, unsupported versions
104 // (the format had to be modified to accomodate a particular bug fix)
106 else if ( (Version)m_inputVersion == BamToolsIndex::BTI_1_0 ) {
107 const string message = "unsupported format: this version of the index may not properly handle "
108 "reference ends. Please run 'bamtools index -bti -in yourData.bam' to "
109 "generate an up-to-date, fixed BTI file.";
110 throw BamException("BamToolsIndex::CheckVersion", message);
113 else if ( (Version)m_inputVersion == BamToolsIndex::BTI_1_1 ) {
114 const string message = "unsupported format: this version of the index may not properly handle "
115 "empty references. Please run 'bamtools index -bti -in yourData.bam to "
116 "generate an up-to-date, fixed BTI file.";
117 throw BamException("BamToolsIndex::CheckVersion", message);
121 void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) {
123 refEntry.Blocks.clear();
126 void BamToolsIndex::CloseFile(void) {
127 if ( IsFileOpen() ) {
128 fclose(Resources.IndexStream);
129 Resources.IndexStream = 0;
131 m_indexFileSummary.clear();
134 // builds index from associated BAM file & writes out to index file
135 bool BamToolsIndex::Create(void) {
137 // skip if BamReader is invalid or not open
138 if ( m_reader == 0 || !m_reader->IsOpen() ) {
139 SetErrorString("BamToolsIndex::Create", "could not create index: reader is not open");
144 if ( !m_reader->Rewind() ) {
145 const string readerError = m_reader->GetErrorString();
146 const string message = "could not create index: \n\t" + readerError;
147 SetErrorString("BamToolsIndex::Create", message);
152 // open new index file (read & write)
153 string indexFilename = m_reader->Filename() + Extension();
154 OpenFile(indexFilename, "w+b");
156 // initialize BtiFileSummary with number of references
157 const int& numReferences = m_reader->GetReferenceCount();
158 InitializeFileSummary(numReferences);
160 // intialize output file header
163 // index building markers
164 uint32_t currentBlockCount = 0;
165 int64_t currentAlignmentOffset = m_reader->Tell();
166 int32_t blockRefId = -1;
167 int32_t blockMaxEndPosition = -1;
168 int64_t blockStartOffset = currentAlignmentOffset;
169 int32_t blockStartPosition = -1;
171 // plow through alignments, storing index entries
173 BtiReferenceEntry refEntry;
174 while ( m_reader->LoadNextAlignment(al) ) {
176 // if moved to new reference
177 if ( al.RefID != blockRefId ) {
179 // if first pass, check:
180 if ( currentBlockCount == 0 ) {
182 // write any empty references up to (but not including) al.RefID
183 for ( int i = 0; i < al.RefID; ++i )
184 WriteReferenceEntry( BtiReferenceEntry(i) );
190 // store previous BTI block data in reference entry
191 BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
192 refEntry.Blocks.push_back(block);
194 // write reference entry, then clear
195 WriteReferenceEntry(refEntry);
196 ClearReferenceEntry(refEntry);
198 // write any empty references between (but not including)
199 // the last blockRefID and current al.RefID
200 for ( int i = blockRefId+1; i < al.RefID; ++i )
201 WriteReferenceEntry( BtiReferenceEntry(i) );
204 currentBlockCount = 0;
207 // set ID for new reference entry
208 refEntry.ID = al.RefID;
211 // if beginning of block, update counters
212 if ( currentBlockCount == 0 ) {
213 blockRefId = al.RefID;
214 blockStartOffset = currentAlignmentOffset;
215 blockStartPosition = al.Position;
216 blockMaxEndPosition = al.GetEndPosition();
219 // increment block counter
222 // check end position
223 int32_t alignmentEndPosition = al.GetEndPosition();
224 if ( alignmentEndPosition > blockMaxEndPosition )
225 blockMaxEndPosition = alignmentEndPosition;
227 // if block is full, get offset for next block, reset currentBlockCount
228 if ( currentBlockCount == m_blockSize ) {
230 // store previous block data in reference entry
231 BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
232 refEntry.Blocks.push_back(block);
235 blockStartOffset = m_reader->Tell();
236 currentBlockCount = 0;
239 // not the best name, but for the next iteration, this value will be the offset of the
240 // *current* alignment. this is necessary because we won't know if this next alignment
241 // is on a new reference until we actually read it
242 currentAlignmentOffset = m_reader->Tell();
245 // after finishing alignments, if any data was read, check:
246 if ( blockRefId >= 0 ) {
248 // store last BTI block data in reference entry
249 BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
250 refEntry.Blocks.push_back(block);
252 // write last reference entry, then clear
253 WriteReferenceEntry(refEntry);
254 ClearReferenceEntry(refEntry);
256 // then write any empty references remaining at end of file
257 for ( int i = blockRefId+1; i < numReferences; ++i )
258 WriteReferenceEntry( BtiReferenceEntry(i) );
261 } catch ( BamException& e ) {
262 m_errorString = e.what();
267 if ( !m_reader->Rewind() ) {
268 const string readerError = m_reader->GetErrorString();
269 const string message = "could not create index: \n\t" + readerError;
270 SetErrorString("BamToolsIndex::Create", message);
278 // returns format's file extension
279 const std::string BamToolsIndex::Extension(void) {
280 return BamToolsIndex::BTI_EXTENSION;
283 void BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
285 // return false ref ID is not a valid index in file summary data
286 if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() )
287 throw BamException("BamToolsIndex::GetOffset", "invalid region requested");
289 // retrieve reference index data for left bound reference
290 BtiReferenceEntry refEntry(region.LeftRefID);
291 ReadReferenceEntry(refEntry);
293 // binary search for an overlapping block (may not be first one though)
295 typedef BtiBlockVector::const_iterator BtiBlockConstIterator;
296 BtiBlockConstIterator blockFirst = refEntry.Blocks.begin();
297 BtiBlockConstIterator blockIter = blockFirst;
298 BtiBlockConstIterator blockLast = refEntry.Blocks.end();
299 iterator_traits<BtiBlockConstIterator>::difference_type count = distance(blockFirst, blockLast);
300 iterator_traits<BtiBlockConstIterator>::difference_type step;
301 while ( count > 0 ) {
302 blockIter = blockFirst;
304 advance(blockIter, step);
306 const BtiBlock& block = (*blockIter);
307 if ( block.StartPosition <= region.RightPosition ) {
308 if ( block.MaxEndPosition >= region.LeftPosition ) {
309 offset = block.StartOffset;
312 blockFirst = ++blockIter;
318 // if we didn't search "off the end" of the blocks
319 if ( blockIter != blockLast ) {
321 // "walk back" until we've gone too far
322 while ( blockIter != blockFirst ) {
323 const BtiBlock& currentBlock = (*blockIter);
326 const BtiBlock& previousBlock = (*blockIter);
327 if ( previousBlock.MaxEndPosition < region.LeftPosition ) {
328 offset = currentBlock.StartOffset;
334 // if we walked all the way to first block, just return that and let the reader's
335 // region overlap parsing do the rest
336 if ( blockIter == blockFirst ) {
337 const BtiBlock& block = (*blockIter);
338 offset = block.StartOffset;
344 // sets to false if blocks container is empty, or if no matching block could be found
345 *hasAlignmentsInRegion = found;
348 // returns whether reference has alignments or no
349 bool BamToolsIndex::HasAlignments(const int& referenceID) const {
350 if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() )
352 const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID);
353 return ( refSummary.NumBlocks > 0 );
356 // pre-allocates space for each reference's summary data
357 void BamToolsIndex::InitializeFileSummary(const int& numReferences) {
358 m_indexFileSummary.clear();
359 for ( int i = 0; i < numReferences; ++i )
360 m_indexFileSummary.push_back( BtiReferenceSummary() );
363 // returns true if the index stream is open
364 bool BamToolsIndex::IsFileOpen(void) const {
365 return ( Resources.IndexStream != 0 );
368 // attempts to use index data to jump to @region, returns success/fail
369 // a "successful" jump indicates no error, but not whether this region has data
370 // * thus, the method sets a flag to indicate whether there are alignments
371 // available after the jump position
372 bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) {
375 *hasAlignmentsInRegion = false;
377 // skip if invalid reader or not open
378 if ( m_reader == 0 || !m_reader->IsOpen() ) {
379 SetErrorString("BamToolsIndex::Jump", "could not jump: reader is not open");
383 // make sure left-bound position is valid
384 const RefVector& references = m_reader->GetReferenceData();
385 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) {
386 SetErrorString("BamToolsIndex::Jump", "could not create index: invalid region requested");
390 // calculate nearest offset to jump to
393 GetOffset(region, offset, hasAlignmentsInRegion);
394 } catch ( BamException& e ) {
395 m_errorString = e.what();
399 // return success/failure of seek
400 return m_reader->Seek(offset);
403 // loads existing data from file into memory
404 bool BamToolsIndex::Load(const std::string& filename) {
408 // attempt to open file (read-only)
409 OpenFile(filename, "rb");
411 // load metadata & generate in-memory summary
418 } catch ( BamException& e ) {
419 m_errorString = e.what();
424 void BamToolsIndex::LoadFileSummary(void) {
426 // load number of reference sequences
428 LoadNumReferences(numReferences);
430 // initialize file summary data
431 InitializeFileSummary(numReferences);
433 // load summary for each reference
434 BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin();
435 BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end();
436 for ( ; summaryIter != summaryEnd; ++summaryIter )
437 LoadReferenceSummary(*summaryIter);
440 void BamToolsIndex::LoadHeader(void) {
442 // check BTI file metadata
446 // use file's BTI block size to set member variable
447 const size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, Resources.IndexStream);
448 if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
449 if ( elementsRead != 1 )
450 throw BamException("BamToolsIndex::LoadHeader", "could not read BTI block size");
453 void BamToolsIndex::LoadNumBlocks(int& numBlocks) {
454 const size_t elementsRead = fread(&numBlocks, sizeof(numBlocks), 1, Resources.IndexStream);
455 if ( m_isBigEndian ) SwapEndian_32(numBlocks);
456 if ( elementsRead != 1 )
457 throw BamException("BamToolsIndex::LoadNumBlocks", "could not read number of BTI blocks");
460 void BamToolsIndex::LoadNumReferences(int& numReferences) {
461 const size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, Resources.IndexStream);
462 if ( m_isBigEndian ) SwapEndian_32(numReferences);
463 if ( elementsRead != 1 )
464 throw BamException("BamToolsIndex::LoadNumReferences", "could not read number of references");
467 void BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) {
469 // load number of blocks
471 LoadNumBlocks(numBlocks);
473 // store block summary data for this reference
474 refSummary.NumBlocks = numBlocks;
475 refSummary.FirstBlockFilePosition = Tell();
477 // skip reference's blocks
478 SkipBlocks(numBlocks);
481 void BamToolsIndex::OpenFile(const std::string& filename, const char* mode) {
483 // make sure any previous index file is closed
486 // attempt to open file
487 Resources.IndexStream = fopen(filename.c_str(), mode);
488 if ( !IsFileOpen() ) {
489 const string message = string("could not open file: ") + filename;
490 throw BamException("BamToolsIndex::OpenFile", message);
494 void BamToolsIndex::ReadBlock(BtiBlock& block) {
496 // read in block data members
497 size_t elementsRead = 0;
498 elementsRead += fread(&block.MaxEndPosition, sizeof(block.MaxEndPosition), 1, Resources.IndexStream);
499 elementsRead += fread(&block.StartOffset, sizeof(block.StartOffset), 1, Resources.IndexStream);
500 elementsRead += fread(&block.StartPosition, sizeof(block.StartPosition), 1, Resources.IndexStream);
502 // swap endian-ness if necessary
503 if ( m_isBigEndian ) {
504 SwapEndian_32(block.MaxEndPosition);
505 SwapEndian_64(block.StartOffset);
506 SwapEndian_32(block.StartPosition);
509 if ( elementsRead != 3 )
510 throw BamException("BamToolsIndex::ReadBlock", "could not read block");
513 void BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) {
515 // prep blocks container
517 blocks.reserve(refSummary.NumBlocks);
519 // skip to first block entry
520 Seek( refSummary.FirstBlockFilePosition, SEEK_SET );
522 // read & store block entries
524 for ( int i = 0; i < refSummary.NumBlocks; ++i ) {
526 blocks.push_back(block);
530 void BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) {
532 // return false if refId not valid index in file summary structure
533 if ( refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size() )
534 throw BamException("BamToolsIndex::ReadReferenceEntry", "invalid reference requested");
536 // use index summary to assist reading the reference's BTI blocks
537 const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID);
538 ReadBlocks(refSummary, refEntry.Blocks);
541 void BamToolsIndex::Seek(const int64_t& position, const int& origin) {
542 if ( fseek64(Resources.IndexStream, position, origin) != 0 )
543 throw BamException("BamToolsIndex::Seek", "could not seek in BAI file");
546 // change the index caching behavior
547 void BamToolsIndex::SetCacheMode(const BamIndex::IndexCacheMode& mode) {
549 // do nothing else here ? cache mode will be ignored from now on, most likely
552 void BamToolsIndex::SkipBlocks(const int& numBlocks) {
553 Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR );
556 int64_t BamToolsIndex::Tell(void) const {
557 return ftell64(Resources.IndexStream);
560 void BamToolsIndex::WriteBlock(const BtiBlock& block) {
563 int32_t maxEndPosition = block.MaxEndPosition;
564 int64_t startOffset = block.StartOffset;
565 int32_t startPosition = block.StartPosition;
567 // swap endian-ness if necessary
568 if ( m_isBigEndian ) {
569 SwapEndian_32(maxEndPosition);
570 SwapEndian_64(startOffset);
571 SwapEndian_32(startPosition);
574 // write the reference index entry
575 size_t elementsWritten = 0;
576 elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, Resources.IndexStream);
577 elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, Resources.IndexStream);
578 elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, Resources.IndexStream);
579 if ( elementsWritten != 3 )
580 throw BamException("BamToolsIndex::WriteBlock", "could not write BTI block");
583 void BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) {
584 BtiBlockVector::const_iterator blockIter = blocks.begin();
585 BtiBlockVector::const_iterator blockEnd = blocks.end();
586 for ( ; blockIter != blockEnd; ++blockIter )
587 WriteBlock(*blockIter);
590 void BamToolsIndex::WriteHeader(void) {
592 size_t elementsWritten = 0;
594 // write BTI index format 'magic number'
595 elementsWritten += fwrite(BamToolsIndex::BTI_MAGIC, 1, 4, Resources.IndexStream);
597 // write BTI index format version
598 int32_t currentVersion = (int32_t)m_outputVersion;
599 if ( m_isBigEndian ) SwapEndian_32(currentVersion);
600 elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, Resources.IndexStream);
603 uint32_t blockSize = m_blockSize;
604 if ( m_isBigEndian ) SwapEndian_32(blockSize);
605 elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, Resources.IndexStream);
607 // write number of references
608 int32_t numReferences = m_indexFileSummary.size();
609 if ( m_isBigEndian ) SwapEndian_32(numReferences);
610 elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, Resources.IndexStream);
612 if ( elementsWritten != 7 )
613 throw BamException("BamToolsIndex::WriteHeader", "could not write BTI header");
616 void BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) {
618 // write number of blocks this reference
619 uint32_t numBlocks = refEntry.Blocks.size();
620 if ( m_isBigEndian ) SwapEndian_32(numBlocks);
621 const size_t elementsWritten = fwrite(&numBlocks, sizeof(numBlocks), 1, Resources.IndexStream);
622 if ( elementsWritten != 1 )
623 throw BamException("BamToolsIndex::WriteReferenceEntry", "could not write number of blocks");
625 // write actual block entries
626 WriteBlocks(refEntry.Blocks);