1 // ***************************************************************************
2 // BamToolsIndex.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 27 April 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides index operations for the BamTools index format (".bti")
8 // ***************************************************************************
10 #include <api/BamAlignment.h>
11 #include <api/internal/BamReader_p.h>
12 #include <api/internal/BamToolsIndex_p.h>
13 #include <api/internal/BgzfStream_p.h>
14 using namespace BamTools;
15 using namespace BamTools::Internal;
26 // static BamToolsIndex constants
27 const int BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000;
28 const string BamToolsIndex::BTI_EXTENSION = ".bti";
29 const char* const BamToolsIndex::BTI_MAGIC = "BTI\1";
30 const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t)*2 + sizeof(int64_t);
33 BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader)
36 , m_cacheMode(BamIndex::LimitedIndexCaching)
37 , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH)
39 , m_outputVersion(BTI_1_2) // latest version - used for writing new index files
41 m_isBigEndian = BamTools::SystemIsBigEndian();
45 BamToolsIndex::~BamToolsIndex(void) {
49 bool BamToolsIndex::CheckMagicNumber(void) {
51 // check 'magic number' to see if file is BTI index
53 size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream);
54 if ( elementsRead != 4 ) {
55 cerr << "BamToolsIndex ERROR: could not read format 'magic' number" << endl;
59 if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 ) {
60 cerr << "BamToolsIndex ERROR: invalid format" << endl;
68 // check index file version, return true if OK
69 bool BamToolsIndex::CheckVersion(void) {
71 // read version from file
72 size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream);
73 if ( elementsRead != 1 ) return false;
74 if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
76 // if version is negative, or zero
77 if ( m_inputVersion <= 0 ) {
78 cerr << "BamToolsIndex ERROR: could not load index file: invalid version."
83 // if version is newer than can be supported by this version of bamtools
84 else if ( m_inputVersion > m_outputVersion ) {
85 cerr << "BamToolsIndex ERROR: could not load index file. This version of BamTools does not recognize new index file version"
87 << "Please update BamTools to a more recent version to support this index file."
92 // ------------------------------------------------------------------
93 // check for deprecated, unsupported versions
94 // (typically whose format did not accomodate a particular bug fix)
96 else if ( (Version)m_inputVersion == BamToolsIndex::BTI_1_0 ) {
97 cerr << "BamToolsIndex ERROR: could not load index file. This version of the index contains a bug related to accessing data near reference ends."
99 << "Please run 'bamtools index -bti -in yourData.bam' to generate an up-to-date, fixed BTI file."
104 else if ( (Version)m_inputVersion == BamToolsIndex::BTI_1_1 ) {
105 cerr << "BamToolsIndex ERROR: could not load index file. This version of the index contains a bug related to handling empty references."
107 << "Please run 'bamtools index -bti -in yourData.bam' to generate an up-to-date, fixed BTI file."
116 void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) {
118 refEntry.Blocks.clear();
121 void BamToolsIndex::CloseFile(void) {
123 fclose(m_indexStream);
124 m_indexFileSummary.clear();
127 // builds index from associated BAM file & writes out to index file
128 bool BamToolsIndex::Create(void) {
130 // return false if BamReader is invalid or not open
131 if ( m_reader == 0 || !m_reader->IsOpen() ) {
132 cerr << "BamToolsIndex ERROR: BamReader is not open"
133 << ", aborting index creation" << endl;
138 if ( !m_reader->Rewind() ) {
139 cerr << "BamToolsIndex ERROR: could not rewind BamReader to create index"
140 << ", aborting index creation" << endl;
144 // open new index file (read & write)
145 string indexFilename = m_reader->Filename() + Extension();
146 if ( !OpenFile(indexFilename, "w+b") ) {
147 cerr << "BamToolsIndex ERROR: could not open output index file " << indexFilename
148 << ", aborting index creation" << endl;
152 // initialize BtiFileSummary with number of references
153 const int& numReferences = m_reader->GetReferenceCount();
154 InitializeFileSummary(numReferences);
156 // initialize output file
157 bool createdOk = true;
158 createdOk &= WriteHeader();
160 // index building markers
161 int32_t currentBlockCount = 0;
162 int64_t currentAlignmentOffset = m_reader->Tell();
163 int32_t blockRefId = -1;
164 int32_t blockMaxEndPosition = -1;
165 int64_t blockStartOffset = currentAlignmentOffset;
166 int32_t blockStartPosition = -1;
168 // plow through alignments, storing index entries
170 BtiReferenceEntry refEntry;
171 while ( m_reader->LoadNextAlignment(al) ) {
173 // if moved to new reference
174 if ( al.RefID != blockRefId ) {
176 // if first pass, check:
177 if ( currentBlockCount == 0 ) {
179 // write any empty references up to (but not including) al.RefID
180 for ( int i = 0; i < al.RefID; ++i ) {
181 BtiReferenceEntry emptyEntry(i);
182 createdOk &= WriteReferenceEntry(emptyEntry);
189 // store previous BTI block data in reference entry
190 BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
191 refEntry.Blocks.push_back(block);
193 // write reference entry, then clear
194 createdOk &= WriteReferenceEntry(refEntry);
195 ClearReferenceEntry(refEntry);
197 // write any empty references between (but not including) the last blockRefID and current al.RefID
198 for ( int i = blockRefId+1; i < al.RefID; ++i ) {
199 BtiReferenceEntry emptyEntry(i);
200 createdOk &= WriteReferenceEntry(emptyEntry);
204 currentBlockCount = 0;
207 // set ID for new reference entry
208 refEntry.ID = al.RefID;
211 // if beginning of block, update counters
212 if ( currentBlockCount == 0 ) {
213 blockRefId = al.RefID;
214 blockStartOffset = currentAlignmentOffset;
215 blockStartPosition = al.Position;
216 blockMaxEndPosition = al.GetEndPosition();
219 // increment block counter
222 // check end position
223 int32_t alignmentEndPosition = al.GetEndPosition();
224 if ( alignmentEndPosition > blockMaxEndPosition )
225 blockMaxEndPosition = alignmentEndPosition;
227 // if block is full, get offset for next block, reset currentBlockCount
228 if ( currentBlockCount == m_blockSize ) {
230 // store previous block data in reference entry
231 BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
232 refEntry.Blocks.push_back(block);
235 blockStartOffset = m_reader->Tell();
236 currentBlockCount = 0;
239 // not the best name, but for the next iteration, this value will be the offset of the *current* alignment
240 // necessary because we won't know if this next alignment is on a new reference until we actually read it
241 currentAlignmentOffset = m_reader->Tell();
244 // after finishing alignments, if any data was read, check:
245 if ( blockRefId >= 0 ) {
247 // store last BTI block data in reference entry
248 BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
249 refEntry.Blocks.push_back(block);
251 // write last reference entry, then clear
252 createdOk &= WriteReferenceEntry(refEntry);
253 ClearReferenceEntry(refEntry);
255 // then write any empty references remaining at end of file
256 for ( int i = blockRefId+1; i < numReferences; ++i ) {
257 BtiReferenceEntry emptyEntry(i);
258 createdOk &= WriteReferenceEntry(emptyEntry);
262 // rewind reader & return result
263 createdOk &= m_reader->Rewind();
269 // returns format's file extension
270 const std::string BamToolsIndex::Extension(void) {
271 return BamToolsIndex::BTI_EXTENSION;
274 bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
276 // return false ref ID is not a valid index in file summary data
277 if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() )
280 // retrieve reference index data for left bound reference
281 BtiReferenceEntry refEntry(region.LeftRefID);
282 if ( !ReadReferenceEntry(refEntry) ) {
283 cerr << "BamToolsIndex ERROR: could not retrieve index data from BTI file" << endl;
287 // binary search for an overlapping block (may not be first one though)
289 typedef BtiBlockVector::const_iterator BtiBlockConstIterator;
290 BtiBlockConstIterator blockFirst = refEntry.Blocks.begin();
291 BtiBlockConstIterator blockIter = blockFirst;
292 BtiBlockConstIterator blockLast = refEntry.Blocks.end();
293 iterator_traits<BtiBlockConstIterator>::difference_type count = distance(blockFirst, blockLast);
294 iterator_traits<BtiBlockConstIterator>::difference_type step;
295 while ( count > 0 ) {
296 blockIter = blockFirst;
298 advance(blockIter, step);
300 const BtiBlock& block = (*blockIter);
301 if ( block.StartPosition <= region.RightPosition ) {
302 if ( block.MaxEndPosition >= region.LeftPosition ) {
303 offset = block.StartOffset;
306 blockFirst = ++blockIter;
312 // if we didn't search "off the end" of the blocks
313 if ( blockIter != blockLast ) {
315 // "walk back" until we've gone too far
316 while ( blockIter != blockFirst ) {
317 const BtiBlock& currentBlock = (*blockIter);
320 const BtiBlock& previousBlock = (*blockIter);
321 if ( previousBlock.MaxEndPosition < region.LeftPosition ) {
322 offset = currentBlock.StartOffset;
328 // if we walked all the way to first block, just return that and let the reader's
329 // region overlap parsing do the rest
330 if ( blockIter == blockFirst ) {
331 const BtiBlock& block = (*blockIter);
332 offset = block.StartOffset;
338 // sets to false if blocks container is empty, or if no matching block could be found
339 *hasAlignmentsInRegion = found;
345 // returns whether reference has alignments or no
346 bool BamToolsIndex::HasAlignments(const int& referenceID) const {
347 if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() )
349 const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID);
350 return ( refSummary.NumBlocks > 0 );
353 void BamToolsIndex::InitializeFileSummary(const int& numReferences) {
354 m_indexFileSummary.clear();
355 for ( int i = 0; i < numReferences; ++i )
356 m_indexFileSummary.push_back( BtiReferenceSummary() );
359 bool BamToolsIndex::IsFileOpen(void) const {
360 return ( m_indexStream != 0 );
363 // attempts to use index data to jump to @region, returns success/fail
364 // a "successful" jump indicates no error, but not whether this region has data
365 // * thus, the method sets a flag to indicate whether there are alignments
366 // available after the jump position
367 bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) {
370 *hasAlignmentsInRegion = false;
372 // skip if invalid reader or not open
373 if ( m_reader == 0 || !m_reader->IsOpen() )
376 // make sure left-bound position is valid
377 const RefVector& references = m_reader->GetReferenceData();
378 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength )
381 // calculate nearest offset to jump to
383 if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) {
384 cerr << "BamToolsIndex ERROR: could not jump"
385 << ", unable to calculate offset for specified region" << endl;
389 // return success/failure of seek
390 return m_reader->Seek(offset);
393 // loads existing data from file into memory
394 bool BamToolsIndex::Load(const std::string& filename) {
396 // attempt open index file (read-only)
397 if ( !OpenFile(filename, "rb") ) {
398 cerr << "BamToolsIndex ERROR: could not open input index file " << filename
399 << ", aborting index load" << endl;
403 // attempt to load & validate BTI header data
404 if ( !LoadHeader() ) {
405 cerr << "BamToolsIndex ERROR: could load header from index file " << filename
406 << ", aborting index load" << endl;
411 // attempt to load index file summary
412 if ( !LoadFileSummary() ) {
413 cerr << "BamToolsIndex ERROR: could not generate a summary of index file " << filename
414 << ", aborting index load" << endl;
419 // if we get here, index summary is loaded OK
423 bool BamToolsIndex::LoadFileSummary(void) {
425 // load number of reference sequences
427 if ( !LoadNumReferences(numReferences) )
430 // initialize file summary data
431 InitializeFileSummary(numReferences);
433 // iterate over reference entries
434 bool loadedOk = true;
435 BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin();
436 BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end();
437 for ( ; summaryIter != summaryEnd; ++summaryIter )
438 loadedOk &= LoadReferenceSummary(*summaryIter);
444 bool BamToolsIndex::LoadHeader(void) {
446 // if invalid format 'magic number'
447 if ( !CheckMagicNumber() )
450 // if invalid BTI version
451 if ( !CheckVersion() )
454 // use file's BTI block size to set member variable
455 size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream);
456 if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
457 return ( elementsRead == 1 );
460 bool BamToolsIndex::LoadNumBlocks(int& numBlocks) {
461 size_t elementsRead = 0;
462 elementsRead += fread(&numBlocks, sizeof(numBlocks), 1, m_indexStream);
463 if ( m_isBigEndian ) SwapEndian_32(numBlocks);
464 return ( elementsRead == 1 );
467 bool BamToolsIndex::LoadNumReferences(int& numReferences) {
468 size_t elementsRead = 0;
469 elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
470 if ( m_isBigEndian ) SwapEndian_32(numReferences);
471 return ( elementsRead == 1 );
474 bool BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) {
476 // load number of blocks
478 if ( !LoadNumBlocks(numBlocks) )
481 // store block summary data for this reference
482 refSummary.NumBlocks = numBlocks;
483 refSummary.FirstBlockFilePosition = Tell();
485 // skip blocks in index file (and return status)
486 return SkipBlocks(numBlocks);
489 bool BamToolsIndex::OpenFile(const std::string& filename, const char* mode) {
491 // make sure any previous index file is closed
494 // attempt to open file
495 m_indexStream = fopen(filename.c_str(), mode);
499 bool BamToolsIndex::ReadBlock(BtiBlock& block) {
501 // read in block data members
502 size_t elementsRead = 0;
503 elementsRead += fread(&block.MaxEndPosition, sizeof(block.MaxEndPosition), 1, m_indexStream);
504 elementsRead += fread(&block.StartOffset, sizeof(block.StartOffset), 1, m_indexStream);
505 elementsRead += fread(&block.StartPosition, sizeof(block.StartPosition), 1, m_indexStream);
507 // swap endian-ness if necessary
508 if ( m_isBigEndian ) {
509 SwapEndian_32(block.MaxEndPosition);
510 SwapEndian_64(block.StartOffset);
511 SwapEndian_32(block.StartPosition);
514 // return success/failure
515 return ( elementsRead == 3 );
518 bool BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) {
520 // prep blocks container
522 blocks.reserve(refSummary.NumBlocks);
524 // skip to first block entry
525 if ( !Seek( refSummary.FirstBlockFilePosition, SEEK_SET ) ) {
526 cerr << "BamToolsIndex ERROR: could not seek to position "
527 << refSummary.FirstBlockFilePosition << endl;
531 // read & store block entries
534 for ( int i = 0; i < refSummary.NumBlocks; ++i ) {
535 readOk &= ReadBlock(block);
536 blocks.push_back(block);
541 bool BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) {
543 // return false if refId not valid index in file summary structure
544 if ( refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size() )
547 // use index summary to assist reading the reference's BTI blocks
548 const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID);
549 return ReadBlocks(refSummary, refEntry.Blocks);
552 bool BamToolsIndex::Seek(const int64_t& position, const int& origin) {
553 return ( fseek64(m_indexStream, position, origin) == 0 );
556 // change the index caching behavior
557 void BamToolsIndex::SetCacheMode(const BamIndex::IndexCacheMode& mode) {
559 // do nothing else here ? cache mode will be ignored from now on, most likely
562 bool BamToolsIndex::SkipBlocks(const int& numBlocks) {
563 return Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR );
566 int64_t BamToolsIndex::Tell(void) const {
567 return ftell64(m_indexStream);
570 bool BamToolsIndex::WriteBlock(const BtiBlock& block) {
573 int32_t maxEndPosition = block.MaxEndPosition;
574 int64_t startOffset = block.StartOffset;
575 int32_t startPosition = block.StartPosition;
577 // swap endian-ness if necessary
578 if ( m_isBigEndian ) {
579 SwapEndian_32(maxEndPosition);
580 SwapEndian_64(startOffset);
581 SwapEndian_32(startPosition);
584 // write the reference index entry
585 size_t elementsWritten = 0;
586 elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream);
587 elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_indexStream);
588 elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_indexStream);
589 return ( elementsWritten == 3 );
592 bool BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) {
593 bool writtenOk = true;
594 BtiBlockVector::const_iterator blockIter = blocks.begin();
595 BtiBlockVector::const_iterator blockEnd = blocks.end();
596 for ( ; blockIter != blockEnd; ++blockIter )
597 writtenOk &= WriteBlock(*blockIter);
601 bool BamToolsIndex::WriteHeader(void) {
603 size_t elementsWritten = 0;
605 // write BTI index format 'magic number'
606 elementsWritten += fwrite(BamToolsIndex::BTI_MAGIC, 1, 4, m_indexStream);
608 // write BTI index format version
609 int32_t currentVersion = (int32_t)m_outputVersion;
610 if ( m_isBigEndian ) SwapEndian_32(currentVersion);
611 elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_indexStream);
614 int32_t blockSize = m_blockSize;
615 if ( m_isBigEndian ) SwapEndian_32(blockSize);
616 elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_indexStream);
618 // write number of references
619 int32_t numReferences = m_indexFileSummary.size();
620 if ( m_isBigEndian ) SwapEndian_32(numReferences);
621 elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream);
623 // return success/failure of write
624 return ( elementsWritten == 7 );
627 bool BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) {
629 size_t elementsWritten = 0;
631 // write number of blocks this reference
632 uint32_t numBlocks = refEntry.Blocks.size();
633 if ( m_isBigEndian ) SwapEndian_32(numBlocks);
634 elementsWritten += fwrite(&numBlocks, sizeof(numBlocks), 1, m_indexStream);
636 // write actual block entries
637 const bool blocksOk = WriteBlocks(refEntry.Blocks);
639 // return success/fail
640 return ( elementsWritten == 1) && blocksOk;