1 // ***************************************************************************
2 // BamToolsIndex.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 27 April 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides index operations for the BamTools index format (".bti")
9 // ***************************************************************************
11 #include <api/BamAlignment.h>
12 #include <api/internal/BamReader_p.h>
13 #include <api/internal/BamToolsIndex_p.h>
14 #include <api/internal/BgzfStream_p.h>
15 using namespace BamTools;
16 using namespace BamTools::Internal;
27 // static BamToolsIndex constants
28 const int BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000;
29 const string BamToolsIndex::BTI_EXTENSION = ".bti";
30 const char* const BamToolsIndex::BTI_MAGIC = "BTI\1";
31 const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t)*2 + sizeof(int64_t);
34 BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader)
37 , m_cacheMode(BamIndex::LimitedIndexCaching)
38 , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH)
40 , m_outputVersion(BTI_1_2) // latest version - used for writing new index files
42 m_isBigEndian = BamTools::SystemIsBigEndian();
46 BamToolsIndex::~BamToolsIndex(void) {
50 bool BamToolsIndex::CheckMagicNumber(void) {
52 // check 'magic number' to see if file is BTI index
54 size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream);
55 if ( elementsRead != 4 ) {
56 cerr << "BamToolsIndex ERROR: could not read format 'magic' number" << endl;
60 if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 ) {
61 cerr << "BamToolsIndex ERROR: invalid format" << endl;
69 // check index file version, return true if OK
70 bool BamToolsIndex::CheckVersion(void) {
72 // read version from file
73 size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream);
74 if ( elementsRead != 1 ) return false;
75 if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
77 // if version is negative, or zero
78 if ( m_inputVersion <= 0 ) {
79 cerr << "BamToolsIndex ERROR: could not load index file: invalid version."
84 // if version is newer than can be supported by this version of bamtools
85 else if ( m_inputVersion > m_outputVersion ) {
86 cerr << "BamToolsIndex ERROR: could not load index file. This version of BamTools does not recognize new index file version"
88 << "Please update BamTools to a more recent version to support this index file."
93 // ------------------------------------------------------------------
94 // check for deprecated, unsupported versions
95 // (typically whose format did not accomodate a particular bug fix)
97 else if ( (Version)m_inputVersion == BamToolsIndex::BTI_1_0 ) {
98 cerr << "BamToolsIndex ERROR: could not load index file. This version of the index contains a bug related to accessing data near reference ends."
100 << "Please run 'bamtools index -bti -in yourData.bam' to generate an up-to-date, fixed BTI file."
105 else if ( (Version)m_inputVersion == BamToolsIndex::BTI_1_1 ) {
106 cerr << "BamToolsIndex ERROR: could not load index file. This version of the index contains a bug related to handling empty references."
108 << "Please run 'bamtools index -bti -in yourData.bam' to generate an up-to-date, fixed BTI file."
117 void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) {
119 refEntry.Blocks.clear();
122 void BamToolsIndex::CloseFile(void) {
124 fclose(m_indexStream);
125 m_indexFileSummary.clear();
128 // builds index from associated BAM file & writes out to index file
129 bool BamToolsIndex::Create(void) {
131 // return false if BamReader is invalid or not open
132 if ( m_reader == 0 || !m_reader->IsOpen() ) {
133 cerr << "BamToolsIndex ERROR: BamReader is not open"
134 << ", aborting index creation" << endl;
139 if ( !m_reader->Rewind() ) {
140 cerr << "BamToolsIndex ERROR: could not rewind BamReader to create index"
141 << ", aborting index creation" << endl;
145 // open new index file (read & write)
146 string indexFilename = m_reader->Filename() + Extension();
147 if ( !OpenFile(indexFilename, "w+b") ) {
148 cerr << "BamToolsIndex ERROR: could not open ouput index file " << indexFilename
149 << ", aborting index creation" << endl;
153 // initialize BtiFileSummary with number of references
154 const int& numReferences = m_reader->GetReferenceCount();
155 InitializeFileSummary(numReferences);
157 // initialize output file
158 bool createdOk = true;
159 createdOk &= WriteHeader();
161 // index building markers
162 int32_t currentBlockCount = 0;
163 int64_t currentAlignmentOffset = m_reader->Tell();
164 int32_t blockRefId = -1;
165 int32_t blockMaxEndPosition = -1;
166 int64_t blockStartOffset = currentAlignmentOffset;
167 int32_t blockStartPosition = -1;
169 // plow through alignments, storing index entries
171 BtiReferenceEntry refEntry;
172 while ( m_reader->LoadNextAlignment(al) ) {
174 // if moved to new reference
175 if ( al.RefID != blockRefId ) {
177 // if first pass, check:
178 if ( currentBlockCount == 0 ) {
180 // write any empty references up to (but not including) al.RefID
181 for ( int i = 0; i < al.RefID; ++i ) {
182 BtiReferenceEntry emptyEntry(i);
183 createdOk &= WriteReferenceEntry(emptyEntry);
190 // store previous BTI block data in reference entry
191 BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
192 refEntry.Blocks.push_back(block);
194 // write reference entry, then clear
195 createdOk &= WriteReferenceEntry(refEntry);
196 ClearReferenceEntry(refEntry);
198 // write any empty references between (but not including) the last blockRefID and current al.RefID
199 for ( int i = blockRefId+1; i < al.RefID; ++i ) {
200 BtiReferenceEntry emptyEntry(i);
201 createdOk &= WriteReferenceEntry(emptyEntry);
205 currentBlockCount = 0;
208 // set ID for new reference entry
209 refEntry.ID = al.RefID;
212 // if beginning of block, update counters
213 if ( currentBlockCount == 0 ) {
214 blockRefId = al.RefID;
215 blockStartOffset = currentAlignmentOffset;
216 blockStartPosition = al.Position;
217 blockMaxEndPosition = al.GetEndPosition();
220 // increment block counter
223 // check end position
224 int32_t alignmentEndPosition = al.GetEndPosition();
225 if ( alignmentEndPosition > blockMaxEndPosition )
226 blockMaxEndPosition = alignmentEndPosition;
228 // if block is full, get offset for next block, reset currentBlockCount
229 if ( currentBlockCount == m_blockSize ) {
231 // store previous block data in reference entry
232 BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
233 refEntry.Blocks.push_back(block);
236 blockStartOffset = m_reader->Tell();
237 currentBlockCount = 0;
240 // not the best name, but for the next iteration, this value will be the offset of the *current* alignment
241 // necessary because we won't know if this next alignment is on a new reference until we actually read it
242 currentAlignmentOffset = m_reader->Tell();
245 // after finishing alignments, if any data was read, check:
246 if ( blockRefId >= 0 ) {
248 // store last BTI block data in reference entry
249 BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition);
250 refEntry.Blocks.push_back(block);
252 // write last reference entry, then clear
253 createdOk &= WriteReferenceEntry(refEntry);
254 ClearReferenceEntry(refEntry);
256 // then write any empty references remaining at end of file
257 for ( int i = blockRefId+1; i < numReferences; ++i ) {
258 BtiReferenceEntry emptyEntry(i);
259 createdOk &= WriteReferenceEntry(emptyEntry);
263 // rewind reader & return result
264 createdOk &= m_reader->Rewind();
270 // returns format's file extension
271 const std::string BamToolsIndex::Extension(void) {
272 return BamToolsIndex::BTI_EXTENSION;
275 bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
277 // return false ref ID is not a valid index in file summary data
278 if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() )
281 // retrieve reference index data for left bound reference
282 BtiReferenceEntry refEntry(region.LeftRefID);
283 if ( !ReadReferenceEntry(refEntry) ) {
284 cerr << "BamToolsIndex ERROR: could not retrieve index data from BTI file" << endl;
288 // binary search for an overlapping block (may not be first one though)
290 typedef BtiBlockVector::const_iterator BtiBlockConstIterator;
291 BtiBlockConstIterator blockFirst = refEntry.Blocks.begin();
292 BtiBlockConstIterator blockIter = blockFirst;
293 BtiBlockConstIterator blockLast = refEntry.Blocks.end();
294 iterator_traits<BtiBlockConstIterator>::difference_type count = distance(blockFirst, blockLast);
295 iterator_traits<BtiBlockConstIterator>::difference_type step;
296 while ( count > 0 ) {
297 blockIter = blockFirst;
299 advance(blockIter, step);
301 const BtiBlock& block = (*blockIter);
302 if ( block.StartPosition <= region.RightPosition ) {
303 if ( block.MaxEndPosition >= region.LeftPosition ) {
304 offset = block.StartOffset;
307 blockFirst = ++blockIter;
313 // if we didn't search "off the end" of the blocks
314 if ( blockIter != blockLast ) {
316 // "walk back" until we've gone too far
317 while ( blockIter != blockFirst ) {
318 const BtiBlock& currentBlock = (*blockIter);
321 const BtiBlock& previousBlock = (*blockIter);
322 if ( previousBlock.MaxEndPosition < region.LeftPosition ) {
323 offset = currentBlock.StartOffset;
329 // if we walked all the way to first block, just return that and let the reader's
330 // region overlap parsing do the rest
331 if ( blockIter == blockFirst ) {
332 const BtiBlock& block = (*blockIter);
333 offset = block.StartOffset;
339 // sets to false if blocks container is empty, or if no matching block could be found
340 *hasAlignmentsInRegion = found;
346 // returns whether reference has alignments or no
347 bool BamToolsIndex::HasAlignments(const int& referenceID) const {
348 if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() )
350 const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID);
351 return ( refSummary.NumBlocks > 0 );
354 void BamToolsIndex::InitializeFileSummary(const int& numReferences) {
355 m_indexFileSummary.clear();
356 for ( int i = 0; i < numReferences; ++i )
357 m_indexFileSummary.push_back( BtiReferenceSummary() );
360 bool BamToolsIndex::IsFileOpen(void) const {
361 return ( m_indexStream != 0 );
364 // attempts to use index data to jump to @region, returns success/fail
365 // a "successful" jump indicates no error, but not whether this region has data
366 // * thus, the method sets a flag to indicate whether there are alignments
367 // available after the jump position
368 bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) {
371 *hasAlignmentsInRegion = false;
373 // skip if invalid reader or not open
374 if ( m_reader == 0 || !m_reader->IsOpen() )
377 // make sure left-bound position is valid
378 const RefVector& references = m_reader->GetReferenceData();
379 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength )
382 // calculate nearest offset to jump to
384 if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) {
385 cerr << "BamToolsIndex ERROR: could not jump"
386 << ", unable to calculate offset for specified region" << endl;
390 // return success/failure of seek
391 return m_reader->Seek(offset);
394 // loads existing data from file into memory
395 bool BamToolsIndex::Load(const std::string& filename) {
397 // attempt open index file (read-only)
398 if ( !OpenFile(filename, "rb") ) {
399 cerr << "BamToolsIndex ERROR: could not open input index file " << filename
400 << ", aborting index load" << endl;
404 // attempt to load & validate BTI header data
405 if ( !LoadHeader() ) {
406 cerr << "BamToolsIndex ERROR: could load header from index file " << filename
407 << ", aborting index load" << endl;
412 // attempt to load index file summary
413 if ( !LoadFileSummary() ) {
414 cerr << "BamToolsIndex ERROR: could not generate a summary of index file " << filename
415 << ", aborting index load" << endl;
420 // if we get here, index summary is loaded OK
424 bool BamToolsIndex::LoadFileSummary(void) {
426 // load number of reference sequences
428 if ( !LoadNumReferences(numReferences) )
431 // initialize file summary data
432 InitializeFileSummary(numReferences);
434 // iterate over reference entries
435 bool loadedOk = true;
436 BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin();
437 BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end();
438 for ( ; summaryIter != summaryEnd; ++summaryIter )
439 loadedOk &= LoadReferenceSummary(*summaryIter);
445 bool BamToolsIndex::LoadHeader(void) {
447 // if invalid format 'magic number'
448 if ( !CheckMagicNumber() )
451 // if invalid BTI version
452 if ( !CheckVersion() )
455 // use file's BTI block size to set member variable
456 size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream);
457 if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
458 return ( elementsRead == 1 );
461 bool BamToolsIndex::LoadNumBlocks(int& numBlocks) {
462 size_t elementsRead = 0;
463 elementsRead += fread(&numBlocks, sizeof(numBlocks), 1, m_indexStream);
464 if ( m_isBigEndian ) SwapEndian_32(numBlocks);
465 return ( elementsRead == 1 );
468 bool BamToolsIndex::LoadNumReferences(int& numReferences) {
469 size_t elementsRead = 0;
470 elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
471 if ( m_isBigEndian ) SwapEndian_32(numReferences);
472 return ( elementsRead == 1 );
475 bool BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) {
477 // load number of blocks
479 if ( !LoadNumBlocks(numBlocks) )
482 // store block summary data for this reference
483 refSummary.NumBlocks = numBlocks;
484 refSummary.FirstBlockFilePosition = Tell();
486 // skip blocks in index file (and return status)
487 return SkipBlocks(numBlocks);
490 bool BamToolsIndex::OpenFile(const std::string& filename, const char* mode) {
492 // make sure any previous index file is closed
495 // attempt to open file
496 m_indexStream = fopen(filename.c_str(), mode);
500 bool BamToolsIndex::ReadBlock(BtiBlock& block) {
502 // read in block data members
503 size_t elementsRead = 0;
504 elementsRead += fread(&block.MaxEndPosition, sizeof(block.MaxEndPosition), 1, m_indexStream);
505 elementsRead += fread(&block.StartOffset, sizeof(block.StartOffset), 1, m_indexStream);
506 elementsRead += fread(&block.StartPosition, sizeof(block.StartPosition), 1, m_indexStream);
508 // swap endian-ness if necessary
509 if ( m_isBigEndian ) {
510 SwapEndian_32(block.MaxEndPosition);
511 SwapEndian_64(block.StartOffset);
512 SwapEndian_32(block.StartPosition);
515 // return success/failure
516 return ( elementsRead == 3 );
519 bool BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) {
521 // prep blocks container
523 blocks.reserve(refSummary.NumBlocks);
525 // skip to first block entry
526 if ( !Seek( refSummary.FirstBlockFilePosition, SEEK_SET ) ) {
527 cerr << "BamToolsIndex ERROR: could not seek to position "
528 << refSummary.FirstBlockFilePosition << endl;
532 // read & store block entries
535 for ( int i = 0; i < refSummary.NumBlocks; ++i ) {
536 readOk &= ReadBlock(block);
537 blocks.push_back(block);
542 bool BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) {
544 // return false if refId not valid index in file summary structure
545 if ( refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size() )
548 // use index summary to assist reading the reference's BTI blocks
549 const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID);
550 return ReadBlocks(refSummary, refEntry.Blocks);
553 bool BamToolsIndex::Seek(const int64_t& position, const int& origin) {
554 return ( fseek64(m_indexStream, position, origin) == 0 );
557 // change the index caching behavior
558 void BamToolsIndex::SetCacheMode(const BamIndex::IndexCacheMode& mode) {
560 // do nothing else here ? cache mode will be ignored from now on, most likely
563 bool BamToolsIndex::SkipBlocks(const int& numBlocks) {
564 return Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR );
567 int64_t BamToolsIndex::Tell(void) const {
568 return ftell64(m_indexStream);
571 bool BamToolsIndex::WriteBlock(const BtiBlock& block) {
574 int32_t maxEndPosition = block.MaxEndPosition;
575 int64_t startOffset = block.StartOffset;
576 int32_t startPosition = block.StartPosition;
578 // swap endian-ness if necessary
579 if ( m_isBigEndian ) {
580 SwapEndian_32(maxEndPosition);
581 SwapEndian_64(startOffset);
582 SwapEndian_32(startPosition);
585 // write the reference index entry
586 size_t elementsWritten = 0;
587 elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream);
588 elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_indexStream);
589 elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_indexStream);
590 return ( elementsWritten == 3 );
593 bool BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) {
594 bool writtenOk = true;
595 BtiBlockVector::const_iterator blockIter = blocks.begin();
596 BtiBlockVector::const_iterator blockEnd = blocks.end();
597 for ( ; blockIter != blockEnd; ++blockIter )
598 writtenOk &= WriteBlock(*blockIter);
602 bool BamToolsIndex::WriteHeader(void) {
604 size_t elementsWritten = 0;
606 // write BTI index format 'magic number'
607 elementsWritten += fwrite(BamToolsIndex::BTI_MAGIC, 1, 4, m_indexStream);
609 // write BTI index format version
610 int32_t currentVersion = (int32_t)m_outputVersion;
611 if ( m_isBigEndian ) SwapEndian_32(currentVersion);
612 elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_indexStream);
615 int32_t blockSize = m_blockSize;
616 if ( m_isBigEndian ) SwapEndian_32(blockSize);
617 elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_indexStream);
619 // write number of references
620 int32_t numReferences = m_indexFileSummary.size();
621 if ( m_isBigEndian ) SwapEndian_32(numReferences);
622 elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream);
624 // return success/failure of write
625 return ( elementsWritten == 7 );
628 bool BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) {
630 size_t elementsWritten = 0;
632 // write number of blocks this reference
633 uint32_t numBlocks = refEntry.Blocks.size();
634 if ( m_isBigEndian ) SwapEndian_32(numBlocks);
635 elementsWritten += fwrite(&numBlocks, sizeof(numBlocks), 1, m_indexStream);
637 // write actual block entries
638 const bool blocksOk = WriteBlocks(refEntry.Blocks);
640 // return success/fail
641 return ( elementsWritten == 1) && blocksOk;