1 // ***************************************************************************
2 // BamToolsIndex.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 19 January 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides index operations for the BamTools index format (".bti")
9 // ***************************************************************************
11 #include <api/BamAlignment.h>
12 #include <api/BamReader.h>
13 #include <api/internal/BamReader_p.h>
14 #include <api/internal/BamToolsIndex_p.h>
15 #include <api/internal/BgzfStream_p.h>
16 using namespace BamTools;
17 using namespace BamTools::Internal;
27 BamToolsIndex::BamToolsIndex(void)
30 , m_dataBeginOffset(0)
31 , m_hasFullDataCache(false)
33 , m_outputVersion(BTI_1_2) // latest version - used for writing new index files
35 m_isBigEndian = BamTools::SystemIsBigEndian();
39 BamToolsIndex::~BamToolsIndex(void) {
43 // creates index data (in-memory) from @reader data
44 bool BamToolsIndex::Build(Internal::BamReaderPrivate* reader) {
46 // skip if invalid reader
50 // skip if reader's BgzfStream is invalid or not open
51 BgzfStream* bgzfStream = reader->Stream();
52 if ( bgzfStream == 0 || !bgzfStream->IsOpen )
55 // move reader's file pointer to beginning of alignments
56 if ( !reader->Rewind() ) return false;
58 // initialize index data structure with space for all references
59 const int numReferences = reader->GetReferenceCount();
61 m_hasFullDataCache = false;
62 SetReferenceCount(numReferences);
64 // set up counters and markers
65 int32_t currentBlockCount = 0;
66 int64_t currentAlignmentOffset = bgzfStream->Tell();
67 int32_t blockRefId = 0;
68 int32_t blockMaxEndPosition = 0;
69 int64_t blockStartOffset = currentAlignmentOffset;
70 int32_t blockStartPosition = -1;
72 // plow through alignments, storing index entries
74 while ( reader->LoadNextAlignment(al) ) {
76 // if block contains data (not the first time through) AND alignment is on a new reference
77 if ( currentBlockCount > 0 && al.RefID != blockRefId ) {
79 // store previous data
80 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
81 SaveOffsetEntry(blockRefId, entry);
83 // intialize new block for current alignment's reference
84 currentBlockCount = 0;
85 blockMaxEndPosition = al.GetEndPosition();
86 blockStartOffset = currentAlignmentOffset;
89 // if beginning of block, save first alignment's refID & position
90 if ( currentBlockCount == 0 ) {
91 blockRefId = al.RefID;
92 blockStartPosition = al.Position;
95 // increment block counter
99 int32_t alignmentEndPosition = al.GetEndPosition();
100 if ( alignmentEndPosition > blockMaxEndPosition )
101 blockMaxEndPosition = alignmentEndPosition;
103 // if block is full, get offset for next block, reset currentBlockCount
104 if ( currentBlockCount == m_blockSize ) {
105 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
106 SaveOffsetEntry(blockRefId, entry);
107 blockStartOffset = bgzfStream->Tell();
108 currentBlockCount = 0;
111 // not the best name, but for the next iteration, this value will be the offset of the *current* alignment
112 // necessary because we won't know if this next alignment is on a new reference until we actually read it
113 currentAlignmentOffset = bgzfStream->Tell();
116 // store final block with data
117 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
118 SaveOffsetEntry(blockRefId, entry);
121 m_hasFullDataCache = true;
123 // return success/failure of rewind
124 return reader->Rewind();
127 // check index file magic number, return true if OK
128 bool BamToolsIndex::CheckMagicNumber(void) {
130 // see if index is valid BAM index
132 size_t elementsRead = fread(magic, 1, 4, m_indexStream);
133 if ( elementsRead != 4 ) return false;
134 if ( strncmp(magic, "BTI\1", 4) != 0 ) {
135 fprintf(stderr, "BamToolsIndex ERROR: could not load index file - invalid magic number.\n");
143 // check index file version, return true if OK
144 bool BamToolsIndex::CheckVersion(void) {
146 // read version from file
147 size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream);
148 if ( elementsRead != 1 ) return false;
149 if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
151 // if version is negative, or zero
152 if ( m_inputVersion <= 0 ) {
153 fprintf(stderr, "BamToolsIndex ERROR: could not load index file - invalid version.\n");
157 // if version is newer than can be supported by this version of bamtools
158 else if ( m_inputVersion > m_outputVersion ) {
159 fprintf(stderr, "BamToolsIndex ERROR: could not load index file - this version of BamTools does not recognize new index file version.\n");
160 fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n");
164 // ------------------------------------------------------------------
165 // check for deprecated, unsupported versions
166 // (typically whose format did not accomodate a particular bug fix)
168 else if ( (Version)m_inputVersion == BTI_1_0 ) {
169 fprintf(stderr, "BamToolsIndex ERROR: could not load index file - this version of the index contains a bug related to accessing data near reference ends.\n");
170 fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date, fixed BTI file.\n\n");
174 else if ( (Version)m_inputVersion == BTI_1_1 ) {
175 fprintf(stderr, "BamToolsIndex ERROR: could not load index file - this version of the index contains a bug related to handling empty references.\n");
176 fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date, fixed BTI file.\n\n");
184 // clear all current index offset data in memory
185 void BamToolsIndex::ClearAllData(void) {
186 BamToolsIndexData::const_iterator indexIter = m_indexData.begin();
187 BamToolsIndexData::const_iterator indexEnd = m_indexData.end();
188 for ( ; indexIter != indexEnd; ++indexIter ) {
189 const int& refId = (*indexIter).first;
190 ClearReferenceOffsets(refId);
194 // clear all index offset data for desired reference
195 void BamToolsIndex::ClearReferenceOffsets(const int& refId) {
196 if ( m_indexData.find(refId) == m_indexData.end() ) return;
197 vector<BamToolsIndexEntry>& offsets = m_indexData[refId].Offsets;
199 m_hasFullDataCache = false;
202 // return file position after header metadata
203 off_t BamToolsIndex::DataBeginOffset(void) const {
204 return m_dataBeginOffset;
207 // calculate BAM file offset for desired region
208 // return true if no error (*NOT* equivalent to "has alignments or valid offset")
209 // check @hasAlignmentsInRegion to determine this status
210 // @region - target region
211 // @offset - resulting seek target
212 // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
213 // N.B. - ignores isRightBoundSpecified
214 bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
216 // return false if leftBound refID is not found in index data
217 BamToolsIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID);
218 if ( indexIter == m_indexData.end() )
221 // load index data for region if not already cached
222 if ( !IsDataLoaded(region.LeftRefID) ) {
224 bool loadedOk = true;
225 loadedOk &= SkipToReference(region.LeftRefID);
226 loadedOk &= LoadReference(region.LeftRefID);
227 if ( !loadedOk ) return false;
230 // localize index data for this reference (& sanity check that data actually exists)
231 indexIter = m_indexData.find(region.LeftRefID);
232 if ( indexIter == m_indexData.end() )
235 const vector<BamToolsIndexEntry>& referenceOffsets = (*indexIter).second.Offsets;
236 if ( referenceOffsets.empty() )
239 // -------------------------------------------------------
240 // calculate nearest index to jump to
243 offset = (*referenceOffsets.begin()).StartOffset;
245 // iterate over offsets entries on this reference
246 vector<BamToolsIndexEntry>::const_iterator offsetIter = referenceOffsets.begin();
247 vector<BamToolsIndexEntry>::const_iterator offsetEnd = referenceOffsets.end();
248 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
249 const BamToolsIndexEntry& entry = (*offsetIter);
251 // break if alignment 'entry' overlaps region
252 if ( entry.MaxEndPosition >= region.LeftPosition ) break;
253 offset = (*offsetIter).StartOffset;
256 // set flag based on whether an index entry was found for this region
257 *hasAlignmentsInRegion = ( offsetIter != offsetEnd );
259 // if cache mode set to none, dump the data we just loaded
260 if ( m_cacheMode == BamIndex::NoIndexCaching )
261 ClearReferenceOffsets(region.LeftRefID);
267 // returns whether reference has alignments or no
268 bool BamToolsIndex::HasAlignments(const int& refId) const {
269 BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId);
270 if ( indexIter == m_indexData.end()) return false;
271 const BamToolsReferenceEntry& refEntry = (*indexIter).second;
272 return refEntry.HasAlignments;
275 // return true if all index data is cached
276 bool BamToolsIndex::HasFullDataCache(void) const {
277 return m_hasFullDataCache;
280 // returns true if index cache has data for desired reference
281 bool BamToolsIndex::IsDataLoaded(const int& refId) const {
283 BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId);
284 if ( indexIter == m_indexData.end()) return false;
285 const BamToolsReferenceEntry& refEntry = (*indexIter).second;
287 if ( !refEntry.HasAlignments ) return true; // no data period
289 // return whether offsets list contains data
290 return !refEntry.Offsets.empty();
293 // attempts to use index to jump to @region in @reader; returns success/fail
294 bool BamToolsIndex::Jump(Internal::BamReaderPrivate* reader,
295 const BamTools::BamRegion& region,
296 bool* hasAlignmentsInRegion)
299 *hasAlignmentsInRegion = false;
301 // skip if invalid reader
302 if ( reader == 0 ) return false;
304 // skip if reader's BgzfStream is invalid or not open
305 BgzfStream* bgzfStream = reader->Stream();
306 if ( bgzfStream == 0 || !bgzfStream->IsOpen )
309 // make sure left-bound position is valid
310 const RefVector& references = reader->GetReferenceData();
311 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength )
314 // calculate nearest offset to jump to
316 if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) {
317 fprintf(stderr, "BamToolsIndex ERROR: could not jump - unable to calculate offset for specified region.\n");
321 // return success/failure of seek
322 return bgzfStream->Seek(offset);
325 // clears index data from all references except the first
326 void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) {
327 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
328 KeepOnlyReferenceOffsets( (*indexBegin).first );
331 // clears index data from all references except the one specified
332 void BamToolsIndex::KeepOnlyReferenceOffsets(const int& refId) {
333 BamToolsIndexData::iterator mapIter = m_indexData.begin();
334 BamToolsIndexData::iterator mapEnd = m_indexData.end();
335 for ( ; mapIter != mapEnd; ++mapIter ) {
336 const int entryRefId = (*mapIter).first;
337 if ( entryRefId != refId )
338 ClearReferenceOffsets(entryRefId);
342 // load index data for all references, return true if loaded OK
343 bool BamToolsIndex::LoadAllReferences(bool saveData) {
345 // skip if data already loaded
346 if ( m_hasFullDataCache ) return true;
348 // read in number of references
349 int32_t numReferences;
350 if ( !LoadReferenceCount(numReferences) ) return false;
351 //SetReferenceCount(numReferences);
353 // iterate over reference entries
354 bool loadedOk = true;
355 for ( int i = 0; i < numReferences; ++i )
356 loadedOk &= LoadReference(i, saveData);
359 if ( loadedOk && saveData )
360 m_hasFullDataCache = true;
362 // return success/failure of load
366 // load header data from index file, return true if loaded OK
367 bool BamToolsIndex::LoadHeader(void) {
369 // check magic number
370 if ( !CheckMagicNumber() ) return false;
373 if ( !CheckVersion() ) return false;
375 // read in block size
376 size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream);
377 if ( elementsRead != 1 ) return false;
379 // swap endian-ness if necessary
380 if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
382 // store offset of beginning of data
383 m_dataBeginOffset = ftell64(m_indexStream);
389 // load a single index entry from file, return true if loaded OK
390 // @saveData - save data in memory if true, just read & discard if false
391 bool BamToolsIndex::LoadIndexEntry(const int& refId, bool saveData) {
393 // read in index entry data members
394 size_t elementsRead = 0;
395 BamToolsIndexEntry entry;
396 elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_indexStream);
397 elementsRead += fread(&entry.StartOffset, sizeof(entry.StartOffset), 1, m_indexStream);
398 elementsRead += fread(&entry.StartPosition, sizeof(entry.StartPosition), 1, m_indexStream);
399 if ( elementsRead != 3 ) return false;
401 // swap endian-ness if necessary
402 if ( m_isBigEndian ) {
403 SwapEndian_32(entry.MaxEndPosition);
404 SwapEndian_64(entry.StartOffset);
405 SwapEndian_32(entry.StartPosition);
410 SaveOffsetEntry(refId, entry);
412 // return success/failure of load
416 // load a single reference from file, return true if loaded OK
417 // @saveData - save data in memory if true, just read & discard if false
418 bool BamToolsIndex::LoadFirstReference(bool saveData) {
419 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
420 return LoadReference( (*indexBegin).first, saveData );
423 // load a single reference from file, return true if loaded OK
424 // @saveData - save data in memory if true, just read & discard if false
425 bool BamToolsIndex::LoadReference(const int& refId, bool saveData) {
427 // read in number of offsets for this reference
429 size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_indexStream);
430 if ( elementsRead != 1 ) return false;
432 // swap endian-ness if necessary
433 if ( m_isBigEndian ) SwapEndian_32(numOffsets);
435 // initialize offsets container for this reference
436 SetOffsetCount(refId, (int)numOffsets);
438 // iterate over offset entries
439 for ( unsigned int j = 0; j < numOffsets; ++j )
440 LoadIndexEntry(refId, saveData);
442 // return success/failure of load
446 // loads number of references, return true if loaded OK
447 bool BamToolsIndex::LoadReferenceCount(int& numReferences) {
449 size_t elementsRead = 0;
451 // read reference count
452 elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
453 if ( elementsRead != 1 ) return false;
455 // swap endian-ness if necessary
456 if ( m_isBigEndian ) SwapEndian_32(numReferences);
462 // saves an index offset entry in memory
463 void BamToolsIndex::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) {
464 BamToolsReferenceEntry& refEntry = m_indexData[refId];
465 refEntry.HasAlignments = true;
466 refEntry.Offsets.push_back(entry);
469 // pre-allocates size for offset vector
470 void BamToolsIndex::SetOffsetCount(const int& refId, const int& offsetCount) {
471 BamToolsReferenceEntry& refEntry = m_indexData[refId];
472 refEntry.Offsets.reserve(offsetCount);
473 refEntry.HasAlignments = ( offsetCount > 0);
476 // initializes index data structure to hold @count references
477 void BamToolsIndex::SetReferenceCount(const int& count) {
478 for ( int i = 0; i < count; ++i )
479 m_indexData[i].HasAlignments = false;
482 // position file pointer to first reference begin, return true if skipped OK
483 bool BamToolsIndex::SkipToFirstReference(void) {
484 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
485 return SkipToReference( (*indexBegin).first );
488 // position file pointer to desired reference begin, return true if skipped OK
489 bool BamToolsIndex::SkipToReference(const int& refId) {
492 if ( !Rewind() ) return false;
494 // read in number of references
495 int32_t numReferences;
496 size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
497 if ( elementsRead != 1 ) return false;
498 if ( m_isBigEndian ) SwapEndian_32(numReferences);
500 // iterate over reference entries
501 bool skippedOk = true;
502 int currentRefId = 0;
503 while (currentRefId != refId) {
504 skippedOk &= LoadReference(currentRefId, false);
508 // return success/failure of skip
512 // write header to new index file
513 bool BamToolsIndex::WriteHeader(void) {
515 size_t elementsWritten = 0;
517 // write BTI index format 'magic number'
518 elementsWritten += fwrite("BTI\1", 1, 4, m_indexStream);
520 // write BTI index format version
521 int32_t currentVersion = (int32_t)m_outputVersion;
522 if ( m_isBigEndian ) SwapEndian_32(currentVersion);
523 elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_indexStream);
526 int32_t blockSize = m_blockSize;
527 if ( m_isBigEndian ) SwapEndian_32(blockSize);
528 elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_indexStream);
530 // store offset of beginning of data
531 m_dataBeginOffset = ftell64(m_indexStream);
533 // return success/failure of write
534 return ( elementsWritten == 6 );
537 // write index data for all references to new index file
538 bool BamToolsIndex::WriteAllReferences(void) {
540 size_t elementsWritten = 0;
542 // write number of references
543 int32_t numReferences = (int32_t)m_indexData.size();
544 if ( m_isBigEndian ) SwapEndian_32(numReferences);
545 elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream);
546 if ( elementsWritten != 1 ) return false;
548 // iterate through references in index
550 BamToolsIndexData::const_iterator refIter = m_indexData.begin();
551 BamToolsIndexData::const_iterator refEnd = m_indexData.end();
552 for ( ; refIter != refEnd; ++refIter )
553 refOk &= WriteReferenceEntry( (*refIter).second );
555 // return success/fail
559 // write current reference index data to new index file
560 bool BamToolsIndex::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) {
562 size_t elementsWritten = 0;
564 // write number of offsets listed for this reference
565 uint32_t numOffsets = refEntry.Offsets.size();
566 if ( m_isBigEndian ) SwapEndian_32(numOffsets);
567 elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_indexStream);
568 if ( elementsWritten != 1 ) return false;
570 // iterate over offset entries
571 bool entriesOk = true;
572 vector<BamToolsIndexEntry>::const_iterator offsetIter = refEntry.Offsets.begin();
573 vector<BamToolsIndexEntry>::const_iterator offsetEnd = refEntry.Offsets.end();
574 for ( ; offsetIter != offsetEnd; ++offsetIter )
575 entriesOk &= WriteIndexEntry( (*offsetIter) );
577 // return success/fail
581 // write current index offset entry to new index file
582 bool BamToolsIndex::WriteIndexEntry(const BamToolsIndexEntry& entry) {
585 int32_t maxEndPosition = entry.MaxEndPosition;
586 int64_t startOffset = entry.StartOffset;
587 int32_t startPosition = entry.StartPosition;
589 // swap endian-ness if necessary
590 if ( m_isBigEndian ) {
591 SwapEndian_32(maxEndPosition);
592 SwapEndian_64(startOffset);
593 SwapEndian_32(startPosition);
596 // write the reference index entry
597 size_t elementsWritten = 0;
598 elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream);
599 elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_indexStream);
600 elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_indexStream);
601 return ( elementsWritten == 3 );