1 // ***************************************************************************
2 // BamToolsIndex.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 13 January 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides index operations for the BamTools index format (".bti")
9 // ***************************************************************************
11 #include <api/BamAlignment.h>
12 #include <api/BamReader.h>
14 #include <api/internal/BamToolsIndex_p.h>
15 using namespace BamTools;
16 using namespace BamTools::Internal;
25 BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader)
26 : BamIndex(bgzf, reader)
28 , m_dataBeginOffset(0)
29 , m_hasFullDataCache(false)
31 , m_outputVersion(BTI_1_2) // latest version - used for writing new index files
33 m_isBigEndian = BamTools::SystemIsBigEndian();
37 BamToolsIndex::~BamToolsIndex(void) {
41 // creates index data (in-memory) from current reader data
42 bool BamToolsIndex::Build(void) {
44 // be sure reader & BGZF file are valid & open for reading
45 if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen )
48 // move file pointer to beginning of alignments
49 if ( !m_reader->Rewind() ) return false;
51 // initialize index data structure with space for all references
52 const int numReferences = (int)m_references.size();
54 m_hasFullDataCache = false;
55 SetReferenceCount(numReferences);
57 // set up counters and markers
58 int32_t currentBlockCount = 0;
59 int64_t currentAlignmentOffset = m_BGZF->Tell();
60 int32_t blockRefId = 0;
61 int32_t blockMaxEndPosition = 0;
62 int64_t blockStartOffset = currentAlignmentOffset;
63 int32_t blockStartPosition = -1;
65 // plow through alignments, storing index entries
67 while ( m_reader->GetNextAlignmentCore(al) ) {
69 // if block contains data (not the first time through) AND alignment is on a new reference
70 if ( currentBlockCount > 0 && al.RefID != blockRefId ) {
72 // store previous data
73 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
74 SaveOffsetEntry(blockRefId, entry);
76 // intialize new block for current alignment's reference
77 currentBlockCount = 0;
78 blockMaxEndPosition = al.GetEndPosition();
79 blockStartOffset = currentAlignmentOffset;
82 // if beginning of block, save first alignment's refID & position
83 if ( currentBlockCount == 0 ) {
84 blockRefId = al.RefID;
85 blockStartPosition = al.Position;
88 // increment block counter
92 int32_t alignmentEndPosition = al.GetEndPosition();
93 if ( alignmentEndPosition > blockMaxEndPosition )
94 blockMaxEndPosition = alignmentEndPosition;
96 // if block is full, get offset for next block, reset currentBlockCount
97 if ( currentBlockCount == m_blockSize ) {
98 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
99 SaveOffsetEntry(blockRefId, entry);
100 blockStartOffset = m_BGZF->Tell();
101 currentBlockCount = 0;
104 // not the best name, but for the next iteration, this value will be the offset of the *current* alignment
105 // necessary because we won't know if this next alignment is on a new reference until we actually read it
106 currentAlignmentOffset = m_BGZF->Tell();
109 // store final block with data
110 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
111 SaveOffsetEntry(blockRefId, entry);
114 m_hasFullDataCache = true;
116 // return success/failure of rewind
117 return m_reader->Rewind();
120 // check index file magic number, return true if OK
121 bool BamToolsIndex::CheckMagicNumber(void) {
123 // see if index is valid BAM index
125 size_t elementsRead = fread(magic, 1, 4, m_indexStream);
126 if ( elementsRead != 4 ) return false;
127 if ( strncmp(magic, "BTI\1", 4) != 0 ) {
128 fprintf(stderr, "Problem with index file - invalid format.\n");
136 // check index file version, return true if OK
137 bool BamToolsIndex::CheckVersion(void) {
139 // read version from file
140 size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream);
141 if ( elementsRead != 1 ) return false;
142 if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
144 // if version is negative, or zero
145 if ( m_inputVersion <= 0 ) {
146 fprintf(stderr, "Problem with index file - invalid version.\n");
150 // if version is newer than can be supported by this version of bamtools
151 else if ( m_inputVersion > m_outputVersion ) {
152 fprintf(stderr, "Problem with index file - attempting to use an outdated version of BamTools with a newer index file.\n");
153 fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n");
157 // ------------------------------------------------------------------
158 // check for deprecated, unsupported versions
159 // (typically whose format did not accomodate a particular bug fix)
161 else if ( (Version)m_inputVersion == BTI_1_0 ) {
162 fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n");
163 fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
167 else if ( (Version)m_inputVersion == BTI_1_1 ) {
168 fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n");
169 fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
177 // clear all current index offset data in memory
178 void BamToolsIndex::ClearAllData(void) {
179 BamToolsIndexData::const_iterator indexIter = m_indexData.begin();
180 BamToolsIndexData::const_iterator indexEnd = m_indexData.end();
181 for ( ; indexIter != indexEnd; ++indexIter ) {
182 const int& refId = (*indexIter).first;
183 ClearReferenceOffsets(refId);
187 // clear all index offset data for desired reference
188 void BamToolsIndex::ClearReferenceOffsets(const int& refId) {
189 if ( m_indexData.find(refId) == m_indexData.end() ) return;
190 vector<BamToolsIndexEntry>& offsets = m_indexData[refId].Offsets;
192 m_hasFullDataCache = false;
195 // return file position after header metadata
196 const off_t BamToolsIndex::DataBeginOffset(void) const {
197 return m_dataBeginOffset;
200 // calculate BAM file offset for desired region
201 // return true if no error (*NOT* equivalent to "has alignments or valid offset")
202 // check @hasAlignmentsInRegion to determine this status
203 // @region - target region
204 // @offset - resulting seek target
205 // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
206 // N.B. - ignores isRightBoundSpecified
207 bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
209 // return false if leftBound refID is not found in index data
210 BamToolsIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID);
211 if ( indexIter == m_indexData.end()) return false;
213 // load index data for region if not already cached
214 if ( !IsDataLoaded(region.LeftRefID) ) {
215 bool loadedOk = true;
216 loadedOk &= SkipToReference(region.LeftRefID);
217 loadedOk &= LoadReference(region.LeftRefID);
218 if ( !loadedOk ) return false;
221 // localize index data for this reference (& sanity check that data actually exists)
222 indexIter = m_indexData.find(region.LeftRefID);
223 if ( indexIter == m_indexData.end()) return false;
224 const vector<BamToolsIndexEntry>& referenceOffsets = (*indexIter).second.Offsets;
225 if ( referenceOffsets.empty() ) return false;
227 // -------------------------------------------------------
228 // calculate nearest index to jump to
231 offset = (*referenceOffsets.begin()).StartOffset;
233 // iterate over offsets entries on this reference
234 vector<BamToolsIndexEntry>::const_iterator offsetIter = referenceOffsets.begin();
235 vector<BamToolsIndexEntry>::const_iterator offsetEnd = referenceOffsets.end();
236 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
237 const BamToolsIndexEntry& entry = (*offsetIter);
238 // break if alignment 'entry' overlaps region
239 if ( entry.MaxEndPosition >= region.LeftPosition ) break;
240 offset = (*offsetIter).StartOffset;
243 // set flag based on whether an index entry was found for this region
244 *hasAlignmentsInRegion = ( offsetIter != offsetEnd );
246 // if cache mode set to none, dump the data we just loaded
247 if (m_cacheMode == BamIndex::NoIndexCaching )
248 ClearReferenceOffsets(region.LeftRefID);
254 // returns whether reference has alignments or no
255 bool BamToolsIndex::HasAlignments(const int& refId) const {
256 BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId);
257 if ( indexIter == m_indexData.end()) return false;
258 const BamToolsReferenceEntry& refEntry = (*indexIter).second;
259 return refEntry.HasAlignments;
262 // return true if all index data is cached
263 bool BamToolsIndex::HasFullDataCache(void) const {
264 return m_hasFullDataCache;
267 // returns true if index cache has data for desired reference
268 bool BamToolsIndex::IsDataLoaded(const int& refId) const {
270 BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId);
271 if ( indexIter == m_indexData.end()) return false;
272 const BamToolsReferenceEntry& refEntry = (*indexIter).second;
274 if ( !refEntry.HasAlignments ) return true; // no data period
276 // return whether offsets list contains data
277 return !refEntry.Offsets.empty();
280 // attempts to use index to jump to region; returns success/fail
281 bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
284 *hasAlignmentsInRegion = false;
286 // check valid BamReader state
287 if ( m_reader == 0 || m_BGZF == 0 || !m_reader->IsOpen() ) {
288 fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n");
292 // make sure left-bound position is valid
293 if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength )
296 // calculate nearest offset to jump to
298 if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) {
299 fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n");
303 // return success/failure of seek
304 return m_BGZF->Seek(offset);
307 // clears index data from all references except the first
308 void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) {
309 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
310 KeepOnlyReferenceOffsets( (*indexBegin).first );
313 // clears index data from all references except the one specified
314 void BamToolsIndex::KeepOnlyReferenceOffsets(const int& refId) {
315 BamToolsIndexData::iterator mapIter = m_indexData.begin();
316 BamToolsIndexData::iterator mapEnd = m_indexData.end();
317 for ( ; mapIter != mapEnd; ++mapIter ) {
318 const int entryRefId = (*mapIter).first;
319 if ( entryRefId != refId )
320 ClearReferenceOffsets(entryRefId);
324 // load index data for all references, return true if loaded OK
325 bool BamToolsIndex::LoadAllReferences(bool saveData) {
327 // skip if data already loaded
328 if ( m_hasFullDataCache ) return true;
330 // read in number of references
331 int32_t numReferences;
332 if ( !LoadReferenceCount(numReferences) ) return false;
333 //SetReferenceCount(numReferences);
335 // iterate over reference entries
336 bool loadedOk = true;
337 for ( int i = 0; i < numReferences; ++i )
338 loadedOk &= LoadReference(i, saveData);
341 if ( loadedOk && saveData )
342 m_hasFullDataCache = true;
344 // return success/failure of load
348 // load header data from index file, return true if loaded OK
349 bool BamToolsIndex::LoadHeader(void) {
351 // check magic number
352 if ( !CheckMagicNumber() ) return false;
355 if ( !CheckVersion() ) return false;
357 // read in block size
358 size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream);
359 if ( elementsRead != 1 ) return false;
360 if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
362 // store offset of beginning of data
363 m_dataBeginOffset = ftell64(m_indexStream);
365 // return success/failure of load
366 return (elementsRead == 1);
369 // load a single index entry from file, return true if loaded OK
370 // @saveData - save data in memory if true, just read & discard if false
371 bool BamToolsIndex::LoadIndexEntry(const int& refId, bool saveData) {
373 // read in index entry data members
374 size_t elementsRead = 0;
375 BamToolsIndexEntry entry;
376 elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_indexStream);
377 elementsRead += fread(&entry.StartOffset, sizeof(entry.StartOffset), 1, m_indexStream);
378 elementsRead += fread(&entry.StartPosition, sizeof(entry.StartPosition), 1, m_indexStream);
379 if ( elementsRead != 3 ) {
380 cerr << "Error reading index entry. Expected 3 elements, read in: " << elementsRead << endl;
384 // swap endian-ness if necessary
385 if ( m_isBigEndian ) {
386 SwapEndian_32(entry.MaxEndPosition);
387 SwapEndian_64(entry.StartOffset);
388 SwapEndian_32(entry.StartPosition);
393 SaveOffsetEntry(refId, entry);
395 // return success/failure of load
399 // load a single reference from file, return true if loaded OK
400 // @saveData - save data in memory if true, just read & discard if false
401 bool BamToolsIndex::LoadFirstReference(bool saveData) {
402 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
403 return LoadReference( (*indexBegin).first, saveData );
406 // load a single reference from file, return true if loaded OK
407 // @saveData - save data in memory if true, just read & discard if false
408 bool BamToolsIndex::LoadReference(const int& refId, bool saveData) {
410 // read in number of offsets for this reference
412 size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_indexStream);
413 if ( elementsRead != 1 ) return false;
414 if ( m_isBigEndian ) SwapEndian_32(numOffsets);
416 // initialize offsets container for this reference
417 SetOffsetCount(refId, (int)numOffsets);
419 // iterate over offset entries
420 for ( unsigned int j = 0; j < numOffsets; ++j )
421 LoadIndexEntry(refId, saveData);
423 // return success/failure of load
427 // loads number of references, return true if loaded OK
428 bool BamToolsIndex::LoadReferenceCount(int& numReferences) {
430 size_t elementsRead = 0;
432 // read reference count
433 elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
434 if ( m_isBigEndian ) SwapEndian_32(numReferences);
436 // return success/failure of load
437 return ( elementsRead == 1 );
440 // saves an index offset entry in memory
441 void BamToolsIndex::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) {
442 BamToolsReferenceEntry& refEntry = m_indexData[refId];
443 refEntry.HasAlignments = true;
444 refEntry.Offsets.push_back(entry);
447 // pre-allocates size for offset vector
448 void BamToolsIndex::SetOffsetCount(const int& refId, const int& offsetCount) {
449 BamToolsReferenceEntry& refEntry = m_indexData[refId];
450 refEntry.Offsets.reserve(offsetCount);
451 refEntry.HasAlignments = ( offsetCount > 0);
454 // initializes index data structure to hold @count references
455 void BamToolsIndex::SetReferenceCount(const int& count) {
456 for ( int i = 0; i < count; ++i )
457 m_indexData[i].HasAlignments = false;
460 // position file pointer to first reference begin, return true if skipped OK
461 bool BamToolsIndex::SkipToFirstReference(void) {
462 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
463 return SkipToReference( (*indexBegin).first );
466 // position file pointer to desired reference begin, return true if skipped OK
467 bool BamToolsIndex::SkipToReference(const int& refId) {
470 if ( !Rewind() ) return false;
472 // read in number of references
473 int32_t numReferences;
474 size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream);
475 if ( elementsRead != 1 ) return false;
476 if ( m_isBigEndian ) SwapEndian_32(numReferences);
478 // iterate over reference entries
479 bool skippedOk = true;
480 int currentRefId = 0;
481 while (currentRefId != refId) {
482 skippedOk &= LoadReference(currentRefId, false);
486 // return success/failure of skip
490 // write header to new index file
491 bool BamToolsIndex::WriteHeader(void) {
493 size_t elementsWritten = 0;
495 // write BTI index format 'magic number'
496 elementsWritten += fwrite("BTI\1", 1, 4, m_indexStream);
498 // write BTI index format version
499 int32_t currentVersion = (int32_t)m_outputVersion;
500 if ( m_isBigEndian ) SwapEndian_32(currentVersion);
501 elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_indexStream);
504 int32_t blockSize = m_blockSize;
505 if ( m_isBigEndian ) SwapEndian_32(blockSize);
506 elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_indexStream);
508 // store offset of beginning of data
509 m_dataBeginOffset = ftell64(m_indexStream);
511 // return success/failure of write
512 return ( elementsWritten == 6 );
515 // write index data for all references to new index file
516 bool BamToolsIndex::WriteAllReferences(void) {
518 size_t elementsWritten = 0;
520 // write number of references
521 int32_t numReferences = (int32_t)m_indexData.size();
522 if ( m_isBigEndian ) SwapEndian_32(numReferences);
523 elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream);
525 // iterate through references in index
527 BamToolsIndexData::const_iterator refIter = m_indexData.begin();
528 BamToolsIndexData::const_iterator refEnd = m_indexData.end();
529 for ( ; refIter != refEnd; ++refIter )
530 refOk &= WriteReferenceEntry( (*refIter).second );
532 return ( (elementsWritten == 1) && refOk );
535 // write current reference index data to new index file
536 bool BamToolsIndex::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) {
538 size_t elementsWritten = 0;
540 // write number of offsets listed for this reference
541 uint32_t numOffsets = refEntry.Offsets.size();
542 if ( m_isBigEndian ) SwapEndian_32(numOffsets);
543 elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_indexStream);
545 // iterate over offset entries
546 bool entriesOk = true;
547 vector<BamToolsIndexEntry>::const_iterator offsetIter = refEntry.Offsets.begin();
548 vector<BamToolsIndexEntry>::const_iterator offsetEnd = refEntry.Offsets.end();
549 for ( ; offsetIter != offsetEnd; ++offsetIter )
550 entriesOk &= WriteIndexEntry( (*offsetIter) );
552 return ( (elementsWritten == 1) && entriesOk );
555 // write current index offset entry to new index file
556 bool BamToolsIndex::WriteIndexEntry(const BamToolsIndexEntry& entry) {
559 int32_t maxEndPosition = entry.MaxEndPosition;
560 int64_t startOffset = entry.StartOffset;
561 int32_t startPosition = entry.StartPosition;
563 // swap endian-ness if necessary
564 if ( m_isBigEndian ) {
565 SwapEndian_32(maxEndPosition);
566 SwapEndian_64(startOffset);
567 SwapEndian_32(startPosition);
570 // write the reference index entry
571 size_t elementsWritten = 0;
572 elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream);
573 elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_indexStream);
574 elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_indexStream);
575 return ( elementsWritten == 3 );