1 // ***************************************************************************
2 // BamIndex.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 21 October 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides index functionality - both for the default (standardized) BAM
9 // index format (.bai) as well as a BamTools-specific (nonstandard) index
11 // ***************************************************************************
19 #include "BamReader.h"
22 using namespace BamTools;
24 // -------------------------------
25 // BamIndex implementation
28 BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader)
31 , m_cacheMode(BamIndex::LimitedIndexCaching)
34 if ( m_reader && m_reader->IsOpen() )
35 m_references = m_reader->GetReferenceData();
39 BamIndex::~BamIndex(void) {
41 fclose(m_indexStream);
44 // return true if FILE* is open
45 bool BamIndex::IsOpen(void) const {
46 return ( m_indexStream != 0 );
49 // loads existing data from file into memory
50 bool BamIndex::Load(const string& filename) {
52 // open index file, abort on error
53 if ( !OpenIndexFile(filename, "rb") ) {
54 fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
59 if ( !LoadHeader() ) {
60 fclose(m_indexStream);
64 // load reference data (but only keep in memory if full caching requested)
65 bool saveInitialLoad = ( m_cacheMode == BamIndex::FullIndexCaching );
66 if ( !LoadAllReferences(saveInitialLoad) ) {
67 fclose(m_indexStream);
71 // update index cache based on selected mode
78 // opens index file for reading/writing, return true if opened OK
79 bool BamIndex::OpenIndexFile(const string& filename, const string& mode) {
80 m_indexStream = fopen(filename.c_str(), mode.c_str());
81 return ( m_indexStream != 0 );
84 // rewind index file to beginning of index data, return true if rewound OK
85 bool BamIndex::Rewind(void) {
86 return ( fseek64(m_indexStream, DataBeginOffset(), SEEK_SET) == 0 );
89 // change the index caching behavior
90 void BamIndex::SetCacheMode(const BamIndexCacheMode mode) {
91 if ( mode != m_cacheMode ) {
97 // updates in-memory cache of index data, depending on current cache mode
98 void BamIndex::UpdateCache(void) {
100 // skip if file not open
101 if ( !IsOpen() ) return;
103 // reflect requested cache mode behavior
104 switch ( m_cacheMode ) {
106 case (BamIndex::FullIndexCaching) :
108 LoadAllReferences(true);
111 case (BamIndex::LimitedIndexCaching) :
112 if ( HasFullDataCache() )
113 KeepOnlyFirstReferenceOffsets();
116 SkipToFirstReference();
117 LoadFirstReference(true);
120 case(BamIndex::NoIndexCaching) :
129 // writes in-memory index data out to file
130 bool BamIndex::Write(const string& bamFilename) {
132 // open index file for writing
133 string indexFilename = bamFilename + Extension();
134 if ( !OpenIndexFile(indexFilename, "wb") ) {
135 fprintf(stderr, "ERROR: Could not open file to save index.\n");
139 // write index header data
140 if ( !WriteHeader() ) {
141 fprintf(stderr, "ERROR: There was a problem writing index metadata to new index file.\n");
142 fflush(m_indexStream);
143 fclose(m_indexStream);
147 // write main index data
148 if ( !WriteAllReferences() ) {
149 fprintf(stderr, "ERROR: There was a problem writing index data to new index file.\n");
150 fflush(m_indexStream);
151 fclose(m_indexStream);
155 // flush any remaining output, rewind file, and return success
156 fflush(m_indexStream);
157 fclose(m_indexStream);
159 // re-open index file for later reading
160 if ( !OpenIndexFile(indexFilename, "rb") ) {
161 fprintf(stderr, "ERROR: Could not open newly created index file for reading.\n");
165 // return success/failure of write
169 // #########################################################################################
170 // #########################################################################################
172 // -------------------------------
173 // BamStandardIndex structs & typedefs
177 // BAM index constants
178 const int MAX_BIN = 37450; // =(8^6-1)/7+1
179 const int BAM_LIDX_SHIFT = 14;
181 // --------------------------------------------------
182 // BamStandardIndex data structures & typedefs
190 Chunk(const uint64_t& start = 0,
191 const uint64_t& stop = 0)
197 bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
198 return lhs.Start < rhs.Start;
201 typedef vector<Chunk> ChunkVector;
202 typedef map<uint32_t, ChunkVector> BamBinMap;
203 typedef vector<uint64_t> LinearOffsetVector;
205 struct ReferenceIndex {
209 LinearOffsetVector Offsets;
213 ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
214 const LinearOffsetVector& offsets = LinearOffsetVector(),
215 const bool hasAlignments = false)
218 , HasAlignments(hasAlignments)
222 typedef map<int32_t, ReferenceIndex> BamStandardIndexData;
224 } // namespace BamTools
226 // -------------------------------
227 // BamStandardIndexPrivate implementation
229 struct BamStandardIndex::BamStandardIndexPrivate {
232 BamStandardIndex* m_parent;
235 BamStandardIndexData m_indexData;
236 off_t m_dataBeginOffset;
237 bool m_hasFullDataCache;
241 BamStandardIndexPrivate(BamStandardIndex* parent);
242 ~BamStandardIndexPrivate(void);
244 // parent interface methods
247 // creates index data (in-memory) from current reader data
249 // clear all current index offset data in memory
250 void ClearAllData(void);
251 // return file position after header metadata
252 const off_t DataBeginOffset(void) const;
253 // returns whether reference has alignments or no
254 bool HasAlignments(const int& referenceID) const;
255 // return true if all index data is cached
256 bool HasFullDataCache(void) const;
257 // attempts to use index to jump to region; returns success/fail
258 bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
259 // clears index data from all references except the first reference
260 void KeepOnlyFirstReferenceOffsets(void);
261 // load index data for all references, return true if loaded OK
262 // @saveData - save data in memory if true, just read & discard if false
263 bool LoadAllReferences(bool saveData = true);
264 // load first reference from file, return true if loaded OK
265 // @saveData - save data in memory if true, just read & discard if false
266 bool LoadFirstReference(bool saveData = true);
267 // load header data from index file, return true if loaded OK
268 bool LoadHeader(void);
269 // position file pointer to first reference begin, return true if skipped OK
270 bool SkipToFirstReference(void);
271 // write header to new index file
272 bool WriteHeader(void);
273 // write index data for all references to new index file
274 bool WriteAllReferences(void);
279 // -----------------------
280 // index file operations
282 // check index file magic number, return true if OK
283 bool CheckMagicNumber(void);
284 // check index file version, return true if OK
285 bool CheckVersion(void);
286 // load a single index bin entry from file, return true if loaded OK
287 // @saveData - save data in memory if true, just read & discard if false
288 bool LoadBin(ReferenceIndex& refEntry, bool saveData = true);
289 bool LoadBins(ReferenceIndex& refEntry, bool saveData = true);
290 // load a single index bin entry from file, return true if loaded OK
291 // @saveData - save data in memory if true, just read & discard if false
292 bool LoadChunk(ChunkVector& chunks, bool saveData = true);
293 bool LoadChunks(ChunkVector& chunks, bool saveData = true);
294 // load a single index linear offset entry from file, return true if loaded OK
295 // @saveData - save data in memory if true, just read & discard if false
296 bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true);
297 // load a single reference from file, return true if loaded OK
298 // @saveData - save data in memory if true, just read & discard if false
299 bool LoadReference(const int& refId, bool saveData = true);
300 // loads number of references, return true if loaded OK
301 bool LoadReferenceCount(int& numReferences);
302 // position file pointer to desired reference begin, return true if skipped OK
303 bool SkipToReference(const int& refId);
304 // write index data for bin to new index file
305 bool WriteBin(const uint32_t& binId, const ChunkVector& chunks);
306 // write index data for bins to new index file
307 bool WriteBins(const BamBinMap& bins);
308 // write index data for chunk entry to new index file
309 bool WriteChunk(const Chunk& chunk);
310 // write index data for chunk entry to new index file
311 bool WriteChunks(const ChunkVector& chunks);
312 // write index data for linear offsets entry to new index file
313 bool WriteLinearOffsets(const LinearOffsetVector& offsets);
314 // write index data single reference to new index file
315 bool WriteReference(const ReferenceIndex& refEntry);
317 // -----------------------
318 // index data operations
320 // calculate bins that overlap region
321 int BinsFromRegion(const BamRegion& region,
322 const bool isRightBoundSpecified,
323 uint16_t bins[BamTools::MAX_BIN]);
324 // clear all index offset data for desired reference
325 void ClearReferenceOffsets(const int& refId);
326 // calculates offset(s) for a given region
327 bool GetOffsets(const BamRegion& region,
328 const bool isRightBoundSpecified,
329 vector<int64_t>& offsets,
330 bool* hasAlignmentsInRegion);
331 // returns true if index cache has data for desired reference
332 bool IsDataLoaded(const int& refId) const;
333 // clears index data from all references except the one specified
334 void KeepOnlyReferenceOffsets(const int& refId);
335 // simplifies index by merging 'chunks'
336 void MergeChunks(void);
337 // saves BAM bin entry for index
338 void SaveBinEntry(BamBinMap& binMap,
339 const uint32_t& saveBin,
340 const uint64_t& saveOffset,
341 const uint64_t& lastOffset);
342 // saves linear offset entry for index
343 void SaveLinearOffset(LinearOffsetVector& offsets,
344 const BamAlignment& bAlignment,
345 const uint64_t& lastOffset);
346 // initializes index data structure to hold @count references
347 void SetReferenceCount(const int& count);
351 BamStandardIndex::BamStandardIndexPrivate::BamStandardIndexPrivate(BamStandardIndex* parent)
353 , m_dataBeginOffset(0)
354 , m_hasFullDataCache(false)
356 m_isBigEndian = BamTools::SystemIsBigEndian();
359 BamStandardIndex::BamStandardIndexPrivate::~BamStandardIndexPrivate(void) {
363 // calculate bins that overlap region
364 int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& region,
365 const bool isRightBoundSpecified,
366 uint16_t bins[MAX_BIN])
368 // get region boundaries
369 uint32_t begin = (unsigned int)region.LeftPosition;
372 // if right bound specified AND left&right bounds are on same reference
373 // OK to use right bound position
374 if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
375 end = (unsigned int)region.RightPosition;
377 // otherwise, use end of left bound reference as cutoff
379 end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1;
381 // initialize list, bin '0' always a valid bin
385 // get rest of bins that contain this region
387 for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; }
388 for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; }
389 for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; }
390 for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; }
391 for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
393 // return number of bins stored
397 // creates index data (in-memory) from current reader data
398 bool BamStandardIndex::BamStandardIndexPrivate::Build(void) {
400 // localize parent data
401 if ( m_parent == 0 ) return false;
402 BamReader* reader = m_parent->m_reader;
403 BgzfData* mBGZF = m_parent->m_BGZF;
405 // be sure reader & BGZF file are valid & open for reading
406 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
409 // move file pointer to beginning of alignments
412 // get reference count, reserve index space
413 const int numReferences = (int)m_parent->m_references.size();
415 m_hasFullDataCache = false;
416 SetReferenceCount(numReferences);
418 // sets default constant for bin, ID, offset, coordinate variables
419 const uint32_t defaultValue = 0xffffffffu;
422 uint32_t saveBin(defaultValue);
423 uint32_t lastBin(defaultValue);
426 int32_t saveRefID(defaultValue);
427 int32_t lastRefID(defaultValue);
430 uint64_t saveOffset = mBGZF->Tell();
431 uint64_t lastOffset = saveOffset;
434 int32_t lastCoordinate = defaultValue;
436 BamAlignment bAlignment;
437 while ( reader->GetNextAlignmentCore(bAlignment) ) {
439 // change of chromosome, save ID, reset bin
440 if ( lastRefID != bAlignment.RefID ) {
441 lastRefID = bAlignment.RefID;
442 lastBin = defaultValue;
445 // if lastCoordinate greater than BAM position - file not sorted properly
446 else if ( lastCoordinate > bAlignment.Position ) {
447 fprintf(stderr, "BAM file not properly sorted:\n");
448 fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(),
449 lastCoordinate, bAlignment.Position, bAlignment.RefID);
453 // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
454 if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
456 // save linear offset entry (matched to BAM entry refID)
457 ReferenceIndex& refIndex = m_indexData.at(bAlignment.RefID);
458 LinearOffsetVector& offsets = refIndex.Offsets;
459 SaveLinearOffset(offsets, bAlignment, lastOffset);
462 // if current BamAlignment bin != lastBin, "then possibly write the binning index"
463 if ( bAlignment.Bin != lastBin ) {
465 // if not first time through
466 if ( saveBin != defaultValue ) {
468 // save Bam bin entry
469 ReferenceIndex& refIndex = m_indexData.at(saveRefID);
470 BamBinMap& binMap = refIndex.Bins;
471 SaveBinEntry(binMap, saveBin, saveOffset, lastOffset);
475 saveOffset = lastOffset;
478 saveBin = bAlignment.Bin;
479 lastBin = bAlignment.Bin;
482 saveRefID = bAlignment.RefID;
484 // if invalid RefID, break out
485 if ( saveRefID < 0 ) break;
488 // make sure that current file pointer is beyond lastOffset
489 if ( mBGZF->Tell() <= (int64_t)lastOffset ) {
490 fprintf(stderr, "Error in BGZF offsets.\n");
495 lastOffset = mBGZF->Tell();
497 // update lastCoordinate
498 lastCoordinate = bAlignment.Position;
501 // save any leftover BAM data (as long as refID is valid)
502 if ( saveRefID >= 0 ) {
503 // save Bam bin entry
504 ReferenceIndex& refIndex = m_indexData.at(saveRefID);
505 BamBinMap& binMap = refIndex.Bins;
506 SaveBinEntry(binMap, saveBin, saveOffset, lastOffset);
509 // simplify index by merging chunks
512 // iterate through references in index
513 // sort offsets in linear offset vector
514 BamStandardIndexData::iterator indexIter = m_indexData.begin();
515 BamStandardIndexData::iterator indexEnd = m_indexData.end();
516 for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
518 // get reference index data
519 ReferenceIndex& refIndex = (*indexIter).second;
520 LinearOffsetVector& offsets = refIndex.Offsets;
522 // sort linear offsets
523 sort(offsets.begin(), offsets.end());
526 // rewind file pointer to beginning of alignments, return success/fail
527 return reader->Rewind();
530 // check index file magic number, return true if OK
531 bool BamStandardIndex::BamStandardIndexPrivate::CheckMagicNumber(void) {
533 // read in magic number
535 size_t elementsRead = fread(magic, sizeof(char), 4, m_parent->m_indexStream);
537 // compare to expected value
538 if ( strncmp(magic, "BAI\1", 4) != 0 ) {
539 fprintf(stderr, "Problem with index file - invalid format.\n");
540 fclose(m_parent->m_indexStream);
544 // return success/failure of load
545 return (elementsRead == 4);
548 // clear all current index offset data in memory
549 void BamStandardIndex::BamStandardIndexPrivate::ClearAllData(void) {
550 BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
551 BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
552 for ( ; indexIter != indexEnd; ++indexIter ) {
553 const int& refId = (*indexIter).first;
554 ClearReferenceOffsets(refId);
558 // clear all index offset data for desired reference
559 void BamStandardIndex::BamStandardIndexPrivate::ClearReferenceOffsets(const int& refId) {
561 // look up desired reference, skip if not found
562 if ( m_indexData.find(refId) == m_indexData.end() ) return;
564 // clear reference data
565 ReferenceIndex& refEntry = m_indexData.at(refId);
566 refEntry.Bins.clear();
567 refEntry.Offsets.clear();
570 m_hasFullDataCache = false;
573 // return file position after header metadata
574 const off_t BamStandardIndex::BamStandardIndexPrivate::DataBeginOffset(void) const {
575 return m_dataBeginOffset;
578 // calculates offset(s) for a given region
579 bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& region,
580 const bool isRightBoundSpecified,
581 vector<int64_t>& offsets,
582 bool* hasAlignmentsInRegion)
584 // return false if leftBound refID is not found in index data
585 if ( m_indexData.find(region.LeftRefID) == m_indexData.end() )
588 // load index data for region if not already cached
589 if ( !IsDataLoaded(region.LeftRefID) ) {
590 bool loadedOk = true;
591 loadedOk &= SkipToReference(region.LeftRefID);
592 loadedOk &= LoadReference(region.LeftRefID);
593 if ( !loadedOk ) return false;
596 // calculate which bins overlap this region
597 uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
598 int numBins = BinsFromRegion(region, isRightBoundSpecified, bins);
600 // get bins for this reference
601 const ReferenceIndex& refIndex = m_indexData.at(region.LeftRefID);
602 const BamBinMap& binMap = refIndex.Bins;
604 // get minimum offset to consider
605 const LinearOffsetVector& linearOffsets = refIndex.Offsets;
606 const uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() )
607 ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
609 // store all alignment 'chunk' starts (file offsets) for bins in this region
610 for ( int i = 0; i < numBins; ++i ) {
612 const uint16_t binKey = bins[i];
613 map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
614 if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
616 // iterate over chunks
617 const ChunkVector& chunks = (*binIter).second;
618 std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
619 std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
620 for ( ; chunksIter != chunksEnd; ++chunksIter) {
622 // if valid chunk found, store its file offset
623 const Chunk& chunk = (*chunksIter);
624 if ( chunk.Stop > minOffset )
625 offsets.push_back( chunk.Start );
633 // sort the offsets before returning
634 sort(offsets.begin(), offsets.end());
636 // set flag & return success
637 *hasAlignmentsInRegion = (offsets.size() != 0 );
639 // if cache mode set to none, dump the data we just loaded
640 if (m_parent->m_cacheMode == BamIndex::NoIndexCaching )
641 ClearReferenceOffsets(region.LeftRefID);
647 // returns whether reference has alignments or no
648 bool BamStandardIndex::BamStandardIndexPrivate::HasAlignments(const int& refId) const {
649 if ( m_indexData.find(refId) == m_indexData.end()) return false;
650 const ReferenceIndex& refEntry = m_indexData.at(refId);
651 return refEntry.HasAlignments;
654 // return true if all index data is cached
655 bool BamStandardIndex::BamStandardIndexPrivate::HasFullDataCache(void) const {
656 return m_hasFullDataCache;
659 // returns true if index cache has data for desired reference
660 bool BamStandardIndex::BamStandardIndexPrivate::IsDataLoaded(const int& refId) const {
661 if ( m_indexData.find(refId) == m_indexData.end() ) return false; // unknown reference
662 const ReferenceIndex& refEntry = m_indexData.at(refId);
663 if ( !refEntry.HasAlignments ) return true; // no data period
664 // return whether bin map contains data
665 return ( !refEntry.Bins.empty() );
668 // attempts to use index to jump to region; returns success/fail
669 bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
671 // localize parent data
672 if ( m_parent == 0 ) return false;
673 BamReader* reader = m_parent->m_reader;
674 BgzfData* mBGZF = m_parent->m_BGZF;
675 RefVector& references = m_parent->m_references;
677 // be sure reader & BGZF file are valid & open for reading
678 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
681 // make sure left-bound position is valid
682 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength )
685 // calculate offsets for this region
686 // if failed, print message, set flag, and return failure
687 vector<int64_t> offsets;
688 if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) {
689 fprintf(stderr, "ERROR: Could not jump: unable to calculate offset(s) for specified region.\n");
690 *hasAlignmentsInRegion = false;
694 // iterate through offsets
695 BamAlignment bAlignment;
697 for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {
699 // attempt seek & load first available alignment
700 // set flag to true if data exists
701 result &= mBGZF->Seek(*o);
702 *hasAlignmentsInRegion = reader->GetNextAlignmentCore(bAlignment);
704 // if this alignment corresponds to desired position
705 // return success of seeking back to the offset before the 'current offset' (to cover overlaps)
706 if ( ((bAlignment.RefID == region.LeftRefID) &&
707 ((bAlignment.Position + bAlignment.Length) > region.LeftPosition)) ||
708 (bAlignment.RefID > region.LeftRefID) )
710 if ( o != offsets.begin() ) --o;
711 return mBGZF->Seek(*o);
715 // if error in jumping, print message & set flag
717 fprintf(stderr, "ERROR: Could not jump: unable to determine correct offset for specified region.\n");
718 *hasAlignmentsInRegion = false;
721 // return success/failure
725 // clears index data from all references except the first
726 void BamStandardIndex::BamStandardIndexPrivate::KeepOnlyFirstReferenceOffsets(void) {
727 BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
728 KeepOnlyReferenceOffsets((*indexBegin).first);
731 // clears index data from all references except the one specified
732 void BamStandardIndex::BamStandardIndexPrivate::KeepOnlyReferenceOffsets(const int& refId) {
733 BamStandardIndexData::iterator mapIter = m_indexData.begin();
734 BamStandardIndexData::iterator mapEnd = m_indexData.end();
735 for ( ; mapIter != mapEnd; ++mapIter ) {
736 const int entryRefId = (*mapIter).first;
737 if ( entryRefId != refId )
738 ClearReferenceOffsets(entryRefId);
742 bool BamStandardIndex::BamStandardIndexPrivate::LoadAllReferences(bool saveData) {
744 // skip if data already loaded
745 if ( m_hasFullDataCache ) return true;
747 // get number of reference sequences
748 uint32_t numReferences;
749 if ( !LoadReferenceCount((int&)numReferences) )
752 // iterate over reference entries
753 bool loadedOk = true;
754 for ( int i = 0; i < (int)numReferences; ++i )
755 loadedOk &= LoadReference(i, saveData);
758 if ( loadedOk && saveData )
759 m_hasFullDataCache = true;
761 // return success/failure of loading references
765 // load header data from index file, return true if loaded OK
766 bool BamStandardIndex::BamStandardIndexPrivate::LoadHeader(void) {
768 bool loadedOk = CheckMagicNumber();
770 // store offset of beginning of data
771 m_dataBeginOffset = ftell64(m_parent->m_indexStream);
773 // return success/failure of load
777 // load a single index bin entry from file, return true if loaded OK
778 // @saveData - save data in memory if true, just read & discard if false
779 bool BamStandardIndex::BamStandardIndexPrivate::LoadBin(ReferenceIndex& refEntry, bool saveData) {
781 size_t elementsRead = 0;
785 elementsRead += fread(&binId, sizeof(binId), 1, m_parent->m_indexStream);
786 if ( m_isBigEndian ) SwapEndian_32(binId);
788 // load alignment chunks for this bin
790 bool chunksOk = LoadChunks(chunks, saveData);
793 if ( chunksOk && saveData )
794 refEntry.Bins.insert(pair<uint32_t, ChunkVector>(binId, chunks));
796 // return success/failure of load
797 return ( (elementsRead == 1) && chunksOk );
800 bool BamStandardIndex::BamStandardIndexPrivate::LoadBins(ReferenceIndex& refEntry, bool saveData) {
802 size_t elementsRead = 0;
804 // get number of bins
806 elementsRead += fread(&numBins, sizeof(numBins), 1, m_parent->m_indexStream);
807 if ( m_isBigEndian ) SwapEndian_32(numBins);
810 refEntry.HasAlignments = ( numBins != 0 );
814 for ( int i = 0; i < numBins; ++i )
815 binsOk &= LoadBin(refEntry, saveData);
817 // return success/failure of load
818 return ( (elementsRead == 1) && binsOk );
821 // load a single index bin entry from file, return true if loaded OK
822 // @saveData - save data in memory if true, just read & discard if false
823 bool BamStandardIndex::BamStandardIndexPrivate::LoadChunk(ChunkVector& chunks, bool saveData) {
825 size_t elementsRead = 0;
827 // read in chunk data
830 elementsRead += fread(&start, sizeof(start), 1, m_parent->m_indexStream);
831 elementsRead += fread(&stop, sizeof(stop), 1, m_parent->m_indexStream);
833 // swap endian-ness if necessary
834 if ( m_isBigEndian ) {
835 SwapEndian_64(start);
839 // save data if requested
840 if ( saveData ) chunks.push_back( Chunk(start, stop) );
842 // return success/failure of load
843 return ( elementsRead == 2 );
846 bool BamStandardIndex::BamStandardIndexPrivate::LoadChunks(ChunkVector& chunks, bool saveData) {
848 size_t elementsRead = 0;
850 // read in number of chunks
852 elementsRead += fread(&numChunks, sizeof(numChunks), 1, m_parent->m_indexStream);
853 if ( m_isBigEndian ) SwapEndian_32(numChunks);
855 // initialize space for chunks if we're storing this data
856 if ( saveData ) chunks.reserve(numChunks);
858 // iterate over chunks
859 bool chunksOk = true;
860 for ( int i = 0; i < (int)numChunks; ++i )
861 chunksOk &= LoadChunk(chunks, saveData);
864 sort( chunks.begin(), chunks.end(), ChunkLessThan );
866 // return success/failure of load
867 return ( (elementsRead == 1) && chunksOk );
870 // load a single index linear offset entry from file, return true if loaded OK
871 // @saveData - save data in memory if true, just read & discard if false
872 bool BamStandardIndex::BamStandardIndexPrivate::LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData) {
874 size_t elementsRead = 0;
876 // read in number of linear offsets
877 int32_t numLinearOffsets;
878 elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_parent->m_indexStream);
879 if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets);
881 // set up destination vector (if we're saving the data)
882 LinearOffsetVector linearOffsets;
883 if ( saveData ) linearOffsets.reserve(numLinearOffsets);
885 // iterate over linear offsets
886 uint64_t linearOffset;
887 for ( int i = 0; i < numLinearOffsets; ++i ) {
888 elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_parent->m_indexStream);
889 if ( m_isBigEndian ) SwapEndian_64(linearOffset);
890 if ( saveData ) linearOffsets.push_back(linearOffset);
893 // sort linear offsets
894 sort ( linearOffsets.begin(), linearOffsets.end() );
896 // save in reference index entry if desired
897 if ( saveData ) refEntry.Offsets = linearOffsets;
899 // return success/failure of load
900 return ( elementsRead == (size_t)(numLinearOffsets + 1) );
903 bool BamStandardIndex::BamStandardIndexPrivate::LoadFirstReference(bool saveData) {
904 BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
905 return LoadReference((*indexBegin).first, saveData);
908 // load a single reference from file, return true if loaded OK
909 // @saveData - save data in memory if true, just read & discard if false
910 bool BamStandardIndex::BamStandardIndexPrivate::LoadReference(const int& refId, bool saveData) {
912 // if reference not previously loaded, create new entry
913 if ( m_indexData.find(refId) == m_indexData.end() ) {
914 ReferenceIndex newEntry;
915 newEntry.HasAlignments = false;
916 m_indexData.insert( pair<int32_t, ReferenceIndex>(refId, newEntry) );
919 // load reference data
920 ReferenceIndex& entry = m_indexData.at(refId);
921 bool loadedOk = true;
922 loadedOk &= LoadBins(entry, saveData);
923 loadedOk &= LoadLinearOffsets(entry, saveData);
927 // loads number of references, return true if loaded OK
928 bool BamStandardIndex::BamStandardIndexPrivate::LoadReferenceCount(int& numReferences) {
930 size_t elementsRead = 0;
932 // read reference count
933 elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
934 if ( m_isBigEndian ) SwapEndian_32(numReferences);
936 // return success/failure of load
937 return ( elementsRead == 1 );
940 // merges 'alignment chunks' in BAM bin (used for index building)
941 void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) {
943 // iterate over reference enties
944 BamStandardIndexData::iterator indexIter = m_indexData.begin();
945 BamStandardIndexData::iterator indexEnd = m_indexData.end();
946 for ( ; indexIter != indexEnd; ++indexIter ) {
948 // get BAM bin map for this reference
949 ReferenceIndex& refIndex = (*indexIter).second;
950 BamBinMap& bamBinMap = refIndex.Bins;
952 // iterate over BAM bins
953 BamBinMap::iterator binIter = bamBinMap.begin();
954 BamBinMap::iterator binEnd = bamBinMap.end();
955 for ( ; binIter != binEnd; ++binIter ) {
957 // get chunk vector for this bin
958 ChunkVector& binChunks = (*binIter).second;
959 if ( binChunks.size() == 0 ) continue;
961 ChunkVector mergedChunks;
962 mergedChunks.push_back( binChunks[0] );
964 // iterate over chunks
966 ChunkVector::iterator chunkIter = binChunks.begin();
967 ChunkVector::iterator chunkEnd = binChunks.end();
968 for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
970 // get 'currentChunk' based on numeric index
971 Chunk& currentChunk = mergedChunks[i];
973 // get iteratorChunk based on vector iterator
974 Chunk& iteratorChunk = (*chunkIter);
976 // if chunk ends where (iterator) chunk starts, then merge
977 if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 )
978 currentChunk.Stop = iteratorChunk.Stop;
982 // set currentChunk + 1 to iteratorChunk
983 mergedChunks.push_back(iteratorChunk);
988 // saved merged chunk vector
989 (*binIter).second = mergedChunks;
994 // saves BAM bin entry for index
995 void BamStandardIndex::BamStandardIndexPrivate::SaveBinEntry(BamBinMap& binMap,
996 const uint32_t& saveBin,
997 const uint64_t& saveOffset,
998 const uint64_t& lastOffset)
1001 BamBinMap::iterator binIter = binMap.find(saveBin);
1004 Chunk newChunk(saveOffset, lastOffset);
1006 // if entry doesn't exist
1007 if ( binIter == binMap.end() ) {
1008 ChunkVector newChunks;
1009 newChunks.push_back(newChunk);
1010 binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
1015 ChunkVector& binChunks = (*binIter).second;
1016 binChunks.push_back( newChunk );
1020 // saves linear offset entry for index
1021 void BamStandardIndex::BamStandardIndexPrivate::SaveLinearOffset(LinearOffsetVector& offsets,
1022 const BamAlignment& bAlignment,
1023 const uint64_t& lastOffset)
1025 // get converted offsets
1026 int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
1027 int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
1029 // resize vector if necessary
1030 int oldSize = offsets.size();
1031 int newSize = endOffset + 1;
1032 if ( oldSize < newSize )
1033 offsets.resize(newSize, 0);
1036 for( int i = beginOffset + 1; i <= endOffset; ++i ) {
1037 if ( offsets[i] == 0 )
1038 offsets[i] = lastOffset;
1042 // initializes index data structure to hold @count references
1043 void BamStandardIndex::BamStandardIndexPrivate::SetReferenceCount(const int& count) {
1044 for ( int i = 0; i < count; ++i )
1045 m_indexData[i].HasAlignments = false;
1048 bool BamStandardIndex::BamStandardIndexPrivate::SkipToFirstReference(void) {
1049 BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
1050 return SkipToReference( (*indexBegin).first );
1053 // position file pointer to desired reference begin, return true if skipped OK
1054 bool BamStandardIndex::BamStandardIndexPrivate::SkipToReference(const int& refId) {
1057 if ( !m_parent->Rewind() ) return false;
1059 // read in number of references
1060 uint32_t numReferences;
1061 size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
1062 if ( elementsRead != 1 ) return false;
1063 if ( m_isBigEndian ) SwapEndian_32(numReferences);
1065 // iterate over reference entries
1066 bool skippedOk = true;
1067 int currentRefId = 0;
1068 while (currentRefId != refId) {
1069 skippedOk &= LoadReference(currentRefId, false);
1077 // write header to new index file
1078 bool BamStandardIndex::BamStandardIndexPrivate::WriteHeader(void) {
1080 size_t elementsWritten = 0;
1082 // write magic number
1083 elementsWritten += fwrite("BAI\1", sizeof(char), 4, m_parent->m_indexStream);
1085 // store offset of beginning of data
1086 m_dataBeginOffset = ftell64(m_parent->m_indexStream);
1088 // return success/failure of write
1089 return (elementsWritten == 4);
1092 // write index data for all references to new index file
1093 bool BamStandardIndex::BamStandardIndexPrivate::WriteAllReferences(void) {
1095 size_t elementsWritten = 0;
1097 // write number of reference sequences
1098 int32_t numReferenceSeqs = m_indexData.size();
1099 if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs);
1100 elementsWritten += fwrite(&numReferenceSeqs, sizeof(numReferenceSeqs), 1, m_parent->m_indexStream);
1102 // iterate over reference sequences
1104 BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
1105 BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
1106 for ( ; indexIter != indexEnd; ++ indexIter )
1107 refsOk &= WriteReference( (*indexIter).second );
1109 // return success/failure of write
1110 return ( (elementsWritten == 1) && refsOk );
1113 // write index data for bin to new index file
1114 bool BamStandardIndex::BamStandardIndexPrivate::WriteBin(const uint32_t& binId, const ChunkVector& chunks) {
1116 size_t elementsWritten = 0;
1119 uint32_t binKey = binId;
1120 if ( m_isBigEndian ) SwapEndian_32(binKey);
1121 elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_parent->m_indexStream);
1124 bool chunksOk = WriteChunks(chunks);
1126 // return success/failure of write
1127 return ( (elementsWritten == 1) && chunksOk );
1130 // write index data for bins to new index file
1131 bool BamStandardIndex::BamStandardIndexPrivate::WriteBins(const BamBinMap& bins) {
1133 size_t elementsWritten = 0;
1135 // write number of bins
1136 int32_t binCount = bins.size();
1137 if ( m_isBigEndian ) SwapEndian_32(binCount);
1138 elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_parent->m_indexStream);
1140 // iterate over bins
1142 BamBinMap::const_iterator binIter = bins.begin();
1143 BamBinMap::const_iterator binEnd = bins.end();
1144 for ( ; binIter != binEnd; ++binIter )
1145 binsOk &= WriteBin( (*binIter).first, (*binIter).second );
1147 // return success/failure of write
1148 return ( (elementsWritten == 1) && binsOk );
1151 // write index data for chunk entry to new index file
1152 bool BamStandardIndex::BamStandardIndexPrivate::WriteChunk(const Chunk& chunk) {
1154 size_t elementsWritten = 0;
1156 // localize alignment chunk offsets
1157 uint64_t start = chunk.Start;
1158 uint64_t stop = chunk.Stop;
1160 // swap endian-ness if necessary
1161 if ( m_isBigEndian ) {
1162 SwapEndian_64(start);
1163 SwapEndian_64(stop);
1166 // write to index file
1167 elementsWritten += fwrite(&start, sizeof(start), 1, m_parent->m_indexStream);
1168 elementsWritten += fwrite(&stop, sizeof(stop), 1, m_parent->m_indexStream);
1170 // return success/failure of write
1171 return ( elementsWritten == 2 );
1174 // write index data for chunk entry to new index file
1175 bool BamStandardIndex::BamStandardIndexPrivate::WriteChunks(const ChunkVector& chunks) {
1177 size_t elementsWritten = 0;
1180 int32_t chunkCount = chunks.size();
1181 if ( m_isBigEndian ) SwapEndian_32(chunkCount);
1182 elementsWritten += fwrite(&chunkCount, sizeof(chunkCount), 1, m_parent->m_indexStream);
1184 // iterate over chunks
1185 bool chunksOk = true;
1186 ChunkVector::const_iterator chunkIter = chunks.begin();
1187 ChunkVector::const_iterator chunkEnd = chunks.end();
1188 for ( ; chunkIter != chunkEnd; ++chunkIter )
1189 chunksOk &= WriteChunk( (*chunkIter) );
1191 // return success/failure of write
1192 return ( (elementsWritten == 1) && chunksOk );
1195 // write index data for linear offsets entry to new index file
1196 bool BamStandardIndex::BamStandardIndexPrivate::WriteLinearOffsets(const LinearOffsetVector& offsets) {
1198 size_t elementsWritten = 0;
1200 // write number of linear offsets
1201 int32_t offsetCount = offsets.size();
1202 if ( m_isBigEndian ) SwapEndian_32(offsetCount);
1203 elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_parent->m_indexStream);
1205 // iterate over linear offsets
1206 LinearOffsetVector::const_iterator offsetIter = offsets.begin();
1207 LinearOffsetVector::const_iterator offsetEnd = offsets.end();
1208 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
1210 // write linear offset
1211 uint64_t linearOffset = (*offsetIter);
1212 if ( m_isBigEndian ) SwapEndian_64(linearOffset);
1213 elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, m_parent->m_indexStream);
1216 // return success/failure of write
1217 return ( elementsWritten == (size_t)(offsetCount + 1) );
1220 // write index data for a single reference to new index file
1221 bool BamStandardIndex::BamStandardIndexPrivate::WriteReference(const ReferenceIndex& refEntry) {
1223 refOk &= WriteBins(refEntry.Bins);
1224 refOk &= WriteLinearOffsets(refEntry.Offsets);
1228 // ---------------------------------------------------------------
1229 // BamStandardIndex implementation
1231 BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader)
1232 : BamIndex(bgzf, reader)
1234 d = new BamStandardIndexPrivate(this);
1237 BamStandardIndex::~BamStandardIndex(void) {
1242 // BamIndex interface implementation
1243 bool BamStandardIndex::Build(void) { return d->Build(); }
1244 void BamStandardIndex::ClearAllData(void) { d->ClearAllData(); }
1245 const off_t BamStandardIndex::DataBeginOffset(void) const { return d->DataBeginOffset(); }
1246 bool BamStandardIndex::HasAlignments(const int& referenceID) const { return d->HasAlignments(referenceID); }
1247 bool BamStandardIndex::HasFullDataCache(void) const { return d->HasFullDataCache(); }
1248 bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); }
1249 void BamStandardIndex::KeepOnlyFirstReferenceOffsets(void) { d->KeepOnlyFirstReferenceOffsets(); }
1250 bool BamStandardIndex::LoadAllReferences(bool saveData) { return d->LoadAllReferences(saveData); }
1251 bool BamStandardIndex::LoadFirstReference(bool saveData) { return d->LoadFirstReference(saveData); }
1252 bool BamStandardIndex::LoadHeader(void) { return d->LoadHeader(); }
1253 bool BamStandardIndex::SkipToFirstReference(void) { return d->SkipToFirstReference(); }
1254 bool BamStandardIndex::WriteAllReferences(void) { return d->WriteAllReferences(); }
1255 bool BamStandardIndex::WriteHeader(void) { return d->WriteHeader(); }
1257 // #########################################################################################
1258 // #########################################################################################
1260 // ---------------------------------------------------
1261 // BamToolsIndex structs & typedefs
1263 namespace BamTools {
1265 // individual index offset entry
1266 struct BamToolsIndexEntry {
1269 int32_t MaxEndPosition;
1270 int64_t StartOffset;
1271 int32_t StartPosition;
1274 BamToolsIndexEntry(const int32_t& maxEndPosition = 0,
1275 const int64_t& startOffset = 0,
1276 const int32_t& startPosition = 0)
1277 : MaxEndPosition(maxEndPosition)
1278 , StartOffset(startOffset)
1279 , StartPosition(startPosition)
1283 // reference index entry
1284 struct BamToolsReferenceEntry {
1288 vector<BamToolsIndexEntry> Offsets;
1291 BamToolsReferenceEntry(void)
1292 : HasAlignments(false)
1296 // the actual index data structure
1297 typedef map<int, BamToolsReferenceEntry> BamToolsIndexData;
1299 } // namespace BamTools
1301 // ---------------------------------------------------
1302 // BamToolsIndexPrivate implementation
1304 struct BamToolsIndex::BamToolsIndexPrivate {
1306 // keep a list of any supported versions here
1307 // (might be useful later to handle any 'legacy' versions if the format changes)
1308 // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
1310 // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by:
1312 // if ( indexVersion >= BTI_1_2 )
1316 enum Version { BTI_1_0 = 1
1322 BamToolsIndex* m_parent;
1325 int32_t m_blockSize;
1326 BamToolsIndexData m_indexData;
1327 off_t m_dataBeginOffset;
1328 bool m_hasFullDataCache;
1330 int32_t m_inputVersion; // Version is serialized as int
1331 Version m_outputVersion;
1334 BamToolsIndexPrivate(BamToolsIndex* parent);
1335 ~BamToolsIndexPrivate(void);
1337 // parent interface methods
1340 // creates index data (in-memory) from current reader data
1342 // clear all current index offset data in memory
1343 void ClearAllData(void);
1344 // return file position after header metadata
1345 const off_t DataBeginOffset(void) const;
1346 // returns whether reference has alignments or no
1347 bool HasAlignments(const int& referenceID) const;
1348 // return true if all index data is cached
1349 bool HasFullDataCache(void) const;
1350 // attempts to use index to jump to region; returns success/fail
1351 bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
1352 // clears index data from all references except the first
1353 void KeepOnlyFirstReferenceOffsets(void);
1354 // load index data for all references, return true if loaded OK
1355 // @saveData - save data in memory if true, just read & discard if false
1356 bool LoadAllReferences(bool saveData = true);
1357 // load first reference from file, return true if loaded OK
1358 // @saveData - save data in memory if true, just read & discard if false
1359 bool LoadFirstReference(bool saveData = true);
1360 // load header data from index file, return true if loaded OK
1361 bool LoadHeader(void);
1362 // position file pointer to desired reference begin, return true if skipped OK
1363 bool SkipToFirstReference(void);
1364 // write header to new index file
1365 bool WriteHeader(void);
1366 // write index data for all references to new index file
1367 bool WriteAllReferences(void);
1372 // -----------------------
1373 // index file operations
1375 // check index file magic number, return true if OK
1376 bool CheckMagicNumber(void);
1377 // check index file version, return true if OK
1378 bool CheckVersion(void);
1379 // return true if FILE* is open
1380 bool IsOpen(void) const;
1381 // load a single index entry from file, return true if loaded OK
1382 // @saveData - save data in memory if true, just read & discard if false
1383 bool LoadIndexEntry(const int& refId, bool saveData = true);
1384 // load a single reference from file, return true if loaded OK
1385 // @saveData - save data in memory if true, just read & discard if false
1386 bool LoadReference(const int& refId, bool saveData = true);
1387 // loads number of references, return true if loaded OK
1388 bool LoadReferenceCount(int& numReferences);
1389 // position file pointer to desired reference begin, return true if skipped OK
1390 bool SkipToReference(const int& refId);
1391 // write current reference index data to new index file
1392 bool WriteReferenceEntry(const BamToolsReferenceEntry& refEntry);
1393 // write current index offset entry to new index file
1394 bool WriteIndexEntry(const BamToolsIndexEntry& entry);
1396 // -----------------------
1397 // index data operations
1399 // clear all index offset data for desired reference
1400 void ClearReferenceOffsets(const int& refId);
1401 // calculate BAM file offset for desired region
1402 // return true if no error (*NOT* equivalent to "has alignments or valid offset")
1403 // check @hasAlignmentsInRegion to determine this status
1404 // @region - target region
1405 // @offset - resulting seek target
1406 // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
1407 bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion);
1408 // returns true if index cache has data for desired reference
1409 bool IsDataLoaded(const int& refId) const;
1410 // clears index data from all references except the one specified
1411 void KeepOnlyReferenceOffsets(const int& refId);
1412 // saves an index offset entry in memory
1413 void SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry);
1414 // pre-allocates size for offset vector
1415 void SetOffsetCount(const int& refId, const int& offsetCount);
1416 // initializes index data structure to hold @count references
1417 void SetReferenceCount(const int& count);
1421 BamToolsIndex::BamToolsIndexPrivate::BamToolsIndexPrivate(BamToolsIndex* parent)
1424 , m_dataBeginOffset(0)
1425 , m_hasFullDataCache(false)
1427 , m_outputVersion(BTI_1_2) // latest version - used for writing new index files
1429 m_isBigEndian = BamTools::SystemIsBigEndian();
1433 BamToolsIndex::BamToolsIndexPrivate::~BamToolsIndexPrivate(void) {
1437 // creates index data (in-memory) from current reader data
1438 bool BamToolsIndex::BamToolsIndexPrivate::Build(void) {
1440 // localize parent data
1441 if ( m_parent == 0 ) return false;
1442 BamReader* reader = m_parent->m_reader;
1443 BgzfData* mBGZF = m_parent->m_BGZF;
1445 // be sure reader & BGZF file are valid & open for reading
1446 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
1449 // move file pointer to beginning of alignments
1450 if ( !reader->Rewind() ) return false;
1452 // initialize index data structure with space for all references
1453 const int numReferences = (int)m_parent->m_references.size();
1454 m_indexData.clear();
1455 m_hasFullDataCache = false;
1456 SetReferenceCount(numReferences);
1458 // set up counters and markers
1459 int32_t currentBlockCount = 0;
1460 int64_t currentAlignmentOffset = mBGZF->Tell();
1461 int32_t blockRefId = 0;
1462 int32_t blockMaxEndPosition = 0;
1463 int64_t blockStartOffset = currentAlignmentOffset;
1464 int32_t blockStartPosition = -1;
1466 // plow through alignments, storing index entries
1468 while ( reader->GetNextAlignmentCore(al) ) {
1470 // if block contains data (not the first time through) AND alignment is on a new reference
1471 if ( currentBlockCount > 0 && al.RefID != blockRefId ) {
1473 // store previous data
1474 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
1475 SaveOffsetEntry(blockRefId, entry);
1477 // intialize new block for current alignment's reference
1478 currentBlockCount = 0;
1479 blockMaxEndPosition = al.GetEndPosition();
1480 blockStartOffset = currentAlignmentOffset;
1483 // if beginning of block, save first alignment's refID & position
1484 if ( currentBlockCount == 0 ) {
1485 blockRefId = al.RefID;
1486 blockStartPosition = al.Position;
1489 // increment block counter
1490 ++currentBlockCount;
1492 // check end position
1493 int32_t alignmentEndPosition = al.GetEndPosition();
1494 if ( alignmentEndPosition > blockMaxEndPosition )
1495 blockMaxEndPosition = alignmentEndPosition;
1497 // if block is full, get offset for next block, reset currentBlockCount
1498 if ( currentBlockCount == m_blockSize ) {
1499 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
1500 SaveOffsetEntry(blockRefId, entry);
1501 blockStartOffset = mBGZF->Tell();
1502 currentBlockCount = 0;
1505 // not the best name, but for the next iteration, this value will be the offset of the *current* alignment
1506 // necessary because we won't know if this next alignment is on a new reference until we actually read it
1507 currentAlignmentOffset = mBGZF->Tell();
1510 // store final block with data
1511 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
1512 SaveOffsetEntry(blockRefId, entry);
1515 m_hasFullDataCache = true;
1517 // return success/failure of rewind
1518 return reader->Rewind();
1521 // check index file magic number, return true if OK
1522 bool BamToolsIndex::BamToolsIndexPrivate::CheckMagicNumber(void) {
1524 // see if index is valid BAM index
1526 size_t elementsRead = fread(magic, 1, 4, m_parent->m_indexStream);
1527 if ( elementsRead != 4 ) return false;
1528 if ( strncmp(magic, "BTI\1", 4) != 0 ) {
1529 fprintf(stderr, "Problem with index file - invalid format.\n");
1537 // check index file version, return true if OK
1538 bool BamToolsIndex::BamToolsIndexPrivate::CheckVersion(void) {
1540 // read version from file
1541 size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_parent->m_indexStream);
1542 if ( elementsRead != 1 ) return false;
1543 if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
1545 // if version is negative, or zero
1546 if ( m_inputVersion <= 0 ) {
1547 fprintf(stderr, "Problem with index file - invalid version.\n");
1551 // if version is newer than can be supported by this version of bamtools
1552 else if ( m_inputVersion > m_outputVersion ) {
1553 fprintf(stderr, "Problem with index file - attempting to use an outdated version of BamTools with a newer index file.\n");
1554 fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n");
1558 // ------------------------------------------------------------------
1559 // check for deprecated, unsupported versions
1560 // (typically whose format did not accomodate a particular bug fix)
1562 else if ( (Version)m_inputVersion == BTI_1_0 ) {
1563 fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n");
1564 fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
1568 else if ( (Version)m_inputVersion == BTI_1_1 ) {
1569 fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n");
1570 fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
1578 // clear all current index offset data in memory
1579 void BamToolsIndex::BamToolsIndexPrivate::ClearAllData(void) {
1580 BamToolsIndexData::const_iterator indexIter = m_indexData.begin();
1581 BamToolsIndexData::const_iterator indexEnd = m_indexData.end();
1582 for ( ; indexIter != indexEnd; ++indexIter ) {
1583 const int& refId = (*indexIter).first;
1584 ClearReferenceOffsets(refId);
1588 // clear all index offset data for desired reference
1589 void BamToolsIndex::BamToolsIndexPrivate::ClearReferenceOffsets(const int& refId) {
1590 if ( m_indexData.find(refId) == m_indexData.end() ) return;
1591 vector<BamToolsIndexEntry>& offsets = m_indexData.at(refId).Offsets;
1593 m_hasFullDataCache = false;
1596 // return file position after header metadata
1597 const off_t BamToolsIndex::BamToolsIndexPrivate::DataBeginOffset(void) const {
1598 return m_dataBeginOffset;
1601 // calculate BAM file offset for desired region
1602 // return true if no error (*NOT* equivalent to "has alignments or valid offset")
1603 // check @hasAlignmentsInRegion to determine this status
1604 // @region - target region
1605 // @offset - resulting seek target
1606 // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
1607 // N.B. - ignores isRightBoundSpecified
1608 bool BamToolsIndex::BamToolsIndexPrivate::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
1610 // return false if leftBound refID is not found in index data
1611 if ( m_indexData.find(region.LeftRefID) == m_indexData.end()) return false;
1613 // load index data for region if not already cached
1614 if ( !IsDataLoaded(region.LeftRefID) ) {
1615 bool loadedOk = true;
1616 loadedOk &= SkipToReference(region.LeftRefID);
1617 loadedOk &= LoadReference(region.LeftRefID);
1618 if ( !loadedOk ) return false;
1621 // localize index data for this reference (& sanity check that data actually exists)
1622 const vector<BamToolsIndexEntry>& referenceOffsets = m_indexData.at(region.LeftRefID).Offsets;
1623 if ( referenceOffsets.empty() ) return false;
1625 // -------------------------------------------------------
1626 // calculate nearest index to jump to
1628 // save first offset
1629 offset = (*referenceOffsets.begin()).StartOffset;
1631 // iterate over offsets entries on this reference
1632 vector<BamToolsIndexEntry>::const_iterator indexIter = referenceOffsets.begin();
1633 vector<BamToolsIndexEntry>::const_iterator indexEnd = referenceOffsets.end();
1634 for ( ; indexIter != indexEnd; ++indexIter ) {
1635 const BamToolsIndexEntry& entry = (*indexIter);
1636 // break if alignment 'entry' overlaps region
1637 if ( entry.MaxEndPosition >= region.LeftPosition ) break;
1638 offset = (*indexIter).StartOffset;
1641 // set flag based on whether an index entry was found for this region
1642 *hasAlignmentsInRegion = ( indexIter != indexEnd );
1644 // if cache mode set to none, dump the data we just loaded
1645 if (m_parent->m_cacheMode == BamIndex::NoIndexCaching )
1646 ClearReferenceOffsets(region.LeftRefID);
1652 // returns whether reference has alignments or no
1653 bool BamToolsIndex::BamToolsIndexPrivate::HasAlignments(const int& refId) const {
1654 if ( m_indexData.find(refId) == m_indexData.end()) return false;
1655 const BamToolsReferenceEntry& refEntry = m_indexData.at(refId);
1656 return refEntry.HasAlignments;
1659 // return true if all index data is cached
1660 bool BamToolsIndex::BamToolsIndexPrivate::HasFullDataCache(void) const {
1661 return m_hasFullDataCache;
1664 // returns true if index cache has data for desired reference
1665 bool BamToolsIndex::BamToolsIndexPrivate::IsDataLoaded(const int& refId) const {
1667 if ( m_indexData.find(refId) == m_indexData.end() ) return false; // unknown reference
1668 const BamToolsReferenceEntry& refEntry = m_indexData.at(refId);
1669 if ( !refEntry.HasAlignments ) return true; // no data period
1671 // return whether offsets list contains data
1672 return !refEntry.Offsets.empty();
1675 // attempts to use index to jump to region; returns success/fail
1676 bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
1679 *hasAlignmentsInRegion = false;
1681 // localize parent data
1682 if ( m_parent == 0 ) return false;
1683 BamReader* reader = m_parent->m_reader;
1684 BgzfData* mBGZF = m_parent->m_BGZF;
1685 RefVector& references = m_parent->m_references;
1687 // check valid BamReader state
1688 if ( reader == 0 || mBGZF == 0 || !reader->IsOpen() ) {
1689 fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n");
1693 // make sure left-bound position is valid
1694 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength )
1697 // calculate nearest offset to jump to
1699 if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) {
1700 fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n");
1704 // return success/failure of seek
1705 return mBGZF->Seek(offset);
1708 // clears index data from all references except the first
1709 void BamToolsIndex::BamToolsIndexPrivate::KeepOnlyFirstReferenceOffsets(void) {
1710 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
1711 KeepOnlyReferenceOffsets( (*indexBegin).first );
1714 // clears index data from all references except the one specified
1715 void BamToolsIndex::BamToolsIndexPrivate::KeepOnlyReferenceOffsets(const int& refId) {
1716 BamToolsIndexData::iterator mapIter = m_indexData.begin();
1717 BamToolsIndexData::iterator mapEnd = m_indexData.end();
1718 for ( ; mapIter != mapEnd; ++mapIter ) {
1719 const int entryRefId = (*mapIter).first;
1720 if ( entryRefId != refId )
1721 ClearReferenceOffsets(entryRefId);
1725 // load index data for all references, return true if loaded OK
1726 bool BamToolsIndex::BamToolsIndexPrivate::LoadAllReferences(bool saveData) {
1728 // skip if data already loaded
1729 if ( m_hasFullDataCache ) return true;
1731 // read in number of references
1732 int32_t numReferences;
1733 if ( !LoadReferenceCount(numReferences) ) return false;
1734 //SetReferenceCount(numReferences);
1736 // iterate over reference entries
1737 bool loadedOk = true;
1738 for ( int i = 0; i < numReferences; ++i )
1739 loadedOk &= LoadReference(i, saveData);
1742 if ( loadedOk && saveData )
1743 m_hasFullDataCache = true;
1745 // return success/failure of load
1749 // load header data from index file, return true if loaded OK
1750 bool BamToolsIndex::BamToolsIndexPrivate::LoadHeader(void) {
1752 // check magic number
1753 if ( !CheckMagicNumber() ) return false;
1755 // check BTI version
1756 if ( !CheckVersion() ) return false;
1758 // read in block size
1759 size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_parent->m_indexStream);
1760 if ( elementsRead != 1 ) return false;
1761 if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
1763 // store offset of beginning of data
1764 m_dataBeginOffset = ftell64(m_parent->m_indexStream);
1766 // return success/failure of load
1767 return (elementsRead == 1);
1770 // load a single index entry from file, return true if loaded OK
1771 // @saveData - save data in memory if true, just read & discard if false
1772 bool BamToolsIndex::BamToolsIndexPrivate::LoadIndexEntry(const int& refId, bool saveData) {
1774 // read in index entry data members
1775 size_t elementsRead = 0;
1776 BamToolsIndexEntry entry;
1777 elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_parent->m_indexStream);
1778 elementsRead += fread(&entry.StartOffset, sizeof(entry.StartOffset), 1, m_parent->m_indexStream);
1779 elementsRead += fread(&entry.StartPosition, sizeof(entry.StartPosition), 1, m_parent->m_indexStream);
1780 if ( elementsRead != 3 ) {
1781 cerr << "Error reading index entry. Expected 3 elements, read in: " << elementsRead << endl;
1785 // swap endian-ness if necessary
1786 if ( m_isBigEndian ) {
1787 SwapEndian_32(entry.MaxEndPosition);
1788 SwapEndian_64(entry.StartOffset);
1789 SwapEndian_32(entry.StartPosition);
1794 SaveOffsetEntry(refId, entry);
1796 // return success/failure of load
1800 // load a single reference from file, return true if loaded OK
1801 // @saveData - save data in memory if true, just read & discard if false
1802 bool BamToolsIndex::BamToolsIndexPrivate::LoadFirstReference(bool saveData) {
1803 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
1804 return LoadReference( (*indexBegin).first, saveData );
1807 // load a single reference from file, return true if loaded OK
1808 // @saveData - save data in memory if true, just read & discard if false
1809 bool BamToolsIndex::BamToolsIndexPrivate::LoadReference(const int& refId, bool saveData) {
1811 // read in number of offsets for this reference
1812 uint32_t numOffsets;
1813 size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_parent->m_indexStream);
1814 if ( elementsRead != 1 ) return false;
1815 if ( m_isBigEndian ) SwapEndian_32(numOffsets);
1817 // initialize offsets container for this reference
1818 SetOffsetCount(refId, (int)numOffsets);
1820 // iterate over offset entries
1821 for ( unsigned int j = 0; j < numOffsets; ++j )
1822 LoadIndexEntry(refId, saveData);
1824 // return success/failure of load
1828 // loads number of references, return true if loaded OK
1829 bool BamToolsIndex::BamToolsIndexPrivate::LoadReferenceCount(int& numReferences) {
1831 size_t elementsRead = 0;
1833 // read reference count
1834 elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
1835 if ( m_isBigEndian ) SwapEndian_32(numReferences);
1837 // return success/failure of load
1838 return ( elementsRead == 1 );
1841 // saves an index offset entry in memory
1842 void BamToolsIndex::BamToolsIndexPrivate::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) {
1843 BamToolsReferenceEntry& refEntry = m_indexData[refId];
1844 refEntry.HasAlignments = true;
1845 refEntry.Offsets.push_back(entry);
1848 // pre-allocates size for offset vector
1849 void BamToolsIndex::BamToolsIndexPrivate::SetOffsetCount(const int& refId, const int& offsetCount) {
1850 BamToolsReferenceEntry& refEntry = m_indexData[refId];
1851 refEntry.Offsets.reserve(offsetCount);
1852 refEntry.HasAlignments = ( offsetCount > 0);
1855 // initializes index data structure to hold @count references
1856 void BamToolsIndex::BamToolsIndexPrivate::SetReferenceCount(const int& count) {
1857 for ( int i = 0; i < count; ++i )
1858 m_indexData[i].HasAlignments = false;
1861 // position file pointer to first reference begin, return true if skipped OK
1862 bool BamToolsIndex::BamToolsIndexPrivate::SkipToFirstReference(void) {
1863 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
1864 return SkipToReference( (*indexBegin).first );
1867 // position file pointer to desired reference begin, return true if skipped OK
1868 bool BamToolsIndex::BamToolsIndexPrivate::SkipToReference(const int& refId) {
1871 if ( !m_parent->Rewind() ) return false;
1873 // read in number of references
1874 int32_t numReferences;
1875 size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
1876 if ( elementsRead != 1 ) return false;
1877 if ( m_isBigEndian ) SwapEndian_32(numReferences);
1879 // iterate over reference entries
1880 bool skippedOk = true;
1881 int currentRefId = 0;
1882 while (currentRefId != refId) {
1883 skippedOk &= LoadReference(currentRefId, false);
1887 // return success/failure of skip
1891 // write header to new index file
1892 bool BamToolsIndex::BamToolsIndexPrivate::WriteHeader(void) {
1894 size_t elementsWritten = 0;
1896 // write BTI index format 'magic number'
1897 elementsWritten += fwrite("BTI\1", 1, 4, m_parent->m_indexStream);
1899 // write BTI index format version
1900 int32_t currentVersion = (int32_t)m_outputVersion;
1901 if ( m_isBigEndian ) SwapEndian_32(currentVersion);
1902 elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_parent->m_indexStream);
1905 int32_t blockSize = m_blockSize;
1906 if ( m_isBigEndian ) SwapEndian_32(blockSize);
1907 elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_parent->m_indexStream);
1909 // store offset of beginning of data
1910 m_dataBeginOffset = ftell64(m_parent->m_indexStream);
1912 // return success/failure of write
1913 return ( elementsWritten == 6 );
1916 // write index data for all references to new index file
1917 bool BamToolsIndex::BamToolsIndexPrivate::WriteAllReferences(void) {
1919 size_t elementsWritten = 0;
1921 // write number of references
1922 int32_t numReferences = (int32_t)m_indexData.size();
1923 if ( m_isBigEndian ) SwapEndian_32(numReferences);
1924 elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
1926 // iterate through references in index
1928 BamToolsIndexData::const_iterator refIter = m_indexData.begin();
1929 BamToolsIndexData::const_iterator refEnd = m_indexData.end();
1930 for ( ; refIter != refEnd; ++refIter )
1931 refOk &= WriteReferenceEntry( (*refIter).second );
1933 return ( (elementsWritten == 1) && refOk );
1936 // write current reference index data to new index file
1937 bool BamToolsIndex::BamToolsIndexPrivate::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) {
1939 size_t elementsWritten = 0;
1941 // write number of offsets listed for this reference
1942 uint32_t numOffsets = refEntry.Offsets.size();
1943 if ( m_isBigEndian ) SwapEndian_32(numOffsets);
1944 elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_parent->m_indexStream);
1946 // iterate over offset entries
1947 bool entriesOk = true;
1948 vector<BamToolsIndexEntry>::const_iterator offsetIter = refEntry.Offsets.begin();
1949 vector<BamToolsIndexEntry>::const_iterator offsetEnd = refEntry.Offsets.end();
1950 for ( ; offsetIter != offsetEnd; ++offsetIter )
1951 entriesOk &= WriteIndexEntry( (*offsetIter) );
1953 return ( (elementsWritten == 1) && entriesOk );
1956 // write current index offset entry to new index file
1957 bool BamToolsIndex::BamToolsIndexPrivate::WriteIndexEntry(const BamToolsIndexEntry& entry) {
1960 int32_t maxEndPosition = entry.MaxEndPosition;
1961 int64_t startOffset = entry.StartOffset;
1962 int32_t startPosition = entry.StartPosition;
1964 // swap endian-ness if necessary
1965 if ( m_isBigEndian ) {
1966 SwapEndian_32(maxEndPosition);
1967 SwapEndian_64(startOffset);
1968 SwapEndian_32(startPosition);
1971 // write the reference index entry
1972 size_t elementsWritten = 0;
1973 elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_parent->m_indexStream);
1974 elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_parent->m_indexStream);
1975 elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_parent->m_indexStream);
1976 return ( elementsWritten == 3 );
1979 // ---------------------------------------------------
1980 // BamToolsIndex implementation
1982 BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader)
1983 : BamIndex(bgzf, reader)
1985 d = new BamToolsIndexPrivate(this);
1988 BamToolsIndex::~BamToolsIndex(void) {
1993 // BamIndex interface implementation
1994 bool BamToolsIndex::Build(void) { return d->Build(); }
1995 void BamToolsIndex::ClearAllData(void) { d->ClearAllData(); }
1996 const off_t BamToolsIndex::DataBeginOffset(void) const { return d->DataBeginOffset(); }
1997 bool BamToolsIndex::HasAlignments(const int& referenceID) const { return d->HasAlignments(referenceID); }
1998 bool BamToolsIndex::HasFullDataCache(void) const { return d->HasFullDataCache(); }
1999 bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); }
2000 void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) { d->KeepOnlyFirstReferenceOffsets(); }
2001 bool BamToolsIndex::LoadAllReferences(bool saveData) { return d->LoadAllReferences(saveData); }
2002 bool BamToolsIndex::LoadFirstReference(bool saveData) { return d->LoadFirstReference(saveData); }
2003 bool BamToolsIndex::LoadHeader(void) { return d->LoadHeader(); }
2004 bool BamToolsIndex::SkipToFirstReference(void) { return d->SkipToFirstReference(); }
2005 bool BamToolsIndex::WriteAllReferences(void) { return d->WriteAllReferences(); }
2006 bool BamToolsIndex::WriteHeader(void) { return d->WriteHeader(); }