1 // ***************************************************************************
2 // BamIndex.cpp (c) 2009 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 21 October 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides index functionality - both for the default (standardized) BAM
9 // index format (.bai) as well as a BamTools-specific (nonstandard) index
11 // ***************************************************************************
19 #include "BamReader.h"
22 using namespace BamTools;
24 // -------------------------------
25 // BamIndex implementation
28 BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader)
31 , m_cacheMode(BamIndex::LimitedIndexCaching)
34 if ( m_reader && m_reader->IsOpen() )
35 m_references = m_reader->GetReferenceData();
39 BamIndex::~BamIndex(void) {
41 fclose(m_indexStream);
44 // return true if FILE* is open
45 bool BamIndex::IsOpen(void) const {
46 return ( m_indexStream != 0 );
49 // loads existing data from file into memory
50 bool BamIndex::Load(const string& filename) {
52 // open index file, abort on error
53 if ( !OpenIndexFile(filename, "rb") ) {
54 fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str());
59 if ( !LoadHeader() ) {
60 fclose(m_indexStream);
64 // load reference data (but only keep in memory if full caching requested)
65 bool saveInitialLoad = ( m_cacheMode == BamIndex::FullIndexCaching );
66 if ( !LoadAllReferences(saveInitialLoad) ) {
67 fclose(m_indexStream);
71 // update index cache based on selected mode
78 // opens index file for reading/writing, return true if opened OK
79 bool BamIndex::OpenIndexFile(const string& filename, const string& mode) {
80 m_indexStream = fopen(filename.c_str(), mode.c_str());
81 return ( m_indexStream != 0 );
84 // rewind index file to beginning of index data, return true if rewound OK
85 bool BamIndex::Rewind(void) {
86 return ( fseek64(m_indexStream, DataBeginOffset(), SEEK_SET) == 0 );
89 // change the index caching behavior
90 void BamIndex::SetCacheMode(const BamIndexCacheMode mode) {
91 if ( mode != m_cacheMode ) {
97 // updates in-memory cache of index data, depending on current cache mode
98 void BamIndex::UpdateCache(void) {
100 // skip if file not open
101 if ( !IsOpen() ) return;
103 // reflect requested cache mode behavior
104 switch ( m_cacheMode ) {
106 case (BamIndex::FullIndexCaching) :
108 LoadAllReferences(true);
111 case (BamIndex::LimitedIndexCaching) :
112 if ( HasFullDataCache() )
113 KeepOnlyFirstReferenceOffsets();
116 SkipToFirstReference();
117 LoadFirstReference(true);
120 case(BamIndex::NoIndexCaching) :
129 // writes in-memory index data out to file
130 bool BamIndex::Write(const string& bamFilename) {
132 // open index file for writing
133 string indexFilename = bamFilename + Extension();
134 if ( !OpenIndexFile(indexFilename, "wb") ) {
135 fprintf(stderr, "ERROR: Could not open file to save index.\n");
139 // write index header data
140 if ( !WriteHeader() ) {
141 fprintf(stderr, "ERROR: There was a problem writing index metadata to new index file.\n");
142 fflush(m_indexStream);
143 fclose(m_indexStream);
147 // write main index data
148 if ( !WriteAllReferences() ) {
149 fprintf(stderr, "ERROR: There was a problem writing index data to new index file.\n");
150 fflush(m_indexStream);
151 fclose(m_indexStream);
155 // flush any remaining output, rewind file, and return success
156 fflush(m_indexStream);
157 fclose(m_indexStream);
159 // re-open index file for later reading
160 if ( !OpenIndexFile(indexFilename, "rb") ) {
161 fprintf(stderr, "ERROR: Could not open newly created index file for reading.\n");
165 // return success/failure of write
169 // #########################################################################################
170 // #########################################################################################
172 // -------------------------------
173 // BamStandardIndex structs & typedefs
177 // BAM index constants
178 const int MAX_BIN = 37450; // =(8^6-1)/7+1
179 const int BAM_LIDX_SHIFT = 14;
181 // --------------------------------------------------
182 // BamStandardIndex data structures & typedefs
190 Chunk(const uint64_t& start = 0,
191 const uint64_t& stop = 0)
197 bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
198 return lhs.Start < rhs.Start;
201 typedef vector<Chunk> ChunkVector;
202 typedef map<uint32_t, ChunkVector> BamBinMap;
203 typedef vector<uint64_t> LinearOffsetVector;
205 struct ReferenceIndex {
209 LinearOffsetVector Offsets;
213 ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
214 const LinearOffsetVector& offsets = LinearOffsetVector(),
215 const bool hasAlignments = false)
218 , HasAlignments(hasAlignments)
222 typedef map<int32_t, ReferenceIndex> BamStandardIndexData;
224 } // namespace BamTools
226 // -------------------------------
227 // BamStandardIndexPrivate implementation
229 struct BamStandardIndex::BamStandardIndexPrivate {
232 BamStandardIndex* m_parent;
235 BamStandardIndexData m_indexData;
236 off_t m_dataBeginOffset;
237 bool m_hasFullDataCache;
241 BamStandardIndexPrivate(BamStandardIndex* parent);
242 ~BamStandardIndexPrivate(void);
244 // parent interface methods
247 // creates index data (in-memory) from current reader data
249 // clear all current index offset data in memory
250 void ClearAllData(void);
251 // return file position after header metadata
252 const off_t DataBeginOffset(void) const;
253 // returns whether reference has alignments or no
254 bool HasAlignments(const int& referenceID) const;
255 // return true if all index data is cached
256 bool HasFullDataCache(void) const;
257 // attempts to use index to jump to region; returns success/fail
258 bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
259 // clears index data from all references except the first reference
260 void KeepOnlyFirstReferenceOffsets(void);
261 // load index data for all references, return true if loaded OK
262 // @saveData - save data in memory if true, just read & discard if false
263 bool LoadAllReferences(bool saveData = true);
264 // load first reference from file, return true if loaded OK
265 // @saveData - save data in memory if true, just read & discard if false
266 bool LoadFirstReference(bool saveData = true);
267 // load header data from index file, return true if loaded OK
268 bool LoadHeader(void);
269 // position file pointer to first reference begin, return true if skipped OK
270 bool SkipToFirstReference(void);
271 // write header to new index file
272 bool WriteHeader(void);
273 // write index data for all references to new index file
274 bool WriteAllReferences(void);
279 // -----------------------
280 // index file operations
282 // check index file magic number, return true if OK
283 bool CheckMagicNumber(void);
284 // check index file version, return true if OK
285 bool CheckVersion(void);
286 // load a single index bin entry from file, return true if loaded OK
287 // @saveData - save data in memory if true, just read & discard if false
288 bool LoadBin(ReferenceIndex& refEntry, bool saveData = true);
289 bool LoadBins(ReferenceIndex& refEntry, bool saveData = true);
290 // load a single index bin entry from file, return true if loaded OK
291 // @saveData - save data in memory if true, just read & discard if false
292 bool LoadChunk(ChunkVector& chunks, bool saveData = true);
293 bool LoadChunks(ChunkVector& chunks, bool saveData = true);
294 // load a single index linear offset entry from file, return true if loaded OK
295 // @saveData - save data in memory if true, just read & discard if false
296 bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true);
297 // load a single reference from file, return true if loaded OK
298 // @saveData - save data in memory if true, just read & discard if false
299 bool LoadReference(const int& refId, bool saveData = true);
300 // loads number of references, return true if loaded OK
301 bool LoadReferenceCount(int& numReferences);
302 // position file pointer to desired reference begin, return true if skipped OK
303 bool SkipToReference(const int& refId);
304 // write index data for bin to new index file
305 bool WriteBin(const uint32_t& binId, const ChunkVector& chunks);
306 // write index data for bins to new index file
307 bool WriteBins(const BamBinMap& bins);
308 // write index data for chunk entry to new index file
309 bool WriteChunk(const Chunk& chunk);
310 // write index data for chunk entry to new index file
311 bool WriteChunks(const ChunkVector& chunks);
312 // write index data for linear offsets entry to new index file
313 bool WriteLinearOffsets(const LinearOffsetVector& offsets);
314 // write index data single reference to new index file
315 bool WriteReference(const ReferenceIndex& refEntry);
317 // -----------------------
318 // index data operations
320 // calculate bins that overlap region
321 int BinsFromRegion(const BamRegion& region,
322 const bool isRightBoundSpecified,
323 uint16_t bins[BamTools::MAX_BIN]);
324 // clear all index offset data for desired reference
325 void ClearReferenceOffsets(const int& refId);
326 // calculates offset(s) for a given region
327 bool GetOffsets(const BamRegion& region,
328 const bool isRightBoundSpecified,
329 vector<int64_t>& offsets,
330 bool* hasAlignmentsInRegion);
331 // returns true if index cache has data for desired reference
332 bool IsDataLoaded(const int& refId) const;
333 // clears index data from all references except the one specified
334 void KeepOnlyReferenceOffsets(const int& refId);
335 // simplifies index by merging 'chunks'
336 void MergeChunks(void);
337 // saves BAM bin entry for index
338 void SaveBinEntry(BamBinMap& binMap,
339 const uint32_t& saveBin,
340 const uint64_t& saveOffset,
341 const uint64_t& lastOffset);
342 // saves linear offset entry for index
343 void SaveLinearOffset(LinearOffsetVector& offsets,
344 const BamAlignment& bAlignment,
345 const uint64_t& lastOffset);
346 // initializes index data structure to hold @count references
347 void SetReferenceCount(const int& count);
351 BamStandardIndex::BamStandardIndexPrivate::BamStandardIndexPrivate(BamStandardIndex* parent)
353 , m_dataBeginOffset(0)
354 , m_hasFullDataCache(false)
356 m_isBigEndian = BamTools::SystemIsBigEndian();
359 BamStandardIndex::BamStandardIndexPrivate::~BamStandardIndexPrivate(void) {
363 // calculate bins that overlap region
364 int BamStandardIndex::BamStandardIndexPrivate::BinsFromRegion(const BamRegion& region,
365 const bool isRightBoundSpecified,
366 uint16_t bins[MAX_BIN])
368 // get region boundaries
369 uint32_t begin = (unsigned int)region.LeftPosition;
372 // if right bound specified AND left&right bounds are on same reference
373 // OK to use right bound position
374 if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) )
375 end = (unsigned int)region.RightPosition;
377 // otherwise, use end of left bound reference as cutoff
379 end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1;
381 // initialize list, bin '0' always a valid bin
385 // get rest of bins that contain this region
387 for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; }
388 for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; }
389 for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; }
390 for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; }
391 for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; }
393 // return number of bins stored
397 // creates index data (in-memory) from current reader data
398 bool BamStandardIndex::BamStandardIndexPrivate::Build(void) {
400 // localize parent data
401 if ( m_parent == 0 ) return false;
402 BamReader* reader = m_parent->m_reader;
403 BgzfData* mBGZF = m_parent->m_BGZF;
405 // be sure reader & BGZF file are valid & open for reading
406 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
409 // move file pointer to beginning of alignments
412 // get reference count, reserve index space
413 const int numReferences = (int)m_parent->m_references.size();
415 m_hasFullDataCache = false;
416 SetReferenceCount(numReferences);
418 // sets default constant for bin, ID, offset, coordinate variables
419 const uint32_t defaultValue = 0xffffffffu;
422 uint32_t saveBin(defaultValue);
423 uint32_t lastBin(defaultValue);
426 int32_t saveRefID(defaultValue);
427 int32_t lastRefID(defaultValue);
430 uint64_t saveOffset = mBGZF->Tell();
431 uint64_t lastOffset = saveOffset;
434 int32_t lastCoordinate = defaultValue;
436 BamAlignment bAlignment;
437 while ( reader->GetNextAlignmentCore(bAlignment) ) {
439 // change of chromosome, save ID, reset bin
440 if ( lastRefID != bAlignment.RefID ) {
441 lastRefID = bAlignment.RefID;
442 lastBin = defaultValue;
445 // if lastCoordinate greater than BAM position - file not sorted properly
446 else if ( lastCoordinate > bAlignment.Position ) {
447 fprintf(stderr, "BAM file not properly sorted:\n");
448 fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(),
449 lastCoordinate, bAlignment.Position, bAlignment.RefID);
453 // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
454 if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
456 // save linear offset entry (matched to BAM entry refID)
457 ReferenceIndex& refIndex = m_indexData.at(bAlignment.RefID);
458 LinearOffsetVector& offsets = refIndex.Offsets;
459 SaveLinearOffset(offsets, bAlignment, lastOffset);
462 // if current BamAlignment bin != lastBin, "then possibly write the binning index"
463 if ( bAlignment.Bin != lastBin ) {
465 // if not first time through
466 if ( saveBin != defaultValue ) {
468 // save Bam bin entry
469 ReferenceIndex& refIndex = m_indexData.at(saveRefID);
470 BamBinMap& binMap = refIndex.Bins;
471 SaveBinEntry(binMap, saveBin, saveOffset, lastOffset);
475 saveOffset = lastOffset;
478 saveBin = bAlignment.Bin;
479 lastBin = bAlignment.Bin;
482 saveRefID = bAlignment.RefID;
484 // if invalid RefID, break out
485 if ( saveRefID < 0 ) break;
488 // make sure that current file pointer is beyond lastOffset
489 if ( mBGZF->Tell() <= (int64_t)lastOffset ) {
490 fprintf(stderr, "Error in BGZF offsets.\n");
495 lastOffset = mBGZF->Tell();
497 // update lastCoordinate
498 lastCoordinate = bAlignment.Position;
501 // save any leftover BAM data (as long as refID is valid)
502 if ( saveRefID >= 0 ) {
503 // save Bam bin entry
504 ReferenceIndex& refIndex = m_indexData.at(saveRefID);
505 BamBinMap& binMap = refIndex.Bins;
506 SaveBinEntry(binMap, saveBin, saveOffset, lastOffset);
509 // simplify index by merging chunks
512 // iterate through references in index
513 // sort offsets in linear offset vector
514 BamStandardIndexData::iterator indexIter = m_indexData.begin();
515 BamStandardIndexData::iterator indexEnd = m_indexData.end();
516 for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
518 // get reference index data
519 ReferenceIndex& refIndex = (*indexIter).second;
520 LinearOffsetVector& offsets = refIndex.Offsets;
522 // sort linear offsets
523 sort(offsets.begin(), offsets.end());
526 // rewind file pointer to beginning of alignments, return success/fail
527 return reader->Rewind();
530 // check index file magic number, return true if OK
531 bool BamStandardIndex::BamStandardIndexPrivate::CheckMagicNumber(void) {
533 // read in magic number
535 size_t elementsRead = fread(magic, sizeof(char), 4, m_parent->m_indexStream);
537 // compare to expected value
538 if ( strncmp(magic, "BAI\1", 4) != 0 ) {
539 fprintf(stderr, "Problem with index file - invalid format.\n");
540 fclose(m_parent->m_indexStream);
544 // return success/failure of load
545 return (elementsRead == 4);
548 // clear all current index offset data in memory
549 void BamStandardIndex::BamStandardIndexPrivate::ClearAllData(void) {
550 BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
551 BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
552 for ( ; indexIter != indexEnd; ++indexIter ) {
553 const int& refId = (*indexIter).first;
554 ClearReferenceOffsets(refId);
558 // clear all index offset data for desired reference
559 void BamStandardIndex::BamStandardIndexPrivate::ClearReferenceOffsets(const int& refId) {
561 // look up desired reference, skip if not found
562 if ( m_indexData.find(refId) == m_indexData.end() ) return;
564 // clear reference data
565 ReferenceIndex& refEntry = m_indexData.at(refId);
566 refEntry.Bins.clear();
567 refEntry.Offsets.clear();
570 m_hasFullDataCache = false;
573 // return file position after header metadata
574 const off_t BamStandardIndex::BamStandardIndexPrivate::DataBeginOffset(void) const {
575 return m_dataBeginOffset;
578 // calculates offset(s) for a given region
579 bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& region,
580 const bool isRightBoundSpecified,
581 vector<int64_t>& offsets,
582 bool* hasAlignmentsInRegion)
584 // return false if leftBound refID is not found in index data
585 if ( m_indexData.find(region.LeftRefID) == m_indexData.end() )
588 // load index data for region if not already cached
589 if ( !IsDataLoaded(region.LeftRefID) ) {
590 bool loadedOk = true;
591 loadedOk &= SkipToReference(region.LeftRefID);
592 loadedOk &= LoadReference(region.LeftRefID);
593 if ( !loadedOk ) return false;
596 // calculate which bins overlap this region
597 uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
598 int numBins = BinsFromRegion(region, isRightBoundSpecified, bins);
600 // get bins for this reference
601 const ReferenceIndex& refIndex = m_indexData.at(region.LeftRefID);
602 const BamBinMap& binMap = refIndex.Bins;
604 // get minimum offset to consider
605 const LinearOffsetVector& linearOffsets = refIndex.Offsets;
606 const uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() )
607 ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT);
609 // store all alignment 'chunk' starts (file offsets) for bins in this region
610 for ( int i = 0; i < numBins; ++i ) {
612 const uint16_t binKey = bins[i];
613 map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
614 if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
616 // iterate over chunks
617 const ChunkVector& chunks = (*binIter).second;
618 std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
619 std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
620 for ( ; chunksIter != chunksEnd; ++chunksIter) {
622 // if valid chunk found, store its file offset
623 const Chunk& chunk = (*chunksIter);
624 if ( chunk.Stop > minOffset )
625 offsets.push_back( chunk.Start );
633 // sort the offsets before returning
634 sort(offsets.begin(), offsets.end());
636 // set flag & return success
637 *hasAlignmentsInRegion = (offsets.size() != 0 );
641 // returns whether reference has alignments or no
642 bool BamStandardIndex::BamStandardIndexPrivate::HasAlignments(const int& refId) const {
643 if ( m_indexData.find(refId) == m_indexData.end()) return false;
644 const ReferenceIndex& refEntry = m_indexData.at(refId);
645 return refEntry.HasAlignments;
648 // return true if all index data is cached
649 bool BamStandardIndex::BamStandardIndexPrivate::HasFullDataCache(void) const {
650 return m_hasFullDataCache;
653 // returns true if index cache has data for desired reference
654 bool BamStandardIndex::BamStandardIndexPrivate::IsDataLoaded(const int& refId) const {
655 if ( m_indexData.find(refId) == m_indexData.end() ) return false; // unknown reference
656 const ReferenceIndex& refEntry = m_indexData.at(refId);
657 if ( !refEntry.HasAlignments ) return true; // no data period
658 // return whether bin map contains data
659 return ( !refEntry.Bins.empty() );
662 // attempts to use index to jump to region; returns success/fail
663 bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
665 // localize parent data
666 if ( m_parent == 0 ) return false;
667 BamReader* reader = m_parent->m_reader;
668 BgzfData* mBGZF = m_parent->m_BGZF;
669 RefVector& references = m_parent->m_references;
671 // be sure reader & BGZF file are valid & open for reading
672 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
675 // make sure left-bound position is valid
676 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength )
679 // calculate offsets for this region
680 // if failed, print message, set flag, and return failure
681 vector<int64_t> offsets;
682 if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) {
683 fprintf(stderr, "ERROR: Could not jump: unable to calculate offset(s) for specified region.\n");
684 *hasAlignmentsInRegion = false;
688 // iterate through offsets
689 BamAlignment bAlignment;
691 for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {
693 // attempt seek & load first available alignment
694 // set flag to true if data exists
695 result &= mBGZF->Seek(*o);
696 *hasAlignmentsInRegion = reader->GetNextAlignmentCore(bAlignment);
698 // if this alignment corresponds to desired position
699 // return success of seeking back to the offset before the 'current offset' (to cover overlaps)
700 if ( ((bAlignment.RefID == region.LeftRefID) &&
701 ((bAlignment.Position + bAlignment.Length) > region.LeftPosition)) ||
702 (bAlignment.RefID > region.LeftRefID) )
704 if ( o != offsets.begin() ) --o;
705 return mBGZF->Seek(*o);
709 // if error in jumping, print message & set flag
711 fprintf(stderr, "ERROR: Could not jump: unable to determine correct offset for specified region.\n");
712 *hasAlignmentsInRegion = false;
715 // return success/failure
719 // clears index data from all references except the first
720 void BamStandardIndex::BamStandardIndexPrivate::KeepOnlyFirstReferenceOffsets(void) {
721 BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
722 KeepOnlyReferenceOffsets((*indexBegin).first);
725 // clears index data from all references except the one specified
726 void BamStandardIndex::BamStandardIndexPrivate::KeepOnlyReferenceOffsets(const int& refId) {
727 BamStandardIndexData::iterator mapIter = m_indexData.begin();
728 BamStandardIndexData::iterator mapEnd = m_indexData.end();
729 for ( ; mapIter != mapEnd; ++mapIter ) {
730 const int entryRefId = (*mapIter).first;
731 if ( entryRefId != refId )
732 ClearReferenceOffsets(entryRefId);
736 bool BamStandardIndex::BamStandardIndexPrivate::LoadAllReferences(bool saveData) {
738 // skip if data already loaded
739 if ( m_hasFullDataCache ) return true;
741 // get number of reference sequences
742 uint32_t numReferences;
743 if ( !LoadReferenceCount((int&)numReferences) )
746 // iterate over reference entries
747 bool loadedOk = true;
748 for ( int i = 0; i < (int)numReferences; ++i )
749 loadedOk &= LoadReference(i, saveData);
752 if ( loadedOk && saveData )
753 m_hasFullDataCache = true;
755 // return success/failure of loading references
759 // load header data from index file, return true if loaded OK
760 bool BamStandardIndex::BamStandardIndexPrivate::LoadHeader(void) {
762 bool loadedOk = CheckMagicNumber();
764 // store offset of beginning of data
765 m_dataBeginOffset = ftell64(m_parent->m_indexStream);
767 // return success/failure of load
771 // load a single index bin entry from file, return true if loaded OK
772 // @saveData - save data in memory if true, just read & discard if false
773 bool BamStandardIndex::BamStandardIndexPrivate::LoadBin(ReferenceIndex& refEntry, bool saveData) {
775 size_t elementsRead = 0;
779 elementsRead += fread(&binId, sizeof(binId), 1, m_parent->m_indexStream);
780 if ( m_isBigEndian ) SwapEndian_32(binId);
782 // load alignment chunks for this bin
784 bool chunksOk = LoadChunks(chunks, saveData);
787 if ( chunksOk && saveData )
788 refEntry.Bins.insert(pair<uint32_t, ChunkVector>(binId, chunks));
790 // return success/failure of load
791 return ( (elementsRead == 1) && chunksOk );
794 bool BamStandardIndex::BamStandardIndexPrivate::LoadBins(ReferenceIndex& refEntry, bool saveData) {
796 size_t elementsRead = 0;
798 // get number of bins
800 elementsRead += fread(&numBins, sizeof(numBins), 1, m_parent->m_indexStream);
801 if ( m_isBigEndian ) SwapEndian_32(numBins);
804 refEntry.HasAlignments = ( numBins != 0 );
808 for ( int i = 0; i < numBins; ++i )
809 binsOk &= LoadBin(refEntry, saveData);
811 // return success/failure of load
812 return ( (elementsRead == 1) && binsOk );
815 // load a single index bin entry from file, return true if loaded OK
816 // @saveData - save data in memory if true, just read & discard if false
817 bool BamStandardIndex::BamStandardIndexPrivate::LoadChunk(ChunkVector& chunks, bool saveData) {
819 size_t elementsRead = 0;
821 // read in chunk data
824 elementsRead += fread(&start, sizeof(start), 1, m_parent->m_indexStream);
825 elementsRead += fread(&stop, sizeof(stop), 1, m_parent->m_indexStream);
827 // swap endian-ness if necessary
828 if ( m_isBigEndian ) {
829 SwapEndian_64(start);
833 // save data if requested
834 if ( saveData ) chunks.push_back( Chunk(start, stop) );
836 // return success/failure of load
837 return ( elementsRead == 2 );
840 bool BamStandardIndex::BamStandardIndexPrivate::LoadChunks(ChunkVector& chunks, bool saveData) {
842 size_t elementsRead = 0;
844 // read in number of chunks
846 elementsRead += fread(&numChunks, sizeof(numChunks), 1, m_parent->m_indexStream);
847 if ( m_isBigEndian ) SwapEndian_32(numChunks);
849 // initialize space for chunks if we're storing this data
850 if ( saveData ) chunks.reserve(numChunks);
852 // iterate over chunks
853 bool chunksOk = true;
854 for ( int i = 0; i < (int)numChunks; ++i )
855 chunksOk &= LoadChunk(chunks, saveData);
858 sort( chunks.begin(), chunks.end(), ChunkLessThan );
860 // return success/failure of load
861 return ( (elementsRead == 1) && chunksOk );
864 // load a single index linear offset entry from file, return true if loaded OK
865 // @saveData - save data in memory if true, just read & discard if false
866 bool BamStandardIndex::BamStandardIndexPrivate::LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData) {
868 size_t elementsRead = 0;
870 // read in number of linear offsets
871 int32_t numLinearOffsets;
872 elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_parent->m_indexStream);
873 if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets);
875 // set up destination vector (if we're saving the data)
876 LinearOffsetVector linearOffsets;
877 if ( saveData ) linearOffsets.reserve(numLinearOffsets);
879 // iterate over linear offsets
880 uint64_t linearOffset;
881 for ( int i = 0; i < numLinearOffsets; ++i ) {
882 elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_parent->m_indexStream);
883 if ( m_isBigEndian ) SwapEndian_64(linearOffset);
884 if ( saveData ) linearOffsets.push_back(linearOffset);
887 // sort linear offsets
888 sort ( linearOffsets.begin(), linearOffsets.end() );
890 // save in reference index entry if desired
891 if ( saveData ) refEntry.Offsets = linearOffsets;
893 // return success/failure of load
894 return ( elementsRead == (size_t)(numLinearOffsets + 1) );
897 bool BamStandardIndex::BamStandardIndexPrivate::LoadFirstReference(bool saveData) {
898 BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
899 return LoadReference((*indexBegin).first, saveData);
902 // load a single reference from file, return true if loaded OK
903 // @saveData - save data in memory if true, just read & discard if false
904 bool BamStandardIndex::BamStandardIndexPrivate::LoadReference(const int& refId, bool saveData) {
906 // if reference not previously loaded, create new entry
907 if ( m_indexData.find(refId) == m_indexData.end() ) {
908 ReferenceIndex newEntry;
909 newEntry.HasAlignments = false;
910 m_indexData.insert( pair<int32_t, ReferenceIndex>(refId, newEntry) );
913 // load reference data
914 ReferenceIndex& entry = m_indexData.at(refId);
915 bool loadedOk = true;
916 loadedOk &= LoadBins(entry, saveData);
917 loadedOk &= LoadLinearOffsets(entry, saveData);
921 // loads number of references, return true if loaded OK
922 bool BamStandardIndex::BamStandardIndexPrivate::LoadReferenceCount(int& numReferences) {
924 size_t elementsRead = 0;
926 // read reference count
927 elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
928 if ( m_isBigEndian ) SwapEndian_32(numReferences);
930 // return success/failure of load
931 return ( elementsRead == 1 );
934 // merges 'alignment chunks' in BAM bin (used for index building)
935 void BamStandardIndex::BamStandardIndexPrivate::MergeChunks(void) {
937 // iterate over reference enties
938 BamStandardIndexData::iterator indexIter = m_indexData.begin();
939 BamStandardIndexData::iterator indexEnd = m_indexData.end();
940 for ( ; indexIter != indexEnd; ++indexIter ) {
942 // get BAM bin map for this reference
943 ReferenceIndex& refIndex = (*indexIter).second;
944 BamBinMap& bamBinMap = refIndex.Bins;
946 // iterate over BAM bins
947 BamBinMap::iterator binIter = bamBinMap.begin();
948 BamBinMap::iterator binEnd = bamBinMap.end();
949 for ( ; binIter != binEnd; ++binIter ) {
951 // get chunk vector for this bin
952 ChunkVector& binChunks = (*binIter).second;
953 if ( binChunks.size() == 0 ) continue;
955 ChunkVector mergedChunks;
956 mergedChunks.push_back( binChunks[0] );
958 // iterate over chunks
960 ChunkVector::iterator chunkIter = binChunks.begin();
961 ChunkVector::iterator chunkEnd = binChunks.end();
962 for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
964 // get 'currentChunk' based on numeric index
965 Chunk& currentChunk = mergedChunks[i];
967 // get iteratorChunk based on vector iterator
968 Chunk& iteratorChunk = (*chunkIter);
970 // if chunk ends where (iterator) chunk starts, then merge
971 if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 )
972 currentChunk.Stop = iteratorChunk.Stop;
976 // set currentChunk + 1 to iteratorChunk
977 mergedChunks.push_back(iteratorChunk);
982 // saved merged chunk vector
983 (*binIter).second = mergedChunks;
988 // saves BAM bin entry for index
989 void BamStandardIndex::BamStandardIndexPrivate::SaveBinEntry(BamBinMap& binMap,
990 const uint32_t& saveBin,
991 const uint64_t& saveOffset,
992 const uint64_t& lastOffset)
995 BamBinMap::iterator binIter = binMap.find(saveBin);
998 Chunk newChunk(saveOffset, lastOffset);
1000 // if entry doesn't exist
1001 if ( binIter == binMap.end() ) {
1002 ChunkVector newChunks;
1003 newChunks.push_back(newChunk);
1004 binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
1009 ChunkVector& binChunks = (*binIter).second;
1010 binChunks.push_back( newChunk );
1014 // saves linear offset entry for index
1015 void BamStandardIndex::BamStandardIndexPrivate::SaveLinearOffset(LinearOffsetVector& offsets,
1016 const BamAlignment& bAlignment,
1017 const uint64_t& lastOffset)
1019 // get converted offsets
1020 int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
1021 int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;
1023 // resize vector if necessary
1024 int oldSize = offsets.size();
1025 int newSize = endOffset + 1;
1026 if ( oldSize < newSize )
1027 offsets.resize(newSize, 0);
1030 for( int i = beginOffset + 1; i <= endOffset; ++i ) {
1031 if ( offsets[i] == 0 )
1032 offsets[i] = lastOffset;
1036 // initializes index data structure to hold @count references
1037 void BamStandardIndex::BamStandardIndexPrivate::SetReferenceCount(const int& count) {
1038 for ( int i = 0; i < count; ++i )
1039 m_indexData[i].HasAlignments = false;
1042 bool BamStandardIndex::BamStandardIndexPrivate::SkipToFirstReference(void) {
1043 BamStandardIndexData::const_iterator indexBegin = m_indexData.begin();
1044 return SkipToReference( (*indexBegin).first );
1047 // position file pointer to desired reference begin, return true if skipped OK
1048 bool BamStandardIndex::BamStandardIndexPrivate::SkipToReference(const int& refId) {
1051 if ( !m_parent->Rewind() ) return false;
1053 // read in number of references
1054 uint32_t numReferences;
1055 size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
1056 if ( elementsRead != 1 ) return false;
1057 if ( m_isBigEndian ) SwapEndian_32(numReferences);
1059 // iterate over reference entries
1060 bool skippedOk = true;
1061 int currentRefId = 0;
1062 while (currentRefId != refId) {
1063 skippedOk &= LoadReference(currentRefId, false);
1071 // write header to new index file
1072 bool BamStandardIndex::BamStandardIndexPrivate::WriteHeader(void) {
1074 size_t elementsWritten = 0;
1076 // write magic number
1077 elementsWritten += fwrite("BAI\1", sizeof(char), 4, m_parent->m_indexStream);
1079 // store offset of beginning of data
1080 m_dataBeginOffset = ftell64(m_parent->m_indexStream);
1082 // return success/failure of write
1083 return (elementsWritten == 4);
1086 // write index data for all references to new index file
1087 bool BamStandardIndex::BamStandardIndexPrivate::WriteAllReferences(void) {
1089 size_t elementsWritten = 0;
1091 // write number of reference sequences
1092 int32_t numReferenceSeqs = m_indexData.size();
1093 if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs);
1094 elementsWritten += fwrite(&numReferenceSeqs, sizeof(numReferenceSeqs), 1, m_parent->m_indexStream);
1096 // iterate over reference sequences
1098 BamStandardIndexData::const_iterator indexIter = m_indexData.begin();
1099 BamStandardIndexData::const_iterator indexEnd = m_indexData.end();
1100 for ( ; indexIter != indexEnd; ++ indexIter )
1101 refsOk &= WriteReference( (*indexIter).second );
1103 // return success/failure of write
1104 return ( (elementsWritten == 1) && refsOk );
1107 // write index data for bin to new index file
1108 bool BamStandardIndex::BamStandardIndexPrivate::WriteBin(const uint32_t& binId, const ChunkVector& chunks) {
1110 size_t elementsWritten = 0;
1113 uint32_t binKey = binId;
1114 if ( m_isBigEndian ) SwapEndian_32(binKey);
1115 elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_parent->m_indexStream);
1118 bool chunksOk = WriteChunks(chunks);
1120 // return success/failure of write
1121 return ( (elementsWritten == 1) && chunksOk );
1124 // write index data for bins to new index file
1125 bool BamStandardIndex::BamStandardIndexPrivate::WriteBins(const BamBinMap& bins) {
1127 size_t elementsWritten = 0;
1129 // write number of bins
1130 int32_t binCount = bins.size();
1131 if ( m_isBigEndian ) SwapEndian_32(binCount);
1132 elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_parent->m_indexStream);
1134 // iterate over bins
1136 BamBinMap::const_iterator binIter = bins.begin();
1137 BamBinMap::const_iterator binEnd = bins.end();
1138 for ( ; binIter != binEnd; ++binIter )
1139 binsOk &= WriteBin( (*binIter).first, (*binIter).second );
1141 // return success/failure of write
1142 return ( (elementsWritten == 1) && binsOk );
1145 // write index data for chunk entry to new index file
1146 bool BamStandardIndex::BamStandardIndexPrivate::WriteChunk(const Chunk& chunk) {
1148 size_t elementsWritten = 0;
1150 // localize alignment chunk offsets
1151 uint64_t start = chunk.Start;
1152 uint64_t stop = chunk.Stop;
1154 // swap endian-ness if necessary
1155 if ( m_isBigEndian ) {
1156 SwapEndian_64(start);
1157 SwapEndian_64(stop);
1160 // write to index file
1161 elementsWritten += fwrite(&start, sizeof(start), 1, m_parent->m_indexStream);
1162 elementsWritten += fwrite(&stop, sizeof(stop), 1, m_parent->m_indexStream);
1164 // return success/failure of write
1165 return ( elementsWritten == 2 );
1168 // write index data for chunk entry to new index file
1169 bool BamStandardIndex::BamStandardIndexPrivate::WriteChunks(const ChunkVector& chunks) {
1171 size_t elementsWritten = 0;
1174 int32_t chunkCount = chunks.size();
1175 if ( m_isBigEndian ) SwapEndian_32(chunkCount);
1176 elementsWritten += fwrite(&chunkCount, sizeof(chunkCount), 1, m_parent->m_indexStream);
1178 // iterate over chunks
1179 bool chunksOk = true;
1180 ChunkVector::const_iterator chunkIter = chunks.begin();
1181 ChunkVector::const_iterator chunkEnd = chunks.end();
1182 for ( ; chunkIter != chunkEnd; ++chunkIter )
1183 chunksOk &= WriteChunk( (*chunkIter) );
1185 // return success/failure of write
1186 return ( (elementsWritten == 1) && chunksOk );
1189 // write index data for linear offsets entry to new index file
1190 bool BamStandardIndex::BamStandardIndexPrivate::WriteLinearOffsets(const LinearOffsetVector& offsets) {
1192 size_t elementsWritten = 0;
1194 // write number of linear offsets
1195 int32_t offsetCount = offsets.size();
1196 if ( m_isBigEndian ) SwapEndian_32(offsetCount);
1197 elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_parent->m_indexStream);
1199 // iterate over linear offsets
1200 LinearOffsetVector::const_iterator offsetIter = offsets.begin();
1201 LinearOffsetVector::const_iterator offsetEnd = offsets.end();
1202 for ( ; offsetIter != offsetEnd; ++offsetIter ) {
1204 // write linear offset
1205 uint64_t linearOffset = (*offsetIter);
1206 if ( m_isBigEndian ) SwapEndian_64(linearOffset);
1207 elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, m_parent->m_indexStream);
1210 // return success/failure of write
1211 return ( elementsWritten == (size_t)(offsetCount + 1) );
1214 // write index data for a single reference to new index file
1215 bool BamStandardIndex::BamStandardIndexPrivate::WriteReference(const ReferenceIndex& refEntry) {
1217 refOk &= WriteBins(refEntry.Bins);
1218 refOk &= WriteLinearOffsets(refEntry.Offsets);
1222 // ---------------------------------------------------------------
1223 // BamStandardIndex implementation
1225 BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader)
1226 : BamIndex(bgzf, reader)
1228 d = new BamStandardIndexPrivate(this);
1231 BamStandardIndex::~BamStandardIndex(void) {
1236 // BamIndex interface implementation
1237 bool BamStandardIndex::Build(void) { return d->Build(); }
1238 void BamStandardIndex::ClearAllData(void) { d->ClearAllData(); }
1239 const off_t BamStandardIndex::DataBeginOffset(void) const { return d->DataBeginOffset(); }
1240 bool BamStandardIndex::HasAlignments(const int& referenceID) const { return d->HasAlignments(referenceID); }
1241 bool BamStandardIndex::HasFullDataCache(void) const { return d->HasFullDataCache(); }
1242 bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); }
1243 void BamStandardIndex::KeepOnlyFirstReferenceOffsets(void) { d->KeepOnlyFirstReferenceOffsets(); }
1244 bool BamStandardIndex::LoadAllReferences(bool saveData) { return d->LoadAllReferences(saveData); }
1245 bool BamStandardIndex::LoadFirstReference(bool saveData) { return d->LoadFirstReference(saveData); }
1246 bool BamStandardIndex::LoadHeader(void) { return d->LoadHeader(); }
1247 bool BamStandardIndex::SkipToFirstReference(void) { return d->SkipToFirstReference(); }
1248 bool BamStandardIndex::WriteAllReferences(void) { return d->WriteAllReferences(); }
1249 bool BamStandardIndex::WriteHeader(void) { return d->WriteHeader(); }
1251 // #########################################################################################
1252 // #########################################################################################
1254 // ---------------------------------------------------
1255 // BamToolsIndex structs & typedefs
1257 namespace BamTools {
1259 // individual index offset entry
1260 struct BamToolsIndexEntry {
1263 int32_t MaxEndPosition;
1264 int64_t StartOffset;
1265 int32_t StartPosition;
1268 BamToolsIndexEntry(const int32_t& maxEndPosition = 0,
1269 const int64_t& startOffset = 0,
1270 const int32_t& startPosition = 0)
1271 : MaxEndPosition(maxEndPosition)
1272 , StartOffset(startOffset)
1273 , StartPosition(startPosition)
1277 // reference index entry
1278 struct BamToolsReferenceEntry {
1282 vector<BamToolsIndexEntry> Offsets;
1285 BamToolsReferenceEntry(void)
1286 : HasAlignments(false)
1290 // the actual index data structure
1291 typedef map<int, BamToolsReferenceEntry> BamToolsIndexData;
1293 } // namespace BamTools
1295 // ---------------------------------------------------
1296 // BamToolsIndexPrivate implementation
1298 struct BamToolsIndex::BamToolsIndexPrivate {
1300 // keep a list of any supported versions here
1301 // (might be useful later to handle any 'legacy' versions if the format changes)
1302 // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
1304 // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by:
1306 // if ( indexVersion >= BTI_1_2 )
1310 enum Version { BTI_1_0 = 1
1316 BamToolsIndex* m_parent;
1319 int32_t m_blockSize;
1320 BamToolsIndexData m_indexData;
1321 off_t m_dataBeginOffset;
1322 bool m_hasFullDataCache;
1324 int32_t m_inputVersion; // Version is serialized as int
1325 Version m_outputVersion;
1328 BamToolsIndexPrivate(BamToolsIndex* parent);
1329 ~BamToolsIndexPrivate(void);
1331 // parent interface methods
1334 // creates index data (in-memory) from current reader data
1336 // clear all current index offset data in memory
1337 void ClearAllData(void);
1338 // return file position after header metadata
1339 const off_t DataBeginOffset(void) const;
1340 // returns whether reference has alignments or no
1341 bool HasAlignments(const int& referenceID) const;
1342 // return true if all index data is cached
1343 bool HasFullDataCache(void) const;
1344 // attempts to use index to jump to region; returns success/fail
1345 bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
1346 // clears index data from all references except the first
1347 void KeepOnlyFirstReferenceOffsets(void);
1348 // load index data for all references, return true if loaded OK
1349 // @saveData - save data in memory if true, just read & discard if false
1350 bool LoadAllReferences(bool saveData = true);
1351 // load first reference from file, return true if loaded OK
1352 // @saveData - save data in memory if true, just read & discard if false
1353 bool LoadFirstReference(bool saveData = true);
1354 // load header data from index file, return true if loaded OK
1355 bool LoadHeader(void);
1356 // position file pointer to desired reference begin, return true if skipped OK
1357 bool SkipToFirstReference(void);
1358 // write header to new index file
1359 bool WriteHeader(void);
1360 // write index data for all references to new index file
1361 bool WriteAllReferences(void);
1366 // -----------------------
1367 // index file operations
1369 // check index file magic number, return true if OK
1370 bool CheckMagicNumber(void);
1371 // check index file version, return true if OK
1372 bool CheckVersion(void);
1373 // return true if FILE* is open
1374 bool IsOpen(void) const;
1375 // load a single index entry from file, return true if loaded OK
1376 // @saveData - save data in memory if true, just read & discard if false
1377 bool LoadIndexEntry(const int& refId, bool saveData = true);
1378 // load a single reference from file, return true if loaded OK
1379 // @saveData - save data in memory if true, just read & discard if false
1380 bool LoadReference(const int& refId, bool saveData = true);
1381 // loads number of references, return true if loaded OK
1382 bool LoadReferenceCount(int& numReferences);
1383 // position file pointer to desired reference begin, return true if skipped OK
1384 bool SkipToReference(const int& refId);
1385 // write current reference index data to new index file
1386 bool WriteReferenceEntry(const BamToolsReferenceEntry& refEntry);
1387 // write current index offset entry to new index file
1388 bool WriteIndexEntry(const BamToolsIndexEntry& entry);
1390 // -----------------------
1391 // index data operations
1393 // clear all index offset data for desired reference
1394 void ClearReferenceOffsets(const int& refId);
1395 // calculate BAM file offset for desired region
1396 // return true if no error (*NOT* equivalent to "has alignments or valid offset")
1397 // check @hasAlignmentsInRegion to determine this status
1398 // @region - target region
1399 // @offset - resulting seek target
1400 // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
1401 bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion);
1402 // returns true if index cache has data for desired reference
1403 bool IsDataLoaded(const int& refId) const;
1404 // clears index data from all references except the one specified
1405 void KeepOnlyReferenceOffsets(const int& refId);
1406 // saves an index offset entry in memory
1407 void SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry);
1408 // pre-allocates size for offset vector
1409 void SetOffsetCount(const int& refId, const int& offsetCount);
1410 // initializes index data structure to hold @count references
1411 void SetReferenceCount(const int& count);
1415 BamToolsIndex::BamToolsIndexPrivate::BamToolsIndexPrivate(BamToolsIndex* parent)
1418 , m_dataBeginOffset(0)
1419 , m_hasFullDataCache(false)
1421 , m_outputVersion(BTI_1_2) // latest version - used for writing new index files
1423 m_isBigEndian = BamTools::SystemIsBigEndian();
1427 BamToolsIndex::BamToolsIndexPrivate::~BamToolsIndexPrivate(void) {
1431 // creates index data (in-memory) from current reader data
1432 bool BamToolsIndex::BamToolsIndexPrivate::Build(void) {
1434 // localize parent data
1435 if ( m_parent == 0 ) return false;
1436 BamReader* reader = m_parent->m_reader;
1437 BgzfData* mBGZF = m_parent->m_BGZF;
1439 // be sure reader & BGZF file are valid & open for reading
1440 if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
1443 // move file pointer to beginning of alignments
1444 if ( !reader->Rewind() ) return false;
1446 // initialize index data structure with space for all references
1447 const int numReferences = (int)m_parent->m_references.size();
1448 m_indexData.clear();
1449 m_hasFullDataCache = false;
1450 SetReferenceCount(numReferences);
1452 // set up counters and markers
1453 int32_t currentBlockCount = 0;
1454 int64_t currentAlignmentOffset = mBGZF->Tell();
1455 int32_t blockRefId = 0;
1456 int32_t blockMaxEndPosition = 0;
1457 int64_t blockStartOffset = currentAlignmentOffset;
1458 int32_t blockStartPosition = -1;
1460 // plow through alignments, storing index entries
1462 while ( reader->GetNextAlignmentCore(al) ) {
1464 // if block contains data (not the first time through) AND alignment is on a new reference
1465 if ( currentBlockCount > 0 && al.RefID != blockRefId ) {
1467 // store previous data
1468 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
1469 SaveOffsetEntry(blockRefId, entry);
1471 // intialize new block for current alignment's reference
1472 currentBlockCount = 0;
1473 blockMaxEndPosition = al.GetEndPosition();
1474 blockStartOffset = currentAlignmentOffset;
1477 // if beginning of block, save first alignment's refID & position
1478 if ( currentBlockCount == 0 ) {
1479 blockRefId = al.RefID;
1480 blockStartPosition = al.Position;
1483 // increment block counter
1484 ++currentBlockCount;
1486 // check end position
1487 int32_t alignmentEndPosition = al.GetEndPosition();
1488 if ( alignmentEndPosition > blockMaxEndPosition )
1489 blockMaxEndPosition = alignmentEndPosition;
1491 // if block is full, get offset for next block, reset currentBlockCount
1492 if ( currentBlockCount == m_blockSize ) {
1493 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
1494 SaveOffsetEntry(blockRefId, entry);
1495 blockStartOffset = mBGZF->Tell();
1496 currentBlockCount = 0;
1499 // not the best name, but for the next iteration, this value will be the offset of the *current* alignment
1500 // necessary because we won't know if this next alignment is on a new reference until we actually read it
1501 currentAlignmentOffset = mBGZF->Tell();
1504 // store final block with data
1505 BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition);
1506 SaveOffsetEntry(blockRefId, entry);
1509 m_hasFullDataCache = true;
1511 // return success/failure of rewind
1512 return reader->Rewind();
1515 // check index file magic number, return true if OK
1516 bool BamToolsIndex::BamToolsIndexPrivate::CheckMagicNumber(void) {
1518 // see if index is valid BAM index
1520 size_t elementsRead = fread(magic, 1, 4, m_parent->m_indexStream);
1521 if ( elementsRead != 4 ) return false;
1522 if ( strncmp(magic, "BTI\1", 4) != 0 ) {
1523 fprintf(stderr, "Problem with index file - invalid format.\n");
1531 // check index file version, return true if OK
1532 bool BamToolsIndex::BamToolsIndexPrivate::CheckVersion(void) {
1534 // read version from file
1535 size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_parent->m_indexStream);
1536 if ( elementsRead != 1 ) return false;
1537 if ( m_isBigEndian ) SwapEndian_32(m_inputVersion);
1539 // if version is negative, or zero
1540 if ( m_inputVersion <= 0 ) {
1541 fprintf(stderr, "Problem with index file - invalid version.\n");
1545 // if version is newer than can be supported by this version of bamtools
1546 else if ( m_inputVersion > m_outputVersion ) {
1547 fprintf(stderr, "Problem with index file - attempting to use an outdated version of BamTools with a newer index file.\n");
1548 fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n");
1552 // ------------------------------------------------------------------
1553 // check for deprecated, unsupported versions
1554 // (typically whose format did not accomodate a particular bug fix)
1556 else if ( (Version)m_inputVersion == BTI_1_0 ) {
1557 fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n");
1558 fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
1562 else if ( (Version)m_inputVersion == BTI_1_1 ) {
1563 fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n");
1564 fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
1572 // clear all current index offset data in memory
1573 void BamToolsIndex::BamToolsIndexPrivate::ClearAllData(void) {
1574 BamToolsIndexData::const_iterator indexIter = m_indexData.begin();
1575 BamToolsIndexData::const_iterator indexEnd = m_indexData.end();
1576 for ( ; indexIter != indexEnd; ++indexIter ) {
1577 const int& refId = (*indexIter).first;
1578 ClearReferenceOffsets(refId);
1582 // clear all index offset data for desired reference
1583 void BamToolsIndex::BamToolsIndexPrivate::ClearReferenceOffsets(const int& refId) {
1584 if ( m_indexData.find(refId) == m_indexData.end() ) return;
1585 vector<BamToolsIndexEntry>& offsets = m_indexData.at(refId).Offsets;
1587 m_hasFullDataCache = false;
1590 // return file position after header metadata
1591 const off_t BamToolsIndex::BamToolsIndexPrivate::DataBeginOffset(void) const {
1592 return m_dataBeginOffset;
1595 // calculate BAM file offset for desired region
1596 // return true if no error (*NOT* equivalent to "has alignments or valid offset")
1597 // check @hasAlignmentsInRegion to determine this status
1598 // @region - target region
1599 // @offset - resulting seek target
1600 // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status
1601 // N.B. - ignores isRightBoundSpecified
1602 bool BamToolsIndex::BamToolsIndexPrivate::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) {
1604 // return false if leftBound refID is not found in index data
1605 if ( m_indexData.find(region.LeftRefID) == m_indexData.end()) return false;
1607 // load index data for region if not already cached
1608 if ( !IsDataLoaded(region.LeftRefID) ) {
1609 bool loadedOk = true;
1610 loadedOk &= SkipToReference(region.LeftRefID);
1611 loadedOk &= LoadReference(region.LeftRefID);
1612 if ( !loadedOk ) return false;
1615 // localize index data for this reference (& sanity check that data actually exists)
1616 const vector<BamToolsIndexEntry>& referenceOffsets = m_indexData.at(region.LeftRefID).Offsets;
1617 if ( referenceOffsets.empty() ) return false;
1619 // -------------------------------------------------------
1620 // calculate nearest index to jump to
1622 // save first offset
1623 offset = (*referenceOffsets.begin()).StartOffset;
1625 // iterate over offsets entries on this reference
1626 vector<BamToolsIndexEntry>::const_iterator indexIter = referenceOffsets.begin();
1627 vector<BamToolsIndexEntry>::const_iterator indexEnd = referenceOffsets.end();
1628 for ( ; indexIter != indexEnd; ++indexIter ) {
1629 const BamToolsIndexEntry& entry = (*indexIter);
1630 // break if alignment 'entry' overlaps region
1631 if ( entry.MaxEndPosition >= region.LeftPosition ) break;
1632 offset = (*indexIter).StartOffset;
1635 // set flag based on whether an index entry was found for this region
1636 *hasAlignmentsInRegion = ( indexIter != indexEnd );
1640 // returns whether reference has alignments or no
1641 bool BamToolsIndex::BamToolsIndexPrivate::HasAlignments(const int& refId) const {
1642 if ( m_indexData.find(refId) == m_indexData.end()) return false;
1643 const BamToolsReferenceEntry& refEntry = m_indexData.at(refId);
1644 return refEntry.HasAlignments;
1647 // return true if all index data is cached
1648 bool BamToolsIndex::BamToolsIndexPrivate::HasFullDataCache(void) const {
1649 return m_hasFullDataCache;
1652 // returns true if index cache has data for desired reference
1653 bool BamToolsIndex::BamToolsIndexPrivate::IsDataLoaded(const int& refId) const {
1655 if ( m_indexData.find(refId) == m_indexData.end() ) return false; // unknown reference
1656 const BamToolsReferenceEntry& refEntry = m_indexData.at(refId);
1657 if ( !refEntry.HasAlignments ) return true; // no data period
1659 // return whether offsets list contains data
1660 return !refEntry.Offsets.empty();
1663 // attempts to use index to jump to region; returns success/fail
1664 bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
1667 *hasAlignmentsInRegion = false;
1669 // localize parent data
1670 if ( m_parent == 0 ) return false;
1671 BamReader* reader = m_parent->m_reader;
1672 BgzfData* mBGZF = m_parent->m_BGZF;
1673 RefVector& references = m_parent->m_references;
1675 // check valid BamReader state
1676 if ( reader == 0 || mBGZF == 0 || !reader->IsOpen() ) {
1677 fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n");
1681 // make sure left-bound position is valid
1682 if ( region.LeftPosition > references.at(region.LeftRefID).RefLength )
1685 // calculate nearest offset to jump to
1687 if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) {
1688 fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n");
1692 // return success/failure of seek
1693 return mBGZF->Seek(offset);
1696 // clears index data from all references except the first
1697 void BamToolsIndex::BamToolsIndexPrivate::KeepOnlyFirstReferenceOffsets(void) {
1698 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
1699 KeepOnlyReferenceOffsets( (*indexBegin).first );
1702 // clears index data from all references except the one specified
1703 void BamToolsIndex::BamToolsIndexPrivate::KeepOnlyReferenceOffsets(const int& refId) {
1704 BamToolsIndexData::iterator mapIter = m_indexData.begin();
1705 BamToolsIndexData::iterator mapEnd = m_indexData.end();
1706 for ( ; mapIter != mapEnd; ++mapIter ) {
1707 const int entryRefId = (*mapIter).first;
1708 if ( entryRefId != refId )
1709 ClearReferenceOffsets(entryRefId);
1713 // load index data for all references, return true if loaded OK
1714 bool BamToolsIndex::BamToolsIndexPrivate::LoadAllReferences(bool saveData) {
1716 // skip if data already loaded
1717 if ( m_hasFullDataCache ) return true;
1719 // read in number of references
1720 int32_t numReferences;
1721 if ( !LoadReferenceCount(numReferences) ) return false;
1722 //SetReferenceCount(numReferences);
1724 // iterate over reference entries
1725 bool loadedOk = true;
1726 for ( int i = 0; i < numReferences; ++i )
1727 loadedOk &= LoadReference(i, saveData);
1730 if ( loadedOk && saveData )
1731 m_hasFullDataCache = true;
1733 // return success/failure of load
1737 // load header data from index file, return true if loaded OK
1738 bool BamToolsIndex::BamToolsIndexPrivate::LoadHeader(void) {
1740 // check magic number
1741 if ( !CheckMagicNumber() ) return false;
1743 // check BTI version
1744 if ( !CheckVersion() ) return false;
1746 // read in block size
1747 size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_parent->m_indexStream);
1748 if ( elementsRead != 1 ) return false;
1749 if ( m_isBigEndian ) SwapEndian_32(m_blockSize);
1751 // store offset of beginning of data
1752 m_dataBeginOffset = ftell64(m_parent->m_indexStream);
1754 // return success/failure of load
1755 return (elementsRead == 1);
1758 // load a single index entry from file, return true if loaded OK
1759 // @saveData - save data in memory if true, just read & discard if false
1760 bool BamToolsIndex::BamToolsIndexPrivate::LoadIndexEntry(const int& refId, bool saveData) {
1762 // read in index entry data members
1763 size_t elementsRead = 0;
1764 BamToolsIndexEntry entry;
1765 elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_parent->m_indexStream);
1766 elementsRead += fread(&entry.StartOffset, sizeof(entry.StartOffset), 1, m_parent->m_indexStream);
1767 elementsRead += fread(&entry.StartPosition, sizeof(entry.StartPosition), 1, m_parent->m_indexStream);
1768 if ( elementsRead != 3 ) {
1769 cerr << "Error reading index entry. Expected 3 elements, read in: " << elementsRead << endl;
1773 // swap endian-ness if necessary
1774 if ( m_isBigEndian ) {
1775 SwapEndian_32(entry.MaxEndPosition);
1776 SwapEndian_64(entry.StartOffset);
1777 SwapEndian_32(entry.StartPosition);
1782 SaveOffsetEntry(refId, entry);
1784 // return success/failure of load
1788 // load a single reference from file, return true if loaded OK
1789 // @saveData - save data in memory if true, just read & discard if false
1790 bool BamToolsIndex::BamToolsIndexPrivate::LoadFirstReference(bool saveData) {
1791 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
1792 return LoadReference( (*indexBegin).first, saveData );
1795 // load a single reference from file, return true if loaded OK
1796 // @saveData - save data in memory if true, just read & discard if false
1797 bool BamToolsIndex::BamToolsIndexPrivate::LoadReference(const int& refId, bool saveData) {
1799 // read in number of offsets for this reference
1800 uint32_t numOffsets;
1801 size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_parent->m_indexStream);
1802 if ( elementsRead != 1 ) return false;
1803 if ( m_isBigEndian ) SwapEndian_32(numOffsets);
1805 // initialize offsets container for this reference
1806 SetOffsetCount(refId, (int)numOffsets);
1808 // iterate over offset entries
1809 for ( unsigned int j = 0; j < numOffsets; ++j )
1810 LoadIndexEntry(refId, saveData);
1812 // return success/failure of load
1816 // loads number of references, return true if loaded OK
1817 bool BamToolsIndex::BamToolsIndexPrivate::LoadReferenceCount(int& numReferences) {
1819 size_t elementsRead = 0;
1821 // read reference count
1822 elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
1823 if ( m_isBigEndian ) SwapEndian_32(numReferences);
1825 // return success/failure of load
1826 return ( elementsRead == 1 );
1829 // saves an index offset entry in memory
1830 void BamToolsIndex::BamToolsIndexPrivate::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) {
1831 BamToolsReferenceEntry& refEntry = m_indexData[refId];
1832 refEntry.HasAlignments = true;
1833 refEntry.Offsets.push_back(entry);
1836 // pre-allocates size for offset vector
1837 void BamToolsIndex::BamToolsIndexPrivate::SetOffsetCount(const int& refId, const int& offsetCount) {
1838 BamToolsReferenceEntry& refEntry = m_indexData[refId];
1839 refEntry.Offsets.reserve(offsetCount);
1840 refEntry.HasAlignments = ( offsetCount > 0);
1843 // initializes index data structure to hold @count references
1844 void BamToolsIndex::BamToolsIndexPrivate::SetReferenceCount(const int& count) {
1845 for ( int i = 0; i < count; ++i )
1846 m_indexData[i].HasAlignments = false;
1849 // position file pointer to first reference begin, return true if skipped OK
1850 bool BamToolsIndex::BamToolsIndexPrivate::SkipToFirstReference(void) {
1851 BamToolsIndexData::const_iterator indexBegin = m_indexData.begin();
1852 return SkipToReference( (*indexBegin).first );
1855 // position file pointer to desired reference begin, return true if skipped OK
1856 bool BamToolsIndex::BamToolsIndexPrivate::SkipToReference(const int& refId) {
1859 if ( !m_parent->Rewind() ) return false;
1861 // read in number of references
1862 int32_t numReferences;
1863 size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
1864 if ( elementsRead != 1 ) return false;
1865 if ( m_isBigEndian ) SwapEndian_32(numReferences);
1867 // iterate over reference entries
1868 bool skippedOk = true;
1869 int currentRefId = 0;
1870 while (currentRefId != refId) {
1871 skippedOk &= LoadReference(currentRefId, false);
1875 // return success/failure of skip
1879 // write header to new index file
1880 bool BamToolsIndex::BamToolsIndexPrivate::WriteHeader(void) {
1882 size_t elementsWritten = 0;
1884 // write BTI index format 'magic number'
1885 elementsWritten += fwrite("BTI\1", 1, 4, m_parent->m_indexStream);
1887 // write BTI index format version
1888 int32_t currentVersion = (int32_t)m_outputVersion;
1889 if ( m_isBigEndian ) SwapEndian_32(currentVersion);
1890 elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_parent->m_indexStream);
1893 int32_t blockSize = m_blockSize;
1894 if ( m_isBigEndian ) SwapEndian_32(blockSize);
1895 elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_parent->m_indexStream);
1897 // store offset of beginning of data
1898 m_dataBeginOffset = ftell64(m_parent->m_indexStream);
1900 // return success/failure of write
1901 return ( elementsWritten == 6 );
1904 // write index data for all references to new index file
1905 bool BamToolsIndex::BamToolsIndexPrivate::WriteAllReferences(void) {
1907 size_t elementsWritten = 0;
1909 // write number of references
1910 int32_t numReferences = (int32_t)m_indexData.size();
1911 if ( m_isBigEndian ) SwapEndian_32(numReferences);
1912 elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_parent->m_indexStream);
1914 // iterate through references in index
1916 BamToolsIndexData::const_iterator refIter = m_indexData.begin();
1917 BamToolsIndexData::const_iterator refEnd = m_indexData.end();
1918 for ( ; refIter != refEnd; ++refIter )
1919 refOk &= WriteReferenceEntry( (*refIter).second );
1921 return ( (elementsWritten == 1) && refOk );
1924 // write current reference index data to new index file
1925 bool BamToolsIndex::BamToolsIndexPrivate::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) {
1927 size_t elementsWritten = 0;
1929 // write number of offsets listed for this reference
1930 uint32_t numOffsets = refEntry.Offsets.size();
1931 if ( m_isBigEndian ) SwapEndian_32(numOffsets);
1932 elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_parent->m_indexStream);
1934 // iterate over offset entries
1935 bool entriesOk = true;
1936 vector<BamToolsIndexEntry>::const_iterator offsetIter = refEntry.Offsets.begin();
1937 vector<BamToolsIndexEntry>::const_iterator offsetEnd = refEntry.Offsets.end();
1938 for ( ; offsetIter != offsetEnd; ++offsetIter )
1939 entriesOk &= WriteIndexEntry( (*offsetIter) );
1941 return ( (elementsWritten == 1) && entriesOk );
1944 // write current index offset entry to new index file
1945 bool BamToolsIndex::BamToolsIndexPrivate::WriteIndexEntry(const BamToolsIndexEntry& entry) {
1948 int32_t maxEndPosition = entry.MaxEndPosition;
1949 int64_t startOffset = entry.StartOffset;
1950 int32_t startPosition = entry.StartPosition;
1952 // swap endian-ness if necessary
1953 if ( m_isBigEndian ) {
1954 SwapEndian_32(maxEndPosition);
1955 SwapEndian_64(startOffset);
1956 SwapEndian_32(startPosition);
1959 // write the reference index entry
1960 size_t elementsWritten = 0;
1961 elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_parent->m_indexStream);
1962 elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_parent->m_indexStream);
1963 elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_parent->m_indexStream);
1964 return ( elementsWritten == 3 );
1967 // ---------------------------------------------------
1968 // BamToolsIndex implementation
1970 BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader)
1971 : BamIndex(bgzf, reader)
1973 d = new BamToolsIndexPrivate(this);
1976 BamToolsIndex::~BamToolsIndex(void) {
1981 // BamIndex interface implementation
1982 bool BamToolsIndex::Build(void) { return d->Build(); }
1983 void BamToolsIndex::ClearAllData(void) { d->ClearAllData(); }
1984 const off_t BamToolsIndex::DataBeginOffset(void) const { return d->DataBeginOffset(); }
1985 bool BamToolsIndex::HasAlignments(const int& referenceID) const { return d->HasAlignments(referenceID); }
1986 bool BamToolsIndex::HasFullDataCache(void) const { return d->HasFullDataCache(); }
1987 bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); }
1988 void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) { d->KeepOnlyFirstReferenceOffsets(); }
1989 bool BamToolsIndex::LoadAllReferences(bool saveData) { return d->LoadAllReferences(saveData); }
1990 bool BamToolsIndex::LoadFirstReference(bool saveData) { return d->LoadFirstReference(saveData); }
1991 bool BamToolsIndex::LoadHeader(void) { return d->LoadHeader(); }
1992 bool BamToolsIndex::SkipToFirstReference(void) { return d->SkipToFirstReference(); }
1993 bool BamToolsIndex::WriteAllReferences(void) { return d->WriteAllReferences(); }
1994 bool BamToolsIndex::WriteHeader(void) { return d->WriteHeader(); }