// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 18 September 2010 (DB)
+// Last modified: 9 October 2010 (DB)
// ---------------------------------------------------------------------------
// Provides index functionality - both for the default (standardized) BAM
// index format (.bai) as well as a BamTools-specific (nonstandard) index
m_references = m_reader->GetReferenceData();
}
-bool BamIndex::HasAlignments(const int& referenceID) {
-
- // return false if invalid ID
- if ( (referenceID < 0) || (referenceID >= (int)m_references.size()) )
- return false;
-
- // else return status of reference (has alignments?)
- else
- return m_references.at(referenceID).RefHasAlignments;
-}
-
// #########################################################################################
// #########################################################################################
// creates index data (in-memory) from current reader data
bool Build(void);
+ // returns whether reference has alignments or no
+ bool HasAlignments(const int& referenceID) const;
// attempts to use index to jump to region; returns success/fail
bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
- // loads existing data from file into memory
+ // loads existing data from file into memory
bool Load(const std::string& filename);
// writes in-memory index data out to file
// N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
// calculate bins that overlap region
int BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]);
// calculates offset(s) for a given region
- bool GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets);
+ bool GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets, bool* hasAlignmentsInRegion);
// saves BAM bin entry for index
void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset);
// saves linear offset entry for index
return i;
}
+// creates index data (in-memory) from current reader data
bool BamStandardIndex::BamStandardIndexPrivate::Build(void) {
// localize parent data
if ( m_parent == 0 ) return false;
- BamReader* reader = m_parent->m_reader;
- BgzfData* mBGZF = m_parent->m_BGZF;
- RefVector& references = m_parent->m_references;
+ BamReader* reader = m_parent->m_reader;
+ BgzfData* mBGZF = m_parent->m_BGZF;
// be sure reader & BGZF file are valid & open for reading
if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
reader->Rewind();
// get reference count, reserve index space
- int numReferences = (int)references.size();
+ const int numReferences = (int)m_parent->m_references.size();
for ( int i = 0; i < numReferences; ++i )
m_indexData.push_back(ReferenceIndex());
if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
// save linear offset entry (matched to BAM entry refID)
- ReferenceIndex& refIndex = m_indexData.at(bAlignment.RefID);
+ ReferenceIndex& refIndex = m_indexData.at(bAlignment.RefID);
LinearOffsetVector& offsets = refIndex.Offsets;
InsertLinearOffset(offsets, bAlignment, lastOffset);
}
MergeChunks();
// iterate through references in index
- // store whether reference has data &
// sort offsets in linear offset vector
BamStandardIndexData::iterator indexIter = m_indexData.begin();
BamStandardIndexData::iterator indexEnd = m_indexData.end();
// get reference index data
ReferenceIndex& refIndex = (*indexIter);
- BamBinMap& binMap = refIndex.Bins;
LinearOffsetVector& offsets = refIndex.Offsets;
- // store whether reference has alignments or no
- references[i].RefHasAlignments = ( binMap.size() > 0 );
-
// sort linear offsets
sort(offsets.begin(), offsets.end());
}
return reader->Rewind();
}
-bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) {
+// calculates offset(s) for a given region
+bool BamStandardIndex::BamStandardIndexPrivate::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets, bool* hasAlignmentsInRegion) {
// calculate which bins overlap this region
uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
// sort the offsets before returning
sort(offsets.begin(), offsets.end());
- // return whether any offsets were found
- return ( offsets.size() != 0 );
+ // set flag & return success
+ *hasAlignmentsInRegion = (offsets.size() != 0 );
+ return true;
+}
+
+// returns whether reference has alignments or no
+bool BamStandardIndex::BamStandardIndexPrivate::HasAlignments(const int& referenceID) const {
+
+ // ID not found
+ if ( referenceID < 0 || referenceID >= (int)m_indexData.size() )
+ return false;
+
+ // return whether reference contains data
+ const ReferenceIndex& refIndex = m_indexData.at(referenceID);
+ return !( refIndex.Bins.empty() );
}
// saves BAM bin entry for index
}
}
+// attempts to use index to jump to region; returns success/fail
bool BamStandardIndex::BamStandardIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
// localize parent data
if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
return false;
- // see if left-bound reference of region has alignments
- if ( !m_parent->HasAlignments(region.LeftRefID) ) return false;
-
// make sure left-bound position is valid
if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false;
// calculate offsets for this region
// if failed, print message, set flag, and return failure
vector<int64_t> offsets;
- if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets) ) {
- fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n");
+ if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) {
+ fprintf(stderr, "ERROR: Could not jump: unable to calculate offset(s) for specified region.\n");
*hasAlignmentsInRegion = false;
return false;
}
// if error in jumping, print message & set flag
if ( !result ) {
- fprintf(stderr, "ERROR: Could not jump: unable to calculate offset for specified region.\n");
+ fprintf(stderr, "ERROR: Could not jump: unable to determine correct offset for specified region.\n");
*hasAlignmentsInRegion = false;
}
return result;
}
+// loads existing data from file into memory
bool BamStandardIndex::BamStandardIndexPrivate::Load(const string& filename) {
// localize parent data
if ( m_parent == 0 ) return false;
const bool isBigEndian = m_parent->m_isBigEndian;
- RefVector& references = m_parent->m_references;
// open index file, abort on error
FILE* indexStream = fopen(filename.c_str(), "rb");
elementsRead = fread(&numBins, 4, 1, indexStream);
if ( isBigEndian ) SwapEndian_32(numBins);
- if ( numBins > 0 ) {
- RefData& refEntry = references[i];
- refEntry.RefHasAlignments = true;
- }
-
// intialize BinVector
BamBinMap binMap;
if ( m_parent == 0 ) return false;
const bool isBigEndian = m_parent->m_isBigEndian;
+ // open index file
string indexFilename = bamFilename + ".bai";
FILE* indexStream = fopen(indexFilename.c_str(), "wb");
if ( indexStream == 0 ) {
// creates index data (in-memory) from current reader data
bool BamStandardIndex::Build(void) { return d->Build(); }
+// returns whether reference has alignments or no
+bool BamStandardIndex::HasAlignments(const int& referenceID) const { return d->HasAlignments(referenceID); }
+
// attempts to use index to jump to region; returns success/fail
bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { return d->Jump(region, hasAlignmentsInRegion); }
// keep a list of any supported versions here
// (might be useful later to handle any 'legacy' versions if the format changes)
- // listed for example like: BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
+ // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on
//
// so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by:
//
// do something new
// else
// do the old thing
+
enum Version { BTI_1_0 = 1
, BTI_1_1
+ , BTI_1_2
};
// -------------------------
// data members
+
BamToolsIndexData m_indexData;
BamToolsIndex* m_parent;
int32_t m_blockSize;
BamToolsIndexPrivate(BamToolsIndex* parent)
: m_parent(parent)
, m_blockSize(1000)
- , m_version(BTI_1_1) // latest version - used for writing new index files
+ , m_version(BTI_1_2) // latest version - used for writing new index files
{ }
~BamToolsIndexPrivate(void) { }
// creates index data (in-memory) from current reader data
bool Build(void);
// returns supported file extension
- const std::string Extension(void) const { return std::string(".bti"); }
+ const string Extension(void) const { return string(".bti"); }
+ // returns whether reference has alignments or no
+ bool HasAlignments(const int& referenceID) const;
// attempts to use index to jump to region; returns success/fail
bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion);
- // loads existing data from file into memory
+ // loads existing data from file into memory
bool Load(const std::string& filename);
// writes in-memory index data out to file
// N.B. - (this is the original BAM filename, method will modify it to use applicable extension)
bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion);
};
+// creates index data (in-memory) from current reader data
bool BamToolsIndex::BamToolsIndexPrivate::Build(void) {
// localize parent data
if ( m_parent == 0 ) return false;
- BamReader* reader = m_parent->m_reader;
- BgzfData* mBGZF = m_parent->m_BGZF;
- RefVector& references = m_parent->m_references;
+ BamReader* reader = m_parent->m_reader;
+ BgzfData* mBGZF = m_parent->m_BGZF;
// be sure reader & BGZF file are valid & open for reading
if ( reader == 0 || mBGZF == 0 || !mBGZF->IsOpen )
if ( !reader->Rewind() )
return false;
+ // initialize index data structure with space for all references
+ const int numReferences = (int)m_parent->m_references.size();
+ for ( int i = 0; i < numReferences; ++i )
+ m_indexData[i].clear();
+
// set up counters and markers
int32_t currentBlockCount = 0;
int64_t currentAlignmentOffset = mBGZF->Tell();
// if block contains data (not the first time through) AND alignment is on a new reference
if ( currentBlockCount > 0 && al.RefID != blockRefId ) {
- // make sure reference flag is set
- references[blockRefId].RefHasAlignments = true;
-
- // store previous data
- m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) );
-
- // intialize new block
+ // store previous data
+ m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) );
+
+ // intialize new block for current alignment's reference
currentBlockCount = 0;
blockMaxEndPosition = al.GetEndPosition();
blockStartOffset = currentAlignmentOffset;
// increment block counter
++currentBlockCount;
-
+
// check end position
int32_t alignmentEndPosition = al.GetEndPosition();
if ( alignmentEndPosition > blockMaxEndPosition )
// if block is full, get offset for next block, reset currentBlockCount
if ( currentBlockCount == m_blockSize ) {
-
- // make sure reference flag is set
- references[blockRefId].RefHasAlignments = true;
-
m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) );
blockStartOffset = mBGZF->Tell();
currentBlockCount = 0;
currentAlignmentOffset = mBGZF->Tell();
}
- // store final block
+ // store final block with data
m_indexData[blockRefId].push_back( BamToolsIndexEntry(blockMaxEndPosition, blockStartOffset, blockStartPosition) );
// attempt to rewind back to begnning of alignments
return true;
}
+// returns whether reference has alignments or no
+bool BamToolsIndex::BamToolsIndexPrivate::HasAlignments(const int& referenceID) const {
+ if ( m_indexData.find(referenceID) == m_indexData.end() )
+ return false;
+ else
+ return ( !m_indexData.at(referenceID).empty() );
+}
+
+// attempts to use index to jump to region; returns success/fail
bool BamToolsIndex::BamToolsIndexPrivate::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) {
// clear flag
return false;
}
- // see if left-bound reference of region has alignments
- if ( !m_parent->HasAlignments(region.LeftRefID) ) return false;
-
// make sure left-bound position is valid
if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) return false;
fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n");
return false;
}
-
+
// attempt seek in file, return success/fail
return mBGZF->Seek(offset);
}
+// loads existing data from file into memory
bool BamToolsIndex::BamToolsIndexPrivate::Load(const string& filename) {
// localize parent data
if ( m_parent == 0 ) return false;
const bool isBigEndian = m_parent->m_isBigEndian;
- RefVector& references = m_parent->m_references;
// open index file, abort on error
FILE* indexStream = fopen(filename.c_str(), "rb");
fclose(indexStream);
return false;
}
-
+
// read in version
int32_t indexVersion;
elementsRead = fread(&indexVersion, sizeof(indexVersion), 1, indexStream);
return false;
}
- if ( (Version)indexVersion < BTI_1_1 ) {
+ if ( (Version)indexVersion == BTI_1_0 ) {
fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n");
fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
fclose(indexStream);
exit(1);
}
+
+ if ( (Version)indexVersion == BTI_1_1 ) {
+ fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n");
+ fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n");
+ fclose(indexStream);
+ exit(1);
+ }
// read in block size
elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, indexStream);
int32_t numReferences;
elementsRead = fread(&numReferences, sizeof(numReferences), 1, indexStream);
if ( isBigEndian ) SwapEndian_32(numReferences);
-
+
// iterate over reference entries
for ( int i = 0; i < numReferences; ++i ) {
// save refID, offsetVector entry into data structure
m_indexData.insert( make_pair(i, offsets) );
-
- // set ref.HasAlignments flag
- references[i].RefHasAlignments = (numOffsets != 0);
}
// close index file and return
// creates index data (in-memory) from current reader data
bool BamToolsIndex::Build(void) { return d->Build(); }
+// returns whether reference has alignments or no
+bool BamToolsIndex::HasAlignments(const int& referenceID) const { return d->HasAlignments(referenceID); }
+
// attempts to use index to jump to region; returns success/fail
// a "successful" jump indicates no error, but not whether this region has data
// * thus, the method sets a flag to indicate whether there are alignments
-// ***************************************************************************\r
-// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg\r
-// Marth Lab, Department of Biology, Boston College\r
-// All rights reserved.\r
-// ---------------------------------------------------------------------------\r
-// Last modified: 17 September 2010 (DB)\r
-// ---------------------------------------------------------------------------\r
-// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
-// Institute.\r
-// ---------------------------------------------------------------------------\r
-// Provides the basic functionality for reading BAM files\r
-// ***************************************************************************\r
-\r
-// C++ includes\r
-#include <algorithm>\r
-#include <iterator>\r
-#include <string>\r
-#include <vector>\r
-#include <iostream>\r
-#include "BGZF.h"\r
-#include "BamReader.h"\r
-#include "BamIndex.h"\r
-using namespace BamTools;\r
-using namespace std;\r
-\r
-struct BamReader::BamReaderPrivate {\r
-\r
- // -------------------------------\r
- // structs, enums, typedefs\r
- // -------------------------------\r
- enum RegionState { BEFORE_REGION = 0\r
- , WITHIN_REGION\r
- , AFTER_REGION\r
- };\r
-\r
- // -------------------------------\r
- // data members\r
- // -------------------------------\r
-\r
- // general file data\r
- BgzfData mBGZF;\r
- string HeaderText;\r
- BamIndex* Index;\r
- RefVector References;\r
- bool IsIndexLoaded;\r
- int64_t AlignmentsBeginOffset;\r
- string Filename;\r
- string IndexFilename;\r
- \r
- // system data\r
- bool IsBigEndian;\r
-\r
- // user-specified region values\r
- BamRegion Region;\r
- bool HasAlignmentsInRegion;\r
-\r
- // parent BamReader\r
- BamReader* Parent;\r
- \r
- // BAM character constants\r
- const char* DNA_LOOKUP;\r
- const char* CIGAR_LOOKUP;\r
-\r
- // -------------------------------\r
- // constructor & destructor\r
- // -------------------------------\r
- BamReaderPrivate(BamReader* parent);\r
- ~BamReaderPrivate(void);\r
-\r
- // -------------------------------\r
- // "public" interface\r
- // -------------------------------\r
-\r
- // file operations\r
- void Close(void);\r
- bool Open(const std::string& filename, \r
- const std::string& indexFilename, \r
- const bool lookForIndex, \r
- const bool preferStandardIndex);\r
- bool Rewind(void);\r
- bool SetRegion(const BamRegion& region);\r
-\r
- // access alignment data\r
- bool GetNextAlignment(BamAlignment& bAlignment);\r
- bool GetNextAlignmentCore(BamAlignment& bAlignment);\r
-\r
- // access auxiliary data\r
- int GetReferenceID(const string& refName) const;\r
-\r
- // index operations\r
- bool CreateIndex(bool useStandardIndex);\r
-\r
- // -------------------------------\r
- // internal methods\r
- // -------------------------------\r
-\r
- // *** reading alignments and auxiliary data *** //\r
-\r
- // fills out character data for BamAlignment data\r
- bool BuildCharData(BamAlignment& bAlignment);\r
- // checks to see if alignment overlaps current region\r
- RegionState IsOverlap(BamAlignment& bAlignment);\r
- // retrieves header text from BAM file\r
- void LoadHeaderData(void);\r
- // retrieves BAM alignment under file pointer\r
- bool LoadNextAlignment(BamAlignment& bAlignment);\r
- // builds reference data structure from BAM file\r
- void LoadReferenceData(void);\r
-\r
- // *** index file handling *** //\r
-\r
- // clear out inernal index data structure\r
- void ClearIndex(void);\r
- // loads index from BAM index file\r
- bool LoadIndex(const bool lookForIndex, const bool preferStandardIndex);\r
-};\r
-\r
-// -----------------------------------------------------\r
-// BamReader implementation (wrapper around BRPrivate)\r
-// -----------------------------------------------------\r
-// constructor\r
-BamReader::BamReader(void) {\r
- d = new BamReaderPrivate(this);\r
-}\r
-\r
-// destructor\r
-BamReader::~BamReader(void) {\r
- delete d;\r
- d = 0;\r
-}\r
-\r
-// file operations\r
-void BamReader::Close(void) { d->Close(); }\r
-bool BamReader::IsIndexLoaded(void) const { return d->IsIndexLoaded; }\r
-bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; }\r
-bool BamReader::Jump(int refID, int position) { return d->SetRegion( BamRegion(refID, position) ); }\r
-bool BamReader::Open(const std::string& filename, \r
- const std::string& indexFilename, \r
- const bool lookForIndex, \r
- const bool preferStandardIndex) \r
-{ \r
- return d->Open(filename, indexFilename, lookForIndex, preferStandardIndex); \r
-}\r
-bool BamReader::Rewind(void) { return d->Rewind(); }\r
-bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); }\r
-bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) {\r
- return d->SetRegion( BamRegion(leftRefID, leftBound, rightRefID, rightBound) );\r
-}\r
-\r
-// access alignment data\r
-bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); }\r
-bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); }\r
-\r
-// access auxiliary data\r
-const string BamReader::GetHeaderText(void) const { return d->HeaderText; }\r
-int BamReader::GetReferenceCount(void) const { return d->References.size(); }\r
-const RefVector& BamReader::GetReferenceData(void) const { return d->References; }\r
-int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }\r
-const std::string BamReader::GetFilename(void) const { return d->Filename; }\r
-\r
-// index operations\r
-bool BamReader::CreateIndex(bool useStandardIndex) { return d->CreateIndex(useStandardIndex); }\r
-\r
-// -----------------------------------------------------\r
-// BamReaderPrivate implementation\r
-// -----------------------------------------------------\r
-\r
-// constructor\r
-BamReader::BamReaderPrivate::BamReaderPrivate(BamReader* parent)\r
- : Index(0)\r
- , IsIndexLoaded(false)\r
- , AlignmentsBeginOffset(0)\r
- , Parent(parent)\r
- , DNA_LOOKUP("=ACMGRSVTWYHKDBN")\r
- , CIGAR_LOOKUP("MIDNSHP")\r
-{ \r
- IsBigEndian = SystemIsBigEndian();\r
-}\r
-\r
-// destructor\r
-BamReader::BamReaderPrivate::~BamReaderPrivate(void) {\r
- Close();\r
-}\r
-\r
-bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) {\r
- \r
- // calculate character lengths/offsets\r
- const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
- const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);\r
- const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;\r
- const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;\r
- const unsigned int tagDataLength = dataLength - tagDataOffset;\r
- \r
- // set up char buffers\r
- const char* allCharData = bAlignment.SupportData.AllCharData.data();\r
- const char* seqData = ((const char*)allCharData) + seqDataOffset;\r
- const char* qualData = ((const char*)allCharData) + qualDataOffset;\r
- char* tagData = ((char*)allCharData) + tagDataOffset;\r
- \r
- // store alignment name (relies on null char in name as terminator)\r
- bAlignment.Name.assign((const char*)(allCharData)); \r
-\r
- // save query sequence\r
- bAlignment.QueryBases.clear();\r
- bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
- for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {\r
- char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];\r
- bAlignment.QueryBases.append(1, singleBase);\r
- }\r
- \r
- // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character\r
- bAlignment.Qualities.clear();\r
- bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength);\r
- for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {\r
- char singleQuality = (char)(qualData[i]+33);\r
- bAlignment.Qualities.append(1, singleQuality);\r
- }\r
- \r
- // if QueryBases is empty (and this is a allowed case)\r
- if ( bAlignment.QueryBases.empty() ) \r
- bAlignment.AlignedBases = bAlignment.QueryBases;\r
- \r
- // if QueryBases contains data, then build AlignedBases using CIGAR data\r
- else {\r
- \r
- // resize AlignedBases\r
- bAlignment.AlignedBases.clear();\r
- bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
- \r
- // iterate over CigarOps\r
- int k = 0;\r
- vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();\r
- vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end();\r
- for ( ; cigarIter != cigarEnd; ++cigarIter ) {\r
- \r
- const CigarOp& op = (*cigarIter);\r
- switch(op.Type) {\r
- \r
- case ('M') :\r
- case ('I') :\r
- bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases\r
- // fall through\r
- \r
- case ('S') :\r
- k += op.Length; // for 'S' - soft clip, skip over query bases\r
- break;\r
- \r
- case ('D') :\r
- bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character\r
- break;\r
- \r
- case ('P') :\r
- bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character\r
- break;\r
- \r
- case ('N') :\r
- bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence\r
- break;\r
- \r
- case ('H') :\r
- break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op\r
- \r
- default:\r
- fprintf(stderr, "ERROR: Invalid Cigar op type\n"); // shouldn't get here\r
- exit(1);\r
- }\r
- }\r
- }\r
- \r
- // -----------------------\r
- // Added: 3-25-2010 DB\r
- // Fixed: endian-correctness for tag data\r
- // -----------------------\r
- if ( IsBigEndian ) {\r
- int i = 0;\r
- while ( (unsigned int)i < tagDataLength ) {\r
- \r
- i += 2; // skip tag type (e.g. "RG", "NM", etc)\r
- uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning \r
- ++i; // skip value type\r
- \r
- switch (type) {\r
- \r
- case('A') :\r
- case('C') : \r
- ++i;\r
- break;\r
-\r
- case('S') : \r
- SwapEndian_16p(&tagData[i]); \r
- i += sizeof(uint16_t);\r
- break;\r
- \r
- case('F') :\r
- case('I') : \r
- SwapEndian_32p(&tagData[i]);\r
- i += sizeof(uint32_t);\r
- break;\r
- \r
- case('D') : \r
- SwapEndian_64p(&tagData[i]);\r
- i += sizeof(uint64_t);\r
- break;\r
- \r
- case('H') :\r
- case('Z') : \r
- while (tagData[i]) { ++i; }\r
- ++i; // increment one more for null terminator\r
- break;\r
- \r
- default : \r
- fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here\r
- exit(1);\r
- }\r
- }\r
- }\r
- \r
- // store TagData\r
- bAlignment.TagData.clear();\r
- bAlignment.TagData.resize(tagDataLength);\r
- memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength);\r
- \r
- // clear the core-only flag\r
- bAlignment.SupportData.HasCoreOnly = false;\r
- \r
- // return success\r
- return true;\r
-}\r
-\r
-// clear index data structure\r
-void BamReader::BamReaderPrivate::ClearIndex(void) {\r
- delete Index;\r
- Index = 0;\r
- IsIndexLoaded = false;\r
-}\r
-\r
-// closes the BAM file\r
-void BamReader::BamReaderPrivate::Close(void) {\r
- \r
- // close BGZF file stream\r
- mBGZF.Close();\r
- \r
- // clear out index data\r
- ClearIndex();\r
- \r
- // clear out header data\r
- HeaderText.clear();\r
- \r
- // clear out region flags\r
- Region.clear();\r
-}\r
-\r
-// creates index for BAM file, saves to file\r
-// default behavior is to create the BAM standard index (".bai")\r
-// set flag to false to create the BamTools-specific index (".bti")\r
-bool BamReader::BamReaderPrivate::CreateIndex(bool useStandardIndex) {\r
-\r
- // clear out prior index data\r
- ClearIndex();\r
- \r
- // create index based on type requested\r
- if ( useStandardIndex ) \r
- Index = new BamStandardIndex(&mBGZF, Parent, IsBigEndian);\r
- // create BamTools 'custom' index\r
- else\r
- Index = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);\r
- \r
- // build new index\r
- bool ok = true;\r
- ok &= Index->Build();\r
- IsIndexLoaded = ok;\r
- \r
- // attempt to save index data to file\r
- ok &= Index->Write(Filename); \r
- \r
- // return success/fail of both building & writing index\r
- return ok;\r
-}\r
-\r
-// get next alignment (from specified region, if given)\r
-bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {\r
-\r
- // if valid alignment found, attempt to parse char data, and return success/failure\r
- if ( GetNextAlignmentCore(bAlignment) )\r
- return BuildCharData(bAlignment);\r
- \r
- // no valid alignment found\r
- else\r
- return false;\r
-}\r
-\r
-// retrieves next available alignment core data (returns success/fail)\r
-// ** DOES NOT parse any character data (read name, bases, qualities, tag data)\r
-// these can be accessed, if necessary, from the supportData \r
-// useful for operations requiring ONLY positional or other alignment-related information\r
-bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) {\r
-\r
- // if region is set but has no alignments\r
- if ( !Region.isNull() && !HasAlignmentsInRegion ) \r
- return false;\r
- \r
- // if valid alignment available\r
- if ( LoadNextAlignment(bAlignment) ) {\r
-\r
- // set core-only flag\r
- bAlignment.SupportData.HasCoreOnly = true;\r
- \r
- // if region not specified, return success\r
- if ( !Region.isLeftBoundSpecified() ) return true;\r
-\r
- // determine region state (before, within, after)\r
- BamReader::BamReaderPrivate::RegionState state = IsOverlap(bAlignment);\r
- \r
- // if alignment lies after region, return false\r
- if ( state == AFTER_REGION ) return false;\r
-\r
- while ( state != WITHIN_REGION ) {\r
- // if no valid alignment available (likely EOF) return failure\r
- if ( !LoadNextAlignment(bAlignment) ) return false;\r
- // if alignment lies after region, return false (no available read within region)\r
- state = IsOverlap(bAlignment);\r
- if ( state == AFTER_REGION ) return false;\r
- }\r
-\r
- // return success (alignment found that overlaps region)\r
- return true;\r
- }\r
-\r
- // no valid alignment\r
- else\r
- return false;\r
-}\r
-\r
-// returns RefID for given RefName (returns References.size() if not found)\r
-int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const {\r
-\r
- // retrieve names from reference data\r
- vector<string> refNames;\r
- RefVector::const_iterator refIter = References.begin();\r
- RefVector::const_iterator refEnd = References.end();\r
- for ( ; refIter != refEnd; ++refIter) \r
- refNames.push_back( (*refIter).RefName );\r
-\r
- // return 'index-of' refName ( if not found, returns refNames.size() )\r
- return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));\r
-}\r
-\r
-// returns region state - whether alignment ends before, overlaps, or starts after currently specified region\r
-// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true\r
-BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {\r
- \r
- // --------------------------------------------------\r
- // check alignment start against right bound cutoff\r
- \r
- // if full region of interest was given\r
- if ( Region.isRightBoundSpecified() ) {\r
- \r
- // read starts on right bound reference, but AFTER right bound position\r
- if ( bAlignment.RefID == Region.RightRefID && bAlignment.Position > Region.RightPosition )\r
- return AFTER_REGION;\r
- \r
- // if read starts on reference AFTER right bound, return false\r
- if ( bAlignment.RefID > Region.RightRefID ) \r
- return AFTER_REGION;\r
- }\r
- \r
- // --------------------------------------------------------\r
- // no right bound given OR read starts before right bound\r
- // so, check if it overlaps left bound \r
- \r
- // if read starts on left bound reference AND after left boundary, return success\r
- if ( bAlignment.RefID == Region.LeftRefID && bAlignment.Position >= Region.LeftPosition)\r
- return WITHIN_REGION;\r
- \r
- // if read is on any reference sequence before left bound, return false\r
- if ( bAlignment.RefID < Region.LeftRefID )\r
- return BEFORE_REGION;\r
-\r
- // --------------------------------------------------------\r
- // read is on left bound reference, but starts before left bound position\r
-\r
- // if it overlaps, return WITHIN_REGION\r
- if ( bAlignment.GetEndPosition() >= Region.LeftPosition )\r
- return WITHIN_REGION;\r
- // else begins before left bound position\r
- else\r
- return BEFORE_REGION;\r
-}\r
-\r
-// load BAM header data\r
-void BamReader::BamReaderPrivate::LoadHeaderData(void) {\r
-\r
- // check to see if proper BAM header\r
- char buffer[4];\r
- if (mBGZF.Read(buffer, 4) != 4) {\r
- fprintf(stderr, "Could not read header type\n");\r
- exit(1);\r
- }\r
-\r
- if (strncmp(buffer, "BAM\001", 4)) {\r
- fprintf(stderr, "wrong header type!\n");\r
- exit(1);\r
- }\r
-\r
- // get BAM header text length\r
- mBGZF.Read(buffer, 4);\r
- unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( IsBigEndian ) SwapEndian_32(headerTextLength); \r
- \r
- // get BAM header text\r
- char* headerText = (char*)calloc(headerTextLength + 1, 1);\r
- mBGZF.Read(headerText, headerTextLength);\r
- HeaderText = (string)((const char*)headerText);\r
-\r
- // clean up calloc-ed temp variable\r
- free(headerText);\r
-}\r
-\r
-// load existing index data from BAM index file (".bti" OR ".bai"), return success/fail\r
-bool BamReader::BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool preferStandardIndex) {\r
-\r
- // clear out any existing index data\r
- ClearIndex();\r
-\r
- // if no index filename provided, so we need to look for available index files\r
- if ( IndexFilename.empty() ) {\r
- \r
- // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag\r
- const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS);\r
- Index = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, IsBigEndian, type);\r
- \r
- // if null, return failure\r
- if ( Index == 0 ) return false;\r
- \r
- // generate proper IndexFilename based on type of index created\r
- IndexFilename = Filename + Index->Extension();\r
- }\r
- \r
- else {\r
- // attempt to load BamIndex based on IndexFilename provided by client\r
- Index = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent, IsBigEndian);\r
- \r
- // if null, return failure\r
- if ( Index == 0 ) return false; \r
- }\r
- \r
- // an index file was found\r
- // return success of loading the index data from file\r
- IsIndexLoaded = Index->Load(IndexFilename);\r
- return IsIndexLoaded;\r
-}\r
-\r
-// populates BamAlignment with alignment data under file pointer, returns success/fail\r
-bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {\r
-\r
- // read in the 'block length' value, make sure it's not zero\r
- char buffer[4];\r
- mBGZF.Read(buffer, 4);\r
- bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); }\r
- if ( bAlignment.SupportData.BlockLength == 0 ) return false;\r
-\r
- // read in core alignment data, make sure the right size of data was read\r
- char x[BAM_CORE_SIZE];\r
- if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) return false; \r
-\r
- if ( IsBigEndian ) {\r
- for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) \r
- SwapEndian_32p(&x[i]); \r
- }\r
- \r
- // set BamAlignment 'core' and 'support' data\r
- bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); \r
- bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);\r
- \r
- unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]);\r
- bAlignment.Bin = tempValue >> 16;\r
- bAlignment.MapQuality = tempValue >> 8 & 0xff;\r
- bAlignment.SupportData.QueryNameLength = tempValue & 0xff;\r
-\r
- tempValue = BgzfData::UnpackUnsignedInt(&x[12]);\r
- bAlignment.AlignmentFlag = tempValue >> 16;\r
- bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff;\r
-\r
- bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);\r
- bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]);\r
- bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);\r
- bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);\r
- \r
- // set BamAlignment length\r
- bAlignment.Length = bAlignment.SupportData.QuerySequenceLength;\r
- \r
- // read in character data - make sure proper data size was read\r
- bool readCharDataOK = false;\r
- const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
- char* allCharData = (char*)calloc(sizeof(char), dataLength);\r
- \r
- if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { \r
- \r
- // store 'allCharData' in supportData structure\r
- bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength);\r
- \r
- // set success flag\r
- readCharDataOK = true;\r
- \r
- // save CIGAR ops \r
- // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly, \r
- // even when BamReader::GetNextAlignmentCore() is called \r
- const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;\r
- uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);\r
- CigarOp op;\r
- bAlignment.CigarData.clear();\r
- bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations);\r
- for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {\r
-\r
- // swap if necessary\r
- if ( IsBigEndian ) SwapEndian_32(cigarData[i]);\r
- \r
- // build CigarOp structure\r
- op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
- op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
-\r
- // save CigarOp\r
- bAlignment.CigarData.push_back(op);\r
- }\r
- }\r
-\r
- free(allCharData);\r
- return readCharDataOK;\r
-}\r
-\r
-// loads reference data from BAM file\r
-void BamReader::BamReaderPrivate::LoadReferenceData(void) {\r
-\r
- // get number of reference sequences\r
- char buffer[4];\r
- mBGZF.Read(buffer, 4);\r
- unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( IsBigEndian ) SwapEndian_32(numberRefSeqs);\r
- if ( numberRefSeqs == 0 ) return;\r
- References.reserve((int)numberRefSeqs);\r
-\r
- // iterate over all references in header\r
- for (unsigned int i = 0; i != numberRefSeqs; ++i) {\r
-\r
- // get length of reference name\r
- mBGZF.Read(buffer, 4);\r
- unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( IsBigEndian ) SwapEndian_32(refNameLength);\r
- char* refName = (char*)calloc(refNameLength, 1);\r
-\r
- // get reference name and reference sequence length\r
- mBGZF.Read(refName, refNameLength);\r
- mBGZF.Read(buffer, 4);\r
- int refLength = BgzfData::UnpackSignedInt(buffer);\r
- if ( IsBigEndian ) SwapEndian_32(refLength); \r
-\r
- // store data for reference\r
- RefData aReference;\r
- aReference.RefName = (string)((const char*)refName);\r
- aReference.RefLength = refLength;\r
- References.push_back(aReference);\r
-\r
- // clean up calloc-ed temp variable\r
- free(refName);\r
- }\r
-}\r
-\r
-// opens BAM file (and index)\r
-bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename, const bool lookForIndex, const bool preferStandardIndex) {\r
-\r
- // store filenames\r
- Filename = filename;\r
- IndexFilename = indexFilename;\r
-\r
- // open the BGZF file for reading, return false on failure\r
- if ( !mBGZF.Open(filename, "rb") ) return false; \r
- \r
- // retrieve header text & reference data\r
- LoadHeaderData();\r
- LoadReferenceData();\r
-\r
- // store file offset of first alignment\r
- AlignmentsBeginOffset = mBGZF.Tell();\r
-\r
- // if no index filename provided \r
- if ( IndexFilename.empty() ) {\r
- \r
- // client did not specify that index SHOULD be found\r
- // useful for cases where sequential access is all that is required\r
- if ( !lookForIndex ) return true; \r
- \r
- // otherwise, look for index file, return success/fail\r
- return LoadIndex(lookForIndex, preferStandardIndex) ;\r
- }\r
- \r
- // client supplied an index filename\r
- // attempt to load index data, return success/fail\r
- return LoadIndex(lookForIndex, preferStandardIndex); \r
-}\r
-\r
-// returns BAM file pointer to beginning of alignment data\r
-bool BamReader::BamReaderPrivate::Rewind(void) {\r
- \r
- // rewind to first alignment, return false if unable to seek\r
- if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false;\r
- \r
- // retrieve first alignment data, return false if unable to read\r
- BamAlignment al;\r
- if ( !LoadNextAlignment(al) ) return false;\r
- \r
- // reset default region info using first alignment in file\r
- Region.clear();\r
- Region.LeftRefID = al.RefID;\r
- Region.LeftPosition = al.Position;\r
- HasAlignmentsInRegion = true;\r
-\r
- // rewind back to beginning of first alignment\r
- // return success/fail of seek\r
- return mBGZF.Seek(AlignmentsBeginOffset);\r
-}\r
-\r
-// asks Index to attempt a Jump() to specified region\r
-// returns success/failure\r
-bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) {\r
- \r
- // clear out any prior BamReader region data\r
- //\r
- // N.B. - this is cleared so that BamIndex now has free reign to call\r
- // GetNextAlignmentCore() and do overlap checking without worrying about BamReader \r
- // performing any overlap checking of its own and moving on to the next read... Calls \r
- // to GetNextAlignmentCore() with no Region set, simply return the next alignment.\r
- // This ensures that the Index is able to do just that. (All without exposing \r
- // LoadNextAlignment() to the public API, and potentially confusing clients with the nomenclature)\r
- Region.clear();\r
- \r
- // check for existing index \r
- if ( !IsIndexLoaded || Index == 0 ) return false; \r
- \r
- // attempt jump to user-specified region return false if jump could not be performed at all\r
- // (invalid index, unknown reference, etc)\r
- //\r
- // Index::Jump() is allowed to modify the HasAlignmentsInRegion flag\r
- // * This covers case where a region is requested that lies beyond the last alignment on a reference\r
- // If this occurs, any subsequent calls to GetNexAlignment[Core] simply return false\r
- // BamMultiReader is then able to successfully pull alignments from a region from multiple files\r
- // even if one or more have no data.\r
- if ( !Index->Jump(region, &HasAlignmentsInRegion) ) return false;\r
- \r
- // if jump successful, save region data & return success\r
- Region = region;\r
- return true;\r
-}\r
+// ***************************************************************************
+// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 9 October 2010 (DB)
+// ---------------------------------------------------------------------------
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for reading BAM files
+// ***************************************************************************
+
+// C++ includes
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <vector>
+#include <iostream>
+#include "BGZF.h"
+#include "BamReader.h"
+#include "BamIndex.h"
+using namespace BamTools;
+using namespace std;
+
+struct BamReader::BamReaderPrivate {
+
+ // -------------------------------
+ // structs, enums, typedefs
+ // -------------------------------
+ enum RegionState { BEFORE_REGION = 0
+ , WITHIN_REGION
+ , AFTER_REGION
+ };
+
+ // -------------------------------
+ // data members
+ // -------------------------------
+
+ // general file data
+ BgzfData mBGZF;
+ string HeaderText;
+ BamIndex* Index;
+ RefVector References;
+ bool IsIndexLoaded;
+ int64_t AlignmentsBeginOffset;
+ string Filename;
+ string IndexFilename;
+
+ // system data
+ bool IsBigEndian;
+
+ // user-specified region values
+ BamRegion Region;
+ bool HasAlignmentsInRegion;
+
+ // parent BamReader
+ BamReader* Parent;
+
+ // BAM character constants
+ const char* DNA_LOOKUP;
+ const char* CIGAR_LOOKUP;
+
+ // -------------------------------
+ // constructor & destructor
+ // -------------------------------
+ BamReaderPrivate(BamReader* parent);
+ ~BamReaderPrivate(void);
+
+ // -------------------------------
+ // "public" interface
+ // -------------------------------
+
+ // file operations
+ void Close(void);
+ bool Open(const std::string& filename,
+ const std::string& indexFilename,
+ const bool lookForIndex,
+ const bool preferStandardIndex);
+ bool Rewind(void);
+ bool SetRegion(const BamRegion& region);
+
+ // access alignment data
+ bool GetNextAlignment(BamAlignment& bAlignment);
+ bool GetNextAlignmentCore(BamAlignment& bAlignment);
+
+ // access auxiliary data
+ int GetReferenceID(const string& refName) const;
+
+ // index operations
+ bool CreateIndex(bool useStandardIndex);
+
+ // -------------------------------
+ // internal methods
+ // -------------------------------
+
+ // *** reading alignments and auxiliary data *** //
+
+ // adjusts requested region if necessary (depending on where data actually begins)
+ void AdjustRegion(BamRegion& region);
+ // fills out character data for BamAlignment data
+ bool BuildCharData(BamAlignment& bAlignment);
+ // checks to see if alignment overlaps current region
+ RegionState IsOverlap(BamAlignment& bAlignment);
+ // retrieves header text from BAM file
+ void LoadHeaderData(void);
+ // retrieves BAM alignment under file pointer
+ bool LoadNextAlignment(BamAlignment& bAlignment);
+ // builds reference data structure from BAM file
+ void LoadReferenceData(void);
+ // mark references with 'HasAlignments' status
+ void MarkReferences(void);
+
+ // *** index file handling *** //
+
+ // clear out inernal index data structure
+ void ClearIndex(void);
+ // loads index from BAM index file
+ bool LoadIndex(const bool lookForIndex, const bool preferStandardIndex);
+};
+
+// -----------------------------------------------------
+// BamReader implementation (wrapper around BRPrivate)
+// -----------------------------------------------------
+// constructor
+BamReader::BamReader(void) {
+ d = new BamReaderPrivate(this);
+}
+
+// destructor
+BamReader::~BamReader(void) {
+ delete d;
+ d = 0;
+}
+
+// file operations
+void BamReader::Close(void) { d->Close(); }
+bool BamReader::IsIndexLoaded(void) const { return d->IsIndexLoaded; }
+bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; }
+bool BamReader::Jump(int refID, int position) { return d->SetRegion( BamRegion(refID, position) ); }
+bool BamReader::Open(const std::string& filename,
+ const std::string& indexFilename,
+ const bool lookForIndex,
+ const bool preferStandardIndex)
+{
+ return d->Open(filename, indexFilename, lookForIndex, preferStandardIndex);
+}
+bool BamReader::Rewind(void) { return d->Rewind(); }
+bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); }
+bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) {
+ return d->SetRegion( BamRegion(leftRefID, leftBound, rightRefID, rightBound) );
+}
+
+// access alignment data
+bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); }
+bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); }
+
+// access auxiliary data
+const string BamReader::GetHeaderText(void) const { return d->HeaderText; }
+int BamReader::GetReferenceCount(void) const { return d->References.size(); }
+const RefVector& BamReader::GetReferenceData(void) const { return d->References; }
+int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }
+const std::string BamReader::GetFilename(void) const { return d->Filename; }
+
+// index operations
+bool BamReader::CreateIndex(bool useStandardIndex) { return d->CreateIndex(useStandardIndex); }
+
+// -----------------------------------------------------
+// BamReaderPrivate implementation
+// -----------------------------------------------------
+
+// constructor
+BamReader::BamReaderPrivate::BamReaderPrivate(BamReader* parent)
+ : Index(0)
+ , IsIndexLoaded(false)
+ , AlignmentsBeginOffset(0)
+ , HasAlignmentsInRegion(true)
+ , Parent(parent)
+ , DNA_LOOKUP("=ACMGRSVTWYHKDBN")
+ , CIGAR_LOOKUP("MIDNSHP")
+{
+ IsBigEndian = SystemIsBigEndian();
+}
+
+// destructor
+BamReader::BamReaderPrivate::~BamReaderPrivate(void) {
+ Close();
+}
+
+// adjusts requested region if necessary (depending on where data actually begins)
+void BamReader::BamReaderPrivate::AdjustRegion(BamRegion& region) {
+
+ // check for valid index first
+ if ( Index == 0 ) return;
+
+ // see if any references in region have alignments
+ HasAlignmentsInRegion = false;
+ int currentId = region.LeftRefID;
+ while ( currentId <= region.RightRefID ) {
+ HasAlignmentsInRegion = Index->HasAlignments(currentId);
+ if ( HasAlignmentsInRegion ) break;
+ ++currentId;
+ }
+
+ // if no data found on any reference in region
+ if ( !HasAlignmentsInRegion ) return;
+
+ // if left bound of desired region had no data, use first reference that had data
+ // otherwise, leave requested region as-is
+ if ( currentId != region.LeftRefID ) {
+ region.LeftRefID = currentId;
+ region.LeftPosition = 0;
+ }
+}
+
+// fills out character data for BamAlignment data
+bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) {
+
+ // calculate character lengths/offsets
+ const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;
+ const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);
+ const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;
+ const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;
+ const unsigned int tagDataLength = dataLength - tagDataOffset;
+
+ // set up char buffers
+ const char* allCharData = bAlignment.SupportData.AllCharData.data();
+ const char* seqData = ((const char*)allCharData) + seqDataOffset;
+ const char* qualData = ((const char*)allCharData) + qualDataOffset;
+ char* tagData = ((char*)allCharData) + tagDataOffset;
+
+ // store alignment name (relies on null char in name as terminator)
+ bAlignment.Name.assign((const char*)(allCharData));
+
+ // save query sequence
+ bAlignment.QueryBases.clear();
+ bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength);
+ for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {
+ char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
+ bAlignment.QueryBases.append(1, singleBase);
+ }
+
+ // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character
+ bAlignment.Qualities.clear();
+ bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength);
+ for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {
+ char singleQuality = (char)(qualData[i]+33);
+ bAlignment.Qualities.append(1, singleQuality);
+ }
+
+ // if QueryBases is empty (and this is a allowed case)
+ if ( bAlignment.QueryBases.empty() )
+ bAlignment.AlignedBases = bAlignment.QueryBases;
+
+ // if QueryBases contains data, then build AlignedBases using CIGAR data
+ else {
+
+ // resize AlignedBases
+ bAlignment.AlignedBases.clear();
+ bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength);
+
+ // iterate over CigarOps
+ int k = 0;
+ vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();
+ vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end();
+ for ( ; cigarIter != cigarEnd; ++cigarIter ) {
+
+ const CigarOp& op = (*cigarIter);
+ switch(op.Type) {
+
+ case ('M') :
+ case ('I') :
+ bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases
+ // fall through
+
+ case ('S') :
+ k += op.Length; // for 'S' - soft clip, skip over query bases
+ break;
+
+ case ('D') :
+ bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character
+ break;
+
+ case ('P') :
+ bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character
+ break;
+
+ case ('N') :
+ bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence
+ break;
+
+ case ('H') :
+ break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op
+
+ default:
+ fprintf(stderr, "ERROR: Invalid Cigar op type\n"); // shouldn't get here
+ exit(1);
+ }
+ }
+ }
+
+ // -----------------------
+ // Added: 3-25-2010 DB
+ // Fixed: endian-correctness for tag data
+ // -----------------------
+ if ( IsBigEndian ) {
+ int i = 0;
+ while ( (unsigned int)i < tagDataLength ) {
+
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
+ ++i; // skip value type
+
+ switch (type) {
+
+ case('A') :
+ case('C') :
+ ++i;
+ break;
+
+ case('S') :
+ SwapEndian_16p(&tagData[i]);
+ i += sizeof(uint16_t);
+ break;
+
+ case('F') :
+ case('I') :
+ SwapEndian_32p(&tagData[i]);
+ i += sizeof(uint32_t);
+ break;
+
+ case('D') :
+ SwapEndian_64p(&tagData[i]);
+ i += sizeof(uint64_t);
+ break;
+
+ case('H') :
+ case('Z') :
+ while (tagData[i]) { ++i; }
+ ++i; // increment one more for null terminator
+ break;
+
+ default :
+ fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
+ exit(1);
+ }
+ }
+ }
+
+ // store TagData
+ bAlignment.TagData.clear();
+ bAlignment.TagData.resize(tagDataLength);
+ memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength);
+
+ // clear the core-only flag
+ bAlignment.SupportData.HasCoreOnly = false;
+
+ // return success
+ return true;
+}
+
+// clear index data structure
+void BamReader::BamReaderPrivate::ClearIndex(void) {
+ delete Index;
+ Index = 0;
+ IsIndexLoaded = false;
+}
+
+// closes the BAM file
+void BamReader::BamReaderPrivate::Close(void) {
+
+ // close BGZF file stream
+ mBGZF.Close();
+
+ // clear out index data
+ ClearIndex();
+
+ // clear out header data
+ HeaderText.clear();
+
+ // clear out region flags
+ Region.clear();
+}
+
+// creates index for BAM file, saves to file
+// default behavior is to create the BAM standard index (".bai")
+// set flag to false to create the BamTools-specific index (".bti")
+bool BamReader::BamReaderPrivate::CreateIndex(bool useStandardIndex) {
+
+ // clear out prior index data
+ ClearIndex();
+
+ // create index based on type requested
+ if ( useStandardIndex )
+ Index = new BamStandardIndex(&mBGZF, Parent, IsBigEndian);
+ // create BamTools 'custom' index
+ else
+ Index = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);
+
+ // build new index
+ bool ok = true;
+ ok &= Index->Build();
+ IsIndexLoaded = ok;
+
+ // mark empty references
+ MarkReferences();
+
+ // attempt to save index data to file
+ ok &= Index->Write(Filename);
+
+ // return success/fail of both building & writing index
+ return ok;
+}
+
+// get next alignment (from specified region, if given)
+bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {
+
+ // if valid alignment found, attempt to parse char data, and return success/failure
+ if ( GetNextAlignmentCore(bAlignment) )
+ return BuildCharData(bAlignment);
+
+ // no valid alignment found
+ else return false;
+}
+
+// retrieves next available alignment core data (returns success/fail)
+// ** DOES NOT parse any character data (read name, bases, qualities, tag data)
+// these can be accessed, if necessary, from the supportData
+// useful for operations requiring ONLY positional or other alignment-related information
+bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) {
+
+ // if region is set but has no alignments
+ if ( !Region.isNull() && !HasAlignmentsInRegion )
+ return false;
+
+ // if valid alignment available
+ if ( LoadNextAlignment(bAlignment) ) {
+
+ // set core-only flag
+ bAlignment.SupportData.HasCoreOnly = true;
+
+ // if region not specified with at least a left boundary, return success
+ if ( !Region.isLeftBoundSpecified() ) return true;
+
+ // determine region state (before, within, after)
+ BamReader::BamReaderPrivate::RegionState state = IsOverlap(bAlignment);
+
+ // if alignment lies after region, return false
+ if ( state == AFTER_REGION ) return false;
+
+ while ( state != WITHIN_REGION ) {
+ // if no valid alignment available (likely EOF) return failure
+ if ( !LoadNextAlignment(bAlignment) ) return false;
+ // if alignment lies after region, return false (no available read within region)
+ state = IsOverlap(bAlignment);
+ if ( state == AFTER_REGION ) return false;
+ }
+
+ // return success (alignment found that overlaps region)
+ return true;
+ }
+
+ // no valid alignment
+ else return false;
+}
+
+// returns RefID for given RefName (returns References.size() if not found)
+int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const {
+
+ // retrieve names from reference data
+ vector<string> refNames;
+ RefVector::const_iterator refIter = References.begin();
+ RefVector::const_iterator refEnd = References.end();
+ for ( ; refIter != refEnd; ++refIter)
+ refNames.push_back( (*refIter).RefName );
+
+ // return 'index-of' refName ( if not found, returns refNames.size() )
+ return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));
+}
+
+// returns region state - whether alignment ends before, overlaps, or starts after currently specified region
+// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true
+BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {
+
+ // if alignment is on any reference sequence before left bound
+ if ( bAlignment.RefID < Region.LeftRefID ) return BEFORE_REGION;
+
+ // if alignment starts on left bound reference
+ else if ( bAlignment.RefID == Region.LeftRefID ) {
+
+ // if alignment starts at or after left boundary
+ if ( bAlignment.Position >= Region.LeftPosition) {
+
+ // if right boundary is specified AND
+ // left/right boundaries are on same reference AND
+ // alignment starts past right boundary
+ if ( Region.isRightBoundSpecified() &&
+ Region.LeftRefID == Region.RightRefID &&
+ bAlignment.Position > Region.RightPosition )
+ return AFTER_REGION;
+
+ // otherwise, alignment is within region
+ return WITHIN_REGION;
+ }
+
+ // alignment starts before left boundary
+ else {
+ // check if alignment overlaps left boundary
+ if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) return WITHIN_REGION;
+ else return BEFORE_REGION;
+ }
+ }
+
+ // alignment starts on a reference after the left bound
+ else {
+
+ // if region has a right boundary
+ if ( Region.isRightBoundSpecified() ) {
+
+ // alignment is on reference between boundaries
+ if ( bAlignment.RefID < Region.RightRefID ) return WITHIN_REGION;
+
+ // alignment is on reference after right boundary
+ else if ( bAlignment.RefID > Region.RightRefID ) return AFTER_REGION;
+
+ // alignment is on right bound reference
+ else {
+ // check if alignment starts before or at right boundary
+ if ( bAlignment.Position <= Region.RightPosition ) return WITHIN_REGION;
+ else return AFTER_REGION;
+ }
+ }
+
+ // otherwise, alignment is after left bound reference, but there is no right boundary
+ else return WITHIN_REGION;
+ }
+}
+
+// load BAM header data
+void BamReader::BamReaderPrivate::LoadHeaderData(void) {
+
+ // check to see if proper BAM header
+ char buffer[4];
+ if (mBGZF.Read(buffer, 4) != 4) {
+ fprintf(stderr, "Could not read header type\n");
+ exit(1);
+ }
+
+ if (strncmp(buffer, "BAM\001", 4)) {
+ fprintf(stderr, "wrong header type!\n");
+ exit(1);
+ }
+
+ // get BAM header text length
+ mBGZF.Read(buffer, 4);
+ unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);
+ if ( IsBigEndian ) SwapEndian_32(headerTextLength);
+
+ // get BAM header text
+ char* headerText = (char*)calloc(headerTextLength + 1, 1);
+ mBGZF.Read(headerText, headerTextLength);
+ HeaderText = (string)((const char*)headerText);
+
+ // clean up calloc-ed temp variable
+ free(headerText);
+}
+
+// load existing index data from BAM index file (".bti" OR ".bai"), return success/fail
+bool BamReader::BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool preferStandardIndex) {
+
+ // clear out any existing index data
+ ClearIndex();
+
+ // if no index filename provided, so we need to look for available index files
+ if ( IndexFilename.empty() ) {
+
+ // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag
+ const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS);
+ Index = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, IsBigEndian, type);
+
+ // if null, return failure
+ if ( Index == 0 ) return false;
+
+ // generate proper IndexFilename based on type of index created
+ IndexFilename = Filename + Index->Extension();
+ }
+
+ else {
+
+ // attempt to load BamIndex based on IndexFilename provided by client
+ Index = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent, IsBigEndian);
+
+ // if null, return failure
+ if ( Index == 0 ) return false;
+ }
+
+ // an index file was found
+ // return success of loading the index data from file
+ IsIndexLoaded = Index->Load(IndexFilename);
+
+ // mark empty references
+ MarkReferences();
+
+ // return index status
+ return IsIndexLoaded;
+}
+
+// populates BamAlignment with alignment data under file pointer, returns success/fail
+bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {
+
+ // read in the 'block length' value, make sure it's not zero
+ char buffer[4];
+ mBGZF.Read(buffer, 4);
+ bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);
+ if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); }
+ if ( bAlignment.SupportData.BlockLength == 0 ) return false;
+
+ // read in core alignment data, make sure the right size of data was read
+ char x[BAM_CORE_SIZE];
+ if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) return false;
+
+ if ( IsBigEndian ) {
+ for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) )
+ SwapEndian_32p(&x[i]);
+ }
+
+ // set BamAlignment 'core' and 'support' data
+ bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]);
+ bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);
+
+ unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]);
+ bAlignment.Bin = tempValue >> 16;
+ bAlignment.MapQuality = tempValue >> 8 & 0xff;
+ bAlignment.SupportData.QueryNameLength = tempValue & 0xff;
+
+ tempValue = BgzfData::UnpackUnsignedInt(&x[12]);
+ bAlignment.AlignmentFlag = tempValue >> 16;
+ bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff;
+
+ bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);
+ bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]);
+ bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);
+ bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);
+
+ // set BamAlignment length
+ bAlignment.Length = bAlignment.SupportData.QuerySequenceLength;
+
+ // read in character data - make sure proper data size was read
+ bool readCharDataOK = false;
+ const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;
+ char* allCharData = (char*)calloc(sizeof(char), dataLength);
+
+ if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) {
+
+ // store 'allCharData' in supportData structure
+ bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength);
+
+ // set success flag
+ readCharDataOK = true;
+
+ // save CIGAR ops
+ // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly,
+ // even when BamReader::GetNextAlignmentCore() is called
+ const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;
+ uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
+ CigarOp op;
+ bAlignment.CigarData.clear();
+ bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations);
+ for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {
+
+ // swap if necessary
+ if ( IsBigEndian ) SwapEndian_32(cigarData[i]);
+
+ // build CigarOp structure
+ op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
+ op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
+
+ // save CigarOp
+ bAlignment.CigarData.push_back(op);
+ }
+ }
+
+ free(allCharData);
+ return readCharDataOK;
+}
+
+// loads reference data from BAM file
+void BamReader::BamReaderPrivate::LoadReferenceData(void) {
+
+ // get number of reference sequences
+ char buffer[4];
+ mBGZF.Read(buffer, 4);
+ unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);
+ if ( IsBigEndian ) SwapEndian_32(numberRefSeqs);
+ if ( numberRefSeqs == 0 ) return;
+ References.reserve((int)numberRefSeqs);
+
+ // iterate over all references in header
+ for (unsigned int i = 0; i != numberRefSeqs; ++i) {
+
+ // get length of reference name
+ mBGZF.Read(buffer, 4);
+ unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);
+ if ( IsBigEndian ) SwapEndian_32(refNameLength);
+ char* refName = (char*)calloc(refNameLength, 1);
+
+ // get reference name and reference sequence length
+ mBGZF.Read(refName, refNameLength);
+ mBGZF.Read(buffer, 4);
+ int refLength = BgzfData::UnpackSignedInt(buffer);
+ if ( IsBigEndian ) SwapEndian_32(refLength);
+
+ // store data for reference
+ RefData aReference;
+ aReference.RefName = (string)((const char*)refName);
+ aReference.RefLength = refLength;
+ References.push_back(aReference);
+
+ // clean up calloc-ed temp variable
+ free(refName);
+ }
+}
+
+// mark references with no alignment data
+void BamReader::BamReaderPrivate::MarkReferences(void) {
+
+ // ensure index is available
+ if ( Index == 0 || !IsIndexLoaded ) return;
+
+ // mark empty references
+ for ( int i = 0; i < (int)References.size(); ++i )
+ References.at(i).RefHasAlignments = Index->HasAlignments(i);
+}
+
+// opens BAM file (and index)
+bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename, const bool lookForIndex, const bool preferStandardIndex) {
+
+ // store filenames
+ Filename = filename;
+ IndexFilename = indexFilename;
+
+ // open the BGZF file for reading, return false on failure
+ if ( !mBGZF.Open(filename, "rb") ) return false;
+
+ // retrieve header text & reference data
+ LoadHeaderData();
+ LoadReferenceData();
+
+ // store file offset of first alignment
+ AlignmentsBeginOffset = mBGZF.Tell();
+
+ // if no index filename provided
+ if ( IndexFilename.empty() ) {
+
+ // client did not specify that index SHOULD be found
+ // useful for cases where sequential access is all that is required
+ if ( !lookForIndex ) return true;
+
+ // otherwise, look for index file, return success/fail
+ return LoadIndex(lookForIndex, preferStandardIndex) ;
+ }
+
+ // client supplied an index filename
+ // attempt to load index data, return success/fail
+ return LoadIndex(lookForIndex, preferStandardIndex);
+}
+
+// returns BAM file pointer to beginning of alignment data
+bool BamReader::BamReaderPrivate::Rewind(void) {
+
+ // rewind to first alignment, return false if unable to seek
+ if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false;
+
+ // retrieve first alignment data, return false if unable to read
+ BamAlignment al;
+ if ( !LoadNextAlignment(al) ) return false;
+
+ // reset default region info using first alignment in file
+ Region.clear();
+ HasAlignmentsInRegion = true;
+
+ // rewind back to beginning of first alignment
+ // return success/fail of seek
+ return mBGZF.Seek(AlignmentsBeginOffset);
+}
+
+// asks Index to attempt a Jump() to specified region
+// returns success/failure
+bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) {
+
+ // clear out any prior BamReader region data
+ //
+ // N.B. - this is cleared so that BamIndex now has free reign to call
+ // GetNextAlignmentCore() and do overlap checking without worrying about BamReader
+ // performing any overlap checking of its own and moving on to the next read... Calls
+ // to GetNextAlignmentCore() with no Region set, simply return the next alignment.
+ // This ensures that the Index is able to do just that. (All without exposing
+ // LoadNextAlignment() to the public API, and potentially confusing clients with the nomenclature)
+ Region.clear();
+
+ // check for existing index
+ if ( !IsIndexLoaded || Index == 0 ) return false;
+
+ // adjust region if necessary to reflect where data actually begins
+ BamRegion adjustedRegion(region);
+ AdjustRegion(adjustedRegion);
+
+ // if no data present, return true
+ // not an error, but BamReader knows that no data is there for future alignment access
+ // (this is useful in a MultiBamReader setting where some BAM files may lack data in regions
+ // that other BAMs have data)
+ if ( !HasAlignmentsInRegion ) {
+ Region = adjustedRegion;
+ return true;
+ }
+
+ // attempt jump to user-specified region return false if jump could not be performed at all
+ // (invalid index, unknown reference, etc)
+ //
+ // Index::Jump() is allowed to modify the HasAlignmentsInRegion flag
+ // * This covers case where a region is requested that lies beyond the last alignment on a reference
+ // If this occurs, any subsequent calls to GetNexAlignment[Core] simply return false
+ // BamMultiReader is then able to successfully pull alignments from a region from multiple files
+ // even if one or more have no data.
+ if ( !Index->Jump(adjustedRegion, &HasAlignmentsInRegion) )
+ return false;
+
+ // save region and return success
+ Region = adjustedRegion;
+ return true;
+}