// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 15 July 2010 (DB)\r
+// Last modified: 7 September 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
// Institute.\r
\r
// file operations\r
void Close(void);\r
- bool Jump(int refID, int position = 0);\r
- bool Open(const string& filename, const string& indexFilename = "");\r
+ bool Jump(int refID, int position);\r
+ bool Open(const std::string& filename, \r
+ const std::string& indexFilename, \r
+ const bool lookForIndex, \r
+ const bool preferStandardIndex);\r
bool Rewind(void);\r
bool SetRegion(const BamRegion& region);\r
\r
int GetReferenceID(const string& refName) const;\r
\r
// index operations\r
- bool CreateIndex(bool useDefaultIndex);\r
+ bool CreateIndex(bool useStandardIndex);\r
\r
// -------------------------------\r
// internal methods\r
// clear out inernal index data structure\r
void ClearIndex(void);\r
// loads index from BAM index file\r
- bool LoadIndex(void);\r
+ bool LoadIndex(const bool lookForIndex, const bool preferStandardIndex);\r
};\r
\r
// -----------------------------------------------------\r
\r
// file operations\r
void BamReader::Close(void) { d->Close(); }\r
+bool BamReader::IsIndexLoaded(void) const { return d->IsIndexLoaded; }\r
bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; }\r
-bool BamReader::Jump(int refID, int position) { \r
+bool BamReader::Jump(int refID, int position) \r
+{ \r
d->Region.LeftRefID = refID;\r
d->Region.LeftPosition = position;\r
d->IsLeftBoundSpecified = true;\r
d->IsRightBoundSpecified = false;\r
return d->Jump(refID, position); \r
}\r
-bool BamReader::Open(const string& filename, const string& indexFilename) { return d->Open(filename, indexFilename); }\r
+bool BamReader::Open(const std::string& filename, \r
+ const std::string& indexFilename, \r
+ const bool lookForIndex, \r
+ const bool preferStandardIndex) \r
+{ \r
+ return d->Open(filename, indexFilename, lookForIndex, preferStandardIndex); \r
+}\r
bool BamReader::Rewind(void) { return d->Rewind(); }\r
bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); }\r
bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) {\r
const std::string BamReader::GetFilename(void) const { return d->Filename; }\r
\r
// index operations\r
-bool BamReader::CreateIndex(bool useDefaultIndex) { return d->CreateIndex(useDefaultIndex); }\r
+bool BamReader::CreateIndex(bool useStandardIndex) { return d->CreateIndex(useStandardIndex); }\r
\r
// -----------------------------------------------------\r
// BamReaderPrivate implementation\r
\r
// calculate character lengths/offsets\r
const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
- const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;\r
const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);\r
const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;\r
const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;\r
\r
// set up char buffers\r
const char* allCharData = bAlignment.SupportData.AllCharData.data();\r
- uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);\r
const char* seqData = ((const char*)allCharData) + seqDataOffset;\r
const char* qualData = ((const char*)allCharData) + qualDataOffset;\r
char* tagData = ((char*)allCharData) + tagDataOffset;\r
\r
- // store alignment name (depends on null char as terminator)\r
+ // store alignment name (relies on null char in name as terminator)\r
bAlignment.Name.assign((const char*)(allCharData)); \r
- \r
- // save CigarOps \r
- CigarOp op;\r
- bAlignment.CigarData.clear();\r
- bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations);\r
- for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {\r
-\r
- // swap if necessary\r
- if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); }\r
- \r
- // build CigarOp structure\r
- op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
- op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
\r
- // save CigarOp\r
- bAlignment.CigarData.push_back(op);\r
- }\r
- \r
- \r
// save query sequence\r
bAlignment.QueryBases.clear();\r
bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
void BamReader::BamReaderPrivate::ClearIndex(void) {\r
delete NewIndex;\r
NewIndex = 0;\r
+ IsIndexLoaded = false;\r
}\r
\r
// closes the BAM file\r
IsRegionSpecified = false;\r
}\r
\r
-// create BAM index from BAM file (keep structure in memory) and write to default index output file\r
-bool BamReader::BamReaderPrivate::CreateIndex(bool useDefaultIndex) {\r
+// creates index for BAM file, saves to file\r
+// default behavior is to create the BAM standard index (".bai")\r
+// set flag to false to create the BamTools-specific index (".bti")\r
+bool BamReader::BamReaderPrivate::CreateIndex(bool useStandardIndex) {\r
\r
// clear out prior index data\r
ClearIndex();\r
\r
- // create default index\r
- if ( useDefaultIndex )\r
- NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian);\r
+ // create index based on type requested\r
+ if ( useStandardIndex ) \r
+ NewIndex = new BamStandardIndex(&mBGZF, Parent, IsBigEndian);\r
// create BamTools 'custom' index\r
else\r
NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);\r
\r
+ // build new index\r
bool ok = true;\r
ok &= NewIndex->Build();\r
+ IsIndexLoaded = ok;\r
+ \r
+ // attempt to save index data to file\r
ok &= NewIndex->Write(Filename); \r
\r
- // return success/fail\r
+ // return success/fail of both building & writing index\r
return ok;\r
}\r
\r
BamReader::BamReaderPrivate::RegionState state = IsOverlap(bAlignment);\r
\r
// if alignment lies after region, return false\r
- if ( state == AFTER_REGION ) \r
- return false;\r
+ if ( state == AFTER_REGION ) return false;\r
\r
while ( state != WITHIN_REGION ) {\r
// if no valid alignment available (likely EOF) return failure\r
if ( !LoadNextAlignment(bAlignment) ) return false;\r
// if alignment lies after region, return false (no available read within region)\r
state = IsOverlap(bAlignment);\r
- if ( state == AFTER_REGION) return false;\r
- \r
+ if ( state == AFTER_REGION ) return false;\r
}\r
\r
// return success (alignment found that overlaps region)\r
vector<string> refNames;\r
RefVector::const_iterator refIter = References.begin();\r
RefVector::const_iterator refEnd = References.end();\r
- for ( ; refIter != refEnd; ++refIter) {\r
+ for ( ; refIter != refEnd; ++refIter) \r
refNames.push_back( (*refIter).RefName );\r
- }\r
\r
// return 'index-of' refName ( if not found, returns refNames.size() )\r
return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));\r
\r
// -----------------------------------------------------------------------\r
// check for existing index \r
- if ( NewIndex == 0 ) return false; \r
+ if ( !IsIndexLoaded || NewIndex == 0 ) return false; \r
// see if reference has alignments\r
if ( !NewIndex->HasAlignments(refID) ) return false; \r
// make sure position is valid\r
// get BAM header text length\r
mBGZF.Read(buffer, 4);\r
unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( IsBigEndian ) { SwapEndian_32(headerTextLength); }\r
+ if ( IsBigEndian ) SwapEndian_32(headerTextLength); \r
\r
// get BAM header text\r
char* headerText = (char*)calloc(headerTextLength + 1, 1);\r
free(headerText);\r
}\r
\r
-// load existing index data from BAM index file (".bai"), return success/fail\r
-bool BamReader::BamReaderPrivate::LoadIndex(void) {\r
+// load existing index data from BAM index file (".bti" OR ".bai"), return success/fail\r
+bool BamReader::BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool preferStandardIndex) {\r
\r
// clear out any existing index data\r
ClearIndex();\r
\r
- // skip if index file empty\r
- if ( IndexFilename.empty() )\r
- return false;\r
-\r
- // check supplied filename for index type\r
- size_t defaultExtensionFound = IndexFilename.find(".bai");\r
- size_t customExtensionFound = IndexFilename.find(".bti");\r
- \r
- // if SAM/BAM default (".bai")\r
- if ( defaultExtensionFound != string::npos )\r
- NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian);\r
- \r
- // if BamTools custom index (".bti")\r
- else if ( customExtensionFound != string::npos )\r
- NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);\r
+ // if no index filename provided, so we need to look for available index files\r
+ if ( IndexFilename.empty() ) {\r
+ \r
+ // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag\r
+ const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS);\r
+ NewIndex = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, IsBigEndian, type);\r
+ \r
+ // if null, return failure\r
+ if ( NewIndex == 0 ) return false;\r
+ \r
+ // generate proper IndexFilename based on type of index created\r
+ IndexFilename = Filename + NewIndex->Extension();\r
+ }\r
\r
- // else unknown\r
else {\r
- fprintf(stderr, "ERROR: Unknown index file extension.\n");\r
- return false;\r
+ // attempt to load BamIndex based on IndexFilename provided by client\r
+ NewIndex = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent, IsBigEndian);\r
+ \r
+ // if null, return failure\r
+ if ( NewIndex == 0 ) return false; \r
}\r
\r
- // return success of loading index data\r
- return NewIndex->Load(IndexFilename);\r
+ // an index file was found\r
+ // return success of loading the index data from file\r
+ IsIndexLoaded = NewIndex->Load(IndexFilename);\r
+ return IsIndexLoaded;\r
}\r
\r
// populates BamAlignment with alignment data under file pointer, returns success/fail\r
mBGZF.Read(buffer, 4);\r
bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);\r
if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); }\r
- if ( bAlignment.SupportData.BlockLength == 0 ) { return false; }\r
+ if ( bAlignment.SupportData.BlockLength == 0 ) return false;\r
\r
// read in core alignment data, make sure the right size of data was read\r
char x[BAM_CORE_SIZE];\r
- if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }\r
+ if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) return false; \r
\r
if ( IsBigEndian ) {\r
- for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) { \r
- SwapEndian_32p(&x[i]); \r
- }\r
+ for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) \r
+ SwapEndian_32p(&x[i]); \r
}\r
\r
// set BamAlignment 'core' and 'support' data\r
\r
// set success flag\r
readCharDataOK = true;\r
+ \r
+ // save CIGAR ops \r
+ // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly, \r
+ // even when BamReader::GetNextAlignmentCore() is called \r
+ const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;\r
+ uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);\r
+ CigarOp op;\r
+ bAlignment.CigarData.clear();\r
+ bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations);\r
+ for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {\r
+\r
+ // swap if necessary\r
+ if ( IsBigEndian ) SwapEndian_32(cigarData[i]);\r
+ \r
+ // build CigarOp structure\r
+ op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
+ op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
+\r
+ // save CigarOp\r
+ bAlignment.CigarData.push_back(op);\r
+ }\r
}\r
\r
free(allCharData);\r
char buffer[4];\r
mBGZF.Read(buffer, 4);\r
unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( IsBigEndian ) { SwapEndian_32(numberRefSeqs); }\r
- if (numberRefSeqs == 0) { return; }\r
+ if ( IsBigEndian ) SwapEndian_32(numberRefSeqs);\r
+ if ( numberRefSeqs == 0 ) return;\r
References.reserve((int)numberRefSeqs);\r
\r
// iterate over all references in header\r
// get length of reference name\r
mBGZF.Read(buffer, 4);\r
unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( IsBigEndian ) { SwapEndian_32(refNameLength); }\r
+ if ( IsBigEndian ) SwapEndian_32(refNameLength);\r
char* refName = (char*)calloc(refNameLength, 1);\r
\r
// get reference name and reference sequence length\r
mBGZF.Read(refName, refNameLength);\r
mBGZF.Read(buffer, 4);\r
int refLength = BgzfData::UnpackSignedInt(buffer);\r
- if ( IsBigEndian ) { SwapEndian_32(refLength); }\r
+ if ( IsBigEndian ) SwapEndian_32(refLength); \r
\r
// store data for reference\r
RefData aReference;\r
}\r
\r
// opens BAM file (and index)\r
-bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) {\r
+bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename, const bool lookForIndex, const bool preferStandardIndex) {\r
\r
+ // store filenames\r
Filename = filename;\r
IndexFilename = indexFilename;\r
\r
// open the BGZF file for reading, return false on failure\r
- if ( !mBGZF.Open(filename, "rb") ) \r
- return false;\r
+ if ( !mBGZF.Open(filename, "rb") ) return false; \r
\r
// retrieve header text & reference data\r
LoadHeaderData();\r
// store file offset of first alignment\r
AlignmentsBeginOffset = mBGZF.Tell();\r
\r
- // open index file & load index data (if exists)\r
- if ( !IndexFilename.empty() )\r
- LoadIndex();\r
+ // if no index filename provided \r
+ if ( IndexFilename.empty() ) {\r
+ \r
+ // client did not specify that index SHOULD be found\r
+ // useful for cases where sequential access is all that is required\r
+ if ( !lookForIndex ) return true; \r
+ \r
+ // otherwise, look for index file, return success/fail\r
+ return LoadIndex(lookForIndex, preferStandardIndex) ;\r
+ }\r
\r
- // return success\r
- return true;\r
+ // client supplied an index filename\r
+ // attempt to load index data, return success/fail\r
+ return LoadIndex(lookForIndex, preferStandardIndex); \r
}\r
\r
// returns BAM file pointer to beginning of alignment data\r
Region = region;\r
\r
// set flags\r
- if ( region.LeftRefID >= 0 && region.LeftPosition >= 0 ) \r
- IsLeftBoundSpecified = true;\r
- if ( region.RightRefID >= 0 && region.RightPosition >= 0 ) \r
- IsRightBoundSpecified = true;\r
+ if ( region.LeftRefID >= 0 && region.LeftPosition >= 0 ) IsLeftBoundSpecified = true;\r
+ if ( region.RightRefID >= 0 && region.RightPosition >= 0 ) IsRightBoundSpecified = true;\r
\r
// attempt jump to beginning of region, return success/fail of Jump()\r
return Jump( Region.LeftRefID, Region.LeftPosition );\r