// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 30 August 2010 (DB)\r
+// Last modified: 10 September 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
// Institute.\r
// general file data\r
BgzfData mBGZF;\r
string HeaderText;\r
- //BamIndex Index;\r
- BamIndex* NewIndex;\r
+ BamIndex* Index;\r
RefVector References;\r
bool IsIndexLoaded;\r
int64_t AlignmentsBeginOffset;\r
bool IsBigEndian;\r
\r
// user-specified region values\r
- BamRegion Region;\r
- bool IsLeftBoundSpecified;\r
- bool IsRightBoundSpecified;\r
- \r
- bool IsRegionSpecified;\r
+ BamRegion Region; \r
int CurrentRefID;\r
int CurrentLeft;\r
\r
\r
// file operations\r
void Close(void);\r
- bool Jump(int refID, int position = 0);\r
- bool Open(const string& filename, const string& indexFilename = "");\r
+ bool Open(const std::string& filename, \r
+ const std::string& indexFilename, \r
+ const bool lookForIndex, \r
+ const bool preferStandardIndex);\r
bool Rewind(void);\r
bool SetRegion(const BamRegion& region);\r
\r
int GetReferenceID(const string& refName) const;\r
\r
// index operations\r
- bool CreateIndex(bool useDefaultIndex);\r
+ bool CreateIndex(bool useStandardIndex);\r
\r
// -------------------------------\r
// internal methods\r
// clear out inernal index data structure\r
void ClearIndex(void);\r
// loads index from BAM index file\r
- bool LoadIndex(void);\r
+ bool LoadIndex(const bool lookForIndex, const bool preferStandardIndex);\r
};\r
\r
// -----------------------------------------------------\r
void BamReader::Close(void) { d->Close(); }\r
bool BamReader::IsIndexLoaded(void) const { return d->IsIndexLoaded; }\r
bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; }\r
-bool BamReader::Jump(int refID, int position) { \r
- d->Region.LeftRefID = refID;\r
- d->Region.LeftPosition = position;\r
- d->IsLeftBoundSpecified = true;\r
- d->IsRightBoundSpecified = false;\r
- return d->Jump(refID, position); \r
+bool BamReader::Jump(int refID, int position) { return d->SetRegion( BamRegion(refID, position) ); }\r
+bool BamReader::Open(const std::string& filename, \r
+ const std::string& indexFilename, \r
+ const bool lookForIndex, \r
+ const bool preferStandardIndex) \r
+{ \r
+ return d->Open(filename, indexFilename, lookForIndex, preferStandardIndex); \r
}\r
-bool BamReader::Open(const string& filename, const string& indexFilename) { return d->Open(filename, indexFilename); }\r
bool BamReader::Rewind(void) { return d->Rewind(); }\r
bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); }\r
bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) {\r
const std::string BamReader::GetFilename(void) const { return d->Filename; }\r
\r
// index operations\r
-bool BamReader::CreateIndex(bool useDefaultIndex) { return d->CreateIndex(useDefaultIndex); }\r
+bool BamReader::CreateIndex(bool useStandardIndex) { return d->CreateIndex(useStandardIndex); }\r
\r
// -----------------------------------------------------\r
// BamReaderPrivate implementation\r
\r
// constructor\r
BamReader::BamReaderPrivate::BamReaderPrivate(BamReader* parent)\r
- : NewIndex(0)\r
+ : Index(0)\r
, IsIndexLoaded(false)\r
, AlignmentsBeginOffset(0)\r
- , IsLeftBoundSpecified(false)\r
- , IsRightBoundSpecified(false)\r
- , IsRegionSpecified(false)\r
, CurrentRefID(0)\r
, CurrentLeft(0)\r
, Parent(parent)\r
\r
// calculate character lengths/offsets\r
const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
-// const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;\r
const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);\r
const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;\r
const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;\r
\r
// set up char buffers\r
const char* allCharData = bAlignment.SupportData.AllCharData.data();\r
-// uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);\r
const char* seqData = ((const char*)allCharData) + seqDataOffset;\r
const char* qualData = ((const char*)allCharData) + qualDataOffset;\r
char* tagData = ((char*)allCharData) + tagDataOffset;\r
\r
// store alignment name (relies on null char in name as terminator)\r
bAlignment.Name.assign((const char*)(allCharData)); \r
- \r
-// // save CigarOps \r
-// CigarOp op;\r
-// bAlignment.CigarData.clear();\r
-// bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations);\r
-// for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {\r
-// \r
-// // swap if necessary\r
-// if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); }\r
-// \r
-// // build CigarOp structure\r
-// op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
-// op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
-// \r
-// // save CigarOp\r
-// bAlignment.CigarData.push_back(op);\r
-// }\r
- \r
+\r
// save query sequence\r
bAlignment.QueryBases.clear();\r
bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op\r
\r
default:\r
- printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here\r
+ fprintf(stderr, "ERROR: Invalid Cigar op type\n"); // shouldn't get here\r
exit(1);\r
}\r
}\r
break;\r
\r
default : \r
- printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
+ fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here\r
exit(1);\r
}\r
}\r
\r
// clear index data structure\r
void BamReader::BamReaderPrivate::ClearIndex(void) {\r
- delete NewIndex;\r
- NewIndex = 0;\r
+ delete Index;\r
+ Index = 0;\r
IsIndexLoaded = false;\r
}\r
\r
HeaderText.clear();\r
\r
// clear out region flags\r
- IsLeftBoundSpecified = false;\r
- IsRightBoundSpecified = false;\r
- IsRegionSpecified = false;\r
+ Region.clear();\r
}\r
\r
-// create BAM index from BAM file (keep structure in memory) and write to default index output file\r
-bool BamReader::BamReaderPrivate::CreateIndex(bool useDefaultIndex) {\r
+// creates index for BAM file, saves to file\r
+// default behavior is to create the BAM standard index (".bai")\r
+// set flag to false to create the BamTools-specific index (".bti")\r
+bool BamReader::BamReaderPrivate::CreateIndex(bool useStandardIndex) {\r
\r
// clear out prior index data\r
ClearIndex();\r
\r
- // create default index\r
- if ( useDefaultIndex )\r
- NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian);\r
+ // create index based on type requested\r
+ if ( useStandardIndex ) \r
+ Index = new BamStandardIndex(&mBGZF, Parent, IsBigEndian);\r
// create BamTools 'custom' index\r
else\r
- NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);\r
+ Index = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);\r
\r
// build new index\r
bool ok = true;\r
- ok &= NewIndex->Build();\r
+ ok &= Index->Build();\r
IsIndexLoaded = ok;\r
\r
// attempt to save index data to file\r
- ok &= NewIndex->Write(Filename); \r
+ ok &= Index->Write(Filename); \r
\r
// return success/fail of both building & writing index\r
return ok;\r
bAlignment.SupportData.HasCoreOnly = true;\r
\r
// if region not specified, return success\r
- if ( !IsLeftBoundSpecified ) return true;\r
+ if ( !Region.isLeftBoundSpecified() ) return true;\r
\r
// determine region state (before, within, after)\r
BamReader::BamReaderPrivate::RegionState state = IsOverlap(bAlignment);\r
\r
// if alignment lies after region, return false\r
- if ( state == AFTER_REGION ) \r
- return false;\r
+ if ( state == AFTER_REGION ) return false;\r
\r
while ( state != WITHIN_REGION ) {\r
// if no valid alignment available (likely EOF) return failure\r
if ( !LoadNextAlignment(bAlignment) ) return false;\r
// if alignment lies after region, return false (no available read within region)\r
state = IsOverlap(bAlignment);\r
- if ( state == AFTER_REGION) return false;\r
- \r
+ if ( state == AFTER_REGION ) return false;\r
}\r
\r
// return success (alignment found that overlaps region)\r
vector<string> refNames;\r
RefVector::const_iterator refIter = References.begin();\r
RefVector::const_iterator refEnd = References.end();\r
- for ( ; refIter != refEnd; ++refIter) {\r
+ for ( ; refIter != refEnd; ++refIter) \r
refNames.push_back( (*refIter).RefName );\r
- }\r
\r
// return 'index-of' refName ( if not found, returns refNames.size() )\r
return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));\r
// check alignment start against right bound cutoff\r
\r
// if full region of interest was given\r
- if ( IsRightBoundSpecified ) {\r
+ if ( Region.isRightBoundSpecified() ) {\r
\r
// read starts on right bound reference, but AFTER right bound position\r
if ( bAlignment.RefID == Region.RightRefID && bAlignment.Position > Region.RightPosition )\r
return BEFORE_REGION;\r
}\r
\r
-// jumps to specified region(refID, leftBound) in BAM file, returns success/fail\r
-bool BamReader::BamReaderPrivate::Jump(int refID, int position) {\r
-\r
- // -----------------------------------------------------------------------\r
- // check for existing index \r
- if ( !IsIndexLoaded || NewIndex == 0 ) return false; \r
- // see if reference has alignments\r
- if ( !NewIndex->HasAlignments(refID) ) return false; \r
- // make sure position is valid\r
- if ( position > References.at(refID).RefLength ) return false;\r
- \r
- // determine possible offsets\r
- vector<int64_t> offsets;\r
- if ( !NewIndex->GetOffsets(Region, IsRightBoundSpecified, offsets) ) {\r
- printf("ERROR: Could not jump: unable to calculate offset for specified region.\n");\r
- return false;\r
- }\r
- \r
- // iterate through offsets\r
- BamAlignment bAlignment;\r
- bool result = true;\r
- for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {\r
- \r
- // attempt seek & load first available alignment\r
- result &= mBGZF.Seek(*o);\r
- LoadNextAlignment(bAlignment);\r
- \r
- // if this alignment corresponds to desired position\r
- // return success of seeking back to 'current offset'\r
- if ( (bAlignment.RefID == refID && bAlignment.Position + bAlignment.Length > position) || (bAlignment.RefID > refID) ) {\r
- if ( o != offsets.begin() ) --o;\r
- return mBGZF.Seek(*o);\r
- }\r
- }\r
- \r
- return result;\r
-}\r
-\r
// load BAM header data\r
void BamReader::BamReaderPrivate::LoadHeaderData(void) {\r
\r
// check to see if proper BAM header\r
char buffer[4];\r
if (mBGZF.Read(buffer, 4) != 4) {\r
- printf("Could not read header type\n");\r
+ fprintf(stderr, "Could not read header type\n");\r
exit(1);\r
}\r
\r
if (strncmp(buffer, "BAM\001", 4)) {\r
- printf("wrong header type!\n");\r
+ fprintf(stderr, "wrong header type!\n");\r
exit(1);\r
}\r
\r
// get BAM header text length\r
mBGZF.Read(buffer, 4);\r
unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( IsBigEndian ) { SwapEndian_32(headerTextLength); }\r
+ if ( IsBigEndian ) SwapEndian_32(headerTextLength); \r
\r
// get BAM header text\r
char* headerText = (char*)calloc(headerTextLength + 1, 1);\r
free(headerText);\r
}\r
\r
-// load existing index data from BAM index file (".bai"), return success/fail\r
-bool BamReader::BamReaderPrivate::LoadIndex(void) {\r
+// load existing index data from BAM index file (".bti" OR ".bai"), return success/fail\r
+bool BamReader::BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool preferStandardIndex) {\r
\r
// clear out any existing index data\r
ClearIndex();\r
\r
- // skip if index file empty\r
- if ( IndexFilename.empty() )\r
- return false;\r
-\r
- // check supplied filename for index type\r
- size_t defaultExtensionFound = IndexFilename.find(".bai");\r
- size_t customExtensionFound = IndexFilename.find(".bti");\r
- \r
- // if SAM/BAM default (".bai")\r
- if ( defaultExtensionFound != string::npos )\r
- NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian);\r
- \r
- // if BamTools custom index (".bti")\r
- else if ( customExtensionFound != string::npos )\r
- NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);\r
+ // if no index filename provided, so we need to look for available index files\r
+ if ( IndexFilename.empty() ) {\r
+ \r
+ // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag\r
+ const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS);\r
+ Index = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, IsBigEndian, type);\r
+ \r
+ // if null, return failure\r
+ if ( Index == 0 ) return false;\r
+ \r
+ // generate proper IndexFilename based on type of index created\r
+ IndexFilename = Filename + Index->Extension();\r
+ }\r
\r
- // else unknown\r
else {\r
- printf("ERROR: Unknown index file extension.\n");\r
- return false;\r
+ // attempt to load BamIndex based on IndexFilename provided by client\r
+ Index = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent, IsBigEndian);\r
+ \r
+ // if null, return failure\r
+ if ( Index == 0 ) return false; \r
}\r
\r
- // return success of loading index data from file\r
- IsIndexLoaded = NewIndex->Load(IndexFilename);\r
+ // an index file was found\r
+ // return success of loading the index data from file\r
+ IsIndexLoaded = Index->Load(IndexFilename);\r
return IsIndexLoaded;\r
}\r
\r
mBGZF.Read(buffer, 4);\r
bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);\r
if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); }\r
- if ( bAlignment.SupportData.BlockLength == 0 ) { return false; }\r
+ if ( bAlignment.SupportData.BlockLength == 0 ) return false;\r
\r
// read in core alignment data, make sure the right size of data was read\r
char x[BAM_CORE_SIZE];\r
- if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }\r
+ if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) return false; \r
\r
if ( IsBigEndian ) {\r
- for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) { \r
- SwapEndian_32p(&x[i]); \r
- }\r
+ for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) \r
+ SwapEndian_32p(&x[i]); \r
}\r
\r
// set BamAlignment 'core' and 'support' data\r
char buffer[4];\r
mBGZF.Read(buffer, 4);\r
unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( IsBigEndian ) { SwapEndian_32(numberRefSeqs); }\r
- if (numberRefSeqs == 0) { return; }\r
+ if ( IsBigEndian ) SwapEndian_32(numberRefSeqs);\r
+ if ( numberRefSeqs == 0 ) return;\r
References.reserve((int)numberRefSeqs);\r
\r
// iterate over all references in header\r
// get length of reference name\r
mBGZF.Read(buffer, 4);\r
unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( IsBigEndian ) { SwapEndian_32(refNameLength); }\r
+ if ( IsBigEndian ) SwapEndian_32(refNameLength);\r
char* refName = (char*)calloc(refNameLength, 1);\r
\r
// get reference name and reference sequence length\r
mBGZF.Read(refName, refNameLength);\r
mBGZF.Read(buffer, 4);\r
int refLength = BgzfData::UnpackSignedInt(buffer);\r
- if ( IsBigEndian ) { SwapEndian_32(refLength); }\r
+ if ( IsBigEndian ) SwapEndian_32(refLength); \r
\r
// store data for reference\r
RefData aReference;\r
}\r
\r
// opens BAM file (and index)\r
-bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) {\r
+bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename, const bool lookForIndex, const bool preferStandardIndex) {\r
\r
+ // store filenames\r
Filename = filename;\r
IndexFilename = indexFilename;\r
\r
// open the BGZF file for reading, return false on failure\r
- if ( !mBGZF.Open(filename, "rb") ) \r
- return false;\r
+ if ( !mBGZF.Open(filename, "rb") ) return false; \r
\r
// retrieve header text & reference data\r
LoadHeaderData();\r
// store file offset of first alignment\r
AlignmentsBeginOffset = mBGZF.Tell();\r
\r
- // open index file & load index data (if exists)\r
- if ( !IndexFilename.empty() )\r
- LoadIndex();\r
+ // if no index filename provided \r
+ if ( IndexFilename.empty() ) {\r
+ \r
+ // client did not specify that index SHOULD be found\r
+ // useful for cases where sequential access is all that is required\r
+ if ( !lookForIndex ) return true; \r
+ \r
+ // otherwise, look for index file, return success/fail\r
+ return LoadIndex(lookForIndex, preferStandardIndex) ;\r
+ }\r
\r
- // return success\r
- return true;\r
+ // client supplied an index filename\r
+ // attempt to load index data, return success/fail\r
+ return LoadIndex(lookForIndex, preferStandardIndex); \r
}\r
\r
// returns BAM file pointer to beginning of alignment data\r
bool BamReader::BamReaderPrivate::Rewind(void) {\r
\r
- // rewind to first alignment\r
+ // rewind to first alignment, return false if unable to seek\r
if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false;\r
\r
- // retrieve first alignment data\r
+ // retrieve first alignment data, return false if unable to read\r
BamAlignment al;\r
if ( !LoadNextAlignment(al) ) return false;\r
\r
// reset default region info using first alignment in file\r
- Region.LeftRefID = al.RefID;\r
- Region.LeftPosition = al.Position;\r
- Region.RightRefID = -1;\r
- Region.RightPosition = -1;\r
- IsLeftBoundSpecified = false;\r
- IsRightBoundSpecified = false; \r
-\r
- // rewind back to before first alignment\r
+ Region.clear();\r
+ Region.LeftRefID = al.RefID;\r
+ Region.LeftPosition = al.Position;\r
+\r
+ // rewind back to beginning of first alignment\r
// return success/fail of seek\r
return mBGZF.Seek(AlignmentsBeginOffset);\r
}\r
\r
-// sets a region of interest (with left & right bound reference/position)\r
-// attempts a Jump() to left bound as well\r
-// returns success/failure of Jump()\r
+// asks Index to attempt a Jump() to specified region\r
+// returns success/failure\r
bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) {\r
+ \r
+ // clear out any prior BamReader region data\r
+ //\r
+ // N.B. - this is cleared so that BamIndex now has free reign to call\r
+ // GetNextAlignmentCore() and do overlap checking without worrying about BamReader \r
+ // performing any overlap checking of its own and moving on to the next read... Calls \r
+ // to GetNextAlignmentCore() with no Region set, simply return the next alignment.\r
+ // This ensures that the Index is able to do just that. (All without exposing \r
+ // LoadNextAlignment() to the public API, and potentially confusing clients with the nomenclature)\r
+ Region.clear();\r
+ \r
+ // check for existing index \r
+ if ( !IsIndexLoaded || Index == 0 ) return false; \r
\r
- // save region of interest\r
+ // attempt jump to user-specified region, return false if failed\r
+ if ( !Index->Jump(region) ) return false;\r
+ \r
+ // if successful, save region data locally, return success\r
Region = region;\r
- \r
- // set flags\r
- if ( region.LeftRefID >= 0 && region.LeftPosition >= 0 ) \r
- IsLeftBoundSpecified = true;\r
- if ( region.RightRefID >= 0 && region.RightPosition >= 0 ) \r
- IsRightBoundSpecified = true;\r
- \r
- // attempt jump to beginning of region, return success/fail of Jump()\r
- return Jump( Region.LeftRefID, Region.LeftPosition );\r
+ return true;\r
}\r