// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 14 April 2010 (DB)\r
+// Last modified: 8 June 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
// Institute.\r
using namespace BamTools;\r
using namespace std;\r
\r
-namespace BamTools {\r
- struct BamAlignmentSupportData {\r
- string AllCharData;\r
- uint32_t BlockLength;\r
- uint32_t NumCigarOperations;\r
- uint32_t QueryNameLength;\r
- uint32_t QuerySequenceLength;\r
- };\r
-} // namespace BamTools\r
-\r
struct BamReader::BamReaderPrivate {\r
\r
// -------------------------------\r
int64_t AlignmentsBeginOffset;\r
string Filename;\r
string IndexFilename;\r
-
+ \r
// system data\r
bool IsBigEndian;\r
\r
// "public" interface\r
// -------------------------------\r
\r
- // flie operations\r
+ // file operations\r
void Close(void);\r
bool Jump(int refID, int position = 0);\r
void Open(const string& filename, const string& indexFilename = "");\r
\r
// access alignment data\r
bool GetNextAlignment(BamAlignment& bAlignment);\r
+ bool GetNextAlignmentCore(BamAlignment& bAlignment);\r
\r
// access auxiliary data\r
int GetReferenceID(const string& refName) const;\r
// calculate bins that overlap region ( left to reference end for now )\r
int BinsFromRegion(int refID, int left, uint16_t[MAX_BIN]);\r
// fills out character data for BamAlignment data\r
- bool BuildCharData(BamAlignment& bAlignment, const BamAlignmentSupportData& supportData);\r
+ bool BuildCharData(BamAlignment& bAlignment);\r
// calculate file offset for first alignment chunk overlapping 'left'\r
int64_t GetOffset(int refID, int left);\r
// checks to see if alignment overlaps current region\r
// retrieves header text from BAM file\r
void LoadHeaderData(void);\r
// retrieves BAM alignment under file pointer\r
- bool LoadNextAlignment(BamAlignment& bAlignment, BamAlignmentSupportData& supportData);\r
+ bool LoadNextAlignment(BamAlignment& bAlignment);\r
// builds reference data structure from BAM file\r
void LoadReferenceData(void);\r
\r
\r
// access alignment data\r
bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); }\r
+bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); }\r
\r
// access auxiliary data\r
const string BamReader::GetHeaderText(void) const { return d->HeaderText; }\r
int BamReader::GetReferenceCount(void) const { return d->References.size(); }\r
const RefVector BamReader::GetReferenceData(void) const { return d->References; }\r
int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }\r
+const std::string BamReader::GetFilename(void) const { return d->Filename; }\r
\r
// index operations\r
bool BamReader::CreateIndex(void) { return d->CreateIndex(); }\r
return i;\r
}\r
\r
-bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment, const BamAlignmentSupportData& supportData) {\r
+bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) {\r
\r
// calculate character lengths/offsets\r
- const unsigned int dataLength = supportData.BlockLength - BAM_CORE_SIZE;\r
- const unsigned int seqDataOffset = supportData.QueryNameLength + (supportData.NumCigarOperations * 4);\r
- const unsigned int qualDataOffset = seqDataOffset + (supportData.QuerySequenceLength+1)/2;\r
- const unsigned int tagDataOffset = qualDataOffset + supportData.QuerySequenceLength;\r
+ const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
+ const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);\r
+ const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;\r
+ const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;\r
const unsigned int tagDataLength = dataLength - tagDataOffset;\r
\r
// set up char buffers\r
- const char* allCharData = supportData.AllCharData.data();\r
+ const char* allCharData = bAlignment.SupportData.AllCharData.data();\r
const char* seqData = ((const char*)allCharData) + seqDataOffset;\r
const char* qualData = ((const char*)allCharData) + qualDataOffset;\r
char* tagData = ((char*)allCharData) + tagDataOffset;\r
\r
// save query sequence\r
bAlignment.QueryBases.clear();\r
- bAlignment.QueryBases.reserve(supportData.QuerySequenceLength);\r
- for (unsigned int i = 0; i < supportData.QuerySequenceLength; ++i) {\r
+ bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
+ for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {\r
char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];\r
bAlignment.QueryBases.append(1, singleBase);\r
}\r
\r
// save qualities, converting from numeric QV to 'FASTQ-style' ASCII character\r
bAlignment.Qualities.clear();\r
- bAlignment.Qualities.reserve(supportData.QuerySequenceLength);\r
- for (unsigned int i = 0; i < supportData.QuerySequenceLength; ++i) {\r
+ bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength);\r
+ for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {\r
char singleQuality = (char)(qualData[i]+33);\r
bAlignment.Qualities.append(1, singleQuality);\r
}\r
\r
// parse CIGAR to build 'AlignedBases'\r
bAlignment.AlignedBases.clear();\r
- bAlignment.AlignedBases.reserve(supportData.QuerySequenceLength);\r
+ bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
\r
int k = 0;\r
vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();\r
bAlignment.TagData.resize(tagDataLength);\r
memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength);\r
\r
+ // set support data parsed flag\r
+ bAlignment.SupportData.IsParsed = true;\r
+ \r
// return success\r
return true;\r
}\r
// get next alignment (from specified region, if given)\r
bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {\r
\r
- BamAlignmentSupportData supportData;\r
- \r
// if valid alignment available\r
- if ( LoadNextAlignment(bAlignment, supportData) ) {\r
+ if ( LoadNextAlignment(bAlignment) ) {\r
\r
// if region not specified, return success\r
if ( !IsRegionSpecified ) { \r
- bool ok = BuildCharData(bAlignment, supportData);\r
+ bool ok = BuildCharData(bAlignment);\r
return ok; \r
}\r
\r
// load next alignment until region overlap is found\r
while ( !IsOverlap(bAlignment) ) {\r
// if no valid alignment available (likely EOF) return failure\r
- if ( !LoadNextAlignment(bAlignment, supportData) ) { return false; }\r
+ if ( !LoadNextAlignment(bAlignment) ) return false;\r
}\r
\r
// return success (alignment found that overlaps region)\r
- bool ok = BuildCharData(bAlignment, supportData);\r
+ bool ok = BuildCharData(bAlignment);\r
return ok;\r
}\r
\r
// no valid alignment\r
- else { return false; }\r
+ else \r
+ return false;\r
+}\r
+\r
+// retrieves next available alignment core data (returns success/fail)\r
+// ** DOES NOT parse any character data (bases, qualities, tag data)\r
+// these can be accessed, if necessary, from the supportData \r
+// useful for operations requiring ONLY positional or other alignment-related information\r
+bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) {\r
+\r
+ // if valid alignment available\r
+ if ( LoadNextAlignment(bAlignment) ) {\r
+\r
+ // if region not specified, return success\r
+ if ( !IsRegionSpecified ) return true;\r
+\r
+ // load next alignment until region overlap is found\r
+ while ( !IsOverlap(bAlignment) ) {\r
+ // if no valid alignment available (likely EOF) return failure\r
+ if ( !LoadNextAlignment(bAlignment) ) return false;\r
+ }\r
+\r
+ // return success (alignment found that overlaps region)\r
+ return true;\r
+ }\r
+\r
+ // no valid alignment\r
+ else\r
+ return false;\r
}\r
\r
// calculate closest indexed file offset for region specified\r
const uint64_t& lastOffset)\r
{\r
// get converted offsets\r
- int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
+ int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;\r
int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;\r
\r
// resize vector if necessary\r
// read starts after left boundary\r
if ( bAlignment.Position >= CurrentLeft) { return true; }\r
\r
- // return whether alignment end overlaps left boundary
+ // return whether alignment end overlaps left boundary\r
return ( bAlignment.GetEndPosition() >= CurrentLeft );\r
}\r
\r
}\r
\r
// populates BamAlignment with alignment data under file pointer, returns success/fail\r
-bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment, BamAlignmentSupportData& supportData) {\r
+bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {\r
\r
// read in the 'block length' value, make sure it's not zero\r
char buffer[4];\r
mBGZF.Read(buffer, 4);\r
- supportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( IsBigEndian ) { SwapEndian_32(supportData.BlockLength); }\r
- if ( supportData.BlockLength == 0 ) { return false; }\r
+ bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); }\r
+ if ( bAlignment.SupportData.BlockLength == 0 ) { return false; }\r
\r
// read in core alignment data, make sure the right size of data was read\r
char x[BAM_CORE_SIZE];\r
unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]);\r
bAlignment.Bin = tempValue >> 16;\r
bAlignment.MapQuality = tempValue >> 8 & 0xff;\r
- supportData.QueryNameLength = tempValue & 0xff;\r
+ bAlignment.SupportData.QueryNameLength = tempValue & 0xff;\r
\r
tempValue = BgzfData::UnpackUnsignedInt(&x[12]);\r
bAlignment.AlignmentFlag = tempValue >> 16;\r
- supportData.NumCigarOperations = tempValue & 0xffff;\r
+ bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff;\r
\r
- supportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);\r
+ bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);\r
bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]);\r
bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);\r
bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);\r
\r
// store 'all char data' and cigar ops\r
- const unsigned int dataLength = supportData.BlockLength - BAM_CORE_SIZE;\r
- const unsigned int cigarDataOffset = supportData.QueryNameLength;\r
+ const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
+ const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;\r
\r
char* allCharData = (char*)calloc(sizeof(char), dataLength);\r
uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);\r
\r
// store alignment name and length\r
bAlignment.Name.assign((const char*)(allCharData));\r
- bAlignment.Length = supportData.QuerySequenceLength;\r
+ bAlignment.Length = bAlignment.SupportData.QuerySequenceLength;\r
\r
// store remaining 'allCharData' in supportData structure\r
- supportData.AllCharData.assign((const char*)allCharData, dataLength);\r
+ bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength);\r
\r
// save CigarOps for BamAlignment\r
bAlignment.CigarData.clear();\r
- for (unsigned int i = 0; i < supportData.NumCigarOperations; ++i) {\r
+ for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {\r
\r
// swap if necessary\r
if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); }\r