// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 29 March 2010 (DB)\r
+// Last modified: 15 July 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
// Institute.\r
#include <iterator>\r
#include <string>\r
#include <vector>\r
+#include <iostream>\r
\r
// BamTools includes\r
#include "BGZF.h"\r
#include "BamReader.h"\r
+#include "BamIndex.h"\r
using namespace BamTools;\r
using namespace std;\r
\r
struct BamReader::BamReaderPrivate {\r
\r
+ // -------------------------------\r
+ // structs, enums, typedefs\r
+ // -------------------------------\r
+ enum RegionState { BEFORE_REGION = 0\r
+ , WITHIN_REGION\r
+ , AFTER_REGION\r
+ };\r
+\r
// -------------------------------\r
// data members\r
// -------------------------------\r
\r
- // general data\r
+ // general file data\r
BgzfData mBGZF;\r
string HeaderText;\r
- BamIndex Index;\r
+ //BamIndex Index;\r
+ BamIndex* NewIndex;\r
RefVector References;\r
bool IsIndexLoaded;\r
int64_t AlignmentsBeginOffset;\r
string Filename;\r
string IndexFilename;\r
\r
+ // system data\r
bool IsBigEndian;\r
\r
// user-specified region values\r
+ BamRegion Region;\r
+ bool IsLeftBoundSpecified;\r
+ bool IsRightBoundSpecified;\r
+ \r
bool IsRegionSpecified;\r
int CurrentRefID;\r
int CurrentLeft;\r
\r
+ // parent BamReader\r
+ BamReader* Parent;\r
+ \r
// BAM character constants\r
const char* DNA_LOOKUP;\r
const char* CIGAR_LOOKUP;\r
// -------------------------------\r
// constructor & destructor\r
// -------------------------------\r
- BamReaderPrivate(void);\r
+ BamReaderPrivate(BamReader* parent);\r
~BamReaderPrivate(void);\r
\r
// -------------------------------\r
// "public" interface\r
// -------------------------------\r
\r
- // flie operations\r
+ // file operations\r
void Close(void);\r
bool Jump(int refID, int position = 0);\r
- void Open(const string& filename, const string& indexFilename = "");\r
+ bool Open(const string& filename, const string& indexFilename = "");\r
bool Rewind(void);\r
+ bool SetRegion(const BamRegion& region);\r
\r
// access alignment data\r
bool GetNextAlignment(BamAlignment& bAlignment);\r
+ bool GetNextAlignmentCore(BamAlignment& bAlignment);\r
\r
// access auxiliary data\r
- const string GetHeaderText(void) const;\r
- const int GetReferenceCount(void) const;\r
- const RefVector GetReferenceData(void) const;\r
- const int GetReferenceID(const string& refName) const;\r
+ int GetReferenceID(const string& refName) const;\r
\r
// index operations\r
- bool CreateIndex(void);\r
+ bool CreateIndex(bool useDefaultIndex);\r
\r
// -------------------------------\r
// internal methods\r
\r
// *** reading alignments and auxiliary data *** //\r
\r
- // calculate bins that overlap region ( left to reference end for now )\r
- int BinsFromRegion(int refID, int left, uint16_t[MAX_BIN]);\r
- // calculates alignment end position based on starting position and provided CIGAR operations\r
- int CalculateAlignmentEnd(const int& position, const std::vector<CigarOp>& cigarData);\r
- // calculate file offset for first alignment chunk overlapping 'left'\r
- int64_t GetOffset(int refID, int left);\r
+ // fills out character data for BamAlignment data\r
+ bool BuildCharData(BamAlignment& bAlignment);\r
// checks to see if alignment overlaps current region\r
- bool IsOverlap(BamAlignment& bAlignment);\r
+ RegionState IsOverlap(BamAlignment& bAlignment);\r
// retrieves header text from BAM file\r
void LoadHeaderData(void);\r
// retrieves BAM alignment under file pointer\r
\r
// *** index file handling *** //\r
\r
- // calculates index for BAM file\r
- bool BuildIndex(void);\r
// clear out inernal index data structure\r
void ClearIndex(void);\r
- // saves BAM bin entry for index\r
- void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset);\r
- // saves linear offset entry for index\r
- void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset);\r
// loads index from BAM index file\r
bool LoadIndex(void);\r
- // simplifies index by merging 'chunks'\r
- void MergeChunks(void);\r
- // round-up 32-bit integer to next power-of-2\r
- void Roundup32(int& value);\r
- // saves index to BAM index file\r
- bool WriteIndex(void);\r
};\r
\r
// -----------------------------------------------------\r
// BamReader implementation (wrapper around BRPrivate)\r
// -----------------------------------------------------\r
-\r
// constructor\r
BamReader::BamReader(void) {\r
- d = new BamReaderPrivate;\r
+ d = new BamReaderPrivate(this);\r
}\r
\r
// destructor\r
\r
// file operations\r
void BamReader::Close(void) { d->Close(); }\r
-bool BamReader::Jump(int refID, int position) { return d->Jump(refID, position); }\r
-void BamReader::Open(const string& filename, const string& indexFilename) { d->Open(filename, indexFilename); }\r
+bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; }\r
+bool BamReader::Jump(int refID, int position) { \r
+ d->Region.LeftRefID = refID;\r
+ d->Region.LeftPosition = position;\r
+ d->IsLeftBoundSpecified = true;\r
+ d->IsRightBoundSpecified = false;\r
+ return d->Jump(refID, position); \r
+}\r
+bool BamReader::Open(const string& filename, const string& indexFilename) { return d->Open(filename, indexFilename); }\r
bool BamReader::Rewind(void) { return d->Rewind(); }\r
+bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); }\r
+bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) {\r
+ return d->SetRegion( BamRegion(leftRefID, leftBound, rightRefID, rightBound) );\r
+}\r
\r
// access alignment data\r
bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); }\r
+bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); }\r
\r
// access auxiliary data\r
-const string BamReader::GetHeaderText(void) const { return d->HeaderText; }\r
-const int BamReader::GetReferenceCount(void) const { return d->References.size(); }\r
-const RefVector BamReader::GetReferenceData(void) const { return d->References; }\r
-const int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }\r
+const string BamReader::GetHeaderText(void) const { return d->HeaderText; }\r
+int BamReader::GetReferenceCount(void) const { return d->References.size(); }\r
+const RefVector& BamReader::GetReferenceData(void) const { return d->References; }\r
+int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }\r
+const std::string BamReader::GetFilename(void) const { return d->Filename; }\r
\r
// index operations\r
-bool BamReader::CreateIndex(void) { return d->CreateIndex(); }\r
+bool BamReader::CreateIndex(bool useDefaultIndex) { return d->CreateIndex(useDefaultIndex); }\r
\r
// -----------------------------------------------------\r
// BamReaderPrivate implementation\r
// -----------------------------------------------------\r
\r
// constructor\r
-BamReader::BamReaderPrivate::BamReaderPrivate(void)\r
- : IsIndexLoaded(false)\r
+BamReader::BamReaderPrivate::BamReaderPrivate(BamReader* parent)\r
+ : NewIndex(0)\r
+ , IsIndexLoaded(false)\r
, AlignmentsBeginOffset(0)\r
+ , IsLeftBoundSpecified(false)\r
+ , IsRightBoundSpecified(false)\r
, IsRegionSpecified(false)\r
, CurrentRefID(0)\r
, CurrentLeft(0)\r
+ , Parent(parent)\r
, DNA_LOOKUP("=ACMGRSVTWYHKDBN")\r
, CIGAR_LOOKUP("MIDNSHP")\r
{ \r
Close();\r
}\r
\r
-// calculate bins that overlap region ( left to reference end for now )\r
-int BamReader::BamReaderPrivate::BinsFromRegion(int refID, int left, uint16_t list[MAX_BIN]) {\r
-\r
- // get region boundaries\r
- uint32_t begin = (unsigned int)left;\r
- uint32_t end = (unsigned int)References.at(refID).RefLength - 1;\r
-\r
- // initialize list, bin '0' always a valid bin\r
- int i = 0;\r
- list[i++] = 0;\r
-\r
- // get rest of bins that contain this region\r
- unsigned int k;\r
- for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; }\r
- for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; }\r
- for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; }\r
- for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; }\r
- for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }\r
-\r
- // return number of bins stored\r
- return i;\r
-}\r
-\r
-// populates BAM index data structure from BAM file data\r
-bool BamReader::BamReaderPrivate::BuildIndex(void) {\r
-\r
- // check to be sure file is open\r
- if (!mBGZF.IsOpen) { return false; }\r
-\r
- // move file pointer to beginning of alignments\r
- Rewind();\r
-\r
- // get reference count, reserve index space\r
- int numReferences = References.size();\r
- for ( int i = 0; i < numReferences; ++i ) {\r
- Index.push_back(ReferenceIndex());\r
+bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) {\r
+ \r
+ // calculate character lengths/offsets\r
+ const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
+ const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength;\r
+ const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4);\r
+ const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2;\r
+ const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength;\r
+ const unsigned int tagDataLength = dataLength - tagDataOffset;\r
+ \r
+ // set up char buffers\r
+ const char* allCharData = bAlignment.SupportData.AllCharData.data();\r
+ uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);\r
+ const char* seqData = ((const char*)allCharData) + seqDataOffset;\r
+ const char* qualData = ((const char*)allCharData) + qualDataOffset;\r
+ char* tagData = ((char*)allCharData) + tagDataOffset;\r
+ \r
+ // store alignment name (depends on null char as terminator)\r
+ bAlignment.Name.assign((const char*)(allCharData)); \r
+ \r
+ // save CigarOps \r
+ CigarOp op;\r
+ bAlignment.CigarData.clear();\r
+ bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations);\r
+ for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) {\r
+\r
+ // swap if necessary\r
+ if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); }\r
+ \r
+ // build CigarOp structure\r
+ op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
+ op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
+\r
+ // save CigarOp\r
+ bAlignment.CigarData.push_back(op);\r
}\r
-\r
- // sets default constant for bin, ID, offset, coordinate variables\r
- const uint32_t defaultValue = 0xffffffffu;\r
-\r
- // bin data\r
- uint32_t saveBin(defaultValue);\r
- uint32_t lastBin(defaultValue);\r
-\r
- // reference ID data\r
- int32_t saveRefID(defaultValue);\r
- int32_t lastRefID(defaultValue);\r
-\r
- // offset data\r
- uint64_t saveOffset = mBGZF.Tell();\r
- uint64_t lastOffset = saveOffset;\r
-\r
- // coordinate data\r
- int32_t lastCoordinate = defaultValue;\r
-\r
- BamAlignment bAlignment;\r
- while( GetNextAlignment(bAlignment) ) {\r
-\r
- // change of chromosome, save ID, reset bin\r
- if ( lastRefID != bAlignment.RefID ) {\r
- lastRefID = bAlignment.RefID;\r
- lastBin = defaultValue;\r
- }\r
-\r
- // if lastCoordinate greater than BAM position - file not sorted properly\r
- else if ( lastCoordinate > bAlignment.Position ) {\r
- printf("BAM file not properly sorted:\n");\r
- printf("Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID);\r
- exit(1);\r
- }\r
-\r
- // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)\r
- if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {\r
-\r
- // save linear offset entry (matched to BAM entry refID)\r
- ReferenceIndex& refIndex = Index.at(bAlignment.RefID);\r
- LinearOffsetVector& offsets = refIndex.Offsets;\r
- InsertLinearOffset(offsets, bAlignment, lastOffset);\r
- }\r
-\r
- // if current BamAlignment bin != lastBin, "then possibly write the binning index"\r
- if ( bAlignment.Bin != lastBin ) {\r
-\r
- // if not first time through\r
- if ( saveBin != defaultValue ) {\r
-\r
- // save Bam bin entry\r
- ReferenceIndex& refIndex = Index.at(saveRefID);\r
- BamBinMap& binMap = refIndex.Bins;\r
- InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);\r
- }\r
-\r
- // update saveOffset\r
- saveOffset = lastOffset;\r
-\r
- // update bin values\r
- saveBin = bAlignment.Bin;\r
- lastBin = bAlignment.Bin;\r
-\r
- // update saveRefID\r
- saveRefID = bAlignment.RefID;\r
-\r
- // if invalid RefID, break out (why?)\r
- if ( saveRefID < 0 ) { break; }\r
- }\r
-\r
- // make sure that current file pointer is beyond lastOffset\r
- if ( mBGZF.Tell() <= (int64_t)lastOffset ) {\r
- printf("Error in BGZF offsets.\n");\r
- exit(1);\r
- }\r
-\r
- // update lastOffset\r
- lastOffset = mBGZF.Tell();\r
-\r
- // update lastCoordinate\r
- lastCoordinate = bAlignment.Position;\r
+ \r
+ \r
+ // save query sequence\r
+ bAlignment.QueryBases.clear();\r
+ bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
+ for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {\r
+ char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];\r
+ bAlignment.QueryBases.append(1, singleBase);\r
}\r
-\r
- // save any leftover BAM data (as long as refID is valid)\r
- if ( saveRefID >= 0 ) {\r
- // save Bam bin entry\r
- ReferenceIndex& refIndex = Index.at(saveRefID);\r
- BamBinMap& binMap = refIndex.Bins;\r
- InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);\r
+ \r
+ // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character\r
+ bAlignment.Qualities.clear();\r
+ bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength);\r
+ for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) {\r
+ char singleQuality = (char)(qualData[i]+33);\r
+ bAlignment.Qualities.append(1, singleQuality);\r
}\r
-\r
- // simplify index by merging chunks\r
- MergeChunks();\r
-\r
- // iterate over references\r
- BamIndex::iterator indexIter = Index.begin();\r
- BamIndex::iterator indexEnd = Index.end();\r
- for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {\r
-\r
- // get reference index data\r
- ReferenceIndex& refIndex = (*indexIter);\r
- BamBinMap& binMap = refIndex.Bins;\r
- LinearOffsetVector& offsets = refIndex.Offsets;\r
-\r
- // store whether reference has alignments or no\r
- References[i].RefHasAlignments = ( binMap.size() > 0 );\r
-\r
- // sort linear offsets\r
- sort(offsets.begin(), offsets.end());\r
+ \r
+ // if QueryBases is empty (and this is a allowed case)\r
+ if ( bAlignment.QueryBases.empty() ) \r
+ bAlignment.AlignedBases = bAlignment.QueryBases;\r
+ \r
+ // if QueryBases contains data, then build AlignedBases using CIGAR data\r
+ else {\r
+ \r
+ // resize AlignedBases\r
+ bAlignment.AlignedBases.clear();\r
+ bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength);\r
+ \r
+ // iterate over CigarOps\r
+ int k = 0;\r
+ vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();\r
+ vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end();\r
+ for ( ; cigarIter != cigarEnd; ++cigarIter ) {\r
+ \r
+ const CigarOp& op = (*cigarIter);\r
+ switch(op.Type) {\r
+ \r
+ case ('M') :\r
+ case ('I') :\r
+ bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases\r
+ // fall through\r
+ \r
+ case ('S') :\r
+ k += op.Length; // for 'S' - soft clip, skip over query bases\r
+ break;\r
+ \r
+ case ('D') :\r
+ bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character\r
+ break;\r
+ \r
+ case ('P') :\r
+ bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character\r
+ break;\r
+ \r
+ case ('N') :\r
+ bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence\r
+ break;\r
+ \r
+ case ('H') :\r
+ break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op\r
+ \r
+ default:\r
+ printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here\r
+ exit(1);\r
+ }\r
+ }\r
}\r
+ \r
+ // -----------------------\r
+ // Added: 3-25-2010 DB\r
+ // Fixed: endian-correctness for tag data\r
+ // -----------------------\r
+ if ( IsBigEndian ) {\r
+ int i = 0;\r
+ while ( (unsigned int)i < tagDataLength ) {\r
+ \r
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)\r
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning \r
+ ++i; // skip value type\r
+ \r
+ switch (type) {\r
+ \r
+ case('A') :\r
+ case('C') : \r
+ ++i;\r
+ break;\r
\r
-\r
- // rewind file pointer to beginning of alignments, return success/fail\r
- return Rewind();\r
-}\r
-\r
-// calculates alignment end position based on starting position and provided CIGAR operations\r
-int BamReader::BamReaderPrivate::CalculateAlignmentEnd(const int& position, const vector<CigarOp>& cigarData) {\r
-\r
- // initialize alignment end to starting position\r
- int alignEnd = position;\r
-\r
- // iterate over cigar operations\r
- vector<CigarOp>::const_iterator cigarIter = cigarData.begin();\r
- vector<CigarOp>::const_iterator cigarEnd = cigarData.end();\r
- for ( ; cigarIter != cigarEnd; ++cigarIter) {\r
- char cigarType = (*cigarIter).Type;\r
- if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {\r
- alignEnd += (*cigarIter).Length;\r
+ case('S') : \r
+ SwapEndian_16p(&tagData[i]); \r
+ i += sizeof(uint16_t);\r
+ break;\r
+ \r
+ case('F') :\r
+ case('I') : \r
+ SwapEndian_32p(&tagData[i]);\r
+ i += sizeof(uint32_t);\r
+ break;\r
+ \r
+ case('D') : \r
+ SwapEndian_64p(&tagData[i]);\r
+ i += sizeof(uint64_t);\r
+ break;\r
+ \r
+ case('H') :\r
+ case('Z') : \r
+ while (tagData[i]) { ++i; }\r
+ ++i; // increment one more for null terminator\r
+ break;\r
+ \r
+ default : \r
+ printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
+ exit(1);\r
+ }\r
}\r
}\r
- return alignEnd;\r
+ \r
+ // store TagData\r
+ bAlignment.TagData.clear();\r
+ bAlignment.TagData.resize(tagDataLength);\r
+ memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength);\r
+ \r
+ // clear the core-only flag\r
+ bAlignment.SupportData.HasCoreOnly = false;\r
+ \r
+ // return success\r
+ return true;\r
}\r
\r
-\r
// clear index data structure\r
void BamReader::BamReaderPrivate::ClearIndex(void) {\r
- Index.clear(); // sufficient ??\r
+ delete NewIndex;\r
+ NewIndex = 0;\r
}\r
\r
// closes the BAM file\r
void BamReader::BamReaderPrivate::Close(void) {\r
+ \r
+ // close BGZF file stream\r
mBGZF.Close();\r
+ \r
+ // clear out index data\r
ClearIndex();\r
+ \r
+ // clear out header data\r
HeaderText.clear();\r
- IsRegionSpecified = false;\r
+ \r
+ // clear out region flags\r
+ IsLeftBoundSpecified = false;\r
+ IsRightBoundSpecified = false;\r
+ IsRegionSpecified = false;\r
}\r
\r
// create BAM index from BAM file (keep structure in memory) and write to default index output file\r
-bool BamReader::BamReaderPrivate::CreateIndex(void) {\r
+bool BamReader::BamReaderPrivate::CreateIndex(bool useDefaultIndex) {\r
\r
- // clear out index\r
+ // clear out prior index data\r
ClearIndex();\r
-\r
- // build (& save) index from BAM file\r
+ \r
+ // create default index\r
+ if ( useDefaultIndex )\r
+ NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian);\r
+ // create BamTools 'custom' index\r
+ else\r
+ NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);\r
+ \r
bool ok = true;\r
- ok &= BuildIndex();\r
- ok &= WriteIndex();\r
-\r
+ ok &= NewIndex->Build();\r
+ ok &= NewIndex->Write(Filename); \r
+ \r
// return success/fail\r
return ok;\r
}\r
\r
-// returns RefID for given RefName (returns References.size() if not found)\r
-const int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const {\r
-\r
- // retrieve names from reference data\r
- vector<string> refNames;\r
- RefVector::const_iterator refIter = References.begin();\r
- RefVector::const_iterator refEnd = References.end();\r
- for ( ; refIter != refEnd; ++refIter) {\r
- refNames.push_back( (*refIter).RefName );\r
- }\r
+// get next alignment (from specified region, if given)\r
+bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {\r
\r
- // return 'index-of' refName ( if not found, returns refNames.size() )\r
- return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));\r
+ // if valid alignment found, attempt to parse char data, and return success/failure\r
+ if ( GetNextAlignmentCore(bAlignment) )\r
+ return BuildCharData(bAlignment);\r
+ \r
+ // no valid alignment found\r
+ else\r
+ return false;\r
}\r
\r
-// get next alignment (from specified region, if given)\r
-bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {\r
+// retrieves next available alignment core data (returns success/fail)\r
+// ** DOES NOT parse any character data (read name, bases, qualities, tag data)\r
+// these can be accessed, if necessary, from the supportData \r
+// useful for operations requiring ONLY positional or other alignment-related information\r
+bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) {\r
\r
// if valid alignment available\r
if ( LoadNextAlignment(bAlignment) ) {\r
\r
+ // set core-only flag\r
+ bAlignment.SupportData.HasCoreOnly = true;\r
+ \r
// if region not specified, return success\r
- if ( !IsRegionSpecified ) { return true; }\r
+ if ( !IsLeftBoundSpecified ) return true;\r
+\r
+ // determine region state (before, within, after)\r
+ BamReader::BamReaderPrivate::RegionState state = IsOverlap(bAlignment);\r
+ \r
+ // if alignment lies after region, return false\r
+ if ( state == AFTER_REGION ) \r
+ return false;\r
\r
- // load next alignment until region overlap is found\r
- while ( !IsOverlap(bAlignment) ) {\r
+ while ( state != WITHIN_REGION ) {\r
// if no valid alignment available (likely EOF) return failure\r
- if ( !LoadNextAlignment(bAlignment) ) { return false; }\r
+ if ( !LoadNextAlignment(bAlignment) ) return false;\r
+ // if alignment lies after region, return false (no available read within region)\r
+ state = IsOverlap(bAlignment);\r
+ if ( state == AFTER_REGION) return false;\r
+ \r
}\r
\r
// return success (alignment found that overlaps region)\r
}\r
\r
// no valid alignment\r
- else { return false; }\r
+ else\r
+ return false;\r
}\r
\r
-// calculate closest indexed file offset for region specified\r
-int64_t BamReader::BamReaderPrivate::GetOffset(int refID, int left) {\r
-\r
- // calculate which bins overlap this region\r
- uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);\r
- int numBins = BinsFromRegion(refID, left, bins);\r
-\r
- // get bins for this reference\r
- const ReferenceIndex& refIndex = Index.at(refID);\r
- const BamBinMap& binMap = refIndex.Bins;\r
-\r
- // get minimum offset to consider\r
- const LinearOffsetVector& offsets = refIndex.Offsets;\r
- uint64_t minOffset = ( (unsigned int)(left>>BAM_LIDX_SHIFT) >= offsets.size() ) ? 0 : offsets.at(left>>BAM_LIDX_SHIFT);\r
-\r
- // store offsets to beginning of alignment 'chunks'\r
- std::vector<int64_t> chunkStarts;\r
-\r
- // store all alignment 'chunk' starts for bins in this region\r
- for (int i = 0; i < numBins; ++i ) {\r
- uint16_t binKey = bins[i];\r
-\r
- map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);\r
- if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {\r
-\r
- const ChunkVector& chunks = (*binIter).second;\r
- std::vector<Chunk>::const_iterator chunksIter = chunks.begin();\r
- std::vector<Chunk>::const_iterator chunksEnd = chunks.end();\r
- for ( ; chunksIter != chunksEnd; ++chunksIter) {\r
- const Chunk& chunk = (*chunksIter);\r
- if ( chunk.Stop > minOffset ) {\r
- chunkStarts.push_back( chunk.Start );\r
- }\r
- }\r
- }\r
- }\r
-\r
- // clean up memory\r
- free(bins);\r
-\r
- // if no alignments found, else return smallest offset for alignment starts\r
- if ( chunkStarts.size() == 0 ) { return -1; }\r
- else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }\r
-}\r
+// returns RefID for given RefName (returns References.size() if not found)\r
+int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const {\r
\r
-// saves BAM bin entry for index\r
-void BamReader::BamReaderPrivate::InsertBinEntry(BamBinMap& binMap,\r
- const uint32_t& saveBin,\r
- const uint64_t& saveOffset,\r
- const uint64_t& lastOffset)\r
-{\r
- // look up saveBin\r
- BamBinMap::iterator binIter = binMap.find(saveBin);\r
-\r
- // create new chunk\r
- Chunk newChunk(saveOffset, lastOffset);\r
-\r
- // if entry doesn't exist\r
- if ( binIter == binMap.end() ) {\r
- ChunkVector newChunks;\r
- newChunks.push_back(newChunk);\r
- binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));\r
+ // retrieve names from reference data\r
+ vector<string> refNames;\r
+ RefVector::const_iterator refIter = References.begin();\r
+ RefVector::const_iterator refEnd = References.end();\r
+ for ( ; refIter != refEnd; ++refIter) {\r
+ refNames.push_back( (*refIter).RefName );\r
}\r
\r
- // otherwise\r
- else {\r
- ChunkVector& binChunks = (*binIter).second;\r
- binChunks.push_back( newChunk );\r
- }\r
+ // return 'index-of' refName ( if not found, returns refNames.size() )\r
+ return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));\r
}\r
\r
-// saves linear offset entry for index\r
-void BamReader::BamReaderPrivate::InsertLinearOffset(LinearOffsetVector& offsets,\r
- const BamAlignment& bAlignment,\r
- const uint64_t& lastOffset)\r
-{\r
- // get converted offsets\r
- int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;\r
- int endOffset = ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) - 1) >> BAM_LIDX_SHIFT;\r
-\r
- // resize vector if necessary\r
- int oldSize = offsets.size();\r
- int newSize = endOffset + 1;\r
- if ( oldSize < newSize ) { \r
- Roundup32(newSize);\r
- offsets.resize(newSize, 0);\r
- }\r
-\r
- // store offset\r
- for(int i = beginOffset + 1; i <= endOffset ; ++i) {\r
- if ( offsets[i] == 0) {\r
- offsets[i] = lastOffset;\r
- }\r
+// returns region state - whether alignment ends before, overlaps, or starts after currently specified region\r
+// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true\r
+BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {\r
+ \r
+ // --------------------------------------------------\r
+ // check alignment start against right bound cutoff\r
+ \r
+ // if full region of interest was given\r
+ if ( IsRightBoundSpecified ) {\r
+ \r
+ // read starts on right bound reference, but AFTER right bound position\r
+ if ( bAlignment.RefID == Region.RightRefID && bAlignment.Position > Region.RightPosition )\r
+ return AFTER_REGION;\r
+ \r
+ // if read starts on reference AFTER right bound, return false\r
+ if ( bAlignment.RefID > Region.RightRefID ) \r
+ return AFTER_REGION;\r
}\r
-}\r
-\r
-// returns whether alignment overlaps currently specified region (refID, leftBound)\r
-bool BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {\r
-\r
- // if on different reference sequence, quit\r
- if ( bAlignment.RefID != CurrentRefID ) { return false; }\r
-\r
- // read starts after left boundary\r
- if ( bAlignment.Position >= CurrentLeft) { return true; }\r
-\r
- // return whether alignment end overlaps left boundary\r
- return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= CurrentLeft );\r
+ \r
+ // --------------------------------------------------------\r
+ // no right bound given OR read starts before right bound\r
+ // so, check if it overlaps left bound \r
+ \r
+ // if read starts on left bound reference AND after left boundary, return success\r
+ if ( bAlignment.RefID == Region.LeftRefID && bAlignment.Position >= Region.LeftPosition)\r
+ return WITHIN_REGION;\r
+ \r
+ // if read is on any reference sequence before left bound, return false\r
+ if ( bAlignment.RefID < Region.LeftRefID )\r
+ return BEFORE_REGION;\r
+\r
+ // --------------------------------------------------------\r
+ // read is on left bound reference, but starts before left bound position\r
+\r
+ // if it overlaps, return WITHIN_REGION\r
+ if ( bAlignment.GetEndPosition() >= Region.LeftPosition )\r
+ return WITHIN_REGION;\r
+ // else begins before left bound position\r
+ else\r
+ return BEFORE_REGION;\r
}\r
\r
// jumps to specified region(refID, leftBound) in BAM file, returns success/fail\r
bool BamReader::BamReaderPrivate::Jump(int refID, int position) {\r
\r
- // if data exists for this reference and position is valid \r
- if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {\r
-\r
- // set current region\r
- CurrentRefID = refID;\r
- CurrentLeft = position;\r
- IsRegionSpecified = true;\r
-\r
- // calculate offset\r
- int64_t offset = GetOffset(CurrentRefID, CurrentLeft);\r
-\r
- // if in valid offset, return failure\r
- if ( offset == -1 ) { return false; }\r
-\r
- // otherwise return success of seek operation\r
- else { return mBGZF.Seek(offset); }\r
+ // -----------------------------------------------------------------------\r
+ // check for existing index \r
+ if ( NewIndex == 0 ) return false; \r
+ // see if reference has alignments\r
+ if ( !NewIndex->HasAlignments(refID) ) return false; \r
+ // make sure position is valid\r
+ if ( position > References.at(refID).RefLength ) return false;\r
+ \r
+ // determine possible offsets\r
+ vector<int64_t> offsets;\r
+ if ( !NewIndex->GetOffsets(Region, IsRightBoundSpecified, offsets) ) {\r
+ printf("ERROR: Could not jump: unable to calculate offset for specified region.\n");\r
+ return false;\r
}\r
-\r
- // invalid jump request parameters, return failure\r
- return false;\r
+ \r
+ // iterate through offsets\r
+ BamAlignment bAlignment;\r
+ bool result = true;\r
+ for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) {\r
+ \r
+ // attempt seek & load first available alignment\r
+ result &= mBGZF.Seek(*o);\r
+ LoadNextAlignment(bAlignment);\r
+ \r
+ // if this alignment corresponds to desired position\r
+ // return success of seeking back to 'current offset'\r
+ if ( (bAlignment.RefID == refID && bAlignment.Position + bAlignment.Length > position) || (bAlignment.RefID > refID) ) {\r
+ if ( o != offsets.begin() ) --o;\r
+ return mBGZF.Seek(*o);\r
+ }\r
+ }\r
+ \r
+ return result;\r
}\r
\r
// load BAM header data\r
// load existing index data from BAM index file (".bai"), return success/fail\r
bool BamReader::BamReaderPrivate::LoadIndex(void) {\r
\r
- // clear out index data\r
+ // clear out any existing index data\r
ClearIndex();\r
\r
// skip if index file empty\r
- if ( IndexFilename.empty() ) { return false; }\r
-\r
- // open index file, abort on error\r
- FILE* indexStream = fopen(IndexFilename.c_str(), "rb");\r
- if(!indexStream) {\r
- printf("ERROR: Unable to open the BAM index file %s for reading.\n", IndexFilename.c_str() );\r
+ if ( IndexFilename.empty() )\r
return false;\r
- }\r
\r
- size_t elementsRead = 0;\r
- \r
- // see if index is valid BAM index\r
- char magic[4];\r
- elementsRead = fread(magic, 1, 4, indexStream);\r
- if (strncmp(magic, "BAI\1", 4)) {\r
- printf("Problem with index file - invalid format.\n");\r
- fclose(indexStream);\r
+ // check supplied filename for index type\r
+ size_t defaultExtensionFound = IndexFilename.find(".bai");\r
+ size_t customExtensionFound = IndexFilename.find(".bti");\r
+ \r
+ // if SAM/BAM default (".bai")\r
+ if ( defaultExtensionFound != string::npos )\r
+ NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian);\r
+ \r
+ // if BamTools custom index (".bti")\r
+ else if ( customExtensionFound != string::npos )\r
+ NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian);\r
+ \r
+ // else unknown\r
+ else {\r
+ printf("ERROR: Unknown index file extension.\n");\r
return false;\r
}\r
-\r
- // get number of reference sequences\r
- uint32_t numRefSeqs;\r
- elementsRead = fread(&numRefSeqs, 4, 1, indexStream);\r
- if ( IsBigEndian ) { SwapEndian_32(numRefSeqs); }\r
\r
- // intialize space for BamIndex data structure\r
- Index.reserve(numRefSeqs);\r
-\r
- // iterate over reference sequences\r
- for (unsigned int i = 0; i < numRefSeqs; ++i) {\r
-\r
- // get number of bins for this reference sequence\r
- int32_t numBins;\r
- elementsRead = fread(&numBins, 4, 1, indexStream);\r
- if ( IsBigEndian ) { SwapEndian_32(numBins); }\r
-\r
- if (numBins > 0) {\r
- RefData& refEntry = References[i];\r
- refEntry.RefHasAlignments = true;\r
- }\r
-\r
- // intialize BinVector\r
- BamBinMap binMap;\r
-\r
- // iterate over bins for that reference sequence\r
- for (int j = 0; j < numBins; ++j) {\r
-\r
- // get binID\r
- uint32_t binID;\r
- elementsRead = fread(&binID, 4, 1, indexStream);\r
-\r
- // get number of regionChunks in this bin\r
- uint32_t numChunks;\r
- elementsRead = fread(&numChunks, 4, 1, indexStream);\r
-\r
- if ( IsBigEndian ) { \r
- SwapEndian_32(binID);\r
- SwapEndian_32(numChunks);\r
- }\r
- \r
- // intialize ChunkVector\r
- ChunkVector regionChunks;\r
- regionChunks.reserve(numChunks);\r
-\r
- // iterate over regionChunks in this bin\r
- for (unsigned int k = 0; k < numChunks; ++k) {\r
-\r
- // get chunk boundaries (left, right)\r
- uint64_t left;\r
- uint64_t right;\r
- elementsRead = fread(&left, 8, 1, indexStream);\r
- elementsRead = fread(&right, 8, 1, indexStream);\r
-\r
- if ( IsBigEndian ) {\r
- SwapEndian_64(left);\r
- SwapEndian_64(right);\r
- }\r
- \r
- // save ChunkPair\r
- regionChunks.push_back( Chunk(left, right) );\r
- }\r
-\r
- // sort chunks for this bin\r
- sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan );\r
-\r
- // save binID, chunkVector for this bin\r
- binMap.insert( pair<uint32_t, ChunkVector>(binID, regionChunks) );\r
- }\r
-\r
- // load linear index for this reference sequence\r
-\r
- // get number of linear offsets\r
- int32_t numLinearOffsets;\r
- elementsRead = fread(&numLinearOffsets, 4, 1, indexStream);\r
- if ( IsBigEndian ) { SwapEndian_32(numLinearOffsets); }\r
-\r
- // intialize LinearOffsetVector\r
- LinearOffsetVector offsets;\r
- offsets.reserve(numLinearOffsets);\r
-\r
- // iterate over linear offsets for this reference sequeence\r
- uint64_t linearOffset;\r
- for (int j = 0; j < numLinearOffsets; ++j) {\r
- // read a linear offset & store\r
- elementsRead = fread(&linearOffset, 8, 1, indexStream);\r
- if ( IsBigEndian ) { SwapEndian_64(linearOffset); }\r
- offsets.push_back(linearOffset);\r
- }\r
-\r
- // sort linear offsets\r
- sort( offsets.begin(), offsets.end() );\r
-\r
- // store index data for that reference sequence\r
- Index.push_back( ReferenceIndex(binMap, offsets) );\r
- }\r
-\r
- // close index file (.bai) and return\r
- fclose(indexStream);\r
- return true;\r
+ // return success of loading index data\r
+ return NewIndex->Load(IndexFilename);\r
}\r
\r
// populates BamAlignment with alignment data under file pointer, returns success/fail\r
// read in the 'block length' value, make sure it's not zero\r
char buffer[4];\r
mBGZF.Read(buffer, 4);\r
- unsigned int blockLength = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( IsBigEndian ) { SwapEndian_32(blockLength); }\r
- if ( blockLength == 0 ) { return false; }\r
-\r
- // keep track of bytes read as method progresses\r
- int bytesRead = 4;\r
+ bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); }\r
+ if ( bAlignment.SupportData.BlockLength == 0 ) { return false; }\r
\r
// read in core alignment data, make sure the right size of data was read\r
- uint32_t x[8];\r
+ char x[BAM_CORE_SIZE];\r
if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }\r
- bytesRead += BAM_CORE_SIZE;\r
\r
if ( IsBigEndian ) {\r
- for ( int i = 0; i < 8; ++i ) { \r
- SwapEndian_32(x[i]); \r
+ for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) { \r
+ SwapEndian_32p(&x[i]); \r
}\r
}\r
\r
- // set BamAlignment 'core' data and character data lengths\r
- unsigned int tempValue;\r
- unsigned int queryNameLength;\r
- unsigned int numCigarOperations;\r
- unsigned int querySequenceLength;\r
-\r
+ // set BamAlignment 'core' and 'support' data\r
bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); \r
- bAlignment.Position = BgzfData::UnpackSignedInt(&x[1]);\r
+ bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);\r
\r
- tempValue = BgzfData::UnpackUnsignedInt(&x[2]);\r
+ unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]);\r
bAlignment.Bin = tempValue >> 16;\r
bAlignment.MapQuality = tempValue >> 8 & 0xff;\r
- queryNameLength = tempValue & 0xff;\r
+ bAlignment.SupportData.QueryNameLength = tempValue & 0xff;\r
\r
- tempValue = BgzfData::UnpackUnsignedInt(&x[3]);\r
+ tempValue = BgzfData::UnpackUnsignedInt(&x[12]);\r
bAlignment.AlignmentFlag = tempValue >> 16;\r
- numCigarOperations = tempValue & 0xffff;\r
+ bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff;\r
\r
- querySequenceLength = BgzfData::UnpackUnsignedInt(&x[4]);\r
- bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[5]);\r
- bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[6]);\r
- bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[7]);\r
+ bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);\r
+ bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]);\r
+ bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);\r
+ bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);\r
\r
- // calculate lengths/offsets\r
- const unsigned int dataLength = blockLength - BAM_CORE_SIZE;\r
- const unsigned int cigarDataOffset = queryNameLength;\r
- const unsigned int seqDataOffset = cigarDataOffset + (numCigarOperations * 4);\r
- const unsigned int qualDataOffset = seqDataOffset + (querySequenceLength+1)/2;\r
- const unsigned int tagDataOffset = qualDataOffset + querySequenceLength;\r
- const unsigned int tagDataLen = dataLength - tagDataOffset;\r
-\r
- // set up destination buffers for character data\r
- char* allCharData = (char*)calloc(sizeof(char), dataLength);\r
- uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);\r
- char* seqData = ((char*)allCharData) + seqDataOffset;\r
- char* qualData = ((char*)allCharData) + qualDataOffset;\r
- char* tagData = ((char*)allCharData) + tagDataOffset;\r
-\r
- // get character data - make sure proper data size was read\r
- if ( mBGZF.Read(allCharData, dataLength) != (signed int)dataLength) { return false; }\r
- else {\r
-\r
- bytesRead += dataLength;\r
-\r
- // clear out any previous string data\r
- bAlignment.Name.clear(;)\r
- bAlignment.QueryBases.clear();\r
- bAlignment.Qualities.clear();\r
- bAlignment.AlignedBases.clear();\r
- bAlignment.CigarData.clear();\r
- bAlignment.TagData.clear();\r
-\r
- // save name\r
- bAlignment.Name = (string)((const char*)(allCharData));\r
-\r
- // save query sequence\r
- // -----------------------\r
- // Added: 3-25-2010 DWB\r
- // Improved: reduced repeated memory allocations as string grows\r
- bAlignment.QueryBases.reserve(querySequenceLength);\r
- // -----------------------\r
- \r
- for (unsigned int i = 0; i < querySequenceLength; ++i) {\r
- char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];\r
- bAlignment.QueryBases.append( 1, singleBase );\r
- }\r
-\r
- // save sequence length\r
- bAlignment.Length = bAlignment.QueryBases.length();\r
-\r
- // save qualities, convert from numeric QV to FASTQ character\r
- // -----------------------\r
- // Added: 3-25-2010 DWB\r
- // Improved: reduced repeated memory allocations as string grows\r
- bAlignment.Qualities.reserve(querySequenceLength);\r
- // -----------------------\r
- \r
- for (unsigned int i = 0; i < querySequenceLength; ++i) {\r
- char singleQuality = (char)(qualData[i]+33);\r
- bAlignment.Qualities.append( 1, singleQuality );\r
- }\r
-\r
- // save CIGAR-related data;\r
- // -----------------------\r
- // Added: 3-25-2010 DWB\r
- // Improved: reduced repeated memory allocations as string grows\r
- bAlignment.AlignedBases.reserve(querySequenceLength);\r
- // -----------------------\r
- \r
- int k = 0;\r
- for (unsigned int i = 0; i < numCigarOperations; ++i) {\r
-\r
- if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); }\r
- \r
- // build CigarOp struct\r
- CigarOp op;\r
- op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
- op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
-\r
- // save CigarOp\r
- bAlignment.CigarData.push_back(op);\r
-\r
- // build AlignedBases string\r
- switch (op.Type) {\r
-\r
- case ('M') :\r
- case ('I') : \r
- bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases\r
- // fall through\r
- \r
- case ('S') : \r
- k += op.Length; // for 'S' - skip over query bases\r
- break;\r
-\r
- case ('D') : \r
- bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character\r
- break;\r
-\r
- case ('P') : \r
- bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character;\r
- break;\r
-\r
- case ('N') : \r
- bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence\r
- // -----------------------\r
- // Removed: 3-25-2010 DWB\r
- // Contributed: ARQ\r
- // Fixed: compliance with actual 'N' definition in BAM spec\r
- // k += op.Length;\r
- // -----------------------\r
- break;\r
-\r
- case ('H') : \r
- break; // for 'H' - do nothing, move to next op\r
-\r
- default : \r
- printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here\r
- free(allCharData);\r
- exit(1);\r
- }\r
- }\r
-\r
- // -----------------------\r
- // Added: 3-25-2010 DWB\r
- // Fixed: endian-correctness for tag data\r
- // -----------------------\r
- if ( IsBigEndian ) {\r
- int i = 0;\r
- while ( i < tagDataLen ) {\r
- \r
- i += 2; // skip tag type (e.g. "RG", "NM", etc)\r
- uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning \r
- ++i; // skip value type\r
- \r
- switch (type) {\r
- \r
- case('A') :\r
- case('C') : \r
- ++i;\r
- break;\r
- \r
- case('S') : \r
- SwapEndian_16p(&tagData[i]); \r
- i+=2; // sizeof(uint16_t)\r
- break;\r
- \r
- case('F') :\r
- case('I') : \r
- SwapEndian_32p(&tagData[i]);\r
- i+=4; // sizeof(uint32_t)\r
- break;\r
- \r
- case('D') : \r
- SwapEndian_64p(&tagData[i]);\r
- i+=8; // sizeof(uint64_t)\r
- break;\r
- \r
- case('H') :\r
- case('Z') : \r
- while (tagData[i]) { ++i; }\r
- ++i; // increment one more for null terminator\r
- break;\r
- \r
- default : \r
- printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
- free(allCharData);\r
- exit(1); \r
- }\r
- }\r
- }\r
+ // set BamAlignment length\r
+ bAlignment.Length = bAlignment.SupportData.QuerySequenceLength;\r
+ \r
+ // read in character data - make sure proper data size was read\r
+ bool readCharDataOK = false;\r
+ const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE;\r
+ char* allCharData = (char*)calloc(sizeof(char), dataLength);\r
+ \r
+ if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { \r
+ \r
+ // store 'allCharData' in supportData structure\r
+ bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength);\r
\r
- // store tag data\r
- bAlignment.TagData.resize(tagDataLen);\r
- memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);\r
+ // set success flag\r
+ readCharDataOK = true;\r
}\r
\r
free(allCharData);\r
- return true;\r
+ return readCharDataOK;\r
}\r
\r
// loads reference data from BAM file\r
}\r
}\r
\r
-// merges 'alignment chunks' in BAM bin (used for index building)\r
-void BamReader::BamReaderPrivate::MergeChunks(void) {\r
-\r
- // iterate over reference enties\r
- BamIndex::iterator indexIter = Index.begin();\r
- BamIndex::iterator indexEnd = Index.end();\r
- for ( ; indexIter != indexEnd; ++indexIter ) {\r
-\r
- // get BAM bin map for this reference\r
- ReferenceIndex& refIndex = (*indexIter);\r
- BamBinMap& bamBinMap = refIndex.Bins;\r
-\r
- // iterate over BAM bins\r
- BamBinMap::iterator binIter = bamBinMap.begin();\r
- BamBinMap::iterator binEnd = bamBinMap.end();\r
- for ( ; binIter != binEnd; ++binIter ) {\r
-\r
- // get chunk vector for this bin\r
- ChunkVector& binChunks = (*binIter).second;\r
- if ( binChunks.size() == 0 ) { continue; }\r
-\r
- ChunkVector mergedChunks;\r
- mergedChunks.push_back( binChunks[0] );\r
-\r
- // iterate over chunks\r
- int i = 0;\r
- ChunkVector::iterator chunkIter = binChunks.begin();\r
- ChunkVector::iterator chunkEnd = binChunks.end();\r
- for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {\r
-\r
- // get 'currentChunk' based on numeric index\r
- Chunk& currentChunk = mergedChunks[i];\r
-\r
- // get iteratorChunk based on vector iterator\r
- Chunk& iteratorChunk = (*chunkIter);\r
-\r
- // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted)\r
- if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) {\r
-\r
- // set currentChunk.Stop to iteratorChunk.Stop\r
- currentChunk.Stop = iteratorChunk.Stop;\r
- }\r
-\r
- // otherwise\r
- else {\r
- // set currentChunk + 1 to iteratorChunk\r
- mergedChunks.push_back(iteratorChunk);\r
- ++i;\r
- }\r
- }\r
-\r
- // saved merged chunk vector\r
- (*binIter).second = mergedChunks;\r
- }\r
- }\r
-}\r
-\r
// opens BAM file (and index)\r
-void BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) {\r
+bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) {\r
\r
Filename = filename;\r
IndexFilename = indexFilename;\r
\r
- // open the BGZF file for reading, retrieve header text & reference data\r
- mBGZF.Open(filename, "rb");\r
+ // open the BGZF file for reading, return false on failure\r
+ if ( !mBGZF.Open(filename, "rb") ) \r
+ return false;\r
+ \r
+ // retrieve header text & reference data\r
LoadHeaderData();\r
LoadReferenceData();\r
\r
AlignmentsBeginOffset = mBGZF.Tell();\r
\r
// open index file & load index data (if exists)\r
- if ( !IndexFilename.empty() ) {\r
+ if ( !IndexFilename.empty() )\r
LoadIndex();\r
- }\r
+ \r
+ // return success\r
+ return true;\r
}\r
\r
// returns BAM file pointer to beginning of alignment data\r
bool BamReader::BamReaderPrivate::Rewind(void) {\r
-\r
- // find first reference that has alignments in the BAM file\r
- int refID = 0;\r
- int refCount = References.size();\r
- for ( ; refID < refCount; ++refID ) {\r
- if ( References.at(refID).RefHasAlignments ) { break; }\r
- }\r
-\r
- // store default bounds for first alignment\r
- CurrentRefID = refID;\r
- CurrentLeft = 0;\r
- IsRegionSpecified = false;\r
-\r
- // return success/failure of seek\r
+ \r
+ // rewind to first alignment\r
+ if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false;\r
+ \r
+ // retrieve first alignment data\r
+ BamAlignment al;\r
+ if ( !LoadNextAlignment(al) ) return false;\r
+ \r
+ // reset default region info using first alignment in file\r
+ Region.LeftRefID = al.RefID;\r
+ Region.LeftPosition = al.Position;\r
+ Region.RightRefID = -1;\r
+ Region.RightPosition = -1;\r
+ IsLeftBoundSpecified = false;\r
+ IsRightBoundSpecified = false; \r
+\r
+ // rewind back to before first alignment\r
+ // return success/fail of seek\r
return mBGZF.Seek(AlignmentsBeginOffset);\r
}\r
\r
-// rounds value up to next power-of-2 (used in index building)\r
-void BamReader::BamReaderPrivate::Roundup32(int& value) { \r
- --value;\r
- value |= value >> 1;\r
- value |= value >> 2;\r
- value |= value >> 4;\r
- value |= value >> 8;\r
- value |= value >> 16;\r
- ++value;\r
-}\r
-\r
-// saves index data to BAM index file (".bai"), returns success/fail\r
-bool BamReader::BamReaderPrivate::WriteIndex(void) {\r
-\r
- IndexFilename = Filename + ".bai";\r
- FILE* indexStream = fopen(IndexFilename.c_str(), "wb");\r
- if ( indexStream == 0 ) {\r
- printf("ERROR: Could not open file to save index\n");\r
- return false;\r
- }\r
-\r
- // write BAM index header\r
- fwrite("BAI\1", 1, 4, indexStream);\r
-\r
- // write number of reference sequences\r
- int32_t numReferenceSeqs = Index.size();\r
- if ( IsBigEndian ) { SwapEndian_32(numReferenceSeqs); }\r
- fwrite(&numReferenceSeqs, 4, 1, indexStream);\r
-\r
- // iterate over reference sequences\r
- BamIndex::const_iterator indexIter = Index.begin();\r
- BamIndex::const_iterator indexEnd = Index.end();\r
- for ( ; indexIter != indexEnd; ++ indexIter ) {\r
-\r
- // get reference index data\r
- const ReferenceIndex& refIndex = (*indexIter);\r
- const BamBinMap& binMap = refIndex.Bins;\r
- const LinearOffsetVector& offsets = refIndex.Offsets;\r
-\r
- // write number of bins\r
- int32_t binCount = binMap.size();\r
- if ( IsBigEndian ) { SwapEndian_32(binCount); }\r
- fwrite(&binCount, 4, 1, indexStream);\r
-\r
- // iterate over bins\r
- BamBinMap::const_iterator binIter = binMap.begin();\r
- BamBinMap::const_iterator binEnd = binMap.end();\r
- for ( ; binIter != binEnd; ++binIter ) {\r
-\r
- // get bin data (key and chunk vector)\r
- uint32_t binKey = (*binIter).first;\r
- const ChunkVector& binChunks = (*binIter).second;\r
-\r
- // save BAM bin key\r
- if ( IsBigEndian ) { SwapEndian_32(binKey); }\r
- fwrite(&binKey, 4, 1, indexStream);\r
-\r
- // save chunk count\r
- int32_t chunkCount = binChunks.size();\r
- if ( IsBigEndian ) { SwapEndian_32(chunkCount); }\r
- fwrite(&chunkCount, 4, 1, indexStream);\r
-\r
- // iterate over chunks\r
- ChunkVector::const_iterator chunkIter = binChunks.begin();\r
- ChunkVector::const_iterator chunkEnd = binChunks.end();\r
- for ( ; chunkIter != chunkEnd; ++chunkIter ) {\r
-\r
- // get current chunk data\r
- const Chunk& chunk = (*chunkIter);\r
- uint64_t start = chunk.Start;\r
- uint64_t stop = chunk.Stop;\r
-\r
- if ( IsBigEndian ) {\r
- SwapEndian_64(start);\r
- SwapEndian_64(stop);\r
- }\r
- \r
- // save chunk offsets\r
- fwrite(&start, 8, 1, indexStream);\r
- fwrite(&stop, 8, 1, indexStream);\r
- }\r
- }\r
-\r
- // write linear offsets size\r
- int32_t offsetSize = offsets.size();\r
- if ( IsBigEndian ) { SwapEndian_32(offsetSize); }\r
- fwrite(&offsetSize, 4, 1, indexStream);\r
-\r
- // iterate over linear offsets\r
- LinearOffsetVector::const_iterator offsetIter = offsets.begin();\r
- LinearOffsetVector::const_iterator offsetEnd = offsets.end();\r
- for ( ; offsetIter != offsetEnd; ++offsetIter ) {\r
-\r
- // write linear offset value\r
- uint64_t linearOffset = (*offsetIter);\r
- if ( IsBigEndian ) { SwapEndian_64(linearOffset); }\r
- fwrite(&linearOffset, 8, 1, indexStream);\r
- }\r
- }\r
-\r
- // flush buffer, close file, and return success\r
- fflush(indexStream);\r
- fclose(indexStream);\r
- return true;\r
+// sets a region of interest (with left & right bound reference/position)\r
+// attempts a Jump() to left bound as well\r
+// returns success/failure of Jump()\r
+bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) {\r
+ \r
+ // save region of interest\r
+ Region = region;\r
+ \r
+ // set flags\r
+ if ( region.LeftRefID >= 0 && region.LeftPosition >= 0 ) \r
+ IsLeftBoundSpecified = true;\r
+ if ( region.RightRefID >= 0 && region.RightPosition >= 0 ) \r
+ IsRightBoundSpecified = true;\r
+ \r
+ // attempt jump to beginning of region, return success/fail of Jump()\r
+ return Jump( Region.LeftRefID, Region.LeftPosition );\r
}\r