// ***************************************************************************\r
-// BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg\r
+// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg\r
// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 8 December 2009 (DB)\r
+// Last modified: 8 June 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
// Institute.\r
// data members\r
// -------------------------------\r
\r
- // general data\r
+ // general file data\r
BgzfData mBGZF;\r
string HeaderText;\r
BamIndex Index;\r
int64_t AlignmentsBeginOffset;\r
string Filename;\r
string IndexFilename;\r
+ \r
+ // system data\r
+ bool IsBigEndian;\r
\r
// user-specified region values\r
bool IsRegionSpecified;\r
// "public" interface\r
// -------------------------------\r
\r
- // flie operations\r
+ // file operations\r
void Close(void);\r
bool Jump(int refID, int position = 0);\r
void Open(const string& filename, const string& indexFilename = "");\r
\r
// access alignment data\r
bool GetNextAlignment(BamAlignment& bAlignment);\r
+ bool GetNextAlignmentCore(BamAlignment& bAlignment, BamAlignmentSupportData& supportData);\r
\r
// access auxiliary data\r
- const string GetHeaderText(void) const;\r
- const int GetReferenceCount(void) const;\r
- const RefVector GetReferenceData(void) const;\r
- const int GetReferenceID(const string& refName) const;\r
+ int GetReferenceID(const string& refName) const;\r
\r
// index operations\r
bool CreateIndex(void);\r
\r
// calculate bins that overlap region ( left to reference end for now )\r
int BinsFromRegion(int refID, int left, uint16_t[MAX_BIN]);\r
- // calculates alignment end position based on starting position and provided CIGAR operations\r
- int CalculateAlignmentEnd(const int& position, const std::vector<CigarOp>& cigarData);\r
+ // fills out character data for BamAlignment data\r
+ bool BuildCharData(BamAlignment& bAlignment, const BamAlignmentSupportData& supportData);\r
// calculate file offset for first alignment chunk overlapping 'left'\r
int64_t GetOffset(int refID, int left);\r
// checks to see if alignment overlaps current region\r
// retrieves header text from BAM file\r
void LoadHeaderData(void);\r
// retrieves BAM alignment under file pointer\r
- bool LoadNextAlignment(BamAlignment& bAlignment);\r
+ bool LoadNextAlignment(BamAlignment& bAlignment, BamAlignmentSupportData& supportData);\r
// builds reference data structure from BAM file\r
void LoadReferenceData(void);\r
\r
bool LoadIndex(void);\r
// simplifies index by merging 'chunks'\r
void MergeChunks(void);\r
- // round-up 32-bit integer to next power-of-2\r
- void Roundup32(int& value);\r
// saves index to BAM index file\r
bool WriteIndex(void);\r
};\r
\r
// access alignment data\r
bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); }\r
+bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment, BamAlignmentSupportData& supportData) { return d->GetNextAlignmentCore(bAlignment, supportData); }\r
\r
// access auxiliary data\r
-const string BamReader::GetHeaderText(void) const { return d->HeaderText; }\r
-const int BamReader::GetReferenceCount(void) const { return d->References.size(); }\r
+const string BamReader::GetHeaderText(void) const { return d->HeaderText; }\r
+int BamReader::GetReferenceCount(void) const { return d->References.size(); }\r
const RefVector BamReader::GetReferenceData(void) const { return d->References; }\r
-const int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }\r
+int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }\r
+const std::string BamReader::GetFilename(void) const { return d->Filename; }\r
\r
// index operations\r
bool BamReader::CreateIndex(void) { return d->CreateIndex(); }\r
, CurrentLeft(0)\r
, DNA_LOOKUP("=ACMGRSVTWYHKDBN")\r
, CIGAR_LOOKUP("MIDNSHP")\r
-{ }\r
+{ \r
+ IsBigEndian = SystemIsBigEndian();\r
+}\r
\r
// destructor\r
BamReader::BamReaderPrivate::~BamReaderPrivate(void) {\r
int BamReader::BamReaderPrivate::BinsFromRegion(int refID, int left, uint16_t list[MAX_BIN]) {\r
\r
// get region boundaries\r
- int32_t begin = left;\r
- int32_t end = References.at(refID).RefLength - 1;\r
+ uint32_t begin = (unsigned int)left;\r
+ uint32_t end = (unsigned int)References.at(refID).RefLength - 1;\r
\r
// initialize list, bin '0' always a valid bin\r
int i = 0;\r
return i;\r
}\r
\r
+bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment, const BamAlignmentSupportData& supportData) {\r
+ \r
+ // calculate character lengths/offsets\r
+ const unsigned int dataLength = supportData.BlockLength - BAM_CORE_SIZE;\r
+ const unsigned int seqDataOffset = supportData.QueryNameLength + (supportData.NumCigarOperations * 4);\r
+ const unsigned int qualDataOffset = seqDataOffset + (supportData.QuerySequenceLength+1)/2;\r
+ const unsigned int tagDataOffset = qualDataOffset + supportData.QuerySequenceLength;\r
+ const unsigned int tagDataLength = dataLength - tagDataOffset;\r
+ \r
+ // set up char buffers\r
+ const char* allCharData = supportData.AllCharData.data();\r
+ const char* seqData = ((const char*)allCharData) + seqDataOffset;\r
+ const char* qualData = ((const char*)allCharData) + qualDataOffset;\r
+ char* tagData = ((char*)allCharData) + tagDataOffset;\r
+ \r
+ // save query sequence\r
+ bAlignment.QueryBases.clear();\r
+ bAlignment.QueryBases.reserve(supportData.QuerySequenceLength);\r
+ for (unsigned int i = 0; i < supportData.QuerySequenceLength; ++i) {\r
+ char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];\r
+ bAlignment.QueryBases.append(1, singleBase);\r
+ }\r
+ \r
+ // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character\r
+ bAlignment.Qualities.clear();\r
+ bAlignment.Qualities.reserve(supportData.QuerySequenceLength);\r
+ for (unsigned int i = 0; i < supportData.QuerySequenceLength; ++i) {\r
+ char singleQuality = (char)(qualData[i]+33);\r
+ bAlignment.Qualities.append(1, singleQuality);\r
+ }\r
+ \r
+ // parse CIGAR to build 'AlignedBases'\r
+ bAlignment.AlignedBases.clear();\r
+ bAlignment.AlignedBases.reserve(supportData.QuerySequenceLength);\r
+ \r
+ int k = 0;\r
+ vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin();\r
+ vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end();\r
+ for ( ; cigarIter != cigarEnd; ++cigarIter ) {\r
+ \r
+ const CigarOp& op = (*cigarIter);\r
+ switch(op.Type) {\r
+ \r
+ case ('M') :\r
+ case ('I') :\r
+ bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases\r
+ // fall through\r
+ \r
+ case ('S') :\r
+ k += op.Length; // for 'S' - soft clip, skip over query bases\r
+ break;\r
+ \r
+ case ('D') :\r
+ bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character\r
+ break;\r
+ \r
+ case ('P') :\r
+ bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character\r
+ break;\r
+ \r
+ case ('N') :\r
+ bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence\r
+ // k+=op.Length; \r
+ break;\r
+ \r
+ case ('H') :\r
+ break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op\r
+ \r
+ default:\r
+ printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here\r
+ exit(1);\r
+ }\r
+ }\r
+ \r
+ // -----------------------\r
+ // Added: 3-25-2010 DWB\r
+ // Fixed: endian-correctness for tag data\r
+ // -----------------------\r
+ if ( IsBigEndian ) {\r
+ int i = 0;\r
+ while ( (unsigned int)i < tagDataLength ) {\r
+ \r
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)\r
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning \r
+ ++i; // skip value type\r
+ \r
+ switch (type) {\r
+ \r
+ case('A') :\r
+ case('C') : \r
+ ++i;\r
+ break;\r
+\r
+ case('S') : \r
+ SwapEndian_16p(&tagData[i]); \r
+ i+=2; // sizeof(uint16_t)\r
+ break;\r
+ \r
+ case('F') :\r
+ case('I') : \r
+ SwapEndian_32p(&tagData[i]);\r
+ i+=4; // sizeof(uint32_t)\r
+ break;\r
+ \r
+ case('D') : \r
+ SwapEndian_64p(&tagData[i]);\r
+ i+=8; // sizeof(uint64_t) \r
+ break;\r
+ \r
+ case('H') :\r
+ case('Z') : \r
+ while (tagData[i]) { ++i; }\r
+ ++i; // increment one more for null terminator\r
+ break;\r
+ \r
+ default : \r
+ printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
+ exit(1);\r
+ }\r
+ }\r
+ }\r
+ \r
+ // store TagData\r
+ bAlignment.TagData.clear();\r
+ bAlignment.TagData.resize(tagDataLength);\r
+ memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength);\r
+ \r
+ // return success\r
+ return true;\r
+}\r
+\r
// populates BAM index data structure from BAM file data\r
bool BamReader::BamReaderPrivate::BuildIndex(void) {\r
\r
return Rewind();\r
}\r
\r
-// calculates alignment end position based on starting position and provided CIGAR operations\r
-int BamReader::BamReaderPrivate::CalculateAlignmentEnd(const int& position, const vector<CigarOp>& cigarData) {\r
-\r
- // initialize alignment end to starting position\r
- int alignEnd = position;\r
-\r
- // iterate over cigar operations\r
- vector<CigarOp>::const_iterator cigarIter = cigarData.begin();\r
- vector<CigarOp>::const_iterator cigarEnd = cigarData.end();\r
- for ( ; cigarIter != cigarEnd; ++cigarIter) {\r
- char cigarType = (*cigarIter).Type;\r
- if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {\r
- alignEnd += (*cigarIter).Length;\r
- }\r
- }\r
- return alignEnd;\r
-}\r
-\r
\r
// clear index data structure\r
void BamReader::BamReaderPrivate::ClearIndex(void) {\r
// clear out index\r
ClearIndex();\r
\r
- // build (& save) index from BAM file\r
+ // build (& save) index from BAM file\r
bool ok = true;\r
ok &= BuildIndex();\r
ok &= WriteIndex();\r
\r
- // return success/fail\r
+ // return success/fail\r
return ok;\r
}\r
\r
// returns RefID for given RefName (returns References.size() if not found)\r
-const int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const {\r
+int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const {\r
\r
// retrieve names from reference data\r
vector<string> refNames;\r
// get next alignment (from specified region, if given)\r
bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {\r
\r
+ BamAlignmentSupportData supportData;\r
+ \r
// if valid alignment available\r
- if ( LoadNextAlignment(bAlignment) ) {\r
+ if ( LoadNextAlignment(bAlignment, supportData) ) {\r
\r
// if region not specified, return success\r
- if ( !IsRegionSpecified ) { return true; }\r
+ if ( !IsRegionSpecified ) { \r
+ bool ok = BuildCharData(bAlignment, supportData);\r
+ return ok; \r
+ }\r
\r
// load next alignment until region overlap is found\r
while ( !IsOverlap(bAlignment) ) {\r
// if no valid alignment available (likely EOF) return failure\r
- if ( !LoadNextAlignment(bAlignment) ) { return false; }\r
+ if ( !LoadNextAlignment(bAlignment, supportData) ) return false;\r
+ }\r
+\r
+ // return success (alignment found that overlaps region)\r
+ bool ok = BuildCharData(bAlignment, supportData);\r
+ return ok;\r
+ }\r
+\r
+ // no valid alignment\r
+ else \r
+ return false;\r
+}\r
+\r
+// retrieves next available alignment core data (returns success/fail)\r
+// ** DOES NOT parse any character data (bases, qualities, tag data)\r
+// these can be accessed, if necessary, from the supportData \r
+// useful for operations requiring ONLY positional or other alignment-related information\r
+bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment, BamAlignmentSupportData& supportData) {\r
+\r
+ // if valid alignment available\r
+ if ( LoadNextAlignment(bAlignment, supportData) ) {\r
+\r
+ // if region not specified, return success\r
+ if ( !IsRegionSpecified ) return true;\r
+\r
+ // load next alignment until region overlap is found\r
+ while ( !IsOverlap(bAlignment) ) {\r
+ // if no valid alignment available (likely EOF) return failure\r
+ if ( !LoadNextAlignment(bAlignment, supportData) ) return false;\r
}\r
\r
// return success (alignment found that overlaps region)\r
}\r
\r
// no valid alignment\r
- else { return false; }\r
+ else\r
+ return false;\r
}\r
\r
// calculate closest indexed file offset for region specified\r
\r
// get minimum offset to consider\r
const LinearOffsetVector& offsets = refIndex.Offsets;\r
- uint64_t minOffset = ( (left>>BAM_LIDX_SHIFT) >= offsets.size() ) ? 0 : offsets.at(left>>BAM_LIDX_SHIFT);\r
+ uint64_t minOffset = ( (unsigned int)(left>>BAM_LIDX_SHIFT) >= offsets.size() ) ? 0 : offsets.at(left>>BAM_LIDX_SHIFT);\r
\r
// store offsets to beginning of alignment 'chunks'\r
std::vector<int64_t> chunkStarts;\r
{\r
// get converted offsets\r
int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;\r
- int endOffset = ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) - 1) >> BAM_LIDX_SHIFT;\r
+ int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT;\r
\r
// resize vector if necessary\r
int oldSize = offsets.size();\r
int newSize = endOffset + 1;\r
- if ( oldSize < newSize ) { \r
- Roundup32(newSize);\r
- offsets.resize(newSize, 0);\r
- }\r
+ if ( oldSize < newSize ) { offsets.resize(newSize, 0); }\r
\r
// store offset\r
for(int i = beginOffset + 1; i <= endOffset ; ++i) {\r
if ( bAlignment.Position >= CurrentLeft) { return true; }\r
\r
// return whether alignment end overlaps left boundary\r
- return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= CurrentLeft );\r
+ return ( bAlignment.GetEndPosition() >= CurrentLeft );\r
}\r
\r
// jumps to specified region(refID, leftBound) in BAM file, returns success/fail\r
// if data exists for this reference and position is valid \r
if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {\r
\r
- // set current region\r
+ // set current region\r
CurrentRefID = refID;\r
CurrentLeft = position;\r
IsRegionSpecified = true;\r
\r
- // calculate offset\r
+ // calculate offset\r
int64_t offset = GetOffset(CurrentRefID, CurrentLeft);\r
\r
- // if in valid offset, return failure\r
+ // if in valid offset, return failure\r
if ( offset == -1 ) { return false; }\r
\r
- // otherwise return success of seek operation\r
+ // otherwise return success of seek operation\r
else { return mBGZF.Seek(offset); }\r
}\r
\r
- // invalid jump request parameters, return failure\r
+ // invalid jump request parameters, return failure\r
return false;\r
}\r
\r
\r
// get BAM header text length\r
mBGZF.Read(buffer, 4);\r
- const unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);\r
-\r
+ unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(headerTextLength); }\r
+ \r
// get BAM header text\r
char* headerText = (char*)calloc(headerTextLength + 1, 1);\r
mBGZF.Read(headerText, headerTextLength);\r
return false;\r
}\r
\r
+ size_t elementsRead = 0;\r
+ \r
// see if index is valid BAM index\r
char magic[4];\r
- fread(magic, 1, 4, indexStream);\r
+ elementsRead = fread(magic, 1, 4, indexStream);\r
if (strncmp(magic, "BAI\1", 4)) {\r
printf("Problem with index file - invalid format.\n");\r
fclose(indexStream);\r
\r
// get number of reference sequences\r
uint32_t numRefSeqs;\r
- fread(&numRefSeqs, 4, 1, indexStream);\r
-\r
+ elementsRead = fread(&numRefSeqs, 4, 1, indexStream);\r
+ if ( IsBigEndian ) { SwapEndian_32(numRefSeqs); }\r
+ \r
// intialize space for BamIndex data structure\r
Index.reserve(numRefSeqs);\r
\r
\r
// get number of bins for this reference sequence\r
int32_t numBins;\r
- fread(&numBins, 4, 1, indexStream);\r
+ elementsRead = fread(&numBins, 4, 1, indexStream);\r
+ if ( IsBigEndian ) { SwapEndian_32(numBins); }\r
\r
if (numBins > 0) {\r
RefData& refEntry = References[i];\r
\r
// get binID\r
uint32_t binID;\r
- fread(&binID, 4, 1, indexStream);\r
+ elementsRead = fread(&binID, 4, 1, indexStream);\r
\r
// get number of regionChunks in this bin\r
uint32_t numChunks;\r
- fread(&numChunks, 4, 1, indexStream);\r
+ elementsRead = fread(&numChunks, 4, 1, indexStream);\r
\r
+ if ( IsBigEndian ) { \r
+ SwapEndian_32(binID);\r
+ SwapEndian_32(numChunks);\r
+ }\r
+ \r
// intialize ChunkVector\r
ChunkVector regionChunks;\r
regionChunks.reserve(numChunks);\r
// get chunk boundaries (left, right)\r
uint64_t left;\r
uint64_t right;\r
- fread(&left, 8, 1, indexStream);\r
- fread(&right, 8, 1, indexStream);\r
+ elementsRead = fread(&left, 8, 1, indexStream);\r
+ elementsRead = fread(&right, 8, 1, indexStream);\r
\r
+ if ( IsBigEndian ) {\r
+ SwapEndian_64(left);\r
+ SwapEndian_64(right);\r
+ }\r
+ \r
// save ChunkPair\r
regionChunks.push_back( Chunk(left, right) );\r
}\r
\r
// get number of linear offsets\r
int32_t numLinearOffsets;\r
- fread(&numLinearOffsets, 4, 1, indexStream);\r
+ elementsRead = fread(&numLinearOffsets, 4, 1, indexStream);\r
+ if ( IsBigEndian ) { SwapEndian_32(numLinearOffsets); }\r
\r
// intialize LinearOffsetVector\r
LinearOffsetVector offsets;\r
uint64_t linearOffset;\r
for (int j = 0; j < numLinearOffsets; ++j) {\r
// read a linear offset & store\r
- fread(&linearOffset, 8, 1, indexStream);\r
+ elementsRead = fread(&linearOffset, 8, 1, indexStream);\r
+ if ( IsBigEndian ) { SwapEndian_64(linearOffset); }\r
offsets.push_back(linearOffset);\r
}\r
\r
}\r
\r
// populates BamAlignment with alignment data under file pointer, returns success/fail\r
-bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {\r
+bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment, BamAlignmentSupportData& supportData) {\r
\r
// read in the 'block length' value, make sure it's not zero\r
char buffer[4];\r
mBGZF.Read(buffer, 4);\r
- const unsigned int blockLength = BgzfData::UnpackUnsignedInt(buffer);\r
- if ( blockLength == 0 ) { return false; }\r
-\r
- // keep track of bytes read as method progresses\r
- int bytesRead = 4;\r
+ supportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(supportData.BlockLength); }\r
+ if ( supportData.BlockLength == 0 ) { return false; }\r
\r
// read in core alignment data, make sure the right size of data was read\r
char x[BAM_CORE_SIZE];\r
if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }\r
- bytesRead += BAM_CORE_SIZE;\r
-\r
- // set BamAlignment 'core' data and character data lengths\r
- unsigned int tempValue;\r
- unsigned int queryNameLength;\r
- unsigned int numCigarOperations;\r
- unsigned int querySequenceLength;\r
\r
- bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]);\r
+ if ( IsBigEndian ) {\r
+ for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) { \r
+ SwapEndian_32p(&x[i]); \r
+ }\r
+ }\r
+ \r
+ // set BamAlignment 'core' and 'support' data\r
+ bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); \r
bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);\r
-\r
- tempValue = BgzfData::UnpackUnsignedInt(&x[8]);\r
+ \r
+ unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]);\r
bAlignment.Bin = tempValue >> 16;\r
bAlignment.MapQuality = tempValue >> 8 & 0xff;\r
- queryNameLength = tempValue & 0xff;\r
+ supportData.QueryNameLength = tempValue & 0xff;\r
\r
- tempValue = BgzfData::UnpackUnsignedInt(&x[12]);\r
+ tempValue = BgzfData::UnpackUnsignedInt(&x[12]);\r
bAlignment.AlignmentFlag = tempValue >> 16;\r
- numCigarOperations = tempValue & 0xffff;\r
+ supportData.NumCigarOperations = tempValue & 0xffff;\r
\r
- querySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);\r
+ supportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);\r
bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]);\r
bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);\r
bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);\r
-\r
- // calculate lengths/offsets\r
- const unsigned int dataLength = blockLength - BAM_CORE_SIZE;\r
- const unsigned int cigarDataOffset = queryNameLength;\r
- const unsigned int seqDataOffset = cigarDataOffset + (numCigarOperations * 4);\r
- const unsigned int qualDataOffset = seqDataOffset + (querySequenceLength+1)/2;\r
- const unsigned int tagDataOffset = qualDataOffset + querySequenceLength;\r
- const unsigned int tagDataLen = dataLength - tagDataOffset;\r
-\r
- // set up destination buffers for character data\r
- char* allCharData = (char*)calloc(sizeof(char), dataLength);\r
- uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);\r
- char* seqData = ((char*)allCharData) + seqDataOffset;\r
- char* qualData = ((char*)allCharData) + qualDataOffset;\r
- char* tagData = ((char*)allCharData) + tagDataOffset;\r
-\r
- // get character data - make sure proper data size was read\r
+ \r
+ // store 'all char data' and cigar ops\r
+ const unsigned int dataLength = supportData.BlockLength - BAM_CORE_SIZE;\r
+ const unsigned int cigarDataOffset = supportData.QueryNameLength;\r
+ \r
+ char* allCharData = (char*)calloc(sizeof(char), dataLength);\r
+ uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);\r
+ \r
+ // read in character data - make sure proper data size was read\r
if ( mBGZF.Read(allCharData, dataLength) != (signed int)dataLength) { return false; }\r
else {\r
-\r
- bytesRead += dataLength;\r
-\r
- // clear out any previous string data\r
- bAlignment.Name.clear();\r
- bAlignment.QueryBases.clear();\r
- bAlignment.Qualities.clear();\r
- bAlignment.AlignedBases.clear();\r
+ \r
+ // store alignment name and length\r
+ bAlignment.Name.assign((const char*)(allCharData));\r
+ bAlignment.Length = supportData.QuerySequenceLength;\r
+ \r
+ // store remaining 'allCharData' in supportData structure\r
+ supportData.AllCharData.assign((const char*)allCharData, dataLength);\r
+ \r
+ // save CigarOps for BamAlignment\r
bAlignment.CigarData.clear();\r
- bAlignment.TagData.clear();\r
-\r
- // save name\r
- bAlignment.Name = (string)((const char*)(allCharData));\r
-\r
- // save query sequence\r
- for (unsigned int i = 0; i < querySequenceLength; ++i) {\r
- char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];\r
- bAlignment.QueryBases.append( 1, singleBase );\r
- }\r
-\r
- // save sequence length\r
- bAlignment.Length = bAlignment.QueryBases.length();\r
-\r
- // save qualities, convert from numeric QV to FASTQ character\r
- for (unsigned int i = 0; i < querySequenceLength; ++i) {\r
- char singleQuality = (char)(qualData[i]+33);\r
- bAlignment.Qualities.append( 1, singleQuality );\r
- }\r
-\r
- // save CIGAR-related data;\r
- int k = 0;\r
- for (unsigned int i = 0; i < numCigarOperations; ++i) {\r
+ for (unsigned int i = 0; i < supportData.NumCigarOperations; ++i) {\r
\r
- // build CigarOp struct\r
+ // swap if necessary\r
+ if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); }\r
+ \r
+ // build CigarOp structure\r
CigarOp op;\r
op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];\r
\r
// save CigarOp\r
bAlignment.CigarData.push_back(op);\r
-\r
- // build AlignedBases string\r
- switch (op.Type) {\r
-\r
- case ('M') :\r
- case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases\r
- case ('S') : k += op.Length; // for 'S' - skip over query bases\r
- break;\r
-\r
- case ('D') : bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character\r
- break;\r
-\r
- case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character;\r
- break;\r
-\r
- case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence\r
- k += op.Length;\r
- break;\r
-\r
- case ('H') : break; // for 'H' - do nothing, move to next op\r
-\r
- default : printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here\r
- exit(1);\r
- }\r
}\r
-\r
- // read in the tag data\r
- bAlignment.TagData.resize(tagDataLen);\r
- memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);\r
}\r
\r
free(allCharData);\r
// get number of reference sequences\r
char buffer[4];\r
mBGZF.Read(buffer, 4);\r
- const unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);\r
+ unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(numberRefSeqs); }\r
if (numberRefSeqs == 0) { return; }\r
References.reserve((int)numberRefSeqs);\r
\r
\r
// get length of reference name\r
mBGZF.Read(buffer, 4);\r
- const unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(refNameLength); }\r
char* refName = (char*)calloc(refNameLength, 1);\r
\r
// get reference name and reference sequence length\r
mBGZF.Read(refName, refNameLength);\r
mBGZF.Read(buffer, 4);\r
- const int refLength = BgzfData::UnpackSignedInt(buffer);\r
+ int refLength = BgzfData::UnpackSignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(refLength); }\r
\r
// store data for reference\r
RefData aReference;\r
return mBGZF.Seek(AlignmentsBeginOffset);\r
}\r
\r
-// rounds value up to next power-of-2 (used in index building)\r
-void BamReader::BamReaderPrivate::Roundup32(int& value) { \r
- --value;\r
- value |= value >> 1;\r
- value |= value >> 2;\r
- value |= value >> 4;\r
- value |= value >> 8;\r
- value |= value >> 16;\r
- ++value;\r
-}\r
-\r
// saves index data to BAM index file (".bai"), returns success/fail\r
bool BamReader::BamReaderPrivate::WriteIndex(void) {\r
\r
\r
// write number of reference sequences\r
int32_t numReferenceSeqs = Index.size();\r
+ if ( IsBigEndian ) { SwapEndian_32(numReferenceSeqs); }\r
fwrite(&numReferenceSeqs, 4, 1, indexStream);\r
\r
// iterate over reference sequences\r
\r
// write number of bins\r
int32_t binCount = binMap.size();\r
+ if ( IsBigEndian ) { SwapEndian_32(binCount); }\r
fwrite(&binCount, 4, 1, indexStream);\r
\r
// iterate over bins\r
for ( ; binIter != binEnd; ++binIter ) {\r
\r
// get bin data (key and chunk vector)\r
- const uint32_t& binKey = (*binIter).first;\r
+ uint32_t binKey = (*binIter).first;\r
const ChunkVector& binChunks = (*binIter).second;\r
\r
// save BAM bin key\r
+ if ( IsBigEndian ) { SwapEndian_32(binKey); }\r
fwrite(&binKey, 4, 1, indexStream);\r
\r
// save chunk count\r
int32_t chunkCount = binChunks.size();\r
+ if ( IsBigEndian ) { SwapEndian_32(chunkCount); }\r
fwrite(&chunkCount, 4, 1, indexStream);\r
\r
// iterate over chunks\r
\r
// get current chunk data\r
const Chunk& chunk = (*chunkIter);\r
- const uint64_t& start = chunk.Start;\r
- const uint64_t& stop = chunk.Stop;\r
+ uint64_t start = chunk.Start;\r
+ uint64_t stop = chunk.Stop;\r
\r
+ if ( IsBigEndian ) {\r
+ SwapEndian_64(start);\r
+ SwapEndian_64(stop);\r
+ }\r
+ \r
// save chunk offsets\r
fwrite(&start, 8, 1, indexStream);\r
fwrite(&stop, 8, 1, indexStream);\r
\r
// write linear offsets size\r
int32_t offsetSize = offsets.size();\r
+ if ( IsBigEndian ) { SwapEndian_32(offsetSize); }\r
fwrite(&offsetSize, 4, 1, indexStream);\r
\r
// iterate over linear offsets\r
for ( ; offsetIter != offsetEnd; ++offsetIter ) {\r
\r
// write linear offset value\r
- const uint64_t& linearOffset = (*offsetIter);\r
+ uint64_t linearOffset = (*offsetIter);\r
+ if ( IsBigEndian ) { SwapEndian_64(linearOffset); }\r
fwrite(&linearOffset, 8, 1, indexStream);\r
}\r
}\r