// ***************************************************************************\r
-// BamAux.h (c) 2009 Derek Barnett, Michael Strömberg\r
+// BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg\r
// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 11 Janaury 2010 (DB)\r
+// Last modified: 29 March 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Provides the basic constants, data structures, etc. for using BAM files\r
// ***************************************************************************\r
// Platform-specific type definitions\r
#ifndef BAMTOOLS_TYPES\r
#define BAMTOOLS_TYPES\r
- #ifdef _MSC_VER\r
- typedef char int8_t;\r
- typedef unsigned char uint8_t;\r
- typedef short int16_t;\r
- typedef unsigned short uint16_t;\r
- typedef int int32_t;\r
- typedef unsigned int uint32_t;\r
- typedef long long int64_t;\r
- typedef unsigned long long uint64_t;\r
- #else\r
- #include <stdint.h>\r
- #endif\r
+ #ifdef _MSC_VER\r
+ typedef char int8_t;\r
+ typedef unsigned char uint8_t;\r
+ typedef short int16_t;\r
+ typedef unsigned short uint16_t;\r
+ typedef int int32_t;\r
+ typedef unsigned int uint32_t;\r
+ typedef long long int64_t;\r
+ typedef unsigned long long uint64_t;\r
+ #else\r
+ #include <stdint.h>\r
+ #endif\r
#endif // BAMTOOLS_TYPES\r
\r
namespace BamTools {\r
// Returns true if alignment is second mate on read\r
bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }\r
\r
- // Manipulate alignment flag\r
- public:\r
- // Sets "PCR duplicate" bit \r
+ // Manipulate alignment flag\r
+ public:\r
+ // Sets "PCR duplicate" bit \r
void SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; }\r
// Sets "failed quality control" bit\r
void SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; }\r
void SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; }\r
// Sets "alignment mapped to reverse strand" bit\r
void SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; }\r
- // Sets "position is primary alignment (determined by external app)"\r
+ // Sets "position is primary alignment (determined by external app)"\r
void SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; }\r
// Sets "alignment is second mate on read" bit\r
void SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; }\r
- // Sets "alignment is mapped" bit\r
+ // Sets "alignment is mapped" bit\r
void SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; }\r
- \r
+\r
public:\r
\r
// get "RG" tag data\r
if ( !foundEditDistanceTag ) { return false; }\r
\r
// assign the editDistance value\r
- memcpy(&editDistance, pTagData, 1);\r
+ std::memcpy(&editDistance, pTagData, 1);\r
return true;\r
}\r
\r
++numBytesParsed;\r
++pTagData;\r
}\r
+ // ---------------------------\r
+ // Added: 3-25-2010 DWB\r
+ // Contributed: ARQ\r
+ // Fixed: error parsing variable length tag data\r
+ ++pTagData;\r
+ // ---------------------------\r
break;\r
\r
default:\r
struct RefData {\r
// data members\r
std::string RefName; // Name of reference sequence\r
- int RefLength; // Length of reference sequence\r
+ int32_t RefLength; // Length of reference sequence\r
bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence\r
// constructor\r
RefData(void)\r
{ }\r
};\r
\r
-typedef std::vector<RefData> RefVector;\r
+typedef std::vector<RefData> RefVector;\r
typedef std::vector<BamAlignment> BamAlignmentVector;\r
\r
// ----------------------------------------------------------------\r
\r
typedef std::vector<ReferenceIndex> BamIndex;\r
\r
+// ----------------------------------------------------------------\r
+// Added: 3-35-2010 DWB\r
+// Fixed: Routines to provide endian-correctness\r
+// ----------------------------------------------------------------\r
+\r
+// returns true if system is big endian\r
+inline bool SystemIsBigEndian(void) {\r
+ const uint16_t one = 0x0001;\r
+ return ((*(char*) &one) == 0 );\r
+}\r
+\r
+// swaps endianness of 16-bit value 'in place'\r
+inline void SwapEndian_16(uint16_t& x) {\r
+ x = ((x >> 8) | (x << 8));\r
+}\r
+\r
+// swaps endianness of 32-bit value 'in-place'\r
+inline void SwapEndian_32(uint32_t& value) {\r
+ x = ( (x >> 24) | \r
+ ((x << 8) & 0x00FF0000) | \r
+ ((x >> 8) & 0x0000FF00) | \r
+ (x << 24)\r
+ );\r
+}\r
+\r
+// swaps endianness of 64-bit value 'in-place'\r
+inline void SwapEndian_64(uint64_t& value) {\r
+ x = ( (x >> 56) | \r
+ ((x << 40) & 0x00FF000000000000) |\r
+ ((x << 24) & 0x0000FF0000000000) |\r
+ ((x << 8) & 0x000000FF00000000) |\r
+ ((x >> 8) & 0x00000000FF000000) |\r
+ ((x >> 24) & 0x0000000000FF0000) |\r
+ ((x >> 40) & 0x000000000000FF00) |\r
+ (x << 56)\r
+ );\r
+}\r
+\r
+inline void SwapEndian_16p(char* data) {\r
+ uint16_t& value = (uint16_t&)*data; \r
+ SwapEndian_16(value);\r
+}\r
+\r
+inline void SwapEndian_32p(char* data) {\r
+ uint32_t& value = (uint32_t&)*data; \r
+ SwapEndian_32(value);\r
+}\r
+\r
+inline void SwapEndian_64p(char* data) {\r
+ uint64_t& value = (uint64_t&)*data; \r
+ SwapEndian_64(value);\r
+}\r
+\r
} // namespace BamTools\r
\r
#endif // BAMAUX_H\r
// ***************************************************************************\r
-// BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg\r
+// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg\r
// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 11 January 2010(DB)\r
+// Last modified: 29 March 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
// Institute.\r
int64_t AlignmentsBeginOffset;\r
string Filename;\r
string IndexFilename;\r
+ \r
+ bool IsBigEndian;\r
\r
// user-specified region values\r
bool IsRegionSpecified;\r
, CurrentLeft(0)\r
, DNA_LOOKUP("=ACMGRSVTWYHKDBN")\r
, CIGAR_LOOKUP("MIDNSHP")\r
-{ }\r
+{ \r
+ IsBigEndian = SystemIsBigEndian();\r
+}\r
\r
// destructor\r
BamReader::BamReaderPrivate::~BamReaderPrivate(void) {\r
// clear out index\r
ClearIndex();\r
\r
- // build (& save) index from BAM file\r
+ // build (& save) index from BAM file\r
bool ok = true;\r
ok &= BuildIndex();\r
ok &= WriteIndex();\r
\r
- // return success/fail\r
+ // return success/fail\r
return ok;\r
}\r
\r
// if data exists for this reference and position is valid \r
if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {\r
\r
- // set current region\r
+ // set current region\r
CurrentRefID = refID;\r
CurrentLeft = position;\r
IsRegionSpecified = true;\r
\r
- // calculate offset\r
+ // calculate offset\r
int64_t offset = GetOffset(CurrentRefID, CurrentLeft);\r
\r
- // if in valid offset, return failure\r
+ // if in valid offset, return failure\r
if ( offset == -1 ) { return false; }\r
\r
- // otherwise return success of seek operation\r
+ // otherwise return success of seek operation\r
else { return mBGZF.Seek(offset); }\r
}\r
\r
- // invalid jump request parameters, return failure\r
+ // invalid jump request parameters, return failure\r
return false;\r
}\r
\r
\r
// get BAM header text length\r
mBGZF.Read(buffer, 4);\r
- const unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);\r
-\r
+ unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(headerTextLength); }\r
+ \r
// get BAM header text\r
char* headerText = (char*)calloc(headerTextLength + 1, 1);\r
mBGZF.Read(headerText, headerTextLength);\r
return false;\r
}\r
\r
- size_t elementsRead = 0;\r
+ size_t elementsRead = 0;\r
\r
// see if index is valid BAM index\r
char magic[4];\r
// get number of reference sequences\r
uint32_t numRefSeqs;\r
elementsRead = fread(&numRefSeqs, 4, 1, indexStream);\r
-\r
+ if ( IsBigEndian ) { SwapEndian_32(numRefSeqs); }\r
+ \r
// intialize space for BamIndex data structure\r
Index.reserve(numRefSeqs);\r
\r
// get number of bins for this reference sequence\r
int32_t numBins;\r
elementsRead = fread(&numBins, 4, 1, indexStream);\r
+ if ( IsBigEndian ) { SwapEndian_32(numBins); }\r
\r
if (numBins > 0) {\r
RefData& refEntry = References[i];\r
uint32_t numChunks;\r
elementsRead = fread(&numChunks, 4, 1, indexStream);\r
\r
+ if ( IsBigEndian ) { \r
+ SwapEndian_32(binID);\r
+ SwapEndian_32(numChunks);\r
+ }\r
+ \r
// intialize ChunkVector\r
ChunkVector regionChunks;\r
regionChunks.reserve(numChunks);\r
elementsRead = fread(&left, 8, 1, indexStream);\r
elementsRead = fread(&right, 8, 1, indexStream);\r
\r
+ if ( IsBigEndian ) {\r
+ SwapEndian_64(left);\r
+ SwapEndian_64(right);\r
+ }\r
+ \r
// save ChunkPair\r
regionChunks.push_back( Chunk(left, right) );\r
}\r
// get number of linear offsets\r
int32_t numLinearOffsets;\r
elementsRead = fread(&numLinearOffsets, 4, 1, indexStream);\r
+ if ( IsBigEndian ) { SwapEndian_32(numLinearOffsets); }\r
\r
// intialize LinearOffsetVector\r
LinearOffsetVector offsets;\r
for (int j = 0; j < numLinearOffsets; ++j) {\r
// read a linear offset & store\r
elementsRead = fread(&linearOffset, 8, 1, indexStream);\r
+ if ( IsBigEndian ) { SwapEndian_64(linearOffset); }\r
offsets.push_back(linearOffset);\r
}\r
\r
// read in the 'block length' value, make sure it's not zero\r
char buffer[4];\r
mBGZF.Read(buffer, 4);\r
- const unsigned int blockLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ unsigned int blockLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(blockLength); }\r
if ( blockLength == 0 ) { return false; }\r
\r
// keep track of bytes read as method progresses\r
int bytesRead = 4;\r
\r
// read in core alignment data, make sure the right size of data was read\r
- char x[BAM_CORE_SIZE];\r
+ uint32_t x[8];\r
if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }\r
bytesRead += BAM_CORE_SIZE;\r
\r
+ if ( IsBigEndian ) {\r
+ for ( int i = 0; i < 8; ++i ) { \r
+ SwapEndian_32(x[i]); \r
+ }\r
+ }\r
+ \r
// set BamAlignment 'core' data and character data lengths\r
unsigned int tempValue;\r
unsigned int queryNameLength;\r
unsigned int numCigarOperations;\r
unsigned int querySequenceLength;\r
\r
- bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]);\r
- bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);\r
-\r
- tempValue = BgzfData::UnpackUnsignedInt(&x[8]);\r
+ bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); \r
+ bAlignment.Position = BgzfData::UnpackSignedInt(&x[1]);\r
+ \r
+ tempValue = BgzfData::UnpackUnsignedInt(&x[2]);\r
bAlignment.Bin = tempValue >> 16;\r
bAlignment.MapQuality = tempValue >> 8 & 0xff;\r
queryNameLength = tempValue & 0xff;\r
\r
- tempValue = BgzfData::UnpackUnsignedInt(&x[12]);\r
+ tempValue = BgzfData::UnpackUnsignedInt(&x[3]);\r
bAlignment.AlignmentFlag = tempValue >> 16;\r
numCigarOperations = tempValue & 0xffff;\r
\r
- querySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);\r
- bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]);\r
- bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);\r
- bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);\r
-\r
+ querySequenceLength = BgzfData::UnpackUnsignedInt(&x[4]);\r
+ bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[5]);\r
+ bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[6]);\r
+ bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[7]);\r
+ \r
// calculate lengths/offsets\r
const unsigned int dataLength = blockLength - BAM_CORE_SIZE;\r
const unsigned int cigarDataOffset = queryNameLength;\r
bytesRead += dataLength;\r
\r
// clear out any previous string data\r
- bAlignment.Name.clear();\r
+ bAlignment.Name.clear(;)\r
bAlignment.QueryBases.clear();\r
bAlignment.Qualities.clear();\r
bAlignment.AlignedBases.clear();\r
bAlignment.Name = (string)((const char*)(allCharData));\r
\r
// save query sequence\r
+ // -----------------------\r
+ // Added: 3-25-2010 DWB\r
+ // Improved: reduced repeated memory allocations as string grows\r
+ bAlignment.QueryBases.reserve(querySequenceLength);\r
+ // -----------------------\r
+ \r
for (unsigned int i = 0; i < querySequenceLength; ++i) {\r
char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];\r
bAlignment.QueryBases.append( 1, singleBase );\r
bAlignment.Length = bAlignment.QueryBases.length();\r
\r
// save qualities, convert from numeric QV to FASTQ character\r
- for (unsigned int i = 0; i < querySequenceLength; ++i) {\r
+ // -----------------------\r
+ // Added: 3-25-2010 DWB\r
+ // Improved: reduced repeated memory allocations as string grows\r
+ bAlignment.Qualities.reserve(querySequenceLength);\r
+ // -----------------------\r
+ \r
+ for (unsigned int i = 0; i < querySequenceLength; ++i) {\r
char singleQuality = (char)(qualData[i]+33);\r
bAlignment.Qualities.append( 1, singleQuality );\r
}\r
\r
// save CIGAR-related data;\r
- int k = 0;\r
+ // -----------------------\r
+ // Added: 3-25-2010 DWB\r
+ // Improved: reduced repeated memory allocations as string grows\r
+ bAlignment.AlignedBases.reserve(querySequenceLength);\r
+ // -----------------------\r
+ \r
+ int k = 0;\r
for (unsigned int i = 0; i < numCigarOperations; ++i) {\r
\r
+ if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); }\r
+ \r
// build CigarOp struct\r
CigarOp op;\r
op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
switch (op.Type) {\r
\r
case ('M') :\r
- case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases\r
- case ('S') : k += op.Length; // for 'S' - skip over query bases\r
- break;\r
-\r
- case ('D') : bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character\r
- break;\r
-\r
- case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character;\r
- break;\r
-\r
- case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence\r
- k += op.Length;\r
- break;\r
-\r
- case ('H') : break; // for 'H' - do nothing, move to next op\r
-\r
- default : printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here\r
- exit(1);\r
+ case ('I') : \r
+ bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases\r
+ // fall through\r
+ \r
+ case ('S') : \r
+ k += op.Length; // for 'S' - skip over query bases\r
+ break;\r
+\r
+ case ('D') : \r
+ bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character\r
+ break;\r
+\r
+ case ('P') : \r
+ bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character;\r
+ break;\r
+\r
+ case ('N') : \r
+ bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence\r
+ // -----------------------\r
+ // Removed: 3-25-2010 DWB\r
+ // Contributed: ARQ\r
+ // Fixed: compliance with actual 'N' definition in BAM spec\r
+ // k += op.Length;\r
+ // -----------------------\r
+ break;\r
+\r
+ case ('H') : \r
+ break; // for 'H' - do nothing, move to next op\r
+\r
+ default : \r
+ printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here\r
+ free(allCharData);\r
+ exit(1);\r
}\r
}\r
\r
- // read in the tag data\r
+ // -----------------------\r
+ // Added: 3-25-2010 DWB\r
+ // Fixed: endian-correctness for tag data\r
+ // -----------------------\r
+ if ( IsBigEndian ) {\r
+ int i = 0;\r
+ while ( i < tagDataLen ) {\r
+ \r
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)\r
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning \r
+ ++i; // skip value type\r
+ \r
+ switch (type) {\r
+ \r
+ case('A') :\r
+ case('C') : \r
+ ++i;\r
+ break;\r
+ \r
+ case('S') : \r
+ SwapEndian_16p(&tagData[i]); \r
+ i+=2; // sizeof(uint16_t)\r
+ break;\r
+ \r
+ case('F') :\r
+ case('I') : \r
+ SwapEndian_32p(&tagData[i]);\r
+ i+=4; // sizeof(uint32_t)\r
+ break;\r
+ \r
+ case('D') : \r
+ SwapEndian_64p(&tagData[i]);\r
+ i+=8; // sizeof(uint64_t)\r
+ break;\r
+ \r
+ case('H') :\r
+ case('Z') : \r
+ while (tagData[i]) { ++i; }\r
+ ++i; // increment one more for null terminator\r
+ break;\r
+ \r
+ default : \r
+ printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
+ free(allCharData);\r
+ exit(1); \r
+ }\r
+ }\r
+ }\r
+ \r
+ // store tag data\r
bAlignment.TagData.resize(tagDataLen);\r
memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);\r
}\r
// get number of reference sequences\r
char buffer[4];\r
mBGZF.Read(buffer, 4);\r
- const unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);\r
+ unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(numberRefSeqs); }\r
if (numberRefSeqs == 0) { return; }\r
References.reserve((int)numberRefSeqs);\r
\r
\r
// get length of reference name\r
mBGZF.Read(buffer, 4);\r
- const unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(refNameLength); }\r
char* refName = (char*)calloc(refNameLength, 1);\r
\r
// get reference name and reference sequence length\r
mBGZF.Read(refName, refNameLength);\r
mBGZF.Read(buffer, 4);\r
- const int refLength = BgzfData::UnpackSignedInt(buffer);\r
+ int refLength = BgzfData::UnpackSignedInt(buffer);\r
+ if ( IsBigEndian ) { SwapEndian_32(refLength); }\r
\r
// store data for reference\r
RefData aReference;\r
\r
// write number of reference sequences\r
int32_t numReferenceSeqs = Index.size();\r
+ if ( IsBigEndian ) { SwapEndian_32(numReferenceSeqs); }\r
fwrite(&numReferenceSeqs, 4, 1, indexStream);\r
\r
// iterate over reference sequences\r
\r
// write number of bins\r
int32_t binCount = binMap.size();\r
+ if ( IsBigEndian ) { SwapEndian_32(binCount); }\r
fwrite(&binCount, 4, 1, indexStream);\r
\r
// iterate over bins\r
for ( ; binIter != binEnd; ++binIter ) {\r
\r
// get bin data (key and chunk vector)\r
- const uint32_t& binKey = (*binIter).first;\r
+ uint32_t binKey = (*binIter).first;\r
const ChunkVector& binChunks = (*binIter).second;\r
\r
// save BAM bin key\r
+ if ( IsBigEndian ) { SwapEndian_32(binKey); }\r
fwrite(&binKey, 4, 1, indexStream);\r
\r
// save chunk count\r
int32_t chunkCount = binChunks.size();\r
+ if ( IsBigEndian ) { SwapEndian_32(chunkCount); }\r
fwrite(&chunkCount, 4, 1, indexStream);\r
\r
// iterate over chunks\r
\r
// get current chunk data\r
const Chunk& chunk = (*chunkIter);\r
- const uint64_t& start = chunk.Start;\r
- const uint64_t& stop = chunk.Stop;\r
+ uint64_t start = chunk.Start;\r
+ uint64_t stop = chunk.Stop;\r
\r
+ if ( IsBigEndian ) {\r
+ SwapEndian_64(start);\r
+ SwapEndian_64(stop);\r
+ }\r
+ \r
// save chunk offsets\r
fwrite(&start, 8, 1, indexStream);\r
fwrite(&stop, 8, 1, indexStream);\r
\r
// write linear offsets size\r
int32_t offsetSize = offsets.size();\r
+ if ( IsBigEndian ) { SwapEndian_32(offsetSize); }\r
fwrite(&offsetSize, 4, 1, indexStream);\r
\r
// iterate over linear offsets\r
for ( ; offsetIter != offsetEnd; ++offsetIter ) {\r
\r
// write linear offset value\r
- const uint64_t& linearOffset = (*offsetIter);\r
+ uint64_t linearOffset = (*offsetIter);\r
+ if ( IsBigEndian ) { SwapEndian_64(linearOffset); }\r
fwrite(&linearOffset, 8, 1, indexStream);\r
}\r
}\r
// ***************************************************************************\r
-// BamWriter.cpp (c) 2009 Michael Strömberg, Derek Barnett\r
+// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett\r
// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 8 December 2009 (DB)\r
+// Last modified: 29 March 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
// Institute.\r
\r
// data members\r
BgzfData mBGZF;\r
-\r
+ bool IsBigEndian;\r
+ \r
+ \r
// constructor / destructor\r
- BamWriterPrivate(void) { }\r
+ BamWriterPrivate(void) { \r
+ IsBigEndian = SystemIsBigEndian(); \r
+ }\r
+ \r
~BamWriterPrivate(void) {\r
mBGZF.Close();\r
}\r
while(*pQuery) {\r
\r
switch(*pQuery) {\r
+ \r
case '=':\r
- nucleotideCode = 0;\r
- break;\r
+ nucleotideCode = 0;\r
+ break;\r
+ \r
case 'A':\r
- nucleotideCode = 1;\r
- break;\r
+ nucleotideCode = 1;\r
+ break;\r
+ \r
case 'C':\r
- nucleotideCode = 2;\r
- break;\r
+ nucleotideCode = 2;\r
+ break;\r
+ \r
case 'G':\r
- nucleotideCode = 4;\r
- break;\r
+ nucleotideCode = 4;\r
+ break;\r
+ \r
case 'T':\r
- nucleotideCode = 8;\r
- break;\r
+ nucleotideCode = 8;\r
+ break;\r
+ \r
case 'N':\r
- nucleotideCode = 15;\r
- break;\r
+ nucleotideCode = 15;\r
+ break;\r
+ \r
default:\r
- printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
- exit(1);\r
+ printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
+ exit(1);\r
}\r
\r
// pack the nucleotide code\r
mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);\r
\r
// write the SAM header text length\r
- const unsigned int samHeaderLen = samHeader.size();\r
+ uint32_t samHeaderLen = samHeader.size();\r
+ if ( IsBigEndian ) { SwapEndian_32(samHeaderLen); }\r
mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
\r
// write the SAM header text\r
}\r
\r
// write the number of reference sequences\r
- const unsigned int numReferenceSequences = referenceSequences.size();\r
+ uint32_t numReferenceSequences = referenceSequences.size();\r
+ if ( IsBigEndian ) { SwapEndian_32(numReferenceSequences); }\r
mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
\r
// =============================\r
for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {\r
\r
// write the reference sequence name length\r
- const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
+ uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
+ if ( IsBigEndian ) { SwapEndian_32(referenceSequenceNameLen); }\r
mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
\r
// write the reference sequence name\r
mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);\r
\r
// write the reference sequence length\r
- mBGZF.Write((char*)&rsIter->RefLength, BT_SIZEOF_INT);\r
+ int32_t referenceLength = rsIter->RefLength;\r
+ if ( IsBigEndian ) { SwapEndian_32(referenceLength); }\r
+ mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);\r
}\r
}\r
\r
const unsigned int encodedQueryLen = encodedQuery.size();\r
\r
// store the tag data length\r
- const unsigned int tagDataLength = al.TagData.size() + 1;\r
-\r
+ // -------------------------------------------\r
+ // Modified: 3-25-2010 DWB\r
+ // Contributed: ARQ\r
+ // Fixed: "off by one" error when parsing tags in files produced by BamWriter\r
+ const unsigned int tagDataLength = al.TagData.size();\r
+ // original line: \r
+ // const unsigned int tagDataLength = al.TagData.size() + 1; \r
+ // -------------------------------------------\r
+ \r
// assign the BAM core data\r
- unsigned int buffer[8];\r
+ uint32_t buffer[8];\r
buffer[0] = al.RefID;\r
buffer[1] = al.Position;\r
buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;\r
buffer[7] = al.InsertSize;\r
\r
// write the block size\r
- const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;\r
- const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
+ unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;\r
+ unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
+ if ( IsBigEndian ) { SwapEndian_32(blockSize); }\r
mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
\r
// write the BAM core\r
+ if ( IsBigEndian ) { \r
+ for ( int i = 0; i < 8; ++i ) { \r
+ SwapEndian_32(buffer[i]); \r
+ } \r
+ }\r
mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
\r
// write the query name\r
mBGZF.Write(al.Name.c_str(), nameLen);\r
\r
// write the packed cigar\r
- mBGZF.Write(packedCigar.data(), packedCigarLen);\r
+ if ( IsBigEndian ) {\r
+ \r
+ char* cigarData = (char*)calloc(sizeof(char), packedCigarLen);\r
+ memcpy(cigarData, packedCigar.data(), packedCigarLen);\r
+ \r
+ for (unsigned int i = 0; i < packedCigarLen; ++i) {\r
+ if ( IsBigEndian ) { \r
+ SwapEndian_32(cigarData[i]); \r
+ }\r
+ }\r
+ \r
+ mBGZF.Write(cigarData, packedCigarLen);\r
+ free(cigarData);\r
+ \r
+ } else { \r
+ mBGZF.Write(packedCigar.data(), packedCigarLen);\r
+ }\r
\r
// write the encoded query sequence\r
mBGZF.Write(encodedQuery.data(), encodedQueryLen);\r
mBGZF.Write(pBaseQualities, queryLen);\r
\r
// write the read group tag\r
- mBGZF.Write(al.TagData.data(), tagDataLength);\r
+ if ( IsBigEndian ) {\r
+ \r
+ char* tagData = (char*)calloc(sizeof(char), tagDataLength);\r
+ memcpy(tagData, al.TagData.data(), tagDataLength);\r
+ \r
+ int i = 0;\r
+ while ( i < tagDataLength ) {\r
+ \r
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)\r
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning \r
+ ++i; // skip value type\r
+ \r
+ switch (type) {\r
+ \r
+ case('A') :\r
+ case('C') : \r
+ ++i;\r
+ break;\r
+ \r
+ case('S') : \r
+ SwapEndian_16p(&tagData[i]); \r
+ i+=2; // sizeof(uint16_t)\r
+ break;\r
+ \r
+ case('F') :\r
+ case('I') : \r
+ SwapEndian_32p(&tagData[i]);\r
+ i+=4; // sizeof(uint32_t)\r
+ break;\r
+ \r
+ case('D') : \r
+ SwapEndian_64p(&tagData[i]);\r
+ i+=8; // sizeof(uint64_t)\r
+ break;\r
+ \r
+ case('H') :\r
+ case('Z') : \r
+ while (tagData[i]) { ++i; }\r
+ ++i; // increment one more for null terminator\r
+ break;\r
+ \r
+ default : \r
+ printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
+ free(tagData);\r
+ exit(1); \r
+ }\r
+ }\r
+ \r
+ mBGZF.Write(tagData, tagDataLength);\r
+ free(tagData);\r
+ } else {\r
+ mBGZF.Write(al.TagData.data(), tagDataLength);\r
+ }\r
}\r