// ***************************************************************************\r
-// BamWriter.cpp (c) 2009 Michael Strömberg, Derek Barnett\r
+// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett\r
// Marth Lab, Department of Biology, Boston College\r
// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 8 December 2009 (DB)\r
+// Last modified: 30 March 2010 (DB)\r
// ---------------------------------------------------------------------------\r
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
// Institute.\r
\r
// data members\r
BgzfData mBGZF;\r
-\r
+ bool IsBigEndian;\r
+ \r
+ \r
// constructor / destructor\r
- BamWriterPrivate(void) { }\r
+ BamWriterPrivate(void) { \r
+ IsBigEndian = SystemIsBigEndian(); \r
+ }\r
+ \r
~BamWriterPrivate(void) {\r
mBGZF.Close();\r
}\r
while(*pQuery) {\r
\r
switch(*pQuery) {\r
+ \r
case '=':\r
- nucleotideCode = 0;\r
- break;\r
+ nucleotideCode = 0;\r
+ break;\r
+ \r
case 'A':\r
- nucleotideCode = 1;\r
- break;\r
+ nucleotideCode = 1;\r
+ break;\r
+ \r
case 'C':\r
- nucleotideCode = 2;\r
- break;\r
+ nucleotideCode = 2;\r
+ break;\r
+ \r
case 'G':\r
- nucleotideCode = 4;\r
- break;\r
+ nucleotideCode = 4;\r
+ break;\r
+ \r
case 'T':\r
- nucleotideCode = 8;\r
- break;\r
+ nucleotideCode = 8;\r
+ break;\r
+ \r
case 'N':\r
- nucleotideCode = 15;\r
- break;\r
+ nucleotideCode = 15;\r
+ break;\r
+ \r
default:\r
- printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
- exit(1);\r
+ printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
+ exit(1);\r
}\r
\r
// pack the nucleotide code\r
mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);\r
\r
// write the SAM header text length\r
- const unsigned int samHeaderLen = samHeader.size();\r
+ uint32_t samHeaderLen = samHeader.size();\r
+ if ( IsBigEndian ) { SwapEndian_32(samHeaderLen); }\r
mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
\r
// write the SAM header text\r
}\r
\r
// write the number of reference sequences\r
- const unsigned int numReferenceSequences = referenceSequences.size();\r
+ uint32_t numReferenceSequences = referenceSequences.size();\r
+ if ( IsBigEndian ) { SwapEndian_32(numReferenceSequences); }\r
mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
\r
// =============================\r
for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {\r
\r
// write the reference sequence name length\r
- const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
+ uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
+ if ( IsBigEndian ) { SwapEndian_32(referenceSequenceNameLen); }\r
mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
\r
// write the reference sequence name\r
mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);\r
\r
// write the reference sequence length\r
- mBGZF.Write((char*)&rsIter->RefLength, BT_SIZEOF_INT);\r
+ int32_t referenceLength = rsIter->RefLength;\r
+ if ( IsBigEndian ) { SwapEndian_32(referenceLength); }\r
+ mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);\r
}\r
}\r
\r
const unsigned int encodedQueryLen = encodedQuery.size();\r
\r
// store the tag data length\r
- const unsigned int tagDataLength = al.TagData.size() + 1;\r
-\r
+ // -------------------------------------------\r
+ // Modified: 3-25-2010 DWB\r
+ // Contributed: ARQ\r
+ // Fixed: "off by one" error when parsing tags in files produced by BamWriter\r
+ const unsigned int tagDataLength = al.TagData.size();\r
+ // original line: \r
+ // const unsigned int tagDataLength = al.TagData.size() + 1; \r
+ // -------------------------------------------\r
+ \r
// assign the BAM core data\r
- unsigned int buffer[8];\r
+ uint32_t buffer[8];\r
buffer[0] = al.RefID;\r
buffer[1] = al.Position;\r
buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;\r
buffer[7] = al.InsertSize;\r
\r
// write the block size\r
- const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;\r
- const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
+ unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;\r
+ unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
+ if ( IsBigEndian ) { SwapEndian_32(blockSize); }\r
mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
\r
// write the BAM core\r
+ if ( IsBigEndian ) { \r
+ for ( int i = 0; i < 8; ++i ) { \r
+ SwapEndian_32(buffer[i]); \r
+ } \r
+ }\r
mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
\r
// write the query name\r
mBGZF.Write(al.Name.c_str(), nameLen);\r
\r
// write the packed cigar\r
- mBGZF.Write(packedCigar.data(), packedCigarLen);\r
+ if ( IsBigEndian ) {\r
+ \r
+ char* cigarData = (char*)calloc(sizeof(char), packedCigarLen);\r
+ memcpy(cigarData, packedCigar.data(), packedCigarLen);\r
+ \r
+ for (unsigned int i = 0; i < packedCigarLen; ++i) {\r
+ if ( IsBigEndian ) { \r
+ SwapEndian_32p(&cigarData[i]); \r
+ }\r
+ }\r
+ \r
+ mBGZF.Write(cigarData, packedCigarLen);\r
+ free(cigarData);\r
+ \r
+ } else { \r
+ mBGZF.Write(packedCigar.data(), packedCigarLen);\r
+ }\r
\r
// write the encoded query sequence\r
mBGZF.Write(encodedQuery.data(), encodedQueryLen);\r
mBGZF.Write(pBaseQualities, queryLen);\r
\r
// write the read group tag\r
- mBGZF.Write(al.TagData.data(), tagDataLength);\r
+ if ( IsBigEndian ) {\r
+ \r
+ char* tagData = (char*)calloc(sizeof(char), tagDataLength);\r
+ memcpy(tagData, al.TagData.data(), tagDataLength);\r
+ \r
+ int i = 0;\r
+ while ( (unsigned int)i < tagDataLength ) {\r
+ \r
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)\r
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning \r
+ ++i; // skip value type\r
+ \r
+ switch (type) {\r
+ \r
+ case('A') :\r
+ case('C') : \r
+ ++i;\r
+ break;\r
+ \r
+ case('S') : \r
+ SwapEndian_16p(&tagData[i]); \r
+ i+=2; // sizeof(uint16_t)\r
+ break;\r
+ \r
+ case('F') :\r
+ case('I') : \r
+ SwapEndian_32p(&tagData[i]);\r
+ i+=4; // sizeof(uint32_t)\r
+ break;\r
+ \r
+ case('D') : \r
+ SwapEndian_64p(&tagData[i]);\r
+ i+=8; // sizeof(uint64_t)\r
+ break;\r
+ \r
+ case('H') :\r
+ case('Z') : \r
+ while (tagData[i]) { ++i; }\r
+ ++i; // increment one more for null terminator\r
+ break;\r
+ \r
+ default : \r
+ printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
+ free(tagData);\r
+ exit(1); \r
+ }\r
+ }\r
+ \r
+ mBGZF.Write(tagData, tagDataLength);\r
+ free(tagData);\r
+ } else {\r
+ mBGZF.Write(al.TagData.data(), tagDataLength);\r
+ }\r
}\r