From 74536d1d4177b7527b8a0d1c1e95b22e233b8300 Mon Sep 17 00:00:00 2001 From: barnett Date: Tue, 30 Mar 2010 15:00:57 +0000 Subject: [PATCH] Fixed: off by 1 in BamWriter, variable tag parsing in BamAux, endian-correctness git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@40 9efb377e-2e27-44b9-b91a-ec4abb80ed8b --- BamAux.h | 105 +++++++++++++++++----- BamReader.cpp | 234 ++++++++++++++++++++++++++++++++++++++------------ BamWriter.cpp | 156 ++++++++++++++++++++++++++------- 3 files changed, 388 insertions(+), 107 deletions(-) diff --git a/BamAux.h b/BamAux.h index fb04642..ea70927 100644 --- a/BamAux.h +++ b/BamAux.h @@ -1,9 +1,9 @@ // *************************************************************************** -// BamAux.h (c) 2009 Derek Barnett, Michael Strömberg +// BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 11 Janaury 2010 (DB) +// Last modified: 29 March 2010 (DB) // --------------------------------------------------------------------------- // Provides the basic constants, data structures, etc. for using BAM files // *************************************************************************** @@ -26,18 +26,18 @@ // Platform-specific type definitions #ifndef BAMTOOLS_TYPES #define BAMTOOLS_TYPES - #ifdef _MSC_VER - typedef char int8_t; - typedef unsigned char uint8_t; - typedef short int16_t; - typedef unsigned short uint16_t; - typedef int int32_t; - typedef unsigned int uint32_t; - typedef long long int64_t; - typedef unsigned long long uint64_t; - #else - #include - #endif + #ifdef _MSC_VER + typedef char int8_t; + typedef unsigned char uint8_t; + typedef short int16_t; + typedef unsigned short uint16_t; + typedef int int32_t; + typedef unsigned int uint32_t; + typedef long long int64_t; + typedef unsigned long long uint64_t; + #else + #include + #endif #endif // BAMTOOLS_TYPES namespace BamTools { @@ -91,9 +91,9 @@ struct BamAlignment { // Returns true if alignment is second mate on read bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } - // Manipulate alignment flag - public: - // Sets "PCR duplicate" bit + // Manipulate alignment flag + public: + // Sets "PCR duplicate" bit void SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; } // Sets "failed quality control" bit void SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; } @@ -109,13 +109,13 @@ struct BamAlignment { void SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; } // Sets "alignment mapped to reverse strand" bit void SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; } - // Sets "position is primary alignment (determined by external app)" + // Sets "position is primary alignment (determined by external app)" void SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; } // Sets "alignment is second mate on read" bit void SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; } - // Sets "alignment is mapped" bit + // Sets "alignment is mapped" bit void SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; } - + public: // get "RG" tag data @@ -187,7 +187,7 @@ struct BamAlignment { if ( !foundEditDistanceTag ) { return false; } // assign the editDistance value - memcpy(&editDistance, pTagData, 1); + std::memcpy(&editDistance, pTagData, 1); return true; } @@ -221,6 +221,12 @@ struct BamAlignment { ++numBytesParsed; ++pTagData; } + // --------------------------- + // Added: 3-25-2010 DWB + // Contributed: ARQ + // Fixed: error parsing variable length tag data + ++pTagData; + // --------------------------- break; default: @@ -274,7 +280,7 @@ struct CigarOp { struct RefData { // data members std::string RefName; // Name of reference sequence - int RefLength; // Length of reference sequence + int32_t RefLength; // Length of reference sequence bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence // constructor RefData(void) @@ -283,7 +289,7 @@ struct RefData { { } }; -typedef std::vector RefVector; +typedef std::vector RefVector; typedef std::vector BamAlignmentVector; // ---------------------------------------------------------------- @@ -323,6 +329,59 @@ struct ReferenceIndex { typedef std::vector BamIndex; +// ---------------------------------------------------------------- +// Added: 3-35-2010 DWB +// Fixed: Routines to provide endian-correctness +// ---------------------------------------------------------------- + +// returns true if system is big endian +inline bool SystemIsBigEndian(void) { + const uint16_t one = 0x0001; + return ((*(char*) &one) == 0 ); +} + +// swaps endianness of 16-bit value 'in place' +inline void SwapEndian_16(uint16_t& x) { + x = ((x >> 8) | (x << 8)); +} + +// swaps endianness of 32-bit value 'in-place' +inline void SwapEndian_32(uint32_t& value) { + x = ( (x >> 24) | + ((x << 8) & 0x00FF0000) | + ((x >> 8) & 0x0000FF00) | + (x << 24) + ); +} + +// swaps endianness of 64-bit value 'in-place' +inline void SwapEndian_64(uint64_t& value) { + x = ( (x >> 56) | + ((x << 40) & 0x00FF000000000000) | + ((x << 24) & 0x0000FF0000000000) | + ((x << 8) & 0x000000FF00000000) | + ((x >> 8) & 0x00000000FF000000) | + ((x >> 24) & 0x0000000000FF0000) | + ((x >> 40) & 0x000000000000FF00) | + (x << 56) + ); +} + +inline void SwapEndian_16p(char* data) { + uint16_t& value = (uint16_t&)*data; + SwapEndian_16(value); +} + +inline void SwapEndian_32p(char* data) { + uint32_t& value = (uint32_t&)*data; + SwapEndian_32(value); +} + +inline void SwapEndian_64p(char* data) { + uint64_t& value = (uint64_t&)*data; + SwapEndian_64(value); +} + } // namespace BamTools #endif // BAMAUX_H diff --git a/BamReader.cpp b/BamReader.cpp index 5dd65f2..c29b14c 100644 --- a/BamReader.cpp +++ b/BamReader.cpp @@ -1,9 +1,9 @@ // *************************************************************************** -// BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg +// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 11 January 2010(DB) +// Last modified: 29 March 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -38,6 +38,8 @@ struct BamReader::BamReaderPrivate { int64_t AlignmentsBeginOffset; string Filename; string IndexFilename; + + bool IsBigEndian; // user-specified region values bool IsRegionSpecified; @@ -163,7 +165,9 @@ BamReader::BamReaderPrivate::BamReaderPrivate(void) , CurrentLeft(0) , DNA_LOOKUP("=ACMGRSVTWYHKDBN") , CIGAR_LOOKUP("MIDNSHP") -{ } +{ + IsBigEndian = SystemIsBigEndian(); +} // destructor BamReader::BamReaderPrivate::~BamReaderPrivate(void) { @@ -361,12 +365,12 @@ bool BamReader::BamReaderPrivate::CreateIndex(void) { // clear out index ClearIndex(); - // build (& save) index from BAM file + // build (& save) index from BAM file bool ok = true; ok &= BuildIndex(); ok &= WriteIndex(); - // return success/fail + // return success/fail return ok; } @@ -523,22 +527,22 @@ bool BamReader::BamReaderPrivate::Jump(int refID, int position) { // if data exists for this reference and position is valid if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) { - // set current region + // set current region CurrentRefID = refID; CurrentLeft = position; IsRegionSpecified = true; - // calculate offset + // calculate offset int64_t offset = GetOffset(CurrentRefID, CurrentLeft); - // if in valid offset, return failure + // if in valid offset, return failure if ( offset == -1 ) { return false; } - // otherwise return success of seek operation + // otherwise return success of seek operation else { return mBGZF.Seek(offset); } } - // invalid jump request parameters, return failure + // invalid jump request parameters, return failure return false; } @@ -559,8 +563,9 @@ void BamReader::BamReaderPrivate::LoadHeaderData(void) { // get BAM header text length mBGZF.Read(buffer, 4); - const unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer); - + unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer); + if ( IsBigEndian ) { SwapEndian_32(headerTextLength); } + // get BAM header text char* headerText = (char*)calloc(headerTextLength + 1, 1); mBGZF.Read(headerText, headerTextLength); @@ -586,7 +591,7 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) { return false; } - size_t elementsRead = 0; + size_t elementsRead = 0; // see if index is valid BAM index char magic[4]; @@ -600,7 +605,8 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) { // get number of reference sequences uint32_t numRefSeqs; elementsRead = fread(&numRefSeqs, 4, 1, indexStream); - + if ( IsBigEndian ) { SwapEndian_32(numRefSeqs); } + // intialize space for BamIndex data structure Index.reserve(numRefSeqs); @@ -610,6 +616,7 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) { // get number of bins for this reference sequence int32_t numBins; elementsRead = fread(&numBins, 4, 1, indexStream); + if ( IsBigEndian ) { SwapEndian_32(numBins); } if (numBins > 0) { RefData& refEntry = References[i]; @@ -630,6 +637,11 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) { uint32_t numChunks; elementsRead = fread(&numChunks, 4, 1, indexStream); + if ( IsBigEndian ) { + SwapEndian_32(binID); + SwapEndian_32(numChunks); + } + // intialize ChunkVector ChunkVector regionChunks; regionChunks.reserve(numChunks); @@ -643,6 +655,11 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) { elementsRead = fread(&left, 8, 1, indexStream); elementsRead = fread(&right, 8, 1, indexStream); + if ( IsBigEndian ) { + SwapEndian_64(left); + SwapEndian_64(right); + } + // save ChunkPair regionChunks.push_back( Chunk(left, right) ); } @@ -659,6 +676,7 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) { // get number of linear offsets int32_t numLinearOffsets; elementsRead = fread(&numLinearOffsets, 4, 1, indexStream); + if ( IsBigEndian ) { SwapEndian_32(numLinearOffsets); } // intialize LinearOffsetVector LinearOffsetVector offsets; @@ -669,6 +687,7 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) { for (int j = 0; j < numLinearOffsets; ++j) { // read a linear offset & store elementsRead = fread(&linearOffset, 8, 1, indexStream); + if ( IsBigEndian ) { SwapEndian_64(linearOffset); } offsets.push_back(linearOffset); } @@ -690,40 +709,47 @@ bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { // read in the 'block length' value, make sure it's not zero char buffer[4]; mBGZF.Read(buffer, 4); - const unsigned int blockLength = BgzfData::UnpackUnsignedInt(buffer); + unsigned int blockLength = BgzfData::UnpackUnsignedInt(buffer); + if ( IsBigEndian ) { SwapEndian_32(blockLength); } if ( blockLength == 0 ) { return false; } // keep track of bytes read as method progresses int bytesRead = 4; // read in core alignment data, make sure the right size of data was read - char x[BAM_CORE_SIZE]; + uint32_t x[8]; if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; } bytesRead += BAM_CORE_SIZE; + if ( IsBigEndian ) { + for ( int i = 0; i < 8; ++i ) { + SwapEndian_32(x[i]); + } + } + // set BamAlignment 'core' data and character data lengths unsigned int tempValue; unsigned int queryNameLength; unsigned int numCigarOperations; unsigned int querySequenceLength; - bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); - bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]); - - tempValue = BgzfData::UnpackUnsignedInt(&x[8]); + bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); + bAlignment.Position = BgzfData::UnpackSignedInt(&x[1]); + + tempValue = BgzfData::UnpackUnsignedInt(&x[2]); bAlignment.Bin = tempValue >> 16; bAlignment.MapQuality = tempValue >> 8 & 0xff; queryNameLength = tempValue & 0xff; - tempValue = BgzfData::UnpackUnsignedInt(&x[12]); + tempValue = BgzfData::UnpackUnsignedInt(&x[3]); bAlignment.AlignmentFlag = tempValue >> 16; numCigarOperations = tempValue & 0xffff; - querySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]); - bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]); - bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]); - bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]); - + querySequenceLength = BgzfData::UnpackUnsignedInt(&x[4]); + bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[5]); + bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[6]); + bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[7]); + // calculate lengths/offsets const unsigned int dataLength = blockLength - BAM_CORE_SIZE; const unsigned int cigarDataOffset = queryNameLength; @@ -746,7 +772,7 @@ bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { bytesRead += dataLength; // clear out any previous string data - bAlignment.Name.clear(); + bAlignment.Name.clear(;) bAlignment.QueryBases.clear(); bAlignment.Qualities.clear(); bAlignment.AlignedBases.clear(); @@ -757,6 +783,12 @@ bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { bAlignment.Name = (string)((const char*)(allCharData)); // save query sequence + // ----------------------- + // Added: 3-25-2010 DWB + // Improved: reduced repeated memory allocations as string grows + bAlignment.QueryBases.reserve(querySequenceLength); + // ----------------------- + for (unsigned int i = 0; i < querySequenceLength; ++i) { char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; bAlignment.QueryBases.append( 1, singleBase ); @@ -766,15 +798,29 @@ bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { bAlignment.Length = bAlignment.QueryBases.length(); // save qualities, convert from numeric QV to FASTQ character - for (unsigned int i = 0; i < querySequenceLength; ++i) { + // ----------------------- + // Added: 3-25-2010 DWB + // Improved: reduced repeated memory allocations as string grows + bAlignment.Qualities.reserve(querySequenceLength); + // ----------------------- + + for (unsigned int i = 0; i < querySequenceLength; ++i) { char singleQuality = (char)(qualData[i]+33); bAlignment.Qualities.append( 1, singleQuality ); } // save CIGAR-related data; - int k = 0; + // ----------------------- + // Added: 3-25-2010 DWB + // Improved: reduced repeated memory allocations as string grows + bAlignment.AlignedBases.reserve(querySequenceLength); + // ----------------------- + + int k = 0; for (unsigned int i = 0; i < numCigarOperations; ++i) { + if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); } + // build CigarOp struct CigarOp op; op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); @@ -787,28 +833,92 @@ bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { switch (op.Type) { case ('M') : - case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases - case ('S') : k += op.Length; // for 'S' - skip over query bases - break; - - case ('D') : bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character - break; - - case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character; - break; - - case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence - k += op.Length; - break; - - case ('H') : break; // for 'H' - do nothing, move to next op - - default : printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here - exit(1); + case ('I') : + bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases + // fall through + + case ('S') : + k += op.Length; // for 'S' - skip over query bases + break; + + case ('D') : + bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character + break; + + case ('P') : + bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character; + break; + + case ('N') : + bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence + // ----------------------- + // Removed: 3-25-2010 DWB + // Contributed: ARQ + // Fixed: compliance with actual 'N' definition in BAM spec + // k += op.Length; + // ----------------------- + break; + + case ('H') : + break; // for 'H' - do nothing, move to next op + + default : + printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here + free(allCharData); + exit(1); } } - // read in the tag data + // ----------------------- + // Added: 3-25-2010 DWB + // Fixed: endian-correctness for tag data + // ----------------------- + if ( IsBigEndian ) { + int i = 0; + while ( i < tagDataLen ) { + + i += 2; // skip tag type (e.g. "RG", "NM", etc) + uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning + ++i; // skip value type + + switch (type) { + + case('A') : + case('C') : + ++i; + break; + + case('S') : + SwapEndian_16p(&tagData[i]); + i+=2; // sizeof(uint16_t) + break; + + case('F') : + case('I') : + SwapEndian_32p(&tagData[i]); + i+=4; // sizeof(uint32_t) + break; + + case('D') : + SwapEndian_64p(&tagData[i]); + i+=8; // sizeof(uint64_t) + break; + + case('H') : + case('Z') : + while (tagData[i]) { ++i; } + ++i; // increment one more for null terminator + break; + + default : + printf("ERROR: Invalid tag value type\n"); // shouldn't get here + free(allCharData); + exit(1); + } + } + } + + // store tag data bAlignment.TagData.resize(tagDataLen); memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen); } @@ -823,7 +933,8 @@ void BamReader::BamReaderPrivate::LoadReferenceData(void) { // get number of reference sequences char buffer[4]; mBGZF.Read(buffer, 4); - const unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer); + unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer); + if ( IsBigEndian ) { SwapEndian_32(numberRefSeqs); } if (numberRefSeqs == 0) { return; } References.reserve((int)numberRefSeqs); @@ -832,13 +943,15 @@ void BamReader::BamReaderPrivate::LoadReferenceData(void) { // get length of reference name mBGZF.Read(buffer, 4); - const unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer); + unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer); + if ( IsBigEndian ) { SwapEndian_32(refNameLength); } char* refName = (char*)calloc(refNameLength, 1); // get reference name and reference sequence length mBGZF.Read(refName, refNameLength); mBGZF.Read(buffer, 4); - const int refLength = BgzfData::UnpackSignedInt(buffer); + int refLength = BgzfData::UnpackSignedInt(buffer); + if ( IsBigEndian ) { SwapEndian_32(refLength); } // store data for reference RefData aReference; @@ -973,6 +1086,7 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) { // write number of reference sequences int32_t numReferenceSeqs = Index.size(); + if ( IsBigEndian ) { SwapEndian_32(numReferenceSeqs); } fwrite(&numReferenceSeqs, 4, 1, indexStream); // iterate over reference sequences @@ -987,6 +1101,7 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) { // write number of bins int32_t binCount = binMap.size(); + if ( IsBigEndian ) { SwapEndian_32(binCount); } fwrite(&binCount, 4, 1, indexStream); // iterate over bins @@ -995,14 +1110,16 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) { for ( ; binIter != binEnd; ++binIter ) { // get bin data (key and chunk vector) - const uint32_t& binKey = (*binIter).first; + uint32_t binKey = (*binIter).first; const ChunkVector& binChunks = (*binIter).second; // save BAM bin key + if ( IsBigEndian ) { SwapEndian_32(binKey); } fwrite(&binKey, 4, 1, indexStream); // save chunk count int32_t chunkCount = binChunks.size(); + if ( IsBigEndian ) { SwapEndian_32(chunkCount); } fwrite(&chunkCount, 4, 1, indexStream); // iterate over chunks @@ -1012,9 +1129,14 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) { // get current chunk data const Chunk& chunk = (*chunkIter); - const uint64_t& start = chunk.Start; - const uint64_t& stop = chunk.Stop; + uint64_t start = chunk.Start; + uint64_t stop = chunk.Stop; + if ( IsBigEndian ) { + SwapEndian_64(start); + SwapEndian_64(stop); + } + // save chunk offsets fwrite(&start, 8, 1, indexStream); fwrite(&stop, 8, 1, indexStream); @@ -1023,6 +1145,7 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) { // write linear offsets size int32_t offsetSize = offsets.size(); + if ( IsBigEndian ) { SwapEndian_32(offsetSize); } fwrite(&offsetSize, 4, 1, indexStream); // iterate over linear offsets @@ -1031,7 +1154,8 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) { for ( ; offsetIter != offsetEnd; ++offsetIter ) { // write linear offset value - const uint64_t& linearOffset = (*offsetIter); + uint64_t linearOffset = (*offsetIter); + if ( IsBigEndian ) { SwapEndian_64(linearOffset); } fwrite(&linearOffset, 8, 1, indexStream); } } diff --git a/BamWriter.cpp b/BamWriter.cpp index c834d45..075989a 100644 --- a/BamWriter.cpp +++ b/BamWriter.cpp @@ -1,9 +1,9 @@ // *************************************************************************** -// BamWriter.cpp (c) 2009 Michael Strömberg, Derek Barnett +// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 8 December 2009 (DB) +// Last modified: 29 March 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -21,9 +21,14 @@ struct BamWriter::BamWriterPrivate { // data members BgzfData mBGZF; - + bool IsBigEndian; + + // constructor / destructor - BamWriterPrivate(void) { } + BamWriterPrivate(void) { + IsBigEndian = SystemIsBigEndian(); + } + ~BamWriterPrivate(void) { mBGZF.Close(); } @@ -139,27 +144,34 @@ void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, strin while(*pQuery) { switch(*pQuery) { + case '=': - nucleotideCode = 0; - break; + nucleotideCode = 0; + break; + case 'A': - nucleotideCode = 1; - break; + nucleotideCode = 1; + break; + case 'C': - nucleotideCode = 2; - break; + nucleotideCode = 2; + break; + case 'G': - nucleotideCode = 4; - break; + nucleotideCode = 4; + break; + case 'T': - nucleotideCode = 8; - break; + nucleotideCode = 8; + break; + case 'N': - nucleotideCode = 15; - break; + nucleotideCode = 15; + break; + default: - printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); - exit(1); + printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); + exit(1); } // pack the nucleotide code @@ -193,7 +205,8 @@ void BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH); // write the SAM header text length - const unsigned int samHeaderLen = samHeader.size(); + uint32_t samHeaderLen = samHeader.size(); + if ( IsBigEndian ) { SwapEndian_32(samHeaderLen); } mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT); // write the SAM header text @@ -202,7 +215,8 @@ void BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam } // write the number of reference sequences - const unsigned int numReferenceSequences = referenceSequences.size(); + uint32_t numReferenceSequences = referenceSequences.size(); + if ( IsBigEndian ) { SwapEndian_32(numReferenceSequences); } mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT); // ============================= @@ -213,14 +227,17 @@ void BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) { // write the reference sequence name length - const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1; + uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; + if ( IsBigEndian ) { SwapEndian_32(referenceSequenceNameLen); } mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT); // write the reference sequence name mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); // write the reference sequence length - mBGZF.Write((char*)&rsIter->RefLength, BT_SIZEOF_INT); + int32_t referenceLength = rsIter->RefLength; + if ( IsBigEndian ) { SwapEndian_32(referenceLength); } + mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT); } } @@ -243,10 +260,17 @@ void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { const unsigned int encodedQueryLen = encodedQuery.size(); // store the tag data length - const unsigned int tagDataLength = al.TagData.size() + 1; - + // ------------------------------------------- + // Modified: 3-25-2010 DWB + // Contributed: ARQ + // Fixed: "off by one" error when parsing tags in files produced by BamWriter + const unsigned int tagDataLength = al.TagData.size(); + // original line: + // const unsigned int tagDataLength = al.TagData.size() + 1; + // ------------------------------------------- + // assign the BAM core data - unsigned int buffer[8]; + uint32_t buffer[8]; buffer[0] = al.RefID; buffer[1] = al.Position; buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen; @@ -257,18 +281,40 @@ void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { buffer[7] = al.InsertSize; // write the block size - const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength; - const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; + unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength; + unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; + if ( IsBigEndian ) { SwapEndian_32(blockSize); } mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); // write the BAM core + if ( IsBigEndian ) { + for ( int i = 0; i < 8; ++i ) { + SwapEndian_32(buffer[i]); + } + } mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); // write the query name mBGZF.Write(al.Name.c_str(), nameLen); // write the packed cigar - mBGZF.Write(packedCigar.data(), packedCigarLen); + if ( IsBigEndian ) { + + char* cigarData = (char*)calloc(sizeof(char), packedCigarLen); + memcpy(cigarData, packedCigar.data(), packedCigarLen); + + for (unsigned int i = 0; i < packedCigarLen; ++i) { + if ( IsBigEndian ) { + SwapEndian_32(cigarData[i]); + } + } + + mBGZF.Write(cigarData, packedCigarLen); + free(cigarData); + + } else { + mBGZF.Write(packedCigar.data(), packedCigarLen); + } // write the encoded query sequence mBGZF.Write(encodedQuery.data(), encodedQueryLen); @@ -280,5 +326,57 @@ void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { mBGZF.Write(pBaseQualities, queryLen); // write the read group tag - mBGZF.Write(al.TagData.data(), tagDataLength); + if ( IsBigEndian ) { + + char* tagData = (char*)calloc(sizeof(char), tagDataLength); + memcpy(tagData, al.TagData.data(), tagDataLength); + + int i = 0; + while ( i < tagDataLength ) { + + i += 2; // skip tag type (e.g. "RG", "NM", etc) + uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning + ++i; // skip value type + + switch (type) { + + case('A') : + case('C') : + ++i; + break; + + case('S') : + SwapEndian_16p(&tagData[i]); + i+=2; // sizeof(uint16_t) + break; + + case('F') : + case('I') : + SwapEndian_32p(&tagData[i]); + i+=4; // sizeof(uint32_t) + break; + + case('D') : + SwapEndian_64p(&tagData[i]); + i+=8; // sizeof(uint64_t) + break; + + case('H') : + case('Z') : + while (tagData[i]) { ++i; } + ++i; // increment one more for null terminator + break; + + default : + printf("ERROR: Invalid tag value type\n"); // shouldn't get here + free(tagData); + exit(1); + } + } + + mBGZF.Write(tagData, tagDataLength); + free(tagData); + } else { + mBGZF.Write(al.TagData.data(), tagDataLength); + } } -- 2.39.5