From a2400c1ecfd075271b14c2c83aae0dc42bf7e42a Mon Sep 17 00:00:00 2001 From: Derek Date: Thu, 15 Jul 2010 13:44:53 -0400 Subject: [PATCH] Modified handling of BamAlignmentSupportData. This fix should allow BamWriter::SaveAlignment to still handle BamAlignments retrieved using the 'standard' GetNextAlignment[Core] correctly, while adding support for BamAlignments **generated directly** in client code. Switched BASD::IsParsed to HasCoreOnly, which is false by default and only set by BamReader::GetMextAlignmentCore() - therefore the client should never to touch the BASD struct. --- BamAux.h | 9 ++-- BamReader.cpp | 9 ++-- BamWriter.cpp | 143 +++++++++++++++++++++++++++++++++----------------- 3 files changed, 107 insertions(+), 54 deletions(-) diff --git a/BamAux.h b/BamAux.h index 8156596..f742aeb 100644 --- a/BamAux.h +++ b/BamAux.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 16 June 2010 (DB) +// Last modified: 15 July 2010 (DB) // --------------------------------------------------------------------------- // Provides the basic constants, data structures, etc. for using BAM files // *************************************************************************** @@ -133,7 +133,8 @@ struct BamAlignment { int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned int32_t MatePosition; // Position (0-based) where alignment's mate starts int32_t InsertSize; // Mate-pair insert size - + + struct BamAlignmentSupportData { // data members @@ -142,7 +143,7 @@ struct BamAlignment { uint32_t NumCigarOperations; uint32_t QueryNameLength; uint32_t QuerySequenceLength; - bool IsParsed; + bool HasCoreOnly; // constructor BamAlignmentSupportData(void) @@ -150,7 +151,7 @@ struct BamAlignment { , NumCigarOperations(0) , QueryNameLength(0) , QuerySequenceLength(0) - , IsParsed(false) + , HasCoreOnly(false) { } }; diff --git a/BamReader.cpp b/BamReader.cpp index 578cccf..fcd17b1 100644 --- a/BamReader.cpp +++ b/BamReader.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 9 July 2010 (DB) +// Last modified: 15 July 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -350,8 +350,8 @@ bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { bAlignment.TagData.resize(tagDataLength); memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength); - // set support data parsed flag - bAlignment.SupportData.IsParsed = true; + // clear the core-only flag + bAlignment.SupportData.HasCoreOnly = false; // return success return true; @@ -423,6 +423,9 @@ bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) // if valid alignment available if ( LoadNextAlignment(bAlignment) ) { + // set core-only flag + bAlignment.SupportData.HasCoreOnly = true; + // if region not specified, return success if ( !IsLeftBoundSpecified ) return true; diff --git a/BamWriter.cpp b/BamWriter.cpp index 5ca9b7d..a794dff 100644 --- a/BamWriter.cpp +++ b/BamWriter.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 30 March 2010 (DB) +// Last modified: 15 July 2010 (DB) // --------------------------------------------------------------------------- // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad // Institute. @@ -11,7 +11,8 @@ // Provides the basic functionality for producing BAM files // *************************************************************************** -// BGZF includes +#include + #include "BGZF.h" #include "BamWriter.h" using namespace BamTools; @@ -38,6 +39,7 @@ struct BamWriter::BamWriterPrivate { void SaveAlignment(const BamAlignment& al); // internal methods + const unsigned int CalculateMinimumBin(const int begin, int end) const; void CreatePackedCigar(const vector& cigarOperations, string& packedCigar); void EncodeQuerySequence(const string& query, string& encodedQuery); }; @@ -81,6 +83,17 @@ void BamWriter::BamWriterPrivate::Close(void) { mBGZF.Close(); } +// calculates minimum bin for a BAM alignment interval +const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { + --end; + if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); + if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); + if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20); + if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23); + if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26); + return 0; +} + // creates a cigar string from the supplied alignment void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, string& packedCigar) { @@ -242,84 +255,120 @@ void BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam // saves the alignment to the alignment archive void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { - // assign the BAM core data - uint32_t buffer[8]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; - buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; - buffer[4] = al.SupportData.QuerySequenceLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // write the block size - unsigned int blockSize = al.SupportData.BlockLength; - if (IsBigEndian) SwapEndian_32(blockSize); - mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); - - // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { - for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); + // if BamAlignment contains only the core data and a raw char data buffer + // (as a result of BamReader::GetNextAlignmentCore()) + if ( al.SupportData.HasCoreOnly ) { + + // write the block size + unsigned int blockSize = al.SupportData.BlockLength; + if (IsBigEndian) SwapEndian_32(blockSize); + mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); + + // assign the BAM core data + uint32_t buffer[8]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; + buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; + buffer[4] = al.SupportData.QuerySequenceLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( IsBigEndian ) { + for ( int i = 0; i < 8; ++i ) + SwapEndian_32(buffer[i]); + } + + // write the BAM core + mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); + + // write the raw char data + mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); } - // write the BAM core - mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); - - // if support data, not parsed out (resulted from BamReader::GetNextAlignmentCore() - // write the raw char data - if ( !al.SupportData.IsParsed ) - mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); - - // re-pack (as needed) & write the parsed char data + // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc + // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code ) else { - - // initialize - const unsigned int nameLen = al.Name.size() + 1; - const unsigned int queryLen = al.QueryBases.size(); - const unsigned int tagDataLength = al.TagData.size(); + // calculate char lengths + const unsigned int nameLength = al.Name.size() + 1; + const unsigned int numCigarOperations = al.CigarData.size(); + const unsigned int queryLength = al.QueryBases.size(); + const unsigned int tagDataLength = al.TagData.size(); + + // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) + // force calculation of Bin before storing + const int endPosition = al.GetEndPosition(); + const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition); + // create our packed cigar string string packedCigar; CreatePackedCigar(al.CigarData, packedCigar); - const unsigned int packedCigarLen = packedCigar.size(); + const unsigned int packedCigarLength = packedCigar.size(); // encode the query string encodedQuery; EncodeQuerySequence(al.QueryBases, encodedQuery); - const unsigned int encodedQueryLen = encodedQuery.size(); + const unsigned int encodedQueryLength = encodedQuery.size(); + // write the block size + const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength; + unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; + if (IsBigEndian) SwapEndian_32(blockSize); + mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); + + // assign the BAM core data + uint32_t buffer[8]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; + buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; + buffer[4] = queryLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( IsBigEndian ) { + for ( int i = 0; i < 8; ++i ) + SwapEndian_32(buffer[i]); + } + + // write the BAM core + mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); + // write the query name - mBGZF.Write(al.Name.c_str(), nameLen); + mBGZF.Write(al.Name.c_str(), nameLength); // write the packed cigar if ( IsBigEndian ) { - char* cigarData = (char*)calloc(sizeof(char), packedCigarLen); - memcpy(cigarData, packedCigar.data(), packedCigarLen); + char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); + memcpy(cigarData, packedCigar.data(), packedCigarLength); - for (unsigned int i = 0; i < packedCigarLen; ++i) { + for (unsigned int i = 0; i < packedCigarLength; ++i) { if ( IsBigEndian ) SwapEndian_32p(&cigarData[i]); } - mBGZF.Write(cigarData, packedCigarLen); + mBGZF.Write(cigarData, packedCigarLength); free(cigarData); } else - mBGZF.Write(packedCigar.data(), packedCigarLen); + mBGZF.Write(packedCigar.data(), packedCigarLength); // write the encoded query sequence - mBGZF.Write(encodedQuery.data(), encodedQueryLen); + mBGZF.Write(encodedQuery.data(), encodedQueryLength); // write the base qualities string baseQualities(al.Qualities); char* pBaseQualities = (char*)al.Qualities.data(); - for(unsigned int i = 0; i < queryLen; i++) { + for(unsigned int i = 0; i < queryLength; i++) { pBaseQualities[i] -= 33; } - mBGZF.Write(pBaseQualities, queryLen); + mBGZF.Write(pBaseQualities, queryLength); // write the read group tag if ( IsBigEndian ) { -- 2.39.5