From 90f57dc99f0af143f50a0afef447b50048a556f3 Mon Sep 17 00:00:00 2001 From: derek Date: Tue, 11 Jan 2011 22:37:31 -0500 Subject: [PATCH] Update to BamTools API 0.9.2 with exposure of SamHeader object from BamReader and to BamWriter --- src/api/BamReader.cpp | 3 +- src/api/BamReader.h | 5 +- src/api/BamWriter.cpp | 13 +- src/api/BamWriter.h | 10 +- src/api/CMakeLists.txt | 3 +- src/api/internal/BamHeader_p.cpp | 155 ++++++++++ src/api/internal/BamHeader_p.h | 56 ++++ src/api/internal/BamReader_p.cpp | 60 +--- src/api/internal/BamReader_p.h | 10 +- src/api/internal/BamWriter_p.cpp | 472 +++++++++++++++---------------- src/api/internal/BamWriter_p.h | 7 + 11 files changed, 488 insertions(+), 306 deletions(-) create mode 100644 src/api/internal/BamHeader_p.cpp create mode 100644 src/api/internal/BamHeader_p.h diff --git a/src/api/BamReader.cpp b/src/api/BamReader.cpp index c357b9d..473dc89 100644 --- a/src/api/BamReader.cpp +++ b/src/api/BamReader.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 22 November 2010 (DB) +// Last modified: 11 January 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for reading BAM files // *************************************************************************** @@ -55,6 +55,7 @@ bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAl bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); } // access auxiliary data +SamHeader BamReader::GetHeader(void) const { return d->GetSamHeader(); } const string BamReader::GetHeaderText(void) const { return d->GetHeaderText(); } int BamReader::GetReferenceCount(void) const { return d->References.size(); } const RefVector& BamReader::GetReferenceData(void) const { return d->References; } diff --git a/src/api/BamReader.h b/src/api/BamReader.h index c3452f6..d68ab6c 100644 --- a/src/api/BamReader.h +++ b/src/api/BamReader.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) +// Last modified: 11 January 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for reading BAM files // *************************************************************************** @@ -14,6 +14,7 @@ #include #include #include +#include #include namespace BamTools { @@ -74,6 +75,8 @@ class API_EXPORT BamReader { // access auxiliary data // ---------------------- + // returns SamHeader object - see SamHeader.h for more info + SamHeader GetHeader(void) const; // returns SAM header text const std::string GetHeaderText(void) const; // returns number of reference sequences diff --git a/src/api/BamWriter.cpp b/src/api/BamWriter.cpp index e92d071..386755d 100644 --- a/src/api/BamWriter.cpp +++ b/src/api/BamWriter.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 22 November 2010 (DB) +// Last modified: 11 January 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** @@ -32,7 +32,7 @@ void BamWriter::Close(void) { d->Close(); } -// opens the alignment archive +// opens the alignment archive (using std::string SAM header) bool BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, @@ -41,6 +41,15 @@ bool BamWriter::Open(const string& filename, return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed); } +// opens the alignment archive (using SamHeader object) +bool BamWriter::Open(const string& filename, + const SamHeader& samHeader, + const RefVector& referenceSequences, + bool isWriteUncompressed) +{ + return d->Open(filename, samHeader.ToString(), referenceSequences, isWriteUncompressed); +} + // saves the alignment to the alignment archive void BamWriter::SaveAlignment(const BamAlignment& al) { d->SaveAlignment(al); diff --git a/src/api/BamWriter.h b/src/api/BamWriter.h index 55f86f4..2d8b528 100644 --- a/src/api/BamWriter.h +++ b/src/api/BamWriter.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) +// Last modified: 11 January 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** @@ -13,6 +13,7 @@ #include #include +#include #include namespace BamTools { @@ -32,11 +33,16 @@ class API_EXPORT BamWriter { public: // closes the alignment archive void Close(void); - // opens the alignment archive + // opens the alignment archive (using std::string SAM header) bool Open(const std::string& filename, const std::string& samHeader, const BamTools::RefVector& referenceSequences, bool writeUncompressed = false); + // opens the alignment archive (using SamHeader object) + bool Open(const std::string& filename, + const SamHeader& samHeader, + const BamTools::RefVector& referenceSequences, + bool writeUncompressed = false); // saves the alignment to the alignment archive void SaveAlignment(const BamTools::BamAlignment& al); diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt index 9e41c72..6aab198 100644 --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@ -24,6 +24,7 @@ set( BamToolsAPISources SamReadGroupDictionary.cpp SamSequence.cpp SamSequenceDictionary.cpp + internal/BamHeader_p.cpp internal/BamMultiReader_p.cpp internal/BamReader_p.cpp internal/BamStandardIndex_p.cpp @@ -36,7 +37,7 @@ set( BamToolsAPISources # create main BamTools API shared library add_library( BamTools SHARED ${BamToolsAPISources} ) -set_target_properties( BamTools PROPERTIES SOVERSION "0.9.1" ) +set_target_properties( BamTools PROPERTIES SOVERSION "0.9.2" ) set_target_properties( BamTools PROPERTIES OUTPUT_NAME "bamtools" ) # create main BamTools API static library diff --git a/src/api/internal/BamHeader_p.cpp b/src/api/internal/BamHeader_p.cpp new file mode 100644 index 0000000..c613b12 --- /dev/null +++ b/src/api/internal/BamHeader_p.cpp @@ -0,0 +1,155 @@ +// *************************************************************************** +// BamHeader_p.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 25 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for handling BAM headers. +// *************************************************************************** + +#include +#include +#include +#include +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +#include +using namespace std; + +// --------------------------------- +// BamHeaderPrivate implementation + +struct BamHeader::BamHeaderPrivate { + + // data members + SamHeader* m_samHeader; + + // ctor + BamHeaderPrivate(void) + : m_samHeader(0) + { } + + // 'public' interface + bool Load(BgzfData* stream); + + // internal methods + bool CheckMagicNumber(BgzfData* stream); + bool ReadHeaderLength(BgzfData* stream, uint32_t& length); + bool ReadHeaderText(BgzfData* stream, const uint32_t& length); +}; + +bool BamHeader::BamHeaderPrivate::Load(BgzfData* stream) { + + // cannot load if invalid stream + if ( stream == 0 ) + return false; + + // cannot load if magic number is invalid + if ( !CheckMagicNumber(stream) ) + return false; + + // cannot load header if cannot read header length + uint32_t length(0); + if ( !ReadHeaderLength(stream, length) ) + return false; + + // cannot load header if cannot read header text + if ( !ReadHeaderText(stream, length) ) + return false; + + // otherwise, everything OK + return true; +} + +bool BamHeader::BamHeaderPrivate::CheckMagicNumber(BgzfData* stream) { + + // try to read magic number + char buffer[Constants::BAM_HEADER_MAGIC_SIZE]; + if ( stream->Read(buffer, Constants::BAM_HEADER_MAGIC_SIZE) != (int)Constants::BAM_HEADER_MAGIC_SIZE ) { + fprintf(stderr, "BAM header error - could not read magic number\n"); + return false; + } + + // validate magic number + if ( strncmp(buffer, Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_SIZE) != 0 ) { + fprintf(stderr, "BAM header error - invalid magic number\n"); + return false; + } + + // all checks out + return true; +} + +bool BamHeader::BamHeaderPrivate::ReadHeaderLength(BgzfData* stream, uint32_t& length) { + + // attempt to read BAM header text length + char buffer[sizeof(uint32_t)]; + if ( stream->Read(buffer, sizeof(uint32_t)) != sizeof(uint32_t) ) { + fprintf(stderr, "BAM header error - could not read header length\n"); + return false; + } + + // convert char buffer to length, return success + length = BgzfData::UnpackUnsignedInt(buffer); + if ( BamTools::SystemIsBigEndian() ) + SwapEndian_32(length); + return true; +} + +bool BamHeader::BamHeaderPrivate::ReadHeaderText(BgzfData* stream, const uint32_t& length) { + + // set up destination buffer + char* headerText = (char*)calloc(length + 1, 1); + + // attempt to read header text + const unsigned bytesRead = stream->Read(headerText, length); + const bool readOk = ( bytesRead == length ); + if ( readOk ) + m_samHeader = new SamHeader( (string)((const char*)headerText) ); + else + fprintf(stderr, "BAM header error - could not read header text\n"); + + // clean up calloc-ed temp variable (on success or fail) + free(headerText); + + // return read success + return readOk; +} + +// -------------------------- +// BamHeader implementation + +BamHeader::BamHeader(void) + : d(new BamHeaderPrivate) +{ } + +BamHeader::~BamHeader(void) { + delete d; + d = 0; +} + +void BamHeader::Clear(void) { + delete d->m_samHeader; + d->m_samHeader = new SamHeader(""); +} + +bool BamHeader::IsValid(void) const { + return d->m_samHeader->IsValid(); +} + +bool BamHeader::Load(BgzfData* stream) { + return d->Load(stream); +} + +SamHeader BamHeader::ToSamHeader(void) const { + return *(d->m_samHeader); +} + +string BamHeader::ToString(void) const { + return d->m_samHeader->ToString(); +} diff --git a/src/api/internal/BamHeader_p.h b/src/api/internal/BamHeader_p.h new file mode 100644 index 0000000..352ce88 --- /dev/null +++ b/src/api/internal/BamHeader_p.h @@ -0,0 +1,56 @@ +// *************************************************************************** +// BamHeader_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 25 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for handling BAM headers. +// *************************************************************************** + +#ifndef BAMHEADER_P_H +#define BAMHEADER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include +#include + +namespace BamTools { + +class BgzfData; + +namespace Internal { + +class BamHeader { + + public: + BamHeader(void); + ~BamHeader(void); + + public: + void Clear(void); + bool IsValid(void) const; + bool Load(BgzfData* stream); + + public: + SamHeader ToSamHeader(void) const; + std::string ToString(void) const; + + private: + struct BamHeaderPrivate; + BamHeaderPrivate* d; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMHEADER_P_H diff --git a/src/api/internal/BamReader_p.cpp b/src/api/internal/BamReader_p.cpp index f14dcee..7aa18f3 100644 --- a/src/api/internal/BamReader_p.cpp +++ b/src/api/internal/BamReader_p.cpp @@ -3,13 +3,14 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 22 November 2010 (DB) +// Last modified: 11 January 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for reading BAM files // *************************************************************************** #include #include +#include #include #include #include @@ -24,14 +25,13 @@ using namespace std; // constructor BamReaderPrivate::BamReaderPrivate(BamReader* parent) - : HeaderText("") - , Index(0) + : Index(0) , HasIndex(false) , AlignmentsBeginOffset(0) - // , m_header(0) , IndexCacheMode(BamIndex::LimitedIndexCaching) , HasAlignmentsInRegion(true) , Parent(parent) + , m_header(new BamHeader) , DNA_LOOKUP("=ACMGRSVTWYHKDBN") , CIGAR_LOOKUP("MIDNSHP") { @@ -40,7 +40,11 @@ BamReaderPrivate::BamReaderPrivate(BamReader* parent) // destructor BamReaderPrivate::~BamReaderPrivate(void) { + Close(); + + delete m_header; + m_header = 0; } // adjusts requested region if necessary (depending on where data actually begins) @@ -88,12 +92,7 @@ void BamReaderPrivate::Close(void) { ClearIndex(); // clear out header data - HeaderText.clear(); - - // if ( m_header ) { - // delete m_header; - // m_header = 0; - // } + m_header->Clear(); // clear out region flags Region.clear(); @@ -135,13 +134,11 @@ bool BamReaderPrivate::CreateIndex(bool useStandardIndex) { } const string BamReaderPrivate::GetHeaderText(void) const { + return m_header->ToString(); +} - return HeaderText; - - // if ( m_header ) - // return m_header->Text(); - // else - // return string(""); +SamHeader BamReaderPrivate::GetSamHeader(void) const { + return m_header->ToSamHeader(); } // get next alignment (from specified region, if given) @@ -278,36 +275,7 @@ BamReaderPrivate::RegionState BamReaderPrivate::IsOverlap(BamAlignment& bAlignme // load BAM header data void BamReaderPrivate::LoadHeaderData(void) { - - // m_header = new BamHeader(&mBGZF); - // bool headerLoadedOk = m_header->Load(); - // if ( !headerLoadedOk ) - // cerr << "BamReader could not load header" << endl; - - // check to see if proper BAM header - char buffer[4]; - if (mBGZF.Read(buffer, 4) != 4) { - fprintf(stderr, "Could not read header type\n"); - exit(1); - } - - if (strncmp(buffer, "BAM\001", 4)) { - fprintf(stderr, "wrong header type!\n"); - exit(1); - } - - // get BAM header text length - mBGZF.Read(buffer, 4); - unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) SwapEndian_32(headerTextLength); - - // get BAM header text - char* headerText = (char*)calloc(headerTextLength + 1, 1); - mBGZF.Read(headerText, headerTextLength); - HeaderText = (string)((const char*)headerText); - - // clean up calloc-ed temp variable - free(headerText); + m_header->Load(&mBGZF); } // load existing index data from BAM index file (".bti" OR ".bai"), return success/fail diff --git a/src/api/internal/BamReader_p.h b/src/api/internal/BamReader_p.h index ed3940a..79bcddb 100644 --- a/src/api/internal/BamReader_p.h +++ b/src/api/internal/BamReader_p.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) +// Last modified: 11 January 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for reading BAM files // *************************************************************************** @@ -24,14 +24,18 @@ #include #include #include +#include #include namespace BamTools { class BamReader; +class SamHeader; namespace Internal { +class BamHeader; + class BamReaderPrivate { // enums @@ -63,6 +67,7 @@ class BamReaderPrivate { // access auxiliary data const std::string GetHeaderText(void) const; + SamHeader GetSamHeader(void) const; int GetReferenceID(const std::string& refName) const; // index operations @@ -101,7 +106,6 @@ class BamReaderPrivate { // general file data BgzfData mBGZF; - std::string HeaderText; BamIndex* Index; RefVector References; bool HasIndex; @@ -109,7 +113,6 @@ class BamReaderPrivate { std::string Filename; std::string IndexFilename; - // Internal::BamHeader* m_header; // index caching mode BamIndex::BamIndexCacheMode IndexCacheMode; @@ -123,6 +126,7 @@ class BamReaderPrivate { // parent BamReader BamReader* Parent; + BamHeader* m_header; // BAM character constants const char* DNA_LOOKUP; diff --git a/src/api/internal/BamWriter_p.cpp b/src/api/internal/BamWriter_p.cpp index a75eb44..90959b6 100644 --- a/src/api/internal/BamWriter_p.cpp +++ b/src/api/internal/BamWriter_p.cpp @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 22 November 2010 (DB) +// Last modified: 11 January 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** @@ -14,19 +14,16 @@ using namespace BamTools; using namespace BamTools::Internal; using namespace std; +// ctor BamWriterPrivate::BamWriterPrivate(void) { IsBigEndian = SystemIsBigEndian(); } +// dtor BamWriterPrivate::~BamWriterPrivate(void) { mBGZF.Close(); } -// closes the alignment archive -void BamWriterPrivate::Close(void) { - mBGZF.Close(); -} - // calculates minimum bin for a BAM alignment interval const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { --end; @@ -38,6 +35,11 @@ const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int en return 0; } +// closes the alignment archive +void BamWriterPrivate::Close(void) { + mBGZF.Close(); +} + // creates a cigar string from the supplied alignment void BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, string& packedCigar) { @@ -52,35 +54,21 @@ void BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, vector::const_iterator coIter; for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) { - switch(coIter->Type) { - case 'M': - cigarOp = BAM_CMATCH; - break; - case 'I': - cigarOp = BAM_CINS; - break; - case 'D': - cigarOp = BAM_CDEL; - break; - case 'N': - cigarOp = BAM_CREF_SKIP; - break; - case 'S': - cigarOp = BAM_CSOFT_CLIP; - break; - case 'H': - cigarOp = BAM_CHARD_CLIP; - break; - case 'P': - cigarOp = BAM_CPAD; - break; - default: - fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type); - exit(1); - } - - *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; - pPackedCigar++; + switch(coIter->Type) { + case 'M': cigarOp = BAM_CMATCH; break; + case 'I': cigarOp = BAM_CINS; break; + case 'D': cigarOp = BAM_CDEL; break; + case 'N': cigarOp = BAM_CREF_SKIP; break; + case 'S': cigarOp = BAM_CSOFT_CLIP; break; + case 'H': cigarOp = BAM_CHARD_CLIP; break; + case 'P': cigarOp = BAM_CPAD; break; + default: + fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type); + exit(1); + } + + *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; + pPackedCigar++; } } @@ -99,49 +87,30 @@ void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQ while(*pQuery) { - switch(*pQuery) { - - case '=': - nucleotideCode = 0; - break; - - case 'A': - nucleotideCode = 1; - break; - - case 'C': - nucleotideCode = 2; - break; - - case 'G': - nucleotideCode = 4; - break; - - case 'T': - nucleotideCode = 8; - break; - - case 'N': - nucleotideCode = 15; - break; - - default: - fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); - exit(1); - } - - // pack the nucleotide code - if(useHighWord) { - *pEncodedQuery = nucleotideCode << 4; - useHighWord = false; - } else { - *pEncodedQuery |= nucleotideCode; - pEncodedQuery++; - useHighWord = true; - } - - // increment the query position - pQuery++; + switch(*pQuery) { + case '=': nucleotideCode = 0; break; + case 'A': nucleotideCode = 1; break; + case 'C': nucleotideCode = 2; break; + case 'G': nucleotideCode = 4; break; + case 'T': nucleotideCode = 8; break; + case 'N': nucleotideCode = 15; break; + default: + fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); + exit(1); + } + + // pack the nucleotide code + if(useHighWord) { + *pEncodedQuery = nucleotideCode << 4; + useHighWord = false; + } else { + *pEncodedQuery |= nucleotideCode; + pEncodedQuery++; + useHighWord = true; + } + + // increment the query position + pQuery++; } } @@ -153,7 +122,7 @@ bool BamWriterPrivate::Open(const string& filename, { // open the BGZF file for writing, return failure if error if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) ) - return false; + return false; // ================ // write the header @@ -182,21 +151,22 @@ bool BamWriterPrivate::Open(const string& filename, // write the sequence dictionary // ============================= - RefVector::const_iterator rsIter; - for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) { + RefVector::const_iterator rsIter = referenceSequences.begin(); + RefVector::const_iterator rsEnd = referenceSequences.end(); + for( ; rsIter != rsEnd; ++rsIter ) { - // write the reference sequence name length - uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; - if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen); - mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT); + // write the reference sequence name length + uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; + if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen); + mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT); - // write the reference sequence name - mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); + // write the reference sequence name + mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); - // write the reference sequence length - int32_t referenceLength = rsIter->RefLength; - if (IsBigEndian) SwapEndian_32(referenceLength); - mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT); + // write the reference sequence length + int32_t referenceLength = rsIter->RefLength; + if (IsBigEndian) SwapEndian_32(referenceLength); + mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT); } // return success @@ -210,170 +180,172 @@ void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { // (as a result of BamReader::GetNextAlignmentCore()) if ( al.SupportData.HasCoreOnly ) { - // write the block size - unsigned int blockSize = al.SupportData.BlockLength; - if (IsBigEndian) SwapEndian_32(blockSize); - mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); - - // assign the BAM core data - uint32_t buffer[8]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; - buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; - buffer[4] = al.SupportData.QuerySequenceLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { - for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); - } - - // write the BAM core - mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); - - // write the raw char data - mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); + // write the block size + unsigned int blockSize = al.SupportData.BlockLength; + if (IsBigEndian) SwapEndian_32(blockSize); + mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); + + // assign the BAM core data + uint32_t buffer[8]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; + buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; + buffer[4] = al.SupportData.QuerySequenceLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( IsBigEndian ) { + for ( int i = 0; i < 8; ++i ) + SwapEndian_32(buffer[i]); + } + + // write the BAM core + mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); + + // write the raw char data + mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); } // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code ) else { - // calculate char lengths - const unsigned int nameLength = al.Name.size() + 1; - const unsigned int numCigarOperations = al.CigarData.size(); - const unsigned int queryLength = al.QueryBases.size(); - const unsigned int tagDataLength = al.TagData.size(); - - // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) - // force calculation of Bin before storing - const int endPosition = al.GetEndPosition(); - const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition); - - // create our packed cigar string - string packedCigar; - CreatePackedCigar(al.CigarData, packedCigar); - const unsigned int packedCigarLength = packedCigar.size(); - - // encode the query - string encodedQuery; - EncodeQuerySequence(al.QueryBases, encodedQuery); - const unsigned int encodedQueryLength = encodedQuery.size(); - - // write the block size - const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength; - unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; - if (IsBigEndian) SwapEndian_32(blockSize); - mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); - - // assign the BAM core data - uint32_t buffer[8]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; - buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; - buffer[4] = queryLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { - for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); - } - - // write the BAM core - mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); - - // write the query name - mBGZF.Write(al.Name.c_str(), nameLength); - - // write the packed cigar - if ( IsBigEndian ) { - - char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); - memcpy(cigarData, packedCigar.data(), packedCigarLength); - - for (unsigned int i = 0; i < packedCigarLength; ++i) { - if ( IsBigEndian ) - SwapEndian_32p(&cigarData[i]); - } - - mBGZF.Write(cigarData, packedCigarLength); - free(cigarData); - } - else - mBGZF.Write(packedCigar.data(), packedCigarLength); - - // write the encoded query sequence - mBGZF.Write(encodedQuery.data(), encodedQueryLength); - - // write the base qualities - string baseQualities(al.Qualities); - char* pBaseQualities = (char*)al.Qualities.data(); - for(unsigned int i = 0; i < queryLength; i++) { - pBaseQualities[i] -= 33; - } - mBGZF.Write(pBaseQualities, queryLength); - - // write the read group tag - if ( IsBigEndian ) { - - char* tagData = (char*)calloc(sizeof(char), tagDataLength); - memcpy(tagData, al.TagData.data(), tagDataLength); - - int i = 0; - while ( (unsigned int)i < tagDataLength ) { - - i += 2; // skip tag type (e.g. "RG", "NM", etc) - uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning - ++i; // skip value type - - switch (type) { - - case('A') : - case('C') : - ++i; - break; - - case('S') : - SwapEndian_16p(&tagData[i]); - i+=2; // sizeof(uint16_t) - break; - - case('F') : - case('I') : - SwapEndian_32p(&tagData[i]); - i+=4; // sizeof(uint32_t) - break; - - case('D') : - SwapEndian_64p(&tagData[i]); - i+=8; // sizeof(uint64_t) - break; - - case('H') : - case('Z') : - while (tagData[i]) { ++i; } - ++i; // increment one more for null terminator - break; - - default : - fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here - free(tagData); - exit(1); - } - } - - mBGZF.Write(tagData, tagDataLength); - free(tagData); - } - else - mBGZF.Write(al.TagData.data(), tagDataLength); + // calculate char lengths + const unsigned int nameLength = al.Name.size() + 1; + const unsigned int numCigarOperations = al.CigarData.size(); + const unsigned int queryLength = al.QueryBases.size(); + const unsigned int tagDataLength = al.TagData.size(); + + // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) + // force calculation of Bin before storing + const int endPosition = al.GetEndPosition(); + const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition); + + // create our packed cigar string + string packedCigar; + CreatePackedCigar(al.CigarData, packedCigar); + const unsigned int packedCigarLength = packedCigar.size(); + + // encode the query + string encodedQuery; + EncodeQuerySequence(al.QueryBases, encodedQuery); + const unsigned int encodedQueryLength = encodedQuery.size(); + + // write the block size + const unsigned int dataBlockSize = nameLength + + packedCigarLength + + encodedQueryLength + + queryLength + + tagDataLength; + unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; + if (IsBigEndian) SwapEndian_32(blockSize); + mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); + + // assign the BAM core data + uint32_t buffer[8]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; + buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; + buffer[4] = queryLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( IsBigEndian ) { + for ( int i = 0; i < 8; ++i ) + SwapEndian_32(buffer[i]); + } + + // write the BAM core + mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); + + // write the query name + mBGZF.Write(al.Name.c_str(), nameLength); + + // write the packed cigar + if ( IsBigEndian ) { + + char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); + memcpy(cigarData, packedCigar.data(), packedCigarLength); + + for (unsigned int i = 0; i < packedCigarLength; ++i) { + if ( IsBigEndian ) + SwapEndian_32p(&cigarData[i]); + } + + mBGZF.Write(cigarData, packedCigarLength); + free(cigarData); + } + else + mBGZF.Write(packedCigar.data(), packedCigarLength); + + // write the encoded query sequence + mBGZF.Write(encodedQuery.data(), encodedQueryLength); + + // write the base qualities + char* pBaseQualities = (char*)al.Qualities.data(); + for(unsigned int i = 0; i < queryLength; i++) + pBaseQualities[i] -= 33; // FASTQ conversion + mBGZF.Write(pBaseQualities, queryLength); + + // write the read group tag + if ( IsBigEndian ) { + + char* tagData = (char*)calloc(sizeof(char), tagDataLength); + memcpy(tagData, al.TagData.data(), tagDataLength); + + int i = 0; + while ( (unsigned int)i < tagDataLength ) { + + i += 2; // skip tag type (e.g. "RG", "NM", etc) + uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning + ++i; // skip value type + + switch (type) { + + case('A') : + case('C') : + ++i; + break; + + case('S') : + SwapEndian_16p(&tagData[i]); + i+=2; // sizeof(uint16_t) + break; + + case('F') : + case('I') : + SwapEndian_32p(&tagData[i]); + i+=4; // sizeof(uint32_t) + break; + + case('D') : + SwapEndian_64p(&tagData[i]); + i+=8; // sizeof(uint64_t) + break; + + case('H') : + case('Z') : + while (tagData[i]) { ++i; } + ++i; // increment one more for null terminator + break; + + default : + fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here + free(tagData); + exit(1); + } + } + + mBGZF.Write(tagData, tagDataLength); + free(tagData); + } + else + mBGZF.Write(al.TagData.data(), tagDataLength); } } diff --git a/src/api/internal/BamWriter_p.h b/src/api/internal/BamWriter_p.h index 4fe626a..6eecd73 100644 --- a/src/api/internal/BamWriter_p.h +++ b/src/api/internal/BamWriter_p.h @@ -27,6 +27,9 @@ #include namespace BamTools { + +class SamHeader; + namespace Internal { class BamWriterPrivate { @@ -43,6 +46,10 @@ class BamWriterPrivate { const std::string& samHeader, const BamTools::RefVector& referenceSequences, bool isWriteUncompressed); + bool Open(const std::string& filename, + const std::string& samHeader, + const BamTools::RefVector& referenceSequences, + bool isWriteUncompressed); void SaveAlignment(const BamAlignment& al); // internal methods -- 2.39.2