// ***************************************************************************\r
// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett\r
// Marth Lab, Department of Biology, Boston College\r
-// All rights reserved.\r
// ---------------------------------------------------------------------------\r
-// Last modified: 17 August 2010 (DB)\r
-// ---------------------------------------------------------------------------\r
-// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
-// Institute.\r
+// Last modified: 10 October 2011 (DB)\r
// ---------------------------------------------------------------------------\r
// Provides the basic functionality for producing BAM files\r
// ***************************************************************************\r
\r
-#include <iostream>\r
-\r
-#include "BGZF.h"\r
-#include "BamWriter.h"\r
+#include "api/BamAlignment.h"\r
+#include "api/BamWriter.h"\r
+#include "api/SamHeader.h"\r
+#include "api/internal/BamWriter_p.h"\r
using namespace BamTools;\r
+using namespace BamTools::Internal;\r
using namespace std;\r
\r
-struct BamWriter::BamWriterPrivate {\r
-\r
- // data members\r
- BgzfData mBGZF;\r
- bool IsBigEndian;\r
- \r
- // constructor / destructor\r
- BamWriterPrivate(void) { \r
- IsBigEndian = SystemIsBigEndian(); \r
- }\r
- \r
- ~BamWriterPrivate(void) {\r
- mBGZF.Close();\r
- }\r
-\r
- // "public" interface\r
- void Close(void);\r
- bool Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed);\r
- void SaveAlignment(const BamAlignment& al);\r
-\r
- // internal methods\r
- const unsigned int CalculateMinimumBin(const int begin, int end) const;\r
- void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);\r
- void EncodeQuerySequence(const string& query, string& encodedQuery);\r
-};\r
-\r
-// -----------------------------------------------------\r
-// BamWriter implementation\r
-// -----------------------------------------------------\r
-\r
-// constructor\r
-BamWriter::BamWriter(void) {\r
- d = new BamWriterPrivate;\r
-}\r
-\r
-// destructor\r
+/*! \class BamTools::BamWriter\r
+ \brief Provides write access for generating BAM files.\r
+*/\r
+/*! \enum BamTools::BamWriter::CompressionMode\r
+ \brief This enum describes the compression behaviors for output BAM files.\r
+*/\r
+/*! \var BamWriter::CompressionMode BamWriter::Compressed\r
+ \brief Use normal BAM compression\r
+*/\r
+/*! \var BamWriter::CompressionMode BamWriter::Uncompressed\r
+ \brief Disable BAM compression\r
+\r
+ Useful in situations where the BAM data is streamed (e.g. piping).\r
+ It would be wasteful to compress, and then immediately decompress\r
+ the data.\r
+*/\r
+\r
+/*! \fn BamWriter::BamWriter(void)\r
+ \brief constructor\r
+*/\r
+BamWriter::BamWriter(void)\r
+ : d(new BamWriterPrivate)\r
+{ }\r
+\r
+/*! \fn BamWriter::~BamWriter(void)\r
+ \brief destructor\r
+*/\r
BamWriter::~BamWriter(void) {\r
delete d;\r
d = 0;\r
}\r
\r
-// closes the alignment archive\r
-void BamWriter::Close(void) { \r
- d->Close(); \r
+/*! \fn BamWriter::Close(void)\r
+ \brief Closes the current BAM file.\r
+ \sa Open()\r
+*/\r
+void BamWriter::Close(void) {\r
+ d->Close();\r
}\r
\r
-// opens the alignment archive\r
-bool BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) {\r
- return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed);\r
-}\r
-\r
-// saves the alignment to the alignment archive\r
-void BamWriter::SaveAlignment(const BamAlignment& al) { \r
- d->SaveAlignment(al);\r
-}\r
+/*! \fn std::string BamWriter::GetErrorString(void) const\r
+ \brief Returns a human-readable description of the last error that occurred\r
\r
-// -----------------------------------------------------\r
-// BamWriterPrivate implementation\r
-// -----------------------------------------------------\r
+ This method allows elimination of STDERR pollution. Developers of client code\r
+ may choose how the messages are displayed to the user, if at all.\r
\r
-// closes the alignment archive\r
-void BamWriter::BamWriterPrivate::Close(void) {\r
- mBGZF.Close();\r
+ \return error description\r
+*/\r
+std::string BamWriter::GetErrorString(void) const {\r
+ return d->GetErrorString();\r
}\r
\r
-// calculates minimum bin for a BAM alignment interval\r
-const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { \r
- --end;\r
- if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);\r
- if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);\r
- if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);\r
- if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);\r
- if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);\r
- return 0;\r
+/*! \fn bool BamWriter::IsOpen(void) const\r
+ \brief Returns \c true if BAM file is open for writing.\r
+ \sa Open()\r
+*/\r
+bool BamWriter::IsOpen(void) const {\r
+ return d->IsOpen();\r
}\r
\r
-// creates a cigar string from the supplied alignment\r
-void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {\r
-\r
- // initialize\r
- const unsigned int numCigarOperations = cigarOperations.size();\r
- packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);\r
-\r
- // pack the cigar data into the string\r
- unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();\r
-\r
- unsigned int cigarOp;\r
- vector<CigarOp>::const_iterator coIter;\r
- for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {\r
-\r
- switch(coIter->Type) {\r
- case 'M':\r
- cigarOp = BAM_CMATCH;\r
- break;\r
- case 'I':\r
- cigarOp = BAM_CINS;\r
- break;\r
- case 'D':\r
- cigarOp = BAM_CDEL;\r
- break;\r
- case 'N':\r
- cigarOp = BAM_CREF_SKIP;\r
- break;\r
- case 'S':\r
- cigarOp = BAM_CSOFT_CLIP;\r
- break;\r
- case 'H':\r
- cigarOp = BAM_CHARD_CLIP;\r
- break;\r
- case 'P':\r
- cigarOp = BAM_CPAD;\r
- break;\r
- default:\r
- fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);\r
- exit(1);\r
- }\r
-\r
- *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;\r
- pPackedCigar++;\r
- }\r
+/*! \fn bool BamWriter::Open(const std::string& filename,\r
+ const std::string& samHeaderText,\r
+ const RefVector& referenceSequences)\r
+ \brief Opens a BAM file for writing.\r
+\r
+ Will overwrite the BAM file if it already exists.\r
+\r
+ \param[in] filename name of output BAM file\r
+ \param[in] samHeaderText header data, as SAM-formatted string\r
+ \param[in] referenceSequences list of reference entries\r
+\r
+ \return \c true if opened successfully\r
+ \sa Close(), IsOpen(), BamReader::GetHeaderText(), BamReader::GetReferenceData()\r
+*/\r
+bool BamWriter::Open(const std::string& filename,\r
+ const std::string& samHeaderText,\r
+ const RefVector& referenceSequences)\r
+{\r
+ return d->Open(filename, samHeaderText, referenceSequences);\r
}\r
\r
-// encodes the supplied query sequence into 4-bit notation\r
-void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {\r
+/*! \fn bool BamWriter::Open(const std::string& filename,\r
+ const SamHeader& samHeader,\r
+ const RefVector& referenceSequences)\r
+ \brief Opens a BAM file for writing.\r
\r
- // prepare the encoded query string\r
- const unsigned int queryLen = query.size();\r
- const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);\r
- encodedQuery.resize(encodedQueryLen);\r
- char* pEncodedQuery = (char*)encodedQuery.data();\r
- const char* pQuery = (const char*)query.data();\r
+ This is an overloaded function.\r
\r
- unsigned char nucleotideCode;\r
- bool useHighWord = true;\r
+ Will overwrite the BAM file if it already exists.\r
\r
- while(*pQuery) {\r
+ \param[in] filename name of output BAM file\r
+ \param[in] samHeader header data, wrapped in SamHeader object\r
+ \param[in] referenceSequences list of reference entries\r
\r
- switch(*pQuery) {\r
- \r
- case '=':\r
- nucleotideCode = 0;\r
- break;\r
- \r
- case 'A':\r
- nucleotideCode = 1;\r
- break;\r
- \r
- case 'C':\r
- nucleotideCode = 2;\r
- break;\r
- \r
- case 'G':\r
- nucleotideCode = 4;\r
- break;\r
- \r
- case 'T':\r
- nucleotideCode = 8;\r
- break;\r
- \r
- case 'N':\r
- nucleotideCode = 15;\r
- break;\r
- \r
- default:\r
- fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
- exit(1);\r
- }\r
-\r
- // pack the nucleotide code\r
- if(useHighWord) {\r
- *pEncodedQuery = nucleotideCode << 4;\r
- useHighWord = false;\r
- } else {\r
- *pEncodedQuery |= nucleotideCode;\r
- pEncodedQuery++;\r
- useHighWord = true;\r
- }\r
-\r
- // increment the query position\r
- pQuery++;\r
- }\r
+ \return \c true if opened successfully\r
+ \sa Close(), IsOpen(), BamReader::GetHeader(), BamReader::GetReferenceData()\r
+*/\r
+bool BamWriter::Open(const std::string& filename,\r
+ const SamHeader& samHeader,\r
+ const RefVector& referenceSequences)\r
+{\r
+ return d->Open(filename, samHeader.ToString(), referenceSequences);\r
}\r
\r
-// opens the alignment archive\r
-bool BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) {\r
-\r
- // open the BGZF file for writing, return failure if error\r
- if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )\r
- return false;\r
-\r
- // ================\r
- // write the header\r
- // ================\r
-\r
- // write the BAM signature\r
- const unsigned char SIGNATURE_LENGTH = 4;\r
- const char* BAM_SIGNATURE = "BAM\1";\r
- mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);\r
-\r
- // write the SAM header text length\r
- uint32_t samHeaderLen = samHeader.size();\r
- if (IsBigEndian) SwapEndian_32(samHeaderLen);\r
- mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
-\r
- // write the SAM header text\r
- if(samHeaderLen > 0) \r
- mBGZF.Write(samHeader.data(), samHeaderLen);\r
-\r
- // write the number of reference sequences\r
- uint32_t numReferenceSequences = referenceSequences.size();\r
- if (IsBigEndian) SwapEndian_32(numReferenceSequences);\r
- mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
+/*! \fn void BamWriter::SaveAlignment(const BamAlignment& alignment)\r
+ \brief Saves an alignment to the BAM file.\r
\r
- // =============================\r
- // write the sequence dictionary\r
- // =============================\r
-\r
- RefVector::const_iterator rsIter;\r
- for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {\r
-\r
- // write the reference sequence name length\r
- uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
- if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);\r
- mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
-\r
- // write the reference sequence name\r
- mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);\r
-\r
- // write the reference sequence length\r
- int32_t referenceLength = rsIter->RefLength;\r
- if (IsBigEndian) SwapEndian_32(referenceLength);\r
- mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);\r
- }\r
- \r
- // return success\r
- return true;\r
+ \param[in] alignment BamAlignment record to save\r
+ \sa BamReader::GetNextAlignment(), BamReader::GetNextAlignmentCore()\r
+*/\r
+bool BamWriter::SaveAlignment(const BamAlignment& alignment) {\r
+ return d->SaveAlignment(alignment);\r
}\r
\r
-// saves the alignment to the alignment archive\r
-void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {\r
-\r
- // if BamAlignment contains only the core data and a raw char data buffer\r
- // (as a result of BamReader::GetNextAlignmentCore())\r
- if ( al.SupportData.HasCoreOnly ) {\r
- \r
- // write the block size\r
- unsigned int blockSize = al.SupportData.BlockLength;\r
- if (IsBigEndian) SwapEndian_32(blockSize);\r
- mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
-\r
- // assign the BAM core data\r
- uint32_t buffer[8];\r
- buffer[0] = al.RefID;\r
- buffer[1] = al.Position;\r
- buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;\r
- buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;\r
- buffer[4] = al.SupportData.QuerySequenceLength;\r
- buffer[5] = al.MateRefID;\r
- buffer[6] = al.MatePosition;\r
- buffer[7] = al.InsertSize;\r
- \r
- // swap BAM core endian-ness, if necessary\r
- if ( IsBigEndian ) { \r
- for ( int i = 0; i < 8; ++i )\r
- SwapEndian_32(buffer[i]); \r
- }\r
- \r
- // write the BAM core\r
- mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
- \r
- // write the raw char data\r
- mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); \r
- }\r
- \r
- // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc\r
- // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )\r
- else {\r
- \r
- // calculate char lengths\r
- const unsigned int nameLength = al.Name.size() + 1;\r
- const unsigned int numCigarOperations = al.CigarData.size();\r
- const unsigned int queryLength = al.QueryBases.size();\r
- const unsigned int tagDataLength = al.TagData.size();\r
- \r
- // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)\r
- // force calculation of Bin before storing\r
- const int endPosition = al.GetEndPosition();\r
- const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);\r
- \r
- // create our packed cigar string\r
- string packedCigar;\r
- CreatePackedCigar(al.CigarData, packedCigar);\r
- const unsigned int packedCigarLength = packedCigar.size();\r
-\r
- // encode the query\r
- string encodedQuery;\r
- EncodeQuerySequence(al.QueryBases, encodedQuery);\r
- const unsigned int encodedQueryLength = encodedQuery.size(); \r
- \r
- // write the block size\r
- const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;\r
- unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
- if (IsBigEndian) SwapEndian_32(blockSize);\r
- mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
-\r
- // assign the BAM core data\r
- uint32_t buffer[8];\r
- buffer[0] = al.RefID;\r
- buffer[1] = al.Position;\r
- buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;\r
- buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;\r
- buffer[4] = queryLength;\r
- buffer[5] = al.MateRefID;\r
- buffer[6] = al.MatePosition;\r
- buffer[7] = al.InsertSize;\r
- \r
- // swap BAM core endian-ness, if necessary\r
- if ( IsBigEndian ) { \r
- for ( int i = 0; i < 8; ++i )\r
- SwapEndian_32(buffer[i]); \r
- }\r
- \r
- // write the BAM core\r
- mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
- \r
- // write the query name\r
- mBGZF.Write(al.Name.c_str(), nameLength);\r
+/*! \fn void BamWriter::SetCompressionMode(const BamWriter::CompressionMode& compressionMode)\r
+ \brief Sets the output compression mode.\r
\r
- // write the packed cigar\r
- if ( IsBigEndian ) {\r
- \r
- char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);\r
- memcpy(cigarData, packedCigar.data(), packedCigarLength);\r
- \r
- for (unsigned int i = 0; i < packedCigarLength; ++i) {\r
- if ( IsBigEndian )\r
- SwapEndian_32p(&cigarData[i]); \r
- }\r
- \r
- mBGZF.Write(cigarData, packedCigarLength);\r
- free(cigarData); \r
- } \r
- else \r
- mBGZF.Write(packedCigar.data(), packedCigarLength);\r
+ Default mode is BamWriter::Compressed.\r
\r
- // write the encoded query sequence\r
- mBGZF.Write(encodedQuery.data(), encodedQueryLength);\r
+ \note Changing the compression mode is disabled on open files (i.e. the request will\r
+ be ignored). Be sure to call this function before opening the BAM file.\r
\r
- // write the base qualities\r
- string baseQualities(al.Qualities);\r
- char* pBaseQualities = (char*)al.Qualities.data();\r
- for(unsigned int i = 0; i < queryLength; i++) { \r
- pBaseQualities[i] -= 33; \r
- }\r
- mBGZF.Write(pBaseQualities, queryLength);\r
+ \code\r
+ BamWriter writer;\r
+ writer.SetCompressionMode(BamWriter::Uncompressed);\r
+ writer.Open( ... );\r
+ // ...\r
+ \endcode\r
\r
- // write the read group tag\r
- if ( IsBigEndian ) {\r
- \r
- char* tagData = (char*)calloc(sizeof(char), tagDataLength);\r
- memcpy(tagData, al.TagData.data(), tagDataLength);\r
- \r
- int i = 0;\r
- while ( (unsigned int)i < tagDataLength ) {\r
- \r
- i += 2; // skip tag type (e.g. "RG", "NM", etc)\r
- uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning \r
- ++i; // skip value type\r
- \r
- switch (type) {\r
- \r
- case('A') :\r
- case('C') : \r
- ++i;\r
- break;\r
- \r
- case('S') : \r
- SwapEndian_16p(&tagData[i]); \r
- i+=2; // sizeof(uint16_t)\r
- break;\r
- \r
- case('F') :\r
- case('I') : \r
- SwapEndian_32p(&tagData[i]);\r
- i+=4; // sizeof(uint32_t)\r
- break;\r
- \r
- case('D') : \r
- SwapEndian_64p(&tagData[i]);\r
- i+=8; // sizeof(uint64_t)\r
- break;\r
- \r
- case('H') :\r
- case('Z') : \r
- while (tagData[i]) { ++i; }\r
- ++i; // increment one more for null terminator\r
- break;\r
- \r
- default : \r
- fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here\r
- free(tagData);\r
- exit(1); \r
- }\r
- }\r
- \r
- mBGZF.Write(tagData, tagDataLength);\r
- free(tagData);\r
- } \r
- else \r
- mBGZF.Write(al.TagData.data(), tagDataLength); \r
- }\r
+ \param[in] compressionMode desired output compression behavior\r
+ \sa IsOpen(), Open()\r
+*/\r
+void BamWriter::SetCompressionMode(const BamWriter::CompressionMode& compressionMode) {\r
+ d->SetWriteCompressed( compressionMode == BamWriter::Compressed );\r
}\r