-// ***************************************************************************
-// BamWriter (c) 2009 Michael Strömberg
-// Marth Lab, Deptartment of Biology, Boston College
-// All rights reserved.
-// ---------------------------------------------------------------------------
-// Provides the basic functionality for producing BAM files
-// ***************************************************************************
-
-#include "BamWriter.h"
-
-// constructor
-BamWriter::BamWriter(void)
-{}
-
-// destructor
-BamWriter::~BamWriter(void) {
- if(mBGZF.IsOpen) BgzfClose();
-}
-
-// closes the BAM file
-void BamWriter::BgzfClose(void) {
-
- mBGZF.IsOpen = false;
-
- // flush the BGZF block
- BgzfFlushBlock();
-
- // flush and close
- fflush(mBGZF.Stream);
- fclose(mBGZF.Stream);
-}
-
-// compresses the current block
-int BamWriter::BgzfDeflateBlock(void) {
-
- // initialize the gzip header
- char* buffer = mBGZF.CompressedBlock;
- unsigned int bufferSize = mBGZF.CompressedBlockSize;
-
- memset(buffer, 0, 18);
- buffer[0] = GZIP_ID1;
- buffer[1] = (char)GZIP_ID2;
- buffer[2] = CM_DEFLATE;
- buffer[3] = FLG_FEXTRA;
- buffer[9] = (char)OS_UNKNOWN;
- buffer[10] = BGZF_XLEN;
- buffer[12] = BGZF_ID1;
- buffer[13] = BGZF_ID2;
- buffer[14] = BGZF_LEN;
-
- // loop to retry for blocks that do not compress enough
- int inputLength = mBGZF.BlockOffset;
- int compressedLength = 0;
-
- while(true) {
-
- z_stream zs;
- zs.zalloc = NULL;
- zs.zfree = NULL;
- zs.next_in = (Bytef*)mBGZF.UncompressedBlock;
- zs.avail_in = inputLength;
- zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
- zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
-
- // initialize the zlib compression algorithm
- if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {
- printf("ERROR: zlib deflate initialization failed.\n");
- exit(1);
- }
-
- // compress the data
- int status = deflate(&zs, Z_FINISH);
- if(status != Z_STREAM_END) {
- deflateEnd(&zs);
-
- // reduce the input length and try again
- if(status == Z_OK) {
- inputLength -= 1024;
- if(inputLength < 0) {
- printf("ERROR: input reduction failed.\n");
- exit(1);
- }
- continue;
- }
-
- printf("ERROR: zlib deflate failed.\n");
- exit(1);
- }
-
- // finalize the compression routine
- if(deflateEnd(&zs) != Z_OK) {
- printf("ERROR: deflate end failed.\n");
- exit(1);
- }
-
- compressedLength = zs.total_out;
- compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
-
- if(compressedLength > MAX_BLOCK_SIZE) {
- printf("ERROR: deflate overflow.\n");
- exit(1);
- }
-
- break;
- }
-
- // store the compressed length
- BgzfPackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
-
- // store the CRC32 checksum
- unsigned int crc = crc32(0, NULL, 0);
- crc = crc32(crc, (Bytef*)mBGZF.UncompressedBlock, inputLength);
- BgzfPackUnsignedInt(&buffer[compressedLength - 8], crc);
- BgzfPackUnsignedInt(&buffer[compressedLength - 4], inputLength);
-
- // ensure that we have less than a block of data left
- int remaining = mBGZF.BlockOffset - inputLength;
- if(remaining > 0) {
- if(remaining > inputLength) {
- printf("ERROR: remainder too large.\n");
- exit(1);
- }
-
- memcpy(mBGZF.UncompressedBlock, mBGZF.UncompressedBlock + inputLength, remaining);
- }
-
- mBGZF.BlockOffset = remaining;
- return compressedLength;
-}
-
-// flushes the data in the BGZF block
-void BamWriter::BgzfFlushBlock(void) {
-
- // flush all of the remaining blocks
- while(mBGZF.BlockOffset > 0) {
-
- // compress the data block
- int blockLength = BgzfDeflateBlock();
-
- // flush the data to our output stream
- int numBytesWritten = fwrite(mBGZF.CompressedBlock, 1, blockLength, mBGZF.Stream);
-
- if(numBytesWritten != blockLength) {
- printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
- exit(1);
- }
-
- mBGZF.BlockAddress += blockLength;
- }
-}
-
-// opens the BAM file for writing
-void BamWriter::BgzfOpen(const string& filename) {
-
- mBGZF.Stream = fopen(filename.c_str(), "wb");
-
- if(!mBGZF.Stream) {
- printf("ERROR: Unable to open the BAM file (%s) for writing.\n", filename.c_str());
- exit(1);
- }
-
- mBGZF.IsOpen = true;
-}
-
-// writes the supplied data into the BGZF buffer
-unsigned int BamWriter::BgzfWrite(const char* data, const unsigned int dataLen) {
-
- // initialize
- unsigned int numBytesWritten = 0;
- const char* input = data;
- unsigned int blockLength = mBGZF.UncompressedBlockSize;
-
- // copy the data to the buffer
- while(numBytesWritten < dataLen) {
- unsigned int copyLength = min(blockLength - mBGZF.BlockOffset, dataLen - numBytesWritten);
- char* buffer = mBGZF.UncompressedBlock;
- memcpy(buffer + mBGZF.BlockOffset, input, copyLength);
-
- mBGZF.BlockOffset += copyLength;
- input += copyLength;
- numBytesWritten += copyLength;
-
- if(mBGZF.BlockOffset == blockLength) BgzfFlushBlock();
- }
-
- return numBytesWritten;
-}
-
-// closes the alignment archive
-void BamWriter::Close(void) {
- if(mBGZF.IsOpen) BgzfClose();
-}
-
-// creates a cigar string from the supplied alignment
-void BamWriter::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
-
- // initialize
- const unsigned int numCigarOperations = cigarOperations.size();
- packedCigar.resize(numCigarOperations * SIZEOF_INT);
-
- // pack the cigar data into the string
- unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
-
- unsigned int cigarOp;
- vector<CigarOp>::const_iterator coIter;
- for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {
-
- switch(coIter->Type) {
- case 'M':
- cigarOp = BAM_CMATCH;
- break;
- case 'I':
- cigarOp = BAM_CINS;
- break;
- case 'D':
- cigarOp = BAM_CDEL;
- break;
- case 'N':
- cigarOp = BAM_CREF_SKIP;
- break;
- case 'S':
- cigarOp = BAM_CSOFT_CLIP;
- break;
- case 'H':
- cigarOp = BAM_CHARD_CLIP;
- break;
- case 'P':
- cigarOp = BAM_CPAD;
- break;
- default:
- printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);
- exit(1);
- }
-
- *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
- pPackedCigar++;
- }
-}
-
-// encodes the supplied query sequence into 4-bit notation
-void BamWriter::EncodeQuerySequence(const string& query, string& encodedQuery) {
-
- // prepare the encoded query string
- const unsigned int queryLen = query.size();
- const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
- encodedQuery.resize(encodedQueryLen);
- char* pEncodedQuery = (char*)encodedQuery.data();
- const char* pQuery = (const char*)query.data();
-
- unsigned char nucleotideCode;
- bool useHighWord = true;
-
- while(*pQuery) {
-
- switch(*pQuery) {
- case '=':
- nucleotideCode = 0;
- break;
- case 'A':
- nucleotideCode = 1;
- break;
- case 'C':
- nucleotideCode = 2;
- break;
- case 'G':
- nucleotideCode = 4;
- break;
- case 'T':
- nucleotideCode = 8;
- break;
- case 'N':
- nucleotideCode = 15;
- break;
- default:
- printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
- exit(1);
- }
-
- // pack the nucleotide code
- if(useHighWord) {
- *pEncodedQuery = nucleotideCode << 4;
- useHighWord = false;
- } else {
- *pEncodedQuery |= nucleotideCode;
- pEncodedQuery++;
- useHighWord = true;
- }
-
- // increment the query position
- pQuery++;
- }
-}
-
-// opens the alignment archive
-void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {
-
- // open the BGZF file for writing
- BgzfOpen(filename);
-
- // ================
- // write the header
- // ================
-
- // write the BAM signature
- const unsigned char SIGNATURE_LENGTH = 4;
- const char* BAM_SIGNATURE = "BAM\1";
- BgzfWrite(BAM_SIGNATURE, SIGNATURE_LENGTH);
-
- // write the SAM header text length
- const unsigned int samHeaderLen = samHeader.size();
- BgzfWrite((char*)&samHeaderLen, SIZEOF_INT);
-
- // write the SAM header text
- if(samHeaderLen > 0) BgzfWrite(samHeader.data(), samHeaderLen);
-
- // write the number of reference sequences
- const unsigned int numReferenceSequences = referenceSequences.size();
- BgzfWrite((char*)&numReferenceSequences, SIZEOF_INT);
-
- // =============================
- // write the sequence dictionary
- // =============================
-
- RefVector::const_iterator rsIter;
- for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
-
- // write the reference sequence name length
- const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1;
- BgzfWrite((char*)&referenceSequenceNameLen, SIZEOF_INT);
-
- // write the reference sequence name
- BgzfWrite(rsIter->RefName.c_str(), referenceSequenceNameLen);
-
- // write the reference sequence length
- BgzfWrite((char*)&rsIter->RefLength, SIZEOF_INT);
- }
-}
-
-// saves the alignment to the alignment archive
-void BamWriter::SaveAlignment(const BamAlignment& al) {
-
- // initialize
- const unsigned int nameLen = al.Name.size() + 1;
- const unsigned int queryLen = al.QueryBases.size();
- const unsigned int numCigarOperations = al.CigarData.size();
-
- // create our packed cigar string
- string packedCigar;
- CreatePackedCigar(al.CigarData, packedCigar);
- const unsigned int packedCigarLen = packedCigar.size();
-
- // encode the query
- string encodedQuery;
- EncodeQuerySequence(al.QueryBases, encodedQuery);
- const unsigned int encodedQueryLen = encodedQuery.size();
-
- // create our read group tag
- // TODO: add read group tag support when it shows up in the BamAlignment struct
- //string readGroupTag;
- //const unsigned int readGroupTagLen = 3 + mReadGroupLUT[al.ReadGroupCode].NameLen + 1;
- //readGroupTag.resize(readGroupTagLen);
- //char* pReadGroupTag = (char*)readGroupTag.data();
- //sprintf(pReadGroupTag, "RGZ%s", mReadGroupLUT[al.ReadGroupCode].Name);
-
- // assign the BAM core data
- unsigned int buffer[8];
- buffer[0] = al.RefID;
- buffer[1] = al.Position;
- buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;
- buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
- buffer[4] = queryLen;
- buffer[5] = al.MateRefID;
- buffer[6] = al.MatePosition;
- buffer[7] = al.InsertSize;
-
- // write the block size
- const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen /* + readGroupTagLen*/;
- const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
-
- BgzfWrite((char*)&blockSize, SIZEOF_INT);
-
- // write the BAM core
- BgzfWrite((char*)&buffer, BAM_CORE_SIZE);
-
- // write the query name
- BgzfWrite(al.Name.c_str(), nameLen);
-
- // write the packed cigar
- BgzfWrite(packedCigar.data(), packedCigarLen);
-
- // write the encoded query sequence
- BgzfWrite(encodedQuery.data(), encodedQueryLen);
-
- // write the base qualities
- string baseQualities = al.Qualities;
- char* pBaseQualities = (char*)al.Qualities.data();
- for(unsigned int i = 0; i < queryLen; i++) pBaseQualities[i] -= 33;
- BgzfWrite(pBaseQualities, queryLen);
-
- // write the read group tag
- // TODO: add read group tag support when it shows up in the BamAlignment struct
- //BgzfWrite(readGroupTag.c_str(), readGroupTagLen);
-}
+// ***************************************************************************\r
+// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 15 July 2010 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for producing BAM files\r
+// ***************************************************************************\r
+\r
+#include <iostream>\r
+\r
+#include "BGZF.h"\r
+#include "BamWriter.h"\r
+using namespace BamTools;\r
+using namespace std;\r
+\r
+struct BamWriter::BamWriterPrivate {\r
+\r
+ // data members\r
+ BgzfData mBGZF;\r
+ bool IsBigEndian;\r
+ \r
+ // constructor / destructor\r
+ BamWriterPrivate(void) { \r
+ IsBigEndian = SystemIsBigEndian(); \r
+ }\r
+ \r
+ ~BamWriterPrivate(void) {\r
+ mBGZF.Close();\r
+ }\r
+\r
+ // "public" interface\r
+ void Close(void);\r
+ void Open(const string& filename, const string& samHeader, const RefVector& referenceSequences);\r
+ void SaveAlignment(const BamAlignment& al);\r
+\r
+ // internal methods\r
+ const unsigned int CalculateMinimumBin(const int begin, int end) const;\r
+ void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);\r
+ void EncodeQuerySequence(const string& query, string& encodedQuery);\r
+};\r
+\r
+// -----------------------------------------------------\r
+// BamWriter implementation\r
+// -----------------------------------------------------\r
+\r
+// constructor\r
+BamWriter::BamWriter(void) {\r
+ d = new BamWriterPrivate;\r
+}\r
+\r
+// destructor\r
+BamWriter::~BamWriter(void) {\r
+ delete d;\r
+ d = 0;\r
+}\r
+\r
+// closes the alignment archive\r
+void BamWriter::Close(void) { \r
+ d->Close(); \r
+}\r
+\r
+// opens the alignment archive\r
+void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {\r
+ d->Open(filename, samHeader, referenceSequences);\r
+}\r
+\r
+// saves the alignment to the alignment archive\r
+void BamWriter::SaveAlignment(const BamAlignment& al) { \r
+ d->SaveAlignment(al);\r
+}\r
+\r
+// -----------------------------------------------------\r
+// BamWriterPrivate implementation\r
+// -----------------------------------------------------\r
+\r
+// closes the alignment archive\r
+void BamWriter::BamWriterPrivate::Close(void) {\r
+ mBGZF.Close();\r
+}\r
+\r
+// calculates minimum bin for a BAM alignment interval\r
+const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { \r
+ --end;\r
+ if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);\r
+ if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);\r
+ if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);\r
+ if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);\r
+ if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);\r
+ return 0;\r
+}\r
+\r
+// creates a cigar string from the supplied alignment\r
+void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {\r
+\r
+ // initialize\r
+ const unsigned int numCigarOperations = cigarOperations.size();\r
+ packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);\r
+\r
+ // pack the cigar data into the string\r
+ unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();\r
+\r
+ unsigned int cigarOp;\r
+ vector<CigarOp>::const_iterator coIter;\r
+ for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {\r
+\r
+ switch(coIter->Type) {\r
+ case 'M':\r
+ cigarOp = BAM_CMATCH;\r
+ break;\r
+ case 'I':\r
+ cigarOp = BAM_CINS;\r
+ break;\r
+ case 'D':\r
+ cigarOp = BAM_CDEL;\r
+ break;\r
+ case 'N':\r
+ cigarOp = BAM_CREF_SKIP;\r
+ break;\r
+ case 'S':\r
+ cigarOp = BAM_CSOFT_CLIP;\r
+ break;\r
+ case 'H':\r
+ cigarOp = BAM_CHARD_CLIP;\r
+ break;\r
+ case 'P':\r
+ cigarOp = BAM_CPAD;\r
+ break;\r
+ default:\r
+ printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);\r
+ exit(1);\r
+ }\r
+\r
+ *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;\r
+ pPackedCigar++;\r
+ }\r
+}\r
+\r
+// encodes the supplied query sequence into 4-bit notation\r
+void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {\r
+\r
+ // prepare the encoded query string\r
+ const unsigned int queryLen = query.size();\r
+ const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);\r
+ encodedQuery.resize(encodedQueryLen);\r
+ char* pEncodedQuery = (char*)encodedQuery.data();\r
+ const char* pQuery = (const char*)query.data();\r
+\r
+ unsigned char nucleotideCode;\r
+ bool useHighWord = true;\r
+\r
+ while(*pQuery) {\r
+\r
+ switch(*pQuery) {\r
+ \r
+ case '=':\r
+ nucleotideCode = 0;\r
+ break;\r
+ \r
+ case 'A':\r
+ nucleotideCode = 1;\r
+ break;\r
+ \r
+ case 'C':\r
+ nucleotideCode = 2;\r
+ break;\r
+ \r
+ case 'G':\r
+ nucleotideCode = 4;\r
+ break;\r
+ \r
+ case 'T':\r
+ nucleotideCode = 8;\r
+ break;\r
+ \r
+ case 'N':\r
+ nucleotideCode = 15;\r
+ break;\r
+ \r
+ default:\r
+ printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
+ exit(1);\r
+ }\r
+\r
+ // pack the nucleotide code\r
+ if(useHighWord) {\r
+ *pEncodedQuery = nucleotideCode << 4;\r
+ useHighWord = false;\r
+ } else {\r
+ *pEncodedQuery |= nucleotideCode;\r
+ pEncodedQuery++;\r
+ useHighWord = true;\r
+ }\r
+\r
+ // increment the query position\r
+ pQuery++;\r
+ }\r
+}\r
+\r
+// opens the alignment archive\r
+void BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {\r
+\r
+ // open the BGZF file for writing\r
+ mBGZF.Open(filename, "wb");\r
+\r
+ // ================\r
+ // write the header\r
+ // ================\r
+\r
+ // write the BAM signature\r
+ const unsigned char SIGNATURE_LENGTH = 4;\r
+ const char* BAM_SIGNATURE = "BAM\1";\r
+ mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);\r
+\r
+ // write the SAM header text length\r
+ uint32_t samHeaderLen = samHeader.size();\r
+ if (IsBigEndian) SwapEndian_32(samHeaderLen);\r
+ mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
+\r
+ // write the SAM header text\r
+ if(samHeaderLen > 0) \r
+ mBGZF.Write(samHeader.data(), samHeaderLen);\r
+\r
+ // write the number of reference sequences\r
+ uint32_t numReferenceSequences = referenceSequences.size();\r
+ if (IsBigEndian) SwapEndian_32(numReferenceSequences);\r
+ mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
+\r
+ // =============================\r
+ // write the sequence dictionary\r
+ // =============================\r
+\r
+ RefVector::const_iterator rsIter;\r
+ for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {\r
+\r
+ // write the reference sequence name length\r
+ uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
+ if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);\r
+ mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
+\r
+ // write the reference sequence name\r
+ mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);\r
+\r
+ // write the reference sequence length\r
+ int32_t referenceLength = rsIter->RefLength;\r
+ if (IsBigEndian) SwapEndian_32(referenceLength);\r
+ mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);\r
+ }\r
+}\r
+\r
+// saves the alignment to the alignment archive\r
+void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {\r
+\r
+ // if BamAlignment contains only the core data and a raw char data buffer\r
+ // (as a result of BamReader::GetNextAlignmentCore())\r
+ if ( al.SupportData.HasCoreOnly ) {\r
+ \r
+ // write the block size\r
+ unsigned int blockSize = al.SupportData.BlockLength;\r
+ if (IsBigEndian) SwapEndian_32(blockSize);\r
+ mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
+\r
+ // assign the BAM core data\r
+ uint32_t buffer[8];\r
+ buffer[0] = al.RefID;\r
+ buffer[1] = al.Position;\r
+ buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;\r
+ buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;\r
+ buffer[4] = al.SupportData.QuerySequenceLength;\r
+ buffer[5] = al.MateRefID;\r
+ buffer[6] = al.MatePosition;\r
+ buffer[7] = al.InsertSize;\r
+ \r
+ // swap BAM core endian-ness, if necessary\r
+ if ( IsBigEndian ) { \r
+ for ( int i = 0; i < 8; ++i )\r
+ SwapEndian_32(buffer[i]); \r
+ }\r
+ \r
+ // write the BAM core\r
+ mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
+ \r
+ // write the raw char data\r
+ mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); \r
+ }\r
+ \r
+ // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc\r
+ // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )\r
+ else {\r
+ \r
+ // calculate char lengths\r
+ const unsigned int nameLength = al.Name.size() + 1;\r
+ const unsigned int numCigarOperations = al.CigarData.size();\r
+ const unsigned int queryLength = al.QueryBases.size();\r
+ const unsigned int tagDataLength = al.TagData.size();\r
+ \r
+ // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)\r
+ // force calculation of Bin before storing\r
+ const int endPosition = al.GetEndPosition();\r
+ const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);\r
+ \r
+ // create our packed cigar string\r
+ string packedCigar;\r
+ CreatePackedCigar(al.CigarData, packedCigar);\r
+ const unsigned int packedCigarLength = packedCigar.size();\r
+\r
+ // encode the query\r
+ string encodedQuery;\r
+ EncodeQuerySequence(al.QueryBases, encodedQuery);\r
+ const unsigned int encodedQueryLength = encodedQuery.size(); \r
+ \r
+ // write the block size\r
+ const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;\r
+ unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
+ if (IsBigEndian) SwapEndian_32(blockSize);\r
+ mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
+\r
+ // assign the BAM core data\r
+ uint32_t buffer[8];\r
+ buffer[0] = al.RefID;\r
+ buffer[1] = al.Position;\r
+ buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;\r
+ buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;\r
+ buffer[4] = queryLength;\r
+ buffer[5] = al.MateRefID;\r
+ buffer[6] = al.MatePosition;\r
+ buffer[7] = al.InsertSize;\r
+ \r
+ // swap BAM core endian-ness, if necessary\r
+ if ( IsBigEndian ) { \r
+ for ( int i = 0; i < 8; ++i )\r
+ SwapEndian_32(buffer[i]); \r
+ }\r
+ \r
+ // write the BAM core\r
+ mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
+ \r
+ // write the query name\r
+ mBGZF.Write(al.Name.c_str(), nameLength);\r
+\r
+ // write the packed cigar\r
+ if ( IsBigEndian ) {\r
+ \r
+ char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);\r
+ memcpy(cigarData, packedCigar.data(), packedCigarLength);\r
+ \r
+ for (unsigned int i = 0; i < packedCigarLength; ++i) {\r
+ if ( IsBigEndian )\r
+ SwapEndian_32p(&cigarData[i]); \r
+ }\r
+ \r
+ mBGZF.Write(cigarData, packedCigarLength);\r
+ free(cigarData); \r
+ } \r
+ else \r
+ mBGZF.Write(packedCigar.data(), packedCigarLength);\r
+\r
+ // write the encoded query sequence\r
+ mBGZF.Write(encodedQuery.data(), encodedQueryLength);\r
+\r
+ // write the base qualities\r
+ string baseQualities(al.Qualities);\r
+ char* pBaseQualities = (char*)al.Qualities.data();\r
+ for(unsigned int i = 0; i < queryLength; i++) { \r
+ pBaseQualities[i] -= 33; \r
+ }\r
+ mBGZF.Write(pBaseQualities, queryLength);\r
+\r
+ // write the read group tag\r
+ if ( IsBigEndian ) {\r
+ \r
+ char* tagData = (char*)calloc(sizeof(char), tagDataLength);\r
+ memcpy(tagData, al.TagData.data(), tagDataLength);\r
+ \r
+ int i = 0;\r
+ while ( (unsigned int)i < tagDataLength ) {\r
+ \r
+ i += 2; // skip tag type (e.g. "RG", "NM", etc)\r
+ uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning \r
+ ++i; // skip value type\r
+ \r
+ switch (type) {\r
+ \r
+ case('A') :\r
+ case('C') : \r
+ ++i;\r
+ break;\r
+ \r
+ case('S') : \r
+ SwapEndian_16p(&tagData[i]); \r
+ i+=2; // sizeof(uint16_t)\r
+ break;\r
+ \r
+ case('F') :\r
+ case('I') : \r
+ SwapEndian_32p(&tagData[i]);\r
+ i+=4; // sizeof(uint32_t)\r
+ break;\r
+ \r
+ case('D') : \r
+ SwapEndian_64p(&tagData[i]);\r
+ i+=8; // sizeof(uint64_t)\r
+ break;\r
+ \r
+ case('H') :\r
+ case('Z') : \r
+ while (tagData[i]) { ++i; }\r
+ ++i; // increment one more for null terminator\r
+ break;\r
+ \r
+ default : \r
+ printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
+ free(tagData);\r
+ exit(1); \r
+ }\r
+ }\r
+ \r
+ mBGZF.Write(tagData, tagDataLength);\r
+ free(tagData);\r
+ } \r
+ else \r
+ mBGZF.Write(al.TagData.data(), tagDataLength); \r
+ }\r
+}\r