X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=BamWriter.cpp;h=f83ff1c3046ed46876c655b5b11f0535d634bda0;hb=2d6d143e041a08c2083a6857898ca36753317dad;hp=4f6a37f3592d8426be5d52da0a4e6c3257b90295;hpb=ccc66728ab56d429c3b3e72c261a3cefc143d197;p=bamtools.git diff --git a/BamWriter.cpp b/BamWriter.cpp index 4f6a37f..f83ff1c 100644 --- a/BamWriter.cpp +++ b/BamWriter.cpp @@ -1,400 +1,432 @@ -// *************************************************************************** -// BamWriter (c) 2009 Michael Strömberg -// Marth Lab, Deptartment of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// The BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for producing BAM files -// *************************************************************************** - -#include "BamWriter.h" - -// constructor -BamWriter::BamWriter(void) -{} - -// destructor -BamWriter::~BamWriter(void) { - if(mBGZF.IsOpen) BgzfClose(); -} - -// closes the BAM file -void BamWriter::BgzfClose(void) { - - mBGZF.IsOpen = false; - - // flush the BGZF block - BgzfFlushBlock(); - - // flush and close - fflush(mBGZF.Stream); - fclose(mBGZF.Stream); -} - -// compresses the current block -int BamWriter::BgzfDeflateBlock(void) { - - // initialize the gzip header - char* buffer = mBGZF.CompressedBlock; - unsigned int bufferSize = mBGZF.CompressedBlockSize; - - memset(buffer, 0, 18); - buffer[0] = GZIP_ID1; - buffer[1] = (char)GZIP_ID2; - buffer[2] = CM_DEFLATE; - buffer[3] = FLG_FEXTRA; - buffer[9] = (char)OS_UNKNOWN; - buffer[10] = BGZF_XLEN; - buffer[12] = BGZF_ID1; - buffer[13] = BGZF_ID2; - buffer[14] = BGZF_LEN; - - // loop to retry for blocks that do not compress enough - int inputLength = mBGZF.BlockOffset; - int compressedLength = 0; - - while(true) { - - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)mBGZF.UncompressedBlock; - zs.avail_in = inputLength; - zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH]; - zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; - - // initialize the zlib compression algorithm - if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) { - printf("ERROR: zlib deflate initialization failed.\n"); - exit(1); - } - - // compress the data - int status = deflate(&zs, Z_FINISH); - if(status != Z_STREAM_END) { - deflateEnd(&zs); - - // reduce the input length and try again - if(status == Z_OK) { - inputLength -= 1024; - if(inputLength < 0) { - printf("ERROR: input reduction failed.\n"); - exit(1); - } - continue; - } - - printf("ERROR: zlib deflate failed.\n"); - exit(1); - } - - // finalize the compression routine - if(deflateEnd(&zs) != Z_OK) { - printf("ERROR: deflate end failed.\n"); - exit(1); - } - - compressedLength = zs.total_out; - compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; - - if(compressedLength > MAX_BLOCK_SIZE) { - printf("ERROR: deflate overflow.\n"); - exit(1); - } - - break; - } - - // store the compressed length - BgzfPackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1)); - - // store the CRC32 checksum - unsigned int crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)mBGZF.UncompressedBlock, inputLength); - BgzfPackUnsignedInt(&buffer[compressedLength - 8], crc); - BgzfPackUnsignedInt(&buffer[compressedLength - 4], inputLength); - - // ensure that we have less than a block of data left - int remaining = mBGZF.BlockOffset - inputLength; - if(remaining > 0) { - if(remaining > inputLength) { - printf("ERROR: remainder too large.\n"); - exit(1); - } - - memcpy(mBGZF.UncompressedBlock, mBGZF.UncompressedBlock + inputLength, remaining); - } - - mBGZF.BlockOffset = remaining; - return compressedLength; -} - -// flushes the data in the BGZF block -void BamWriter::BgzfFlushBlock(void) { - - // flush all of the remaining blocks - while(mBGZF.BlockOffset > 0) { - - // compress the data block - int blockLength = BgzfDeflateBlock(); - - // flush the data to our output stream - int numBytesWritten = fwrite(mBGZF.CompressedBlock, 1, blockLength, mBGZF.Stream); - - if(numBytesWritten != blockLength) { - printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten); - exit(1); - } - - mBGZF.BlockAddress += blockLength; - } -} - -// opens the BAM file for writing -void BamWriter::BgzfOpen(const string& filename) { - - mBGZF.Stream = fopen(filename.c_str(), "wb"); - - if(!mBGZF.Stream) { - printf("ERROR: Unable to open the BAM file (%s) for writing.\n", filename.c_str()); - exit(1); - } - - mBGZF.IsOpen = true; -} - -// writes the supplied data into the BGZF buffer -unsigned int BamWriter::BgzfWrite(const char* data, const unsigned int dataLen) { - - // initialize - unsigned int numBytesWritten = 0; - const char* input = data; - unsigned int blockLength = mBGZF.UncompressedBlockSize; - - // copy the data to the buffer - while(numBytesWritten < dataLen) { - unsigned int copyLength = min(blockLength - mBGZF.BlockOffset, dataLen - numBytesWritten); - char* buffer = mBGZF.UncompressedBlock; - memcpy(buffer + mBGZF.BlockOffset, input, copyLength); - - mBGZF.BlockOffset += copyLength; - input += copyLength; - numBytesWritten += copyLength; - - if(mBGZF.BlockOffset == blockLength) BgzfFlushBlock(); - } - - return numBytesWritten; -} - -// closes the alignment archive -void BamWriter::Close(void) { - if(mBGZF.IsOpen) BgzfClose(); -} - -// creates a cigar string from the supplied alignment -void BamWriter::CreatePackedCigar(const vector& cigarOperations, string& packedCigar) { - - // initialize - const unsigned int numCigarOperations = cigarOperations.size(); - packedCigar.resize(numCigarOperations * SIZEOF_INT); - - // pack the cigar data into the string - unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); - - unsigned int cigarOp; - vector::const_iterator coIter; - for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) { - - switch(coIter->Type) { - case 'M': - cigarOp = BAM_CMATCH; - break; - case 'I': - cigarOp = BAM_CINS; - break; - case 'D': - cigarOp = BAM_CDEL; - break; - case 'N': - cigarOp = BAM_CREF_SKIP; - break; - case 'S': - cigarOp = BAM_CSOFT_CLIP; - break; - case 'H': - cigarOp = BAM_CHARD_CLIP; - break; - case 'P': - cigarOp = BAM_CPAD; - break; - default: - printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type); - exit(1); - } - - *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; - pPackedCigar++; - } -} - -// encodes the supplied query sequence into 4-bit notation -void BamWriter::EncodeQuerySequence(const string& query, string& encodedQuery) { - - // prepare the encoded query string - const unsigned int queryLen = query.size(); - const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); - encodedQuery.resize(encodedQueryLen); - char* pEncodedQuery = (char*)encodedQuery.data(); - const char* pQuery = (const char*)query.data(); - - unsigned char nucleotideCode; - bool useHighWord = true; - - while(*pQuery) { - - switch(*pQuery) { - case '=': - nucleotideCode = 0; - break; - case 'A': - nucleotideCode = 1; - break; - case 'C': - nucleotideCode = 2; - break; - case 'G': - nucleotideCode = 4; - break; - case 'T': - nucleotideCode = 8; - break; - case 'N': - nucleotideCode = 15; - break; - default: - printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); - exit(1); - } - - // pack the nucleotide code - if(useHighWord) { - *pEncodedQuery = nucleotideCode << 4; - useHighWord = false; - } else { - *pEncodedQuery |= nucleotideCode; - pEncodedQuery++; - useHighWord = true; - } - - // increment the query position - pQuery++; - } -} - -// opens the alignment archive -void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) { - - // open the BGZF file for writing - BgzfOpen(filename); - - // ================ - // write the header - // ================ - - // write the BAM signature - const unsigned char SIGNATURE_LENGTH = 4; - const char* BAM_SIGNATURE = "BAM\1"; - BgzfWrite(BAM_SIGNATURE, SIGNATURE_LENGTH); - - // write the SAM header text length - const unsigned int samHeaderLen = samHeader.size(); - BgzfWrite((char*)&samHeaderLen, SIZEOF_INT); - - // write the SAM header text - if(samHeaderLen > 0) BgzfWrite(samHeader.data(), samHeaderLen); - - // write the number of reference sequences - const unsigned int numReferenceSequences = referenceSequences.size(); - BgzfWrite((char*)&numReferenceSequences, SIZEOF_INT); - - // ============================= - // write the sequence dictionary - // ============================= - - RefVector::const_iterator rsIter; - for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) { - - // write the reference sequence name length - const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1; - BgzfWrite((char*)&referenceSequenceNameLen, SIZEOF_INT); - - // write the reference sequence name - BgzfWrite(rsIter->RefName.c_str(), referenceSequenceNameLen); - - // write the reference sequence length - BgzfWrite((char*)&rsIter->RefLength, SIZEOF_INT); - } -} - -// saves the alignment to the alignment archive -void BamWriter::SaveAlignment(const BamAlignment& al) { - - // initialize - const unsigned int nameLen = al.Name.size() + 1; - const unsigned int queryLen = al.QueryBases.size(); - const unsigned int numCigarOperations = al.CigarData.size(); - - // create our packed cigar string - string packedCigar; - CreatePackedCigar(al.CigarData, packedCigar); - const unsigned int packedCigarLen = packedCigar.size(); - - // encode the query - string encodedQuery; - EncodeQuerySequence(al.QueryBases, encodedQuery); - const unsigned int encodedQueryLen = encodedQuery.size(); - - // store the tag data length - const unsigned int tagDataLength = al.TagData.size(); - - // assign the BAM core data - unsigned int buffer[8]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen; - buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; - buffer[4] = queryLen; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // write the block size - const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength; - const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; - - BgzfWrite((char*)&blockSize, SIZEOF_INT); - - // write the BAM core - BgzfWrite((char*)&buffer, BAM_CORE_SIZE); - - // write the query name - BgzfWrite(al.Name.c_str(), nameLen); - - // write the packed cigar - BgzfWrite(packedCigar.data(), packedCigarLen); - - // write the encoded query sequence - BgzfWrite(encodedQuery.data(), encodedQueryLen); - - // write the base qualities - string baseQualities = al.Qualities; - char* pBaseQualities = (char*)al.Qualities.data(); - for(unsigned int i = 0; i < queryLen; i++) pBaseQualities[i] -= 33; - BgzfWrite(pBaseQualities, queryLen); - - // write the read group tag - BgzfWrite(al.TagData.data(), tagDataLength); -} +// *************************************************************************** +// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 17 August 2010 (DB) +// --------------------------------------------------------------------------- +// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad +// Institute. +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#include + +#include "BGZF.h" +#include "BamWriter.h" +using namespace BamTools; +using namespace std; + +struct BamWriter::BamWriterPrivate { + + // data members + BgzfData mBGZF; + bool IsBigEndian; + + // constructor / destructor + BamWriterPrivate(void) { + IsBigEndian = SystemIsBigEndian(); + } + + ~BamWriterPrivate(void) { + mBGZF.Close(); + } + + // "public" interface + void Close(void); + bool Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed); + void SaveAlignment(const BamAlignment& al); + + // internal methods + const unsigned int CalculateMinimumBin(const int begin, int end) const; + void CreatePackedCigar(const vector& cigarOperations, string& packedCigar); + void EncodeQuerySequence(const string& query, string& encodedQuery); +}; + +// ----------------------------------------------------- +// BamWriter implementation +// ----------------------------------------------------- + +// constructor +BamWriter::BamWriter(void) { + d = new BamWriterPrivate; +} + +// destructor +BamWriter::~BamWriter(void) { + delete d; + d = 0; +} + +// closes the alignment archive +void BamWriter::Close(void) { + d->Close(); +} + +// opens the alignment archive +bool BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) { + return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed); +} + +// saves the alignment to the alignment archive +void BamWriter::SaveAlignment(const BamAlignment& al) { + d->SaveAlignment(al); +} + +// ----------------------------------------------------- +// BamWriterPrivate implementation +// ----------------------------------------------------- + +// closes the alignment archive +void BamWriter::BamWriterPrivate::Close(void) { + mBGZF.Close(); +} + +// calculates minimum bin for a BAM alignment interval +const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { + --end; + if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); + if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); + if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20); + if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23); + if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26); + return 0; +} + +// creates a cigar string from the supplied alignment +void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, string& packedCigar) { + + // initialize + const unsigned int numCigarOperations = cigarOperations.size(); + packedCigar.resize(numCigarOperations * BT_SIZEOF_INT); + + // pack the cigar data into the string + unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); + + unsigned int cigarOp; + vector::const_iterator coIter; + for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) { + + switch(coIter->Type) { + case 'M': + cigarOp = BAM_CMATCH; + break; + case 'I': + cigarOp = BAM_CINS; + break; + case 'D': + cigarOp = BAM_CDEL; + break; + case 'N': + cigarOp = BAM_CREF_SKIP; + break; + case 'S': + cigarOp = BAM_CSOFT_CLIP; + break; + case 'H': + cigarOp = BAM_CHARD_CLIP; + break; + case 'P': + cigarOp = BAM_CPAD; + break; + default: + printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type); + exit(1); + } + + *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; + pPackedCigar++; + } +} + +// encodes the supplied query sequence into 4-bit notation +void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) { + + // prepare the encoded query string + const unsigned int queryLen = query.size(); + const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); + encodedQuery.resize(encodedQueryLen); + char* pEncodedQuery = (char*)encodedQuery.data(); + const char* pQuery = (const char*)query.data(); + + unsigned char nucleotideCode; + bool useHighWord = true; + + while(*pQuery) { + + switch(*pQuery) { + + case '=': + nucleotideCode = 0; + break; + + case 'A': + nucleotideCode = 1; + break; + + case 'C': + nucleotideCode = 2; + break; + + case 'G': + nucleotideCode = 4; + break; + + case 'T': + nucleotideCode = 8; + break; + + case 'N': + nucleotideCode = 15; + break; + + default: + printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); + exit(1); + } + + // pack the nucleotide code + if(useHighWord) { + *pEncodedQuery = nucleotideCode << 4; + useHighWord = false; + } else { + *pEncodedQuery |= nucleotideCode; + pEncodedQuery++; + useHighWord = true; + } + + // increment the query position + pQuery++; + } +} + +// opens the alignment archive +bool BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) { + + // open the BGZF file for writing, return failure if error + if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) ) + return false; + + // ================ + // write the header + // ================ + + // write the BAM signature + const unsigned char SIGNATURE_LENGTH = 4; + const char* BAM_SIGNATURE = "BAM\1"; + mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH); + + // write the SAM header text length + uint32_t samHeaderLen = samHeader.size(); + if (IsBigEndian) SwapEndian_32(samHeaderLen); + mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT); + + // write the SAM header text + if(samHeaderLen > 0) + mBGZF.Write(samHeader.data(), samHeaderLen); + + // write the number of reference sequences + uint32_t numReferenceSequences = referenceSequences.size(); + if (IsBigEndian) SwapEndian_32(numReferenceSequences); + mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT); + + // ============================= + // write the sequence dictionary + // ============================= + + RefVector::const_iterator rsIter; + for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) { + + // write the reference sequence name length + uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; + if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen); + mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT); + + // write the reference sequence name + mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); + + // write the reference sequence length + int32_t referenceLength = rsIter->RefLength; + if (IsBigEndian) SwapEndian_32(referenceLength); + mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT); + } + + // return success + return true; +} + +// saves the alignment to the alignment archive +void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { + + // if BamAlignment contains only the core data and a raw char data buffer + // (as a result of BamReader::GetNextAlignmentCore()) + if ( al.SupportData.HasCoreOnly ) { + + // write the block size + unsigned int blockSize = al.SupportData.BlockLength; + if (IsBigEndian) SwapEndian_32(blockSize); + mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); + + // assign the BAM core data + uint32_t buffer[8]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; + buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; + buffer[4] = al.SupportData.QuerySequenceLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( IsBigEndian ) { + for ( int i = 0; i < 8; ++i ) + SwapEndian_32(buffer[i]); + } + + // write the BAM core + mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); + + // write the raw char data + mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); + } + + // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc + // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code ) + else { + + // calculate char lengths + const unsigned int nameLength = al.Name.size() + 1; + const unsigned int numCigarOperations = al.CigarData.size(); + const unsigned int queryLength = al.QueryBases.size(); + const unsigned int tagDataLength = al.TagData.size(); + + // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) + // force calculation of Bin before storing + const int endPosition = al.GetEndPosition(); + const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition); + + // create our packed cigar string + string packedCigar; + CreatePackedCigar(al.CigarData, packedCigar); + const unsigned int packedCigarLength = packedCigar.size(); + + // encode the query + string encodedQuery; + EncodeQuerySequence(al.QueryBases, encodedQuery); + const unsigned int encodedQueryLength = encodedQuery.size(); + + // write the block size + const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength; + unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; + if (IsBigEndian) SwapEndian_32(blockSize); + mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); + + // assign the BAM core data + uint32_t buffer[8]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; + buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; + buffer[4] = queryLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( IsBigEndian ) { + for ( int i = 0; i < 8; ++i ) + SwapEndian_32(buffer[i]); + } + + // write the BAM core + mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); + + // write the query name + mBGZF.Write(al.Name.c_str(), nameLength); + + // write the packed cigar + if ( IsBigEndian ) { + + char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); + memcpy(cigarData, packedCigar.data(), packedCigarLength); + + for (unsigned int i = 0; i < packedCigarLength; ++i) { + if ( IsBigEndian ) + SwapEndian_32p(&cigarData[i]); + } + + mBGZF.Write(cigarData, packedCigarLength); + free(cigarData); + } + else + mBGZF.Write(packedCigar.data(), packedCigarLength); + + // write the encoded query sequence + mBGZF.Write(encodedQuery.data(), encodedQueryLength); + + // write the base qualities + string baseQualities(al.Qualities); + char* pBaseQualities = (char*)al.Qualities.data(); + for(unsigned int i = 0; i < queryLength; i++) { + pBaseQualities[i] -= 33; + } + mBGZF.Write(pBaseQualities, queryLength); + + // write the read group tag + if ( IsBigEndian ) { + + char* tagData = (char*)calloc(sizeof(char), tagDataLength); + memcpy(tagData, al.TagData.data(), tagDataLength); + + int i = 0; + while ( (unsigned int)i < tagDataLength ) { + + i += 2; // skip tag type (e.g. "RG", "NM", etc) + uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning + ++i; // skip value type + + switch (type) { + + case('A') : + case('C') : + ++i; + break; + + case('S') : + SwapEndian_16p(&tagData[i]); + i+=2; // sizeof(uint16_t) + break; + + case('F') : + case('I') : + SwapEndian_32p(&tagData[i]); + i+=4; // sizeof(uint32_t) + break; + + case('D') : + SwapEndian_64p(&tagData[i]); + i+=8; // sizeof(uint64_t) + break; + + case('H') : + case('Z') : + while (tagData[i]) { ++i; } + ++i; // increment one more for null terminator + break; + + default : + printf("ERROR: Invalid tag value type\n"); // shouldn't get here + free(tagData); + exit(1); + } + } + + mBGZF.Write(tagData, tagDataLength); + free(tagData); + } + else + mBGZF.Write(al.TagData.data(), tagDataLength); + } +}