X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Fapi%2Finternal%2FBamWriter_p.cpp;h=ce5cfa9edf9e8438768dc4579710002da9fef697;hb=8a90b7aefffaf186053ef4da96c8663bf528274a;hp=90959b6a7882dae23878edd234b3f51f61fa09c5;hpb=90f57dc99f0af143f50a0afef447b50048a556f3;p=bamtools.git diff --git a/src/api/internal/BamWriter_p.cpp b/src/api/internal/BamWriter_p.cpp index 90959b6..ce5cfa9 100644 --- a/src/api/internal/BamWriter_p.cpp +++ b/src/api/internal/BamWriter_p.cpp @@ -1,73 +1,95 @@ // *************************************************************************** // BamWriter_p.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College -// All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 11 January 2011 (DB) +// Last modified: 6 October 2011 (DB) // --------------------------------------------------------------------------- // Provides the basic functionality for producing BAM files // *************************************************************************** #include +#include +<<<<<<< HEAD +#include +======= +#include +>>>>>>> iodevice #include using namespace BamTools; using namespace BamTools::Internal; + +#include +#include using namespace std; // ctor -BamWriterPrivate::BamWriterPrivate(void) { - IsBigEndian = SystemIsBigEndian(); -} +BamWriterPrivate::BamWriterPrivate(void) + : m_isBigEndian( BamTools::SystemIsBigEndian() ) +{ } // dtor BamWriterPrivate::~BamWriterPrivate(void) { - mBGZF.Close(); + Close(); } // calculates minimum bin for a BAM alignment interval -const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { +uint32_t BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { --end; - if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); - if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); - if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20); - if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23); - if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26); + if ( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); + if ( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); + if ( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20); + if ( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23); + if ( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26); return 0; } // closes the alignment archive void BamWriterPrivate::Close(void) { - mBGZF.Close(); + + // skip if file not open + if ( !IsOpen() ) return; + + // close output stream + try { + m_stream.Close(); + } catch ( BamException& e ) { + m_errorString = e.what(); + } } // creates a cigar string from the supplied alignment void BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, string& packedCigar) { // initialize - const unsigned int numCigarOperations = cigarOperations.size(); - packedCigar.resize(numCigarOperations * BT_SIZEOF_INT); + const size_t numCigarOperations = cigarOperations.size(); + packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT); // pack the cigar data into the string unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); - unsigned int cigarOp; - vector::const_iterator coIter; - for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) { - - switch(coIter->Type) { - case 'M': cigarOp = BAM_CMATCH; break; - case 'I': cigarOp = BAM_CINS; break; - case 'D': cigarOp = BAM_CDEL; break; - case 'N': cigarOp = BAM_CREF_SKIP; break; - case 'S': cigarOp = BAM_CSOFT_CLIP; break; - case 'H': cigarOp = BAM_CHARD_CLIP; break; - case 'P': cigarOp = BAM_CPAD; break; + // iterate over cigar operations + vector::const_iterator coIter = cigarOperations.begin(); + vector::const_iterator coEnd = cigarOperations.end(); + for ( ; coIter != coEnd; ++coIter ) { + + // store op in packedCigar + uint8_t cigarOp; + switch ( coIter->Type ) { + case (Constants::BAM_CIGAR_MATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MATCH; break; + case (Constants::BAM_CIGAR_INS_CHAR) : cigarOp = Constants::BAM_CIGAR_INS; break; + case (Constants::BAM_CIGAR_DEL_CHAR) : cigarOp = Constants::BAM_CIGAR_DEL; break; + case (Constants::BAM_CIGAR_REFSKIP_CHAR) : cigarOp = Constants::BAM_CIGAR_REFSKIP; break; + case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_SOFTCLIP; break; + case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_HARDCLIP; break; + case (Constants::BAM_CIGAR_PAD_CHAR) : cigarOp = Constants::BAM_CIGAR_PAD; break; + case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_SEQMATCH; break; + case (Constants::BAM_CIGAR_MISMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MISMATCH; break; default: - fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type); - exit(1); + const string message = string("invalid CIGAR operation type") + coIter->Type; + throw BamException("BamWriter::CreatePackedCigar", message); } - *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; + *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp; pPackedCigar++; } } @@ -76,276 +98,375 @@ void BamWriterPrivate::CreatePackedCigar(const vector& cigarOperations, void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) { // prepare the encoded query string - const unsigned int queryLen = query.size(); - const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); - encodedQuery.resize(encodedQueryLen); + const size_t queryLength = query.size(); + const size_t encodedQueryLength = static_cast((queryLength+1)/2); + encodedQuery.resize(encodedQueryLength); char* pEncodedQuery = (char*)encodedQuery.data(); const char* pQuery = (const char*)query.data(); + // walk through original query sequence, encoding its bases unsigned char nucleotideCode; bool useHighWord = true; - - while(*pQuery) { - - switch(*pQuery) { - case '=': nucleotideCode = 0; break; - case 'A': nucleotideCode = 1; break; - case 'C': nucleotideCode = 2; break; - case 'G': nucleotideCode = 4; break; - case 'T': nucleotideCode = 8; break; - case 'N': nucleotideCode = 15; break; + while ( *pQuery ) { + switch ( *pQuery ) { + case (Constants::BAM_DNA_EQUAL) : nucleotideCode = Constants::BAM_BASECODE_EQUAL; break; + case (Constants::BAM_DNA_A) : nucleotideCode = Constants::BAM_BASECODE_A; break; + case (Constants::BAM_DNA_C) : nucleotideCode = Constants::BAM_BASECODE_C; break; + case (Constants::BAM_DNA_M) : nucleotideCode = Constants::BAM_BASECODE_M; break; + case (Constants::BAM_DNA_G) : nucleotideCode = Constants::BAM_BASECODE_G; break; + case (Constants::BAM_DNA_R) : nucleotideCode = Constants::BAM_BASECODE_R; break; + case (Constants::BAM_DNA_S) : nucleotideCode = Constants::BAM_BASECODE_S; break; + case (Constants::BAM_DNA_V) : nucleotideCode = Constants::BAM_BASECODE_V; break; + case (Constants::BAM_DNA_T) : nucleotideCode = Constants::BAM_BASECODE_T; break; + case (Constants::BAM_DNA_W) : nucleotideCode = Constants::BAM_BASECODE_W; break; + case (Constants::BAM_DNA_Y) : nucleotideCode = Constants::BAM_BASECODE_Y; break; + case (Constants::BAM_DNA_H) : nucleotideCode = Constants::BAM_BASECODE_H; break; + case (Constants::BAM_DNA_K) : nucleotideCode = Constants::BAM_BASECODE_K; break; + case (Constants::BAM_DNA_D) : nucleotideCode = Constants::BAM_BASECODE_D; break; + case (Constants::BAM_DNA_B) : nucleotideCode = Constants::BAM_BASECODE_B; break; + case (Constants::BAM_DNA_N) : nucleotideCode = Constants::BAM_BASECODE_N; break; default: - fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); - exit(1); + const string message = string("invalid base: ") + *pQuery; + throw BamException("BamWriter::EncodeQuerySequence", message); } // pack the nucleotide code - if(useHighWord) { + if ( useHighWord ) { *pEncodedQuery = nucleotideCode << 4; useHighWord = false; } else { *pEncodedQuery |= nucleotideCode; - pEncodedQuery++; + ++pEncodedQuery; useHighWord = true; } // increment the query position - pQuery++; + ++pQuery; } } +// returns a description of the last error that occurred +std::string BamWriterPrivate::GetErrorString(void) const { + return m_errorString; +} + +// returns whether BAM file is open for writing or not +bool BamWriterPrivate::IsOpen(void) const { + return m_stream.IsOpen(); +} + // opens the alignment archive bool BamWriterPrivate::Open(const string& filename, - const string& samHeader, - const RefVector& referenceSequences, - bool isWriteUncompressed) + const string& samHeaderText, + const RefVector& referenceSequences) { +<<<<<<< HEAD + try { +======= // open the BGZF file for writing, return failure if error - if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) ) + if ( !m_stream.Open(filename, IBamIODevice::WriteOnly) ) return false; +>>>>>>> iodevice - // ================ - // write the header - // ================ + // open the BGZF file for writing, return failure if error + m_stream.Open(filename, "wb"); - // write the BAM signature - const unsigned char SIGNATURE_LENGTH = 4; - const char* BAM_SIGNATURE = "BAM\1"; - mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH); + // write BAM file 'metadata' components + WriteMagicNumber(); + WriteSamHeaderText(samHeaderText); + WriteReferences(referenceSequences); - // write the SAM header text length - uint32_t samHeaderLen = samHeader.size(); - if (IsBigEndian) SwapEndian_32(samHeaderLen); - mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT); + // return success + return true; - // write the SAM header text - if(samHeaderLen > 0) - mBGZF.Write(samHeader.data(), samHeaderLen); + } catch ( BamException& e ) { + m_errorString = e.what(); + return false; + } +} - // write the number of reference sequences - uint32_t numReferenceSequences = referenceSequences.size(); - if (IsBigEndian) SwapEndian_32(numReferenceSequences); - mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT); +// saves the alignment to the alignment archive +bool BamWriterPrivate::SaveAlignment(const BamAlignment& al) { - // ============================= - // write the sequence dictionary - // ============================= + try { - RefVector::const_iterator rsIter = referenceSequences.begin(); - RefVector::const_iterator rsEnd = referenceSequences.end(); - for( ; rsIter != rsEnd; ++rsIter ) { + // if BamAlignment contains only the core data and a raw char data buffer + // (as a result of BamReader::GetNextAlignmentCore()) + if ( al.SupportData.HasCoreOnly ) + WriteCoreAlignment(al); - // write the reference sequence name length - uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; - if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen); - mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT); + // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc + // (resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code) + else WriteAlignment(al); - // write the reference sequence name - mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); + // if we get here, everything OK + return true; - // write the reference sequence length - int32_t referenceLength = rsIter->RefLength; - if (IsBigEndian) SwapEndian_32(referenceLength); - mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT); + } catch ( BamException& e ) { + m_errorString = e.what(); + return false; } +} - // return success - return true; +void BamWriterPrivate::SetWriteCompressed(bool ok) { + // modifying compression is not allowed if BAM file is open + if ( !IsOpen() ) + m_stream.SetWriteCompressed(ok); } -// saves the alignment to the alignment archive -void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { - - // if BamAlignment contains only the core data and a raw char data buffer - // (as a result of BamReader::GetNextAlignmentCore()) - if ( al.SupportData.HasCoreOnly ) { - - // write the block size - unsigned int blockSize = al.SupportData.BlockLength; - if (IsBigEndian) SwapEndian_32(blockSize); - mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); - - // assign the BAM core data - uint32_t buffer[8]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; - buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; - buffer[4] = al.SupportData.QuerySequenceLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { - for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); - } +void BamWriterPrivate::WriteAlignment(const BamAlignment& al) { + + // calculate char lengths + const unsigned int nameLength = al.Name.size() + 1; + const unsigned int numCigarOperations = al.CigarData.size(); + const unsigned int queryLength = al.QueryBases.size(); + const unsigned int tagDataLength = al.TagData.size(); + + // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) + // force calculation of Bin before storing + const int endPosition = al.GetEndPosition(); + const uint32_t alignmentBin = CalculateMinimumBin(al.Position, endPosition); + + // create our packed cigar string + string packedCigar; + CreatePackedCigar(al.CigarData, packedCigar); + const unsigned int packedCigarLength = packedCigar.size(); + + // encode the query + string encodedQuery; + EncodeQuerySequence(al.QueryBases, encodedQuery); + const unsigned int encodedQueryLength = encodedQuery.size(); + + // write the block size + const unsigned int dataBlockSize = nameLength + + packedCigarLength + + encodedQueryLength + + queryLength + + tagDataLength; + unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize; + if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); + m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); + + // assign the BAM core data + uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; + buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; + buffer[4] = queryLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( m_isBigEndian ) { + for ( int i = 0; i < 8; ++i ) + BamTools::SwapEndian_32(buffer[i]); + } - // write the BAM core - mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); + // write the BAM core + m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); - // write the raw char data - mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); - } + // write the query name + m_stream.Write(al.Name.c_str(), nameLength); - // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc - // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code ) - else { - - // calculate char lengths - const unsigned int nameLength = al.Name.size() + 1; - const unsigned int numCigarOperations = al.CigarData.size(); - const unsigned int queryLength = al.QueryBases.size(); - const unsigned int tagDataLength = al.TagData.size(); - - // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) - // force calculation of Bin before storing - const int endPosition = al.GetEndPosition(); - const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition); - - // create our packed cigar string - string packedCigar; - CreatePackedCigar(al.CigarData, packedCigar); - const unsigned int packedCigarLength = packedCigar.size(); - - // encode the query - string encodedQuery; - EncodeQuerySequence(al.QueryBases, encodedQuery); - const unsigned int encodedQueryLength = encodedQuery.size(); - - // write the block size - const unsigned int dataBlockSize = nameLength + - packedCigarLength + - encodedQueryLength + - queryLength + - tagDataLength; - unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; - if (IsBigEndian) SwapEndian_32(blockSize); - mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); - - // assign the BAM core data - uint32_t buffer[8]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; - buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; - buffer[4] = queryLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { - for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); + // write the packed cigar + if ( m_isBigEndian ) { + char* cigarData = new char[packedCigarLength](); + memcpy(cigarData, packedCigar.data(), packedCigarLength); + if ( m_isBigEndian ) { + for ( size_t i = 0; i < packedCigarLength; ++i ) + BamTools::SwapEndian_32p(&cigarData[i]); } + m_stream.Write(cigarData, packedCigarLength); + delete[] cigarData; // TODO: cleanup on Write exception thrown? + } + else + m_stream.Write(packedCigar.data(), packedCigarLength); + + // write the encoded query sequence + m_stream.Write(encodedQuery.data(), encodedQueryLength); + + // write the base qualities + char* pBaseQualities = (char*)al.Qualities.data(); + for ( size_t i = 0; i < queryLength; ++i ) + pBaseQualities[i] -= 33; // FASTQ conversion + m_stream.Write(pBaseQualities, queryLength); + + // write the read group tag + if ( m_isBigEndian ) { + + char* tagData = new char[tagDataLength](); + memcpy(tagData, al.TagData.data(), tagDataLength); + + size_t i = 0; + while ( i < tagDataLength ) { + + i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.) + const char type = tagData[i]; // get tag type at position i + ++i; + + switch ( type ) { + + case(Constants::BAM_TAG_TYPE_ASCII) : + case(Constants::BAM_TAG_TYPE_INT8) : + case(Constants::BAM_TAG_TYPE_UINT8) : + ++i; + break; + + case(Constants::BAM_TAG_TYPE_INT16) : + case(Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + + case(Constants::BAM_TAG_TYPE_FLOAT) : + case(Constants::BAM_TAG_TYPE_INT32) : + case(Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + + case(Constants::BAM_TAG_TYPE_HEX) : + case(Constants::BAM_TAG_TYPE_STRING) : + // no endian swapping necessary for hex-string/string data + while ( tagData[i] ) + ++i; + // increment one more for null terminator + ++i; + break; + + case(Constants::BAM_TAG_TYPE_ARRAY) : + + { + // read array type + const char arrayType = tagData[i]; + ++i; + + // swap endian-ness of number of elements in place, then retrieve for loop + BamTools::SwapEndian_32p(&tagData[i]); + int32_t numElements; + memcpy(&numElements, &tagData[i], sizeof(uint32_t)); + i += sizeof(uint32_t); + + // swap endian-ness of array elements + for ( int j = 0; j < numElements; ++j ) { + switch (arrayType) { + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + // no endian-swapping necessary + ++i; + break; + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + default: + delete[] tagData; + const string message = string("invalid binary array type: ") + arrayType; + throw BamException("BamWriter::SaveAlignment", message); + } + } + + break; + } - // write the BAM core - mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); + default : + delete[] tagData; + const string message = string("invalid tag type: ") + type; + throw BamException("BamWriter::SaveAlignment", message); + } + } - // write the query name - mBGZF.Write(al.Name.c_str(), nameLength); + m_stream.Write(tagData, tagDataLength); + delete[] tagData; // TODO: cleanup on Write exception thrown? + } + else + m_stream.Write(al.TagData.data(), tagDataLength); +} - // write the packed cigar - if ( IsBigEndian ) { +void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al) { + + // write the block size + unsigned int blockSize = al.SupportData.BlockLength; + if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); + m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); + + // re-calculate bin (in case BamAlignment's position has been previously modified) + const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition()); + + // assign the BAM core data + uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; + buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; + buffer[4] = al.SupportData.QuerySequenceLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( m_isBigEndian ) { + for ( int i = 0; i < 8; ++i ) + BamTools::SwapEndian_32(buffer[i]); + } - char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); - memcpy(cigarData, packedCigar.data(), packedCigarLength); + // write the BAM core + m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); - for (unsigned int i = 0; i < packedCigarLength; ++i) { - if ( IsBigEndian ) - SwapEndian_32p(&cigarData[i]); - } + // write the raw char data + m_stream.Write((char*)al.SupportData.AllCharData.data(), + al.SupportData.BlockLength-Constants::BAM_CORE_SIZE); +} - mBGZF.Write(cigarData, packedCigarLength); - free(cigarData); - } - else - mBGZF.Write(packedCigar.data(), packedCigarLength); +void BamWriterPrivate::WriteMagicNumber(void) { + // write BAM file 'magic number' + m_stream.Write(Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH); +} - // write the encoded query sequence - mBGZF.Write(encodedQuery.data(), encodedQueryLength); +void BamWriterPrivate::WriteReferences(const BamTools::RefVector& referenceSequences) { - // write the base qualities - char* pBaseQualities = (char*)al.Qualities.data(); - for(unsigned int i = 0; i < queryLength; i++) - pBaseQualities[i] -= 33; // FASTQ conversion - mBGZF.Write(pBaseQualities, queryLength); + // write the number of reference sequences + uint32_t numReferenceSequences = referenceSequences.size(); + if ( m_isBigEndian ) BamTools::SwapEndian_32(numReferenceSequences); + m_stream.Write((char*)&numReferenceSequences, Constants::BAM_SIZEOF_INT); - // write the read group tag - if ( IsBigEndian ) { + // foreach reference sequence + RefVector::const_iterator rsIter = referenceSequences.begin(); + RefVector::const_iterator rsEnd = referenceSequences.end(); + for ( ; rsIter != rsEnd; ++rsIter ) { - char* tagData = (char*)calloc(sizeof(char), tagDataLength); - memcpy(tagData, al.TagData.data(), tagDataLength); + // write the reference sequence name length + uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; + if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceSequenceNameLen); + m_stream.Write((char*)&referenceSequenceNameLen, Constants::BAM_SIZEOF_INT); - int i = 0; - while ( (unsigned int)i < tagDataLength ) { + // write the reference sequence name + m_stream.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); - i += 2; // skip tag type (e.g. "RG", "NM", etc) - uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning - ++i; // skip value type + // write the reference sequence length + int32_t referenceLength = rsIter->RefLength; + if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceLength); + m_stream.Write((char*)&referenceLength, Constants::BAM_SIZEOF_INT); + } +} - switch (type) { +void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText) { - case('A') : - case('C') : - ++i; - break; - - case('S') : - SwapEndian_16p(&tagData[i]); - i+=2; // sizeof(uint16_t) - break; - - case('F') : - case('I') : - SwapEndian_32p(&tagData[i]); - i+=4; // sizeof(uint32_t) - break; - - case('D') : - SwapEndian_64p(&tagData[i]); - i+=8; // sizeof(uint64_t) - break; - - case('H') : - case('Z') : - while (tagData[i]) { ++i; } - ++i; // increment one more for null terminator - break; - - default : - fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here - free(tagData); - exit(1); - } - } + // write the SAM header text length + uint32_t samHeaderLen = samHeaderText.size(); + if ( m_isBigEndian ) BamTools::SwapEndian_32(samHeaderLen); + m_stream.Write((char*)&samHeaderLen, Constants::BAM_SIZEOF_INT); - mBGZF.Write(tagData, tagDataLength); - free(tagData); - } - else - mBGZF.Write(al.TagData.data(), tagDataLength); - } + // write the SAM header text + if ( samHeaderLen > 0 ) + m_stream.Write(samHeaderText.data(), samHeaderLen); }