1 // ***************************************************************************
2 // BamWriter_p.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 18 November 2012 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the basic functionality for producing BAM files
8 // ***************************************************************************
10 #include "api/BamAlignment.h"
11 #include "api/BamConstants.h"
12 #include "api/IBamIODevice.h"
13 #include "api/internal/bam/BamWriter_p.h"
14 #include "api/internal/utils/BamException_p.h"
15 using namespace BamTools;
16 using namespace BamTools::Internal;
23 BamWriterPrivate::BamWriterPrivate(void)
24 : m_isBigEndian( BamTools::SystemIsBigEndian() )
28 BamWriterPrivate::~BamWriterPrivate(void) {
32 // calculates minimum bin for a BAM alignment interval [begin, end)
33 uint32_t BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
35 if ( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
36 if ( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);
37 if ( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);
38 if ( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);
39 if ( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);
43 // closes the alignment archive
44 void BamWriterPrivate::Close(void) {
46 // skip if file not open
47 if ( !IsOpen() ) return;
49 // close output stream
52 } catch ( BamException& e ) {
53 m_errorString = e.what();
57 // creates a cigar string from the supplied alignment
58 void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
61 const size_t numCigarOperations = cigarOperations.size();
62 packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT);
64 // pack the cigar data into the string
65 unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
67 // iterate over cigar operations
68 vector<CigarOp>::const_iterator coIter = cigarOperations.begin();
69 vector<CigarOp>::const_iterator coEnd = cigarOperations.end();
70 for ( ; coIter != coEnd; ++coIter ) {
72 // store op in packedCigar
74 switch ( coIter->Type ) {
75 case (Constants::BAM_CIGAR_MATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MATCH; break;
76 case (Constants::BAM_CIGAR_INS_CHAR) : cigarOp = Constants::BAM_CIGAR_INS; break;
77 case (Constants::BAM_CIGAR_DEL_CHAR) : cigarOp = Constants::BAM_CIGAR_DEL; break;
78 case (Constants::BAM_CIGAR_REFSKIP_CHAR) : cigarOp = Constants::BAM_CIGAR_REFSKIP; break;
79 case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_SOFTCLIP; break;
80 case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_HARDCLIP; break;
81 case (Constants::BAM_CIGAR_PAD_CHAR) : cigarOp = Constants::BAM_CIGAR_PAD; break;
82 case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_SEQMATCH; break;
83 case (Constants::BAM_CIGAR_MISMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MISMATCH; break;
85 const string message = string("invalid CIGAR operation type") + coIter->Type;
86 throw BamException("BamWriter::CreatePackedCigar", message);
89 *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp;
94 // encodes the supplied query sequence into 4-bit notation
95 void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
97 // prepare the encoded query string
98 const size_t queryLength = query.size();
99 const size_t encodedQueryLength = static_cast<size_t>((queryLength+1)/2);
100 encodedQuery.resize(encodedQueryLength);
101 char* pEncodedQuery = (char*)encodedQuery.data();
102 const char* pQuery = (const char*)query.data();
104 // walk through original query sequence, encoding its bases
105 unsigned char nucleotideCode;
106 bool useHighWord = true;
109 case (Constants::BAM_DNA_EQUAL) : nucleotideCode = Constants::BAM_BASECODE_EQUAL; break;
110 case (Constants::BAM_DNA_A) : nucleotideCode = Constants::BAM_BASECODE_A; break;
111 case (Constants::BAM_DNA_C) : nucleotideCode = Constants::BAM_BASECODE_C; break;
112 case (Constants::BAM_DNA_M) : nucleotideCode = Constants::BAM_BASECODE_M; break;
113 case (Constants::BAM_DNA_G) : nucleotideCode = Constants::BAM_BASECODE_G; break;
114 case (Constants::BAM_DNA_R) : nucleotideCode = Constants::BAM_BASECODE_R; break;
115 case (Constants::BAM_DNA_S) : nucleotideCode = Constants::BAM_BASECODE_S; break;
116 case (Constants::BAM_DNA_V) : nucleotideCode = Constants::BAM_BASECODE_V; break;
117 case (Constants::BAM_DNA_T) : nucleotideCode = Constants::BAM_BASECODE_T; break;
118 case (Constants::BAM_DNA_W) : nucleotideCode = Constants::BAM_BASECODE_W; break;
119 case (Constants::BAM_DNA_Y) : nucleotideCode = Constants::BAM_BASECODE_Y; break;
120 case (Constants::BAM_DNA_H) : nucleotideCode = Constants::BAM_BASECODE_H; break;
121 case (Constants::BAM_DNA_K) : nucleotideCode = Constants::BAM_BASECODE_K; break;
122 case (Constants::BAM_DNA_D) : nucleotideCode = Constants::BAM_BASECODE_D; break;
123 case (Constants::BAM_DNA_B) : nucleotideCode = Constants::BAM_BASECODE_B; break;
124 case (Constants::BAM_DNA_N) : nucleotideCode = Constants::BAM_BASECODE_N; break;
126 const string message = string("invalid base: ") + *pQuery;
127 throw BamException("BamWriter::EncodeQuerySequence", message);
130 // pack the nucleotide code
132 *pEncodedQuery = nucleotideCode << 4;
135 *pEncodedQuery |= nucleotideCode;
140 // increment the query position
145 // returns a description of the last error that occurred
146 std::string BamWriterPrivate::GetErrorString(void) const {
147 return m_errorString;
150 // returns whether BAM file is open for writing or not
151 bool BamWriterPrivate::IsOpen(void) const {
152 return m_stream.IsOpen();
155 // opens the alignment archive
156 bool BamWriterPrivate::Open(const string& filename,
157 const string& samHeaderText,
158 const RefVector& referenceSequences)
162 // open the BGZF file for writing
163 m_stream.Open(filename, IBamIODevice::WriteOnly);
165 // write BAM file 'metadata' components
167 WriteSamHeaderText(samHeaderText);
168 WriteReferences(referenceSequences);
173 } catch ( BamException& e ) {
174 m_errorString = e.what();
179 // saves the alignment to the alignment archive
180 bool BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
184 // if BamAlignment contains only the core data and a raw char data buffer
185 // (as a result of BamReader::GetNextAlignmentCore())
186 if ( al.SupportData.HasCoreOnly )
187 WriteCoreAlignment(al);
189 // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
190 // (resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code)
191 else WriteAlignment(al);
193 // if we get here, everything OK
196 } catch ( BamException& e ) {
197 m_errorString = e.what();
202 void BamWriterPrivate::SetWriteCompressed(bool ok) {
203 // modifying compression is not allowed if BAM file is open
205 m_stream.SetWriteCompressed(ok);
208 void BamWriterPrivate::WriteAlignment(const BamAlignment& al) {
210 // calculate char lengths
211 const unsigned int nameLength = al.Name.size() + 1;
212 const unsigned int numCigarOperations = al.CigarData.size();
213 const unsigned int queryLength = ( (al.QueryBases == "*") ? 0 : al.QueryBases.size() );
214 const unsigned int tagDataLength = al.TagData.size();
216 // no way to tell if alignment's bin is already defined (there is no default, invalid value)
217 // so we'll go ahead calculate its bin ID before storing
218 const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition());
220 // create our packed cigar string
222 CreatePackedCigar(al.CigarData, packedCigar);
223 const unsigned int packedCigarLength = packedCigar.size();
226 unsigned int encodedQueryLength = 0;
228 if ( queryLength > 0 ) {
229 EncodeQuerySequence(al.QueryBases, encodedQuery);
230 encodedQueryLength = encodedQuery.size();
233 // write the block size
234 const unsigned int dataBlockSize = nameLength +
237 queryLength + // here referring to quality length
239 unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize;
240 if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
241 m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
243 // assign the BAM core data
244 uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
245 buffer[0] = al.RefID;
246 buffer[1] = al.Position;
247 buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
248 buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
249 buffer[4] = queryLength;
250 buffer[5] = al.MateRefID;
251 buffer[6] = al.MatePosition;
252 buffer[7] = al.InsertSize;
254 // swap BAM core endian-ness, if necessary
255 if ( m_isBigEndian ) {
256 for ( int i = 0; i < 8; ++i )
257 BamTools::SwapEndian_32(buffer[i]);
260 // write the BAM core
261 m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
263 // write the query name
264 m_stream.Write(al.Name.c_str(), nameLength);
266 // write the packed cigar
267 if ( m_isBigEndian ) {
268 char* cigarData = new char[packedCigarLength]();
269 memcpy(cigarData, packedCigar.data(), packedCigarLength);
270 if ( m_isBigEndian ) {
271 for ( size_t i = 0; i < packedCigarLength; ++i )
272 BamTools::SwapEndian_32p(&cigarData[i]);
274 m_stream.Write(cigarData, packedCigarLength);
275 delete[] cigarData; // TODO: cleanup on Write exception thrown?
278 m_stream.Write(packedCigar.data(), packedCigarLength);
280 if ( queryLength > 0 ) {
282 // write the encoded query sequence
283 m_stream.Write(encodedQuery.data(), encodedQueryLength);
285 // write the base qualities
286 char* pBaseQualities = new char[queryLength]();
287 if ( al.Qualities.empty() || ( al.Qualities.size() == 1 && al.Qualities[0] == '*' ) || al.Qualities[0] == (char)0xFF )
288 memset(pBaseQualities, 0xFF, queryLength); // if missing or '*', fill with invalid qual
290 for ( size_t i = 0; i < queryLength; ++i )
291 pBaseQualities[i] = al.Qualities.at(i) - 33; // FASTQ ASCII -> phred score conversion
293 m_stream.Write(pBaseQualities, queryLength);
294 delete[] pBaseQualities;
297 // write the tag data
298 if ( m_isBigEndian ) {
300 char* tagData = new char[tagDataLength]();
301 memcpy(tagData, al.TagData.data(), tagDataLength);
304 while ( i < tagDataLength ) {
306 i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.)
307 const char type = tagData[i]; // get tag type at position i
312 case(Constants::BAM_TAG_TYPE_ASCII) :
313 case(Constants::BAM_TAG_TYPE_INT8) :
314 case(Constants::BAM_TAG_TYPE_UINT8) :
318 case(Constants::BAM_TAG_TYPE_INT16) :
319 case(Constants::BAM_TAG_TYPE_UINT16) :
320 BamTools::SwapEndian_16p(&tagData[i]);
321 i += sizeof(uint16_t);
324 case(Constants::BAM_TAG_TYPE_FLOAT) :
325 case(Constants::BAM_TAG_TYPE_INT32) :
326 case(Constants::BAM_TAG_TYPE_UINT32) :
327 BamTools::SwapEndian_32p(&tagData[i]);
328 i += sizeof(uint32_t);
331 case(Constants::BAM_TAG_TYPE_HEX) :
332 case(Constants::BAM_TAG_TYPE_STRING) :
333 // no endian swapping necessary for hex-string/string data
336 // increment one more for null terminator
340 case(Constants::BAM_TAG_TYPE_ARRAY) :
344 const char arrayType = tagData[i];
347 // swap endian-ness of number of elements in place, then retrieve for loop
348 BamTools::SwapEndian_32p(&tagData[i]);
350 memcpy(&numElements, &tagData[i], sizeof(uint32_t));
351 i += sizeof(uint32_t);
353 // swap endian-ness of array elements
354 for ( int j = 0; j < numElements; ++j ) {
356 case (Constants::BAM_TAG_TYPE_INT8) :
357 case (Constants::BAM_TAG_TYPE_UINT8) :
358 // no endian-swapping necessary
361 case (Constants::BAM_TAG_TYPE_INT16) :
362 case (Constants::BAM_TAG_TYPE_UINT16) :
363 BamTools::SwapEndian_16p(&tagData[i]);
364 i += sizeof(uint16_t);
366 case (Constants::BAM_TAG_TYPE_FLOAT) :
367 case (Constants::BAM_TAG_TYPE_INT32) :
368 case (Constants::BAM_TAG_TYPE_UINT32) :
369 BamTools::SwapEndian_32p(&tagData[i]);
370 i += sizeof(uint32_t);
374 const string message = string("invalid binary array type: ") + arrayType;
375 throw BamException("BamWriter::SaveAlignment", message);
384 const string message = string("invalid tag type: ") + type;
385 throw BamException("BamWriter::SaveAlignment", message);
389 m_stream.Write(tagData, tagDataLength);
390 delete[] tagData; // TODO: cleanup on Write exception thrown?
393 m_stream.Write(al.TagData.data(), tagDataLength);
396 void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al) {
398 // write the block size
399 unsigned int blockSize = al.SupportData.BlockLength;
400 if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
401 m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
403 // re-calculate bin (in case BamAlignment's position has been previously modified)
404 const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition());
406 // assign the BAM core data
407 uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
408 buffer[0] = al.RefID;
409 buffer[1] = al.Position;
410 buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
411 buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
412 buffer[4] = al.SupportData.QuerySequenceLength;
413 buffer[5] = al.MateRefID;
414 buffer[6] = al.MatePosition;
415 buffer[7] = al.InsertSize;
417 // swap BAM core endian-ness, if necessary
418 if ( m_isBigEndian ) {
419 for ( int i = 0; i < 8; ++i )
420 BamTools::SwapEndian_32(buffer[i]);
423 // write the BAM core
424 m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
426 // write the raw char data
427 m_stream.Write((char*)al.SupportData.AllCharData.data(),
428 al.SupportData.BlockLength-Constants::BAM_CORE_SIZE);
431 void BamWriterPrivate::WriteMagicNumber(void) {
432 // write BAM file 'magic number'
433 m_stream.Write(Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH);
436 void BamWriterPrivate::WriteReferences(const BamTools::RefVector& referenceSequences) {
438 // write the number of reference sequences
439 uint32_t numReferenceSequences = referenceSequences.size();
440 if ( m_isBigEndian ) BamTools::SwapEndian_32(numReferenceSequences);
441 m_stream.Write((char*)&numReferenceSequences, Constants::BAM_SIZEOF_INT);
443 // foreach reference sequence
444 RefVector::const_iterator rsIter = referenceSequences.begin();
445 RefVector::const_iterator rsEnd = referenceSequences.end();
446 for ( ; rsIter != rsEnd; ++rsIter ) {
448 // write the reference sequence name length (+1 for terminator)
449 const uint32_t actualNameLen = rsIter->RefName.size() + 1;
450 uint32_t maybeSwappedNameLen = actualNameLen;
451 if ( m_isBigEndian ) BamTools::SwapEndian_32(maybeSwappedNameLen);
452 m_stream.Write((char*)&maybeSwappedNameLen, Constants::BAM_SIZEOF_INT);
454 // write the reference sequence name
455 m_stream.Write(rsIter->RefName.c_str(), actualNameLen);
457 // write the reference sequence length
458 int32_t referenceLength = rsIter->RefLength;
459 if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceLength);
460 m_stream.Write((char*)&referenceLength, Constants::BAM_SIZEOF_INT);
464 void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText) {
466 // write the SAM header text length
467 const uint32_t actualHeaderLen = samHeaderText.size();
468 uint32_t maybeSwappedHeaderLen = samHeaderText.size();
469 if ( m_isBigEndian ) BamTools::SwapEndian_32(maybeSwappedHeaderLen);
470 m_stream.Write((char*)&maybeSwappedHeaderLen, Constants::BAM_SIZEOF_INT);
472 // write the SAM header text
473 if ( actualHeaderLen > 0 )
474 m_stream.Write(samHeaderText.data(), actualHeaderLen);