1 // ***************************************************************************
2 // BamWriter_p.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 6 October 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the basic functionality for producing BAM files
8 // ***************************************************************************
10 #include <api/BamAlignment.h>
11 #include <api/BamConstants.h>
13 #include <api/internal/BamException_p.h>
15 #include <api/IBamIODevice.h>
17 #include <api/internal/BamWriter_p.h>
18 using namespace BamTools;
19 using namespace BamTools::Internal;
26 BamWriterPrivate::BamWriterPrivate(void)
27 : m_isBigEndian( BamTools::SystemIsBigEndian() )
31 BamWriterPrivate::~BamWriterPrivate(void) {
35 // calculates minimum bin for a BAM alignment interval
36 uint32_t BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
38 if ( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
39 if ( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);
40 if ( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);
41 if ( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);
42 if ( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);
46 // closes the alignment archive
47 void BamWriterPrivate::Close(void) {
49 // skip if file not open
50 if ( !IsOpen() ) return;
52 // close output stream
55 } catch ( BamException& e ) {
56 m_errorString = e.what();
60 // creates a cigar string from the supplied alignment
61 void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
64 const size_t numCigarOperations = cigarOperations.size();
65 packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT);
67 // pack the cigar data into the string
68 unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
70 // iterate over cigar operations
71 vector<CigarOp>::const_iterator coIter = cigarOperations.begin();
72 vector<CigarOp>::const_iterator coEnd = cigarOperations.end();
73 for ( ; coIter != coEnd; ++coIter ) {
75 // store op in packedCigar
77 switch ( coIter->Type ) {
78 case (Constants::BAM_CIGAR_MATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MATCH; break;
79 case (Constants::BAM_CIGAR_INS_CHAR) : cigarOp = Constants::BAM_CIGAR_INS; break;
80 case (Constants::BAM_CIGAR_DEL_CHAR) : cigarOp = Constants::BAM_CIGAR_DEL; break;
81 case (Constants::BAM_CIGAR_REFSKIP_CHAR) : cigarOp = Constants::BAM_CIGAR_REFSKIP; break;
82 case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_SOFTCLIP; break;
83 case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_HARDCLIP; break;
84 case (Constants::BAM_CIGAR_PAD_CHAR) : cigarOp = Constants::BAM_CIGAR_PAD; break;
85 case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_SEQMATCH; break;
86 case (Constants::BAM_CIGAR_MISMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MISMATCH; break;
88 const string message = string("invalid CIGAR operation type") + coIter->Type;
89 throw BamException("BamWriter::CreatePackedCigar", message);
92 *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp;
97 // encodes the supplied query sequence into 4-bit notation
98 void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
100 // prepare the encoded query string
101 const size_t queryLength = query.size();
102 const size_t encodedQueryLength = static_cast<size_t>((queryLength+1)/2);
103 encodedQuery.resize(encodedQueryLength);
104 char* pEncodedQuery = (char*)encodedQuery.data();
105 const char* pQuery = (const char*)query.data();
107 // walk through original query sequence, encoding its bases
108 unsigned char nucleotideCode;
109 bool useHighWord = true;
112 case (Constants::BAM_DNA_EQUAL) : nucleotideCode = Constants::BAM_BASECODE_EQUAL; break;
113 case (Constants::BAM_DNA_A) : nucleotideCode = Constants::BAM_BASECODE_A; break;
114 case (Constants::BAM_DNA_C) : nucleotideCode = Constants::BAM_BASECODE_C; break;
115 case (Constants::BAM_DNA_M) : nucleotideCode = Constants::BAM_BASECODE_M; break;
116 case (Constants::BAM_DNA_G) : nucleotideCode = Constants::BAM_BASECODE_G; break;
117 case (Constants::BAM_DNA_R) : nucleotideCode = Constants::BAM_BASECODE_R; break;
118 case (Constants::BAM_DNA_S) : nucleotideCode = Constants::BAM_BASECODE_S; break;
119 case (Constants::BAM_DNA_V) : nucleotideCode = Constants::BAM_BASECODE_V; break;
120 case (Constants::BAM_DNA_T) : nucleotideCode = Constants::BAM_BASECODE_T; break;
121 case (Constants::BAM_DNA_W) : nucleotideCode = Constants::BAM_BASECODE_W; break;
122 case (Constants::BAM_DNA_Y) : nucleotideCode = Constants::BAM_BASECODE_Y; break;
123 case (Constants::BAM_DNA_H) : nucleotideCode = Constants::BAM_BASECODE_H; break;
124 case (Constants::BAM_DNA_K) : nucleotideCode = Constants::BAM_BASECODE_K; break;
125 case (Constants::BAM_DNA_D) : nucleotideCode = Constants::BAM_BASECODE_D; break;
126 case (Constants::BAM_DNA_B) : nucleotideCode = Constants::BAM_BASECODE_B; break;
127 case (Constants::BAM_DNA_N) : nucleotideCode = Constants::BAM_BASECODE_N; break;
129 const string message = string("invalid base: ") + *pQuery;
130 throw BamException("BamWriter::EncodeQuerySequence", message);
133 // pack the nucleotide code
135 *pEncodedQuery = nucleotideCode << 4;
138 *pEncodedQuery |= nucleotideCode;
143 // increment the query position
148 // returns a description of the last error that occurred
149 std::string BamWriterPrivate::GetErrorString(void) const {
150 return m_errorString;
153 // returns whether BAM file is open for writing or not
154 bool BamWriterPrivate::IsOpen(void) const {
155 return m_stream.IsOpen();
158 // opens the alignment archive
159 bool BamWriterPrivate::Open(const string& filename,
160 const string& samHeaderText,
161 const RefVector& referenceSequences)
166 // open the BGZF file for writing, return failure if error
167 if ( !m_stream.Open(filename, IBamIODevice::WriteOnly) )
171 // open the BGZF file for writing, return failure if error
172 m_stream.Open(filename, "wb");
174 // write BAM file 'metadata' components
176 WriteSamHeaderText(samHeaderText);
177 WriteReferences(referenceSequences);
182 } catch ( BamException& e ) {
183 m_errorString = e.what();
188 // saves the alignment to the alignment archive
189 bool BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
193 // if BamAlignment contains only the core data and a raw char data buffer
194 // (as a result of BamReader::GetNextAlignmentCore())
195 if ( al.SupportData.HasCoreOnly )
196 WriteCoreAlignment(al);
198 // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
199 // (resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code)
200 else WriteAlignment(al);
202 // if we get here, everything OK
205 } catch ( BamException& e ) {
206 m_errorString = e.what();
211 void BamWriterPrivate::SetWriteCompressed(bool ok) {
212 // modifying compression is not allowed if BAM file is open
214 m_stream.SetWriteCompressed(ok);
217 void BamWriterPrivate::WriteAlignment(const BamAlignment& al) {
219 // calculate char lengths
220 const unsigned int nameLength = al.Name.size() + 1;
221 const unsigned int numCigarOperations = al.CigarData.size();
222 const unsigned int queryLength = al.QueryBases.size();
223 const unsigned int tagDataLength = al.TagData.size();
225 // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
226 // force calculation of Bin before storing
227 const int endPosition = al.GetEndPosition();
228 const uint32_t alignmentBin = CalculateMinimumBin(al.Position, endPosition);
230 // create our packed cigar string
232 CreatePackedCigar(al.CigarData, packedCigar);
233 const unsigned int packedCigarLength = packedCigar.size();
237 EncodeQuerySequence(al.QueryBases, encodedQuery);
238 const unsigned int encodedQueryLength = encodedQuery.size();
240 // write the block size
241 const unsigned int dataBlockSize = nameLength +
246 unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize;
247 if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
248 m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
250 // assign the BAM core data
251 uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
252 buffer[0] = al.RefID;
253 buffer[1] = al.Position;
254 buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
255 buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
256 buffer[4] = queryLength;
257 buffer[5] = al.MateRefID;
258 buffer[6] = al.MatePosition;
259 buffer[7] = al.InsertSize;
261 // swap BAM core endian-ness, if necessary
262 if ( m_isBigEndian ) {
263 for ( int i = 0; i < 8; ++i )
264 BamTools::SwapEndian_32(buffer[i]);
267 // write the BAM core
268 m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
270 // write the query name
271 m_stream.Write(al.Name.c_str(), nameLength);
273 // write the packed cigar
274 if ( m_isBigEndian ) {
275 char* cigarData = new char[packedCigarLength]();
276 memcpy(cigarData, packedCigar.data(), packedCigarLength);
277 if ( m_isBigEndian ) {
278 for ( size_t i = 0; i < packedCigarLength; ++i )
279 BamTools::SwapEndian_32p(&cigarData[i]);
281 m_stream.Write(cigarData, packedCigarLength);
282 delete[] cigarData; // TODO: cleanup on Write exception thrown?
285 m_stream.Write(packedCigar.data(), packedCigarLength);
287 // write the encoded query sequence
288 m_stream.Write(encodedQuery.data(), encodedQueryLength);
290 // write the base qualities
291 char* pBaseQualities = (char*)al.Qualities.data();
292 for ( size_t i = 0; i < queryLength; ++i )
293 pBaseQualities[i] -= 33; // FASTQ conversion
294 m_stream.Write(pBaseQualities, queryLength);
296 // write the read group tag
297 if ( m_isBigEndian ) {
299 char* tagData = new char[tagDataLength]();
300 memcpy(tagData, al.TagData.data(), tagDataLength);
303 while ( i < tagDataLength ) {
305 i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.)
306 const char type = tagData[i]; // get tag type at position i
311 case(Constants::BAM_TAG_TYPE_ASCII) :
312 case(Constants::BAM_TAG_TYPE_INT8) :
313 case(Constants::BAM_TAG_TYPE_UINT8) :
317 case(Constants::BAM_TAG_TYPE_INT16) :
318 case(Constants::BAM_TAG_TYPE_UINT16) :
319 BamTools::SwapEndian_16p(&tagData[i]);
320 i += sizeof(uint16_t);
323 case(Constants::BAM_TAG_TYPE_FLOAT) :
324 case(Constants::BAM_TAG_TYPE_INT32) :
325 case(Constants::BAM_TAG_TYPE_UINT32) :
326 BamTools::SwapEndian_32p(&tagData[i]);
327 i += sizeof(uint32_t);
330 case(Constants::BAM_TAG_TYPE_HEX) :
331 case(Constants::BAM_TAG_TYPE_STRING) :
332 // no endian swapping necessary for hex-string/string data
335 // increment one more for null terminator
339 case(Constants::BAM_TAG_TYPE_ARRAY) :
343 const char arrayType = tagData[i];
346 // swap endian-ness of number of elements in place, then retrieve for loop
347 BamTools::SwapEndian_32p(&tagData[i]);
349 memcpy(&numElements, &tagData[i], sizeof(uint32_t));
350 i += sizeof(uint32_t);
352 // swap endian-ness of array elements
353 for ( int j = 0; j < numElements; ++j ) {
355 case (Constants::BAM_TAG_TYPE_INT8) :
356 case (Constants::BAM_TAG_TYPE_UINT8) :
357 // no endian-swapping necessary
360 case (Constants::BAM_TAG_TYPE_INT16) :
361 case (Constants::BAM_TAG_TYPE_UINT16) :
362 BamTools::SwapEndian_16p(&tagData[i]);
363 i += sizeof(uint16_t);
365 case (Constants::BAM_TAG_TYPE_FLOAT) :
366 case (Constants::BAM_TAG_TYPE_INT32) :
367 case (Constants::BAM_TAG_TYPE_UINT32) :
368 BamTools::SwapEndian_32p(&tagData[i]);
369 i += sizeof(uint32_t);
373 const string message = string("invalid binary array type: ") + arrayType;
374 throw BamException("BamWriter::SaveAlignment", message);
383 const string message = string("invalid tag type: ") + type;
384 throw BamException("BamWriter::SaveAlignment", message);
388 m_stream.Write(tagData, tagDataLength);
389 delete[] tagData; // TODO: cleanup on Write exception thrown?
392 m_stream.Write(al.TagData.data(), tagDataLength);
395 void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al) {
397 // write the block size
398 unsigned int blockSize = al.SupportData.BlockLength;
399 if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
400 m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
402 // re-calculate bin (in case BamAlignment's position has been previously modified)
403 const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition());
405 // assign the BAM core data
406 uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
407 buffer[0] = al.RefID;
408 buffer[1] = al.Position;
409 buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
410 buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
411 buffer[4] = al.SupportData.QuerySequenceLength;
412 buffer[5] = al.MateRefID;
413 buffer[6] = al.MatePosition;
414 buffer[7] = al.InsertSize;
416 // swap BAM core endian-ness, if necessary
417 if ( m_isBigEndian ) {
418 for ( int i = 0; i < 8; ++i )
419 BamTools::SwapEndian_32(buffer[i]);
422 // write the BAM core
423 m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
425 // write the raw char data
426 m_stream.Write((char*)al.SupportData.AllCharData.data(),
427 al.SupportData.BlockLength-Constants::BAM_CORE_SIZE);
430 void BamWriterPrivate::WriteMagicNumber(void) {
431 // write BAM file 'magic number'
432 m_stream.Write(Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH);
435 void BamWriterPrivate::WriteReferences(const BamTools::RefVector& referenceSequences) {
437 // write the number of reference sequences
438 uint32_t numReferenceSequences = referenceSequences.size();
439 if ( m_isBigEndian ) BamTools::SwapEndian_32(numReferenceSequences);
440 m_stream.Write((char*)&numReferenceSequences, Constants::BAM_SIZEOF_INT);
442 // foreach reference sequence
443 RefVector::const_iterator rsIter = referenceSequences.begin();
444 RefVector::const_iterator rsEnd = referenceSequences.end();
445 for ( ; rsIter != rsEnd; ++rsIter ) {
447 // write the reference sequence name length
448 uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
449 if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceSequenceNameLen);
450 m_stream.Write((char*)&referenceSequenceNameLen, Constants::BAM_SIZEOF_INT);
452 // write the reference sequence name
453 m_stream.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
455 // write the reference sequence length
456 int32_t referenceLength = rsIter->RefLength;
457 if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceLength);
458 m_stream.Write((char*)&referenceLength, Constants::BAM_SIZEOF_INT);
462 void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText) {
464 // write the SAM header text length
465 uint32_t samHeaderLen = samHeaderText.size();
466 if ( m_isBigEndian ) BamTools::SwapEndian_32(samHeaderLen);
467 m_stream.Write((char*)&samHeaderLen, Constants::BAM_SIZEOF_INT);
469 // write the SAM header text
470 if ( samHeaderLen > 0 )
471 m_stream.Write(samHeaderText.data(), samHeaderLen);