1 // ***************************************************************************
2 // BamWriter_p.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 16 June 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides the basic functionality for producing BAM files
9 // ***************************************************************************
11 #include <api/BamAlignment.h>
12 #include <api/BamConstants.h>
13 #include <api/IBamIODevice.h>
14 #include <api/internal/BamWriter_p.h>
15 using namespace BamTools;
16 using namespace BamTools::Internal;
24 BamWriterPrivate::BamWriterPrivate(void)
25 : m_isBigEndian( BamTools::SystemIsBigEndian() )
29 BamWriterPrivate::~BamWriterPrivate(void) {
33 // calculates minimum bin for a BAM alignment interval
34 unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
36 if ( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
37 if ( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);
38 if ( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);
39 if ( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);
40 if ( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);
44 // closes the alignment archive
45 void BamWriterPrivate::Close(void) {
49 // creates a cigar string from the supplied alignment
50 void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
53 const unsigned int numCigarOperations = cigarOperations.size();
54 packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT);
56 // pack the cigar data into the string
57 unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
59 // iterate over cigar operations
60 vector<CigarOp>::const_iterator coIter = cigarOperations.begin();
61 vector<CigarOp>::const_iterator coEnd = cigarOperations.end();
62 for ( ; coIter != coEnd; ++coIter ) {
64 // store op in packedCigar
66 switch ( coIter->Type ) {
67 case (Constants::BAM_CIGAR_MATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MATCH; break;
68 case (Constants::BAM_CIGAR_INS_CHAR) : cigarOp = Constants::BAM_CIGAR_INS; break;
69 case (Constants::BAM_CIGAR_DEL_CHAR) : cigarOp = Constants::BAM_CIGAR_DEL; break;
70 case (Constants::BAM_CIGAR_REFSKIP_CHAR) : cigarOp = Constants::BAM_CIGAR_REFSKIP; break;
71 case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_SOFTCLIP; break;
72 case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_HARDCLIP; break;
73 case (Constants::BAM_CIGAR_PAD_CHAR) : cigarOp = Constants::BAM_CIGAR_PAD; break;
74 case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_SEQMATCH; break;
75 case (Constants::BAM_CIGAR_MISMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MISMATCH; break;
77 fprintf(stderr, "BamWriter ERROR: unknown cigar operation found: %c\n", coIter->Type);
81 *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp;
86 // encodes the supplied query sequence into 4-bit notation
87 void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
89 // prepare the encoded query string
90 const unsigned int queryLen = query.size();
91 const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
92 encodedQuery.resize(encodedQueryLen);
93 char* pEncodedQuery = (char*)encodedQuery.data();
94 const char* pQuery = (const char*)query.data();
96 unsigned char nucleotideCode;
97 bool useHighWord = true;
101 case (Constants::BAM_DNA_EQUAL) : nucleotideCode = Constants::BAM_BASECODE_EQUAL; break;
102 case (Constants::BAM_DNA_A) : nucleotideCode = Constants::BAM_BASECODE_A; break;
103 case (Constants::BAM_DNA_C) : nucleotideCode = Constants::BAM_BASECODE_C; break;
104 case (Constants::BAM_DNA_G) : nucleotideCode = Constants::BAM_BASECODE_G; break;
105 case (Constants::BAM_DNA_T) : nucleotideCode = Constants::BAM_BASECODE_T; break;
106 case (Constants::BAM_DNA_N) : nucleotideCode = Constants::BAM_BASECODE_N; break;
108 fprintf(stderr, "BamWriter ERROR: only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
112 // pack the nucleotide code
114 *pEncodedQuery = nucleotideCode << 4;
117 *pEncodedQuery |= nucleotideCode;
122 // increment the query position
127 // returns whether BAM file is open for writing or not
128 bool BamWriterPrivate::IsOpen(void) const {
129 return m_stream.IsOpen();
132 // opens the alignment archive
133 bool BamWriterPrivate::Open(const string& filename,
134 const string& samHeaderText,
135 const RefVector& referenceSequences)
137 // open the BGZF file for writing, return failure if error
138 if ( !m_stream.Open(filename, IBamIODevice::WriteOnly) )
141 // write BAM file 'metadata' components
143 WriteSamHeaderText(samHeaderText);
144 WriteReferences(referenceSequences);
148 // saves the alignment to the alignment archive
149 void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
151 // if BamAlignment contains only the core data and a raw char data buffer
152 // (as a result of BamReader::GetNextAlignmentCore())
153 if ( al.SupportData.HasCoreOnly ) {
155 // write the block size
156 unsigned int blockSize = al.SupportData.BlockLength;
157 if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
158 m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
160 // re-calculate bin (in case BamAlignment's position has been previously modified)
161 const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition());
163 // assign the BAM core data
164 uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
165 buffer[0] = al.RefID;
166 buffer[1] = al.Position;
167 buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
168 buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
169 buffer[4] = al.SupportData.QuerySequenceLength;
170 buffer[5] = al.MateRefID;
171 buffer[6] = al.MatePosition;
172 buffer[7] = al.InsertSize;
174 // swap BAM core endian-ness, if necessary
175 if ( m_isBigEndian ) {
176 for ( int i = 0; i < 8; ++i )
177 BamTools::SwapEndian_32(buffer[i]);
180 // write the BAM core
181 m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
183 // write the raw char data
184 m_stream.Write((char*)al.SupportData.AllCharData.data(),
185 al.SupportData.BlockLength-Constants::BAM_CORE_SIZE);
188 // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
189 // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
192 // calculate char lengths
193 const unsigned int nameLength = al.Name.size() + 1;
194 const unsigned int numCigarOperations = al.CigarData.size();
195 const unsigned int queryLength = al.QueryBases.size();
196 const unsigned int tagDataLength = al.TagData.size();
198 // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
199 // force calculation of Bin before storing
200 const int endPosition = al.GetEndPosition();
201 const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
203 // create our packed cigar string
205 CreatePackedCigar(al.CigarData, packedCigar);
206 const unsigned int packedCigarLength = packedCigar.size();
210 EncodeQuerySequence(al.QueryBases, encodedQuery);
211 const unsigned int encodedQueryLength = encodedQuery.size();
213 // write the block size
214 const unsigned int dataBlockSize = nameLength +
219 unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize;
220 if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
221 m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
223 // assign the BAM core data
224 uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
225 buffer[0] = al.RefID;
226 buffer[1] = al.Position;
227 buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
228 buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
229 buffer[4] = queryLength;
230 buffer[5] = al.MateRefID;
231 buffer[6] = al.MatePosition;
232 buffer[7] = al.InsertSize;
234 // swap BAM core endian-ness, if necessary
235 if ( m_isBigEndian ) {
236 for ( int i = 0; i < 8; ++i )
237 BamTools::SwapEndian_32(buffer[i]);
240 // write the BAM core
241 m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
243 // write the query name
244 m_stream.Write(al.Name.c_str(), nameLength);
246 // write the packed cigar
247 if ( m_isBigEndian ) {
248 char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
249 memcpy(cigarData, packedCigar.data(), packedCigarLength);
250 if ( m_isBigEndian ) {
251 for ( unsigned int i = 0; i < packedCigarLength; ++i )
252 BamTools::SwapEndian_32p(&cigarData[i]);
254 m_stream.Write(cigarData, packedCigarLength);
258 m_stream.Write(packedCigar.data(), packedCigarLength);
260 // write the encoded query sequence
261 m_stream.Write(encodedQuery.data(), encodedQueryLength);
263 // write the base qualities
264 char* pBaseQualities = (char*)al.Qualities.data();
265 for ( unsigned int i = 0; i < queryLength; ++i )
266 pBaseQualities[i] -= 33; // FASTQ conversion
267 m_stream.Write(pBaseQualities, queryLength);
269 // write the read group tag
270 if ( m_isBigEndian ) {
272 char* tagData = (char*)calloc(sizeof(char), tagDataLength);
273 memcpy(tagData, al.TagData.data(), tagDataLength);
276 while ( (unsigned int)i < tagDataLength ) {
278 i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.)
279 const char type = tagData[i]; // get tag type at position i
284 case(Constants::BAM_TAG_TYPE_ASCII) :
285 case(Constants::BAM_TAG_TYPE_INT8) :
286 case(Constants::BAM_TAG_TYPE_UINT8) :
290 case(Constants::BAM_TAG_TYPE_INT16) :
291 case(Constants::BAM_TAG_TYPE_UINT16) :
292 BamTools::SwapEndian_16p(&tagData[i]);
293 i += sizeof(uint16_t);
296 case(Constants::BAM_TAG_TYPE_FLOAT) :
297 case(Constants::BAM_TAG_TYPE_INT32) :
298 case(Constants::BAM_TAG_TYPE_UINT32) :
299 BamTools::SwapEndian_32p(&tagData[i]);
300 i += sizeof(uint32_t);
303 case(Constants::BAM_TAG_TYPE_HEX) :
304 case(Constants::BAM_TAG_TYPE_STRING) :
305 // no endian swapping necessary for hex-string/string data
308 // increment one more for null terminator
312 case(Constants::BAM_TAG_TYPE_ARRAY) :
316 const char arrayType = tagData[i];
319 // swap endian-ness of number of elements in place, then retrieve for loop
320 BamTools::SwapEndian_32p(&tagData[i]);
322 memcpy(&numElements, &tagData[i], sizeof(uint32_t));
323 i += sizeof(uint32_t);
325 // swap endian-ness of array elements
326 for ( int j = 0; j < numElements; ++j ) {
328 case (Constants::BAM_TAG_TYPE_INT8) :
329 case (Constants::BAM_TAG_TYPE_UINT8) :
330 // no endian-swapping necessary
333 case (Constants::BAM_TAG_TYPE_INT16) :
334 case (Constants::BAM_TAG_TYPE_UINT16) :
335 BamTools::SwapEndian_16p(&tagData[i]);
336 i += sizeof(uint16_t);
338 case (Constants::BAM_TAG_TYPE_FLOAT) :
339 case (Constants::BAM_TAG_TYPE_INT32) :
340 case (Constants::BAM_TAG_TYPE_UINT32) :
341 BamTools::SwapEndian_32p(&tagData[i]);
342 i += sizeof(uint32_t);
347 "BamWriter ERROR: unknown binary array type encountered: [%c]\n",
357 fprintf(stderr, "BamWriter ERROR: invalid tag value type\n"); // shouldn't get here
362 m_stream.Write(tagData, tagDataLength);
366 m_stream.Write(al.TagData.data(), tagDataLength);
370 void BamWriterPrivate::SetWriteCompressed(bool ok) {
372 // warn if BAM file is already open
373 // modifying compression is not allowed in this case
375 cerr << "BamWriter WARNING: attempting to change compression mode on an open BAM file is not allowed. "
376 << "Ignoring request." << endl;
380 // set BgzfStream compression mode
381 m_stream.SetWriteCompressed(ok);
384 void BamWriterPrivate::WriteMagicNumber(void) {
385 // write BAM file 'magic number'
386 m_stream.Write(Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH);
389 void BamWriterPrivate::WriteReferences(const BamTools::RefVector& referenceSequences) {
391 // write the number of reference sequences
392 uint32_t numReferenceSequences = referenceSequences.size();
393 if ( m_isBigEndian ) BamTools::SwapEndian_32(numReferenceSequences);
394 m_stream.Write((char*)&numReferenceSequences, Constants::BAM_SIZEOF_INT);
396 // foreach reference sequence
397 RefVector::const_iterator rsIter = referenceSequences.begin();
398 RefVector::const_iterator rsEnd = referenceSequences.end();
399 for ( ; rsIter != rsEnd; ++rsIter ) {
401 // write the reference sequence name length
402 uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
403 if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceSequenceNameLen);
404 m_stream.Write((char*)&referenceSequenceNameLen, Constants::BAM_SIZEOF_INT);
406 // write the reference sequence name
407 m_stream.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
409 // write the reference sequence length
410 int32_t referenceLength = rsIter->RefLength;
411 if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceLength);
412 m_stream.Write((char*)&referenceLength, Constants::BAM_SIZEOF_INT);
416 void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText) {
418 // write the SAM header text length
419 uint32_t samHeaderLen = samHeaderText.size();
420 if ( m_isBigEndian ) BamTools::SwapEndian_32(samHeaderLen);
421 m_stream.Write((char*)&samHeaderLen, Constants::BAM_SIZEOF_INT);
423 // write the SAM header text
424 if ( samHeaderLen > 0 )
425 m_stream.Write(samHeaderText.data(), samHeaderLen);