1 // ***************************************************************************
2 // BamWriter_p.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 19 November 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides the basic functionality for producing BAM files
9 // ***************************************************************************
11 #include <api/BamAlignment.h>
12 #include <api/BamWriter_p.h>
13 using namespace BamTools;
14 using namespace BamTools::Internal;
17 BamWriterPrivate::BamWriterPrivate(void) {
18 IsBigEndian = SystemIsBigEndian();
21 BamWriterPrivate::~BamWriterPrivate(void) {
25 // closes the alignment archive
26 void BamWriterPrivate::Close(void) {
30 // calculates minimum bin for a BAM alignment interval
31 const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
33 if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
34 if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);
35 if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);
36 if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);
37 if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);
41 // creates a cigar string from the supplied alignment
42 void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
45 const unsigned int numCigarOperations = cigarOperations.size();
46 packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);
48 // pack the cigar data into the string
49 unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
52 vector<CigarOp>::const_iterator coIter;
53 for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {
55 switch(coIter->Type) {
66 cigarOp = BAM_CREF_SKIP;
69 cigarOp = BAM_CSOFT_CLIP;
72 cigarOp = BAM_CHARD_CLIP;
78 fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);
82 *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
87 // encodes the supplied query sequence into 4-bit notation
88 void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
90 // prepare the encoded query string
91 const unsigned int queryLen = query.size();
92 const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
93 encodedQuery.resize(encodedQueryLen);
94 char* pEncodedQuery = (char*)encodedQuery.data();
95 const char* pQuery = (const char*)query.data();
97 unsigned char nucleotideCode;
98 bool useHighWord = true;
129 fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
133 // pack the nucleotide code
135 *pEncodedQuery = nucleotideCode << 4;
138 *pEncodedQuery |= nucleotideCode;
143 // increment the query position
148 // opens the alignment archive
149 bool BamWriterPrivate::Open(const string& filename,
150 const string& samHeader,
151 const RefVector& referenceSequences,
152 bool isWriteUncompressed)
154 // open the BGZF file for writing, return failure if error
155 if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )
162 // write the BAM signature
163 const unsigned char SIGNATURE_LENGTH = 4;
164 const char* BAM_SIGNATURE = "BAM\1";
165 mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);
167 // write the SAM header text length
168 uint32_t samHeaderLen = samHeader.size();
169 if (IsBigEndian) SwapEndian_32(samHeaderLen);
170 mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);
172 // write the SAM header text
174 mBGZF.Write(samHeader.data(), samHeaderLen);
176 // write the number of reference sequences
177 uint32_t numReferenceSequences = referenceSequences.size();
178 if (IsBigEndian) SwapEndian_32(numReferenceSequences);
179 mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);
181 // =============================
182 // write the sequence dictionary
183 // =============================
185 RefVector::const_iterator rsIter;
186 for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
188 // write the reference sequence name length
189 uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
190 if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);
191 mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
193 // write the reference sequence name
194 mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
196 // write the reference sequence length
197 int32_t referenceLength = rsIter->RefLength;
198 if (IsBigEndian) SwapEndian_32(referenceLength);
199 mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);
206 // saves the alignment to the alignment archive
207 void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
209 // if BamAlignment contains only the core data and a raw char data buffer
210 // (as a result of BamReader::GetNextAlignmentCore())
211 if ( al.SupportData.HasCoreOnly ) {
213 // write the block size
214 unsigned int blockSize = al.SupportData.BlockLength;
215 if (IsBigEndian) SwapEndian_32(blockSize);
216 mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
218 // assign the BAM core data
220 buffer[0] = al.RefID;
221 buffer[1] = al.Position;
222 buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
223 buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
224 buffer[4] = al.SupportData.QuerySequenceLength;
225 buffer[5] = al.MateRefID;
226 buffer[6] = al.MatePosition;
227 buffer[7] = al.InsertSize;
229 // swap BAM core endian-ness, if necessary
231 for ( int i = 0; i < 8; ++i )
232 SwapEndian_32(buffer[i]);
235 // write the BAM core
236 mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
238 // write the raw char data
239 mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE);
242 // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
243 // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
246 // calculate char lengths
247 const unsigned int nameLength = al.Name.size() + 1;
248 const unsigned int numCigarOperations = al.CigarData.size();
249 const unsigned int queryLength = al.QueryBases.size();
250 const unsigned int tagDataLength = al.TagData.size();
252 // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
253 // force calculation of Bin before storing
254 const int endPosition = al.GetEndPosition();
255 const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
257 // create our packed cigar string
259 CreatePackedCigar(al.CigarData, packedCigar);
260 const unsigned int packedCigarLength = packedCigar.size();
264 EncodeQuerySequence(al.QueryBases, encodedQuery);
265 const unsigned int encodedQueryLength = encodedQuery.size();
267 // write the block size
268 const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;
269 unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
270 if (IsBigEndian) SwapEndian_32(blockSize);
271 mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
273 // assign the BAM core data
275 buffer[0] = al.RefID;
276 buffer[1] = al.Position;
277 buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
278 buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
279 buffer[4] = queryLength;
280 buffer[5] = al.MateRefID;
281 buffer[6] = al.MatePosition;
282 buffer[7] = al.InsertSize;
284 // swap BAM core endian-ness, if necessary
286 for ( int i = 0; i < 8; ++i )
287 SwapEndian_32(buffer[i]);
290 // write the BAM core
291 mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
293 // write the query name
294 mBGZF.Write(al.Name.c_str(), nameLength);
296 // write the packed cigar
299 char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
300 memcpy(cigarData, packedCigar.data(), packedCigarLength);
302 for (unsigned int i = 0; i < packedCigarLength; ++i) {
304 SwapEndian_32p(&cigarData[i]);
307 mBGZF.Write(cigarData, packedCigarLength);
311 mBGZF.Write(packedCigar.data(), packedCigarLength);
313 // write the encoded query sequence
314 mBGZF.Write(encodedQuery.data(), encodedQueryLength);
316 // write the base qualities
317 string baseQualities(al.Qualities);
318 char* pBaseQualities = (char*)al.Qualities.data();
319 for(unsigned int i = 0; i < queryLength; i++) {
320 pBaseQualities[i] -= 33;
322 mBGZF.Write(pBaseQualities, queryLength);
324 // write the read group tag
327 char* tagData = (char*)calloc(sizeof(char), tagDataLength);
328 memcpy(tagData, al.TagData.data(), tagDataLength);
331 while ( (unsigned int)i < tagDataLength ) {
333 i += 2; // skip tag type (e.g. "RG", "NM", etc)
334 uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
335 ++i; // skip value type
345 SwapEndian_16p(&tagData[i]);
346 i+=2; // sizeof(uint16_t)
351 SwapEndian_32p(&tagData[i]);
352 i+=4; // sizeof(uint32_t)
356 SwapEndian_64p(&tagData[i]);
357 i+=8; // sizeof(uint64_t)
362 while (tagData[i]) { ++i; }
363 ++i; // increment one more for null terminator
367 fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
373 mBGZF.Write(tagData, tagDataLength);
377 mBGZF.Write(al.TagData.data(), tagDataLength);