1 // ***************************************************************************
\r
2 // BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett
\r
3 // Marth Lab, Department of Biology, Boston College
\r
4 // All rights reserved.
\r
5 // ---------------------------------------------------------------------------
\r
6 // Last modified: 16 September 2010 (DB)
\r
7 // ---------------------------------------------------------------------------
\r
8 // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
\r
10 // ---------------------------------------------------------------------------
\r
11 // Provides the basic functionality for producing BAM files
\r
12 // ***************************************************************************
\r
17 #include "BamWriter.h"
\r
18 using namespace BamTools;
\r
19 using namespace std;
\r
21 namespace BamTools {
\r
24 const int BT_SIZEOF_INT = 4;
\r
25 const int BAM_CMATCH = 0;
\r
26 const int BAM_CINS = 1;
\r
27 const int BAM_CDEL = 2;
\r
28 const int BAM_CREF_SKIP = 3;
\r
29 const int BAM_CSOFT_CLIP = 4;
\r
30 const int BAM_CHARD_CLIP = 5;
\r
31 const int BAM_CPAD = 6;
\r
33 } // namespace BamTools
\r
35 struct BamWriter::BamWriterPrivate {
\r
41 // constructor / destructor
\r
42 BamWriterPrivate(void) {
\r
43 IsBigEndian = SystemIsBigEndian();
\r
46 ~BamWriterPrivate(void) {
\r
50 // "public" interface
\r
52 bool Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed);
\r
53 void SaveAlignment(const BamAlignment& al);
\r
56 const unsigned int CalculateMinimumBin(const int begin, int end) const;
\r
57 void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);
\r
58 void EncodeQuerySequence(const string& query, string& encodedQuery);
\r
61 // -----------------------------------------------------
\r
62 // BamWriter implementation
\r
63 // -----------------------------------------------------
\r
66 BamWriter::BamWriter(void) {
\r
67 d = new BamWriterPrivate;
\r
71 BamWriter::~BamWriter(void) {
\r
76 // closes the alignment archive
\r
77 void BamWriter::Close(void) {
\r
81 // opens the alignment archive
\r
82 bool BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) {
\r
83 return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed);
\r
86 // saves the alignment to the alignment archive
\r
87 void BamWriter::SaveAlignment(const BamAlignment& al) {
\r
88 d->SaveAlignment(al);
\r
91 // -----------------------------------------------------
\r
92 // BamWriterPrivate implementation
\r
93 // -----------------------------------------------------
\r
95 // closes the alignment archive
\r
96 void BamWriter::BamWriterPrivate::Close(void) {
\r
100 // calculates minimum bin for a BAM alignment interval
\r
101 const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
\r
103 if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
\r
104 if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);
\r
105 if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);
\r
106 if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);
\r
107 if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);
\r
111 // creates a cigar string from the supplied alignment
\r
112 void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
\r
115 const unsigned int numCigarOperations = cigarOperations.size();
\r
116 packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);
\r
118 // pack the cigar data into the string
\r
119 unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
\r
121 unsigned int cigarOp;
\r
122 vector<CigarOp>::const_iterator coIter;
\r
123 for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {
\r
125 switch(coIter->Type) {
\r
127 cigarOp = BAM_CMATCH;
\r
130 cigarOp = BAM_CINS;
\r
133 cigarOp = BAM_CDEL;
\r
136 cigarOp = BAM_CREF_SKIP;
\r
139 cigarOp = BAM_CSOFT_CLIP;
\r
142 cigarOp = BAM_CHARD_CLIP;
\r
145 cigarOp = BAM_CPAD;
\r
148 fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);
\r
152 *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
\r
157 // encodes the supplied query sequence into 4-bit notation
\r
158 void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
\r
160 // prepare the encoded query string
\r
161 const unsigned int queryLen = query.size();
\r
162 const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
\r
163 encodedQuery.resize(encodedQueryLen);
\r
164 char* pEncodedQuery = (char*)encodedQuery.data();
\r
165 const char* pQuery = (const char*)query.data();
\r
167 unsigned char nucleotideCode;
\r
168 bool useHighWord = true;
\r
175 nucleotideCode = 0;
\r
179 nucleotideCode = 1;
\r
183 nucleotideCode = 2;
\r
187 nucleotideCode = 4;
\r
191 nucleotideCode = 8;
\r
195 nucleotideCode = 15;
\r
199 fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
\r
203 // pack the nucleotide code
\r
205 *pEncodedQuery = nucleotideCode << 4;
\r
206 useHighWord = false;
\r
208 *pEncodedQuery |= nucleotideCode;
\r
210 useHighWord = true;
\r
213 // increment the query position
\r
218 // opens the alignment archive
\r
219 bool BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) {
\r
221 // open the BGZF file for writing, return failure if error
\r
222 if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )
\r
225 // ================
\r
226 // write the header
\r
227 // ================
\r
229 // write the BAM signature
\r
230 const unsigned char SIGNATURE_LENGTH = 4;
\r
231 const char* BAM_SIGNATURE = "BAM\1";
\r
232 mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);
\r
234 // write the SAM header text length
\r
235 uint32_t samHeaderLen = samHeader.size();
\r
236 if (IsBigEndian) SwapEndian_32(samHeaderLen);
\r
237 mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);
\r
239 // write the SAM header text
\r
240 if(samHeaderLen > 0)
\r
241 mBGZF.Write(samHeader.data(), samHeaderLen);
\r
243 // write the number of reference sequences
\r
244 uint32_t numReferenceSequences = referenceSequences.size();
\r
245 if (IsBigEndian) SwapEndian_32(numReferenceSequences);
\r
246 mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);
\r
248 // =============================
\r
249 // write the sequence dictionary
\r
250 // =============================
\r
252 RefVector::const_iterator rsIter;
\r
253 for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
\r
255 // write the reference sequence name length
\r
256 uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
\r
257 if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);
\r
258 mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
\r
260 // write the reference sequence name
\r
261 mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
\r
263 // write the reference sequence length
\r
264 int32_t referenceLength = rsIter->RefLength;
\r
265 if (IsBigEndian) SwapEndian_32(referenceLength);
\r
266 mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);
\r
273 // saves the alignment to the alignment archive
\r
274 void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
\r
276 // if BamAlignment contains only the core data and a raw char data buffer
\r
277 // (as a result of BamReader::GetNextAlignmentCore())
\r
278 if ( al.SupportData.HasCoreOnly ) {
\r
280 // write the block size
\r
281 unsigned int blockSize = al.SupportData.BlockLength;
\r
282 if (IsBigEndian) SwapEndian_32(blockSize);
\r
283 mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
\r
285 // assign the BAM core data
\r
286 uint32_t buffer[8];
\r
287 buffer[0] = al.RefID;
\r
288 buffer[1] = al.Position;
\r
289 buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
\r
290 buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
\r
291 buffer[4] = al.SupportData.QuerySequenceLength;
\r
292 buffer[5] = al.MateRefID;
\r
293 buffer[6] = al.MatePosition;
\r
294 buffer[7] = al.InsertSize;
\r
296 // swap BAM core endian-ness, if necessary
\r
297 if ( IsBigEndian ) {
\r
298 for ( int i = 0; i < 8; ++i )
\r
299 SwapEndian_32(buffer[i]);
\r
302 // write the BAM core
\r
303 mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
\r
305 // write the raw char data
\r
306 mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE);
\r
309 // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
\r
310 // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
\r
313 // calculate char lengths
\r
314 const unsigned int nameLength = al.Name.size() + 1;
\r
315 const unsigned int numCigarOperations = al.CigarData.size();
\r
316 const unsigned int queryLength = al.QueryBases.size();
\r
317 const unsigned int tagDataLength = al.TagData.size();
\r
319 // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
\r
320 // force calculation of Bin before storing
\r
321 const int endPosition = al.GetEndPosition();
\r
322 const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
\r
324 // create our packed cigar string
\r
325 string packedCigar;
\r
326 CreatePackedCigar(al.CigarData, packedCigar);
\r
327 const unsigned int packedCigarLength = packedCigar.size();
\r
329 // encode the query
\r
330 string encodedQuery;
\r
331 EncodeQuerySequence(al.QueryBases, encodedQuery);
\r
332 const unsigned int encodedQueryLength = encodedQuery.size();
\r
334 // write the block size
\r
335 const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;
\r
336 unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
\r
337 if (IsBigEndian) SwapEndian_32(blockSize);
\r
338 mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
\r
340 // assign the BAM core data
\r
341 uint32_t buffer[8];
\r
342 buffer[0] = al.RefID;
\r
343 buffer[1] = al.Position;
\r
344 buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
\r
345 buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
\r
346 buffer[4] = queryLength;
\r
347 buffer[5] = al.MateRefID;
\r
348 buffer[6] = al.MatePosition;
\r
349 buffer[7] = al.InsertSize;
\r
351 // swap BAM core endian-ness, if necessary
\r
352 if ( IsBigEndian ) {
\r
353 for ( int i = 0; i < 8; ++i )
\r
354 SwapEndian_32(buffer[i]);
\r
357 // write the BAM core
\r
358 mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
\r
360 // write the query name
\r
361 mBGZF.Write(al.Name.c_str(), nameLength);
\r
363 // write the packed cigar
\r
364 if ( IsBigEndian ) {
\r
366 char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
\r
367 memcpy(cigarData, packedCigar.data(), packedCigarLength);
\r
369 for (unsigned int i = 0; i < packedCigarLength; ++i) {
\r
371 SwapEndian_32p(&cigarData[i]);
\r
374 mBGZF.Write(cigarData, packedCigarLength);
\r
378 mBGZF.Write(packedCigar.data(), packedCigarLength);
\r
380 // write the encoded query sequence
\r
381 mBGZF.Write(encodedQuery.data(), encodedQueryLength);
\r
383 // write the base qualities
\r
384 string baseQualities(al.Qualities);
\r
385 char* pBaseQualities = (char*)al.Qualities.data();
\r
386 for(unsigned int i = 0; i < queryLength; i++) {
\r
387 pBaseQualities[i] -= 33;
\r
389 mBGZF.Write(pBaseQualities, queryLength);
\r
391 // write the read group tag
\r
392 if ( IsBigEndian ) {
\r
394 char* tagData = (char*)calloc(sizeof(char), tagDataLength);
\r
395 memcpy(tagData, al.TagData.data(), tagDataLength);
\r
398 while ( (unsigned int)i < tagDataLength ) {
\r
400 i += 2; // skip tag type (e.g. "RG", "NM", etc)
\r
401 uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
\r
402 ++i; // skip value type
\r
412 SwapEndian_16p(&tagData[i]);
\r
413 i+=2; // sizeof(uint16_t)
\r
418 SwapEndian_32p(&tagData[i]);
\r
419 i+=4; // sizeof(uint32_t)
\r
423 SwapEndian_64p(&tagData[i]);
\r
424 i+=8; // sizeof(uint64_t)
\r
429 while (tagData[i]) { ++i; }
\r
430 ++i; // increment one more for null terminator
\r
434 fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
\r
440 mBGZF.Write(tagData, tagDataLength);
\r
444 mBGZF.Write(al.TagData.data(), tagDataLength);
\r