1 // ***************************************************************************
\r
2 // BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett
\r
3 // Marth Lab, Department of Biology, Boston College
\r
4 // All rights reserved.
\r
5 // ---------------------------------------------------------------------------
\r
6 // Last modified: 30 March 2010 (DB)
\r
7 // ---------------------------------------------------------------------------
\r
8 // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
\r
10 // ---------------------------------------------------------------------------
\r
11 // Provides the basic functionality for producing BAM files
\r
12 // ***************************************************************************
\r
16 #include "BamWriter.h"
\r
17 using namespace BamTools;
\r
18 using namespace std;
\r
20 struct BamWriter::BamWriterPrivate {
\r
27 // constructor / destructor
\r
28 BamWriterPrivate(void) {
\r
29 IsBigEndian = SystemIsBigEndian();
\r
32 ~BamWriterPrivate(void) {
\r
36 // "public" interface
\r
38 void Open(const string& filename, const string& samHeader, const RefVector& referenceSequences);
\r
39 void SaveAlignment(const BamAlignment& al);
\r
40 void SaveAlignment(const BamAlignment& al, const BamAlignmentSupportData& supportData);
\r
43 void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);
\r
44 void EncodeQuerySequence(const string& query, string& encodedQuery);
\r
47 // -----------------------------------------------------
\r
48 // BamWriter implementation
\r
49 // -----------------------------------------------------
\r
52 BamWriter::BamWriter(void) {
\r
53 d = new BamWriterPrivate;
\r
57 BamWriter::~BamWriter(void) {
\r
62 // closes the alignment archive
\r
63 void BamWriter::Close(void) {
\r
67 // opens the alignment archive
\r
68 void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {
\r
69 d->Open(filename, samHeader, referenceSequences);
\r
72 // saves the alignment to the alignment archive
\r
73 void BamWriter::SaveAlignment(const BamAlignment& al) {
\r
74 d->SaveAlignment(al);
\r
77 void BamWriter::SaveAlignment(const BamAlignment& al, const BamAlignmentSupportData& supportData) {
\r
78 d->SaveAlignment(al, supportData);
\r
81 // -----------------------------------------------------
\r
82 // BamWriterPrivate implementation
\r
83 // -----------------------------------------------------
\r
85 // closes the alignment archive
\r
86 void BamWriter::BamWriterPrivate::Close(void) {
\r
90 // creates a cigar string from the supplied alignment
\r
91 void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
\r
94 const unsigned int numCigarOperations = cigarOperations.size();
\r
95 packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);
\r
97 // pack the cigar data into the string
\r
98 unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
\r
100 unsigned int cigarOp;
\r
101 vector<CigarOp>::const_iterator coIter;
\r
102 for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {
\r
104 switch(coIter->Type) {
\r
106 cigarOp = BAM_CMATCH;
\r
109 cigarOp = BAM_CINS;
\r
112 cigarOp = BAM_CDEL;
\r
115 cigarOp = BAM_CREF_SKIP;
\r
118 cigarOp = BAM_CSOFT_CLIP;
\r
121 cigarOp = BAM_CHARD_CLIP;
\r
124 cigarOp = BAM_CPAD;
\r
127 printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);
\r
131 *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
\r
136 // encodes the supplied query sequence into 4-bit notation
\r
137 void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
\r
139 // prepare the encoded query string
\r
140 const unsigned int queryLen = query.size();
\r
141 const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
\r
142 encodedQuery.resize(encodedQueryLen);
\r
143 char* pEncodedQuery = (char*)encodedQuery.data();
\r
144 const char* pQuery = (const char*)query.data();
\r
146 unsigned char nucleotideCode;
\r
147 bool useHighWord = true;
\r
154 nucleotideCode = 0;
\r
158 nucleotideCode = 1;
\r
162 nucleotideCode = 2;
\r
166 nucleotideCode = 4;
\r
170 nucleotideCode = 8;
\r
174 nucleotideCode = 15;
\r
178 printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
\r
182 // pack the nucleotide code
\r
184 *pEncodedQuery = nucleotideCode << 4;
\r
185 useHighWord = false;
\r
187 *pEncodedQuery |= nucleotideCode;
\r
189 useHighWord = true;
\r
192 // increment the query position
\r
197 // opens the alignment archive
\r
198 void BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {
\r
200 // open the BGZF file for writing
\r
201 mBGZF.Open(filename, "wb");
\r
203 // ================
\r
204 // write the header
\r
205 // ================
\r
207 // write the BAM signature
\r
208 const unsigned char SIGNATURE_LENGTH = 4;
\r
209 const char* BAM_SIGNATURE = "BAM\1";
\r
210 mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);
\r
212 // write the SAM header text length
\r
213 uint32_t samHeaderLen = samHeader.size();
\r
214 if ( IsBigEndian ) { SwapEndian_32(samHeaderLen); }
\r
215 mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);
\r
217 // write the SAM header text
\r
218 if(samHeaderLen > 0) {
\r
219 mBGZF.Write(samHeader.data(), samHeaderLen);
\r
222 // write the number of reference sequences
\r
223 uint32_t numReferenceSequences = referenceSequences.size();
\r
224 if ( IsBigEndian ) { SwapEndian_32(numReferenceSequences); }
\r
225 mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);
\r
227 // =============================
\r
228 // write the sequence dictionary
\r
229 // =============================
\r
231 RefVector::const_iterator rsIter;
\r
232 for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
\r
234 // write the reference sequence name length
\r
235 uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
\r
236 if ( IsBigEndian ) { SwapEndian_32(referenceSequenceNameLen); }
\r
237 mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
\r
239 // write the reference sequence name
\r
240 mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
\r
242 // write the reference sequence length
\r
243 int32_t referenceLength = rsIter->RefLength;
\r
244 if ( IsBigEndian ) { SwapEndian_32(referenceLength); }
\r
245 mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);
\r
249 // saves the alignment to the alignment archive
\r
250 void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
\r
253 const unsigned int nameLen = al.Name.size() + 1;
\r
254 const unsigned int queryLen = al.QueryBases.size();
\r
255 const unsigned int numCigarOperations = al.CigarData.size();
\r
257 // create our packed cigar string
\r
258 string packedCigar;
\r
259 CreatePackedCigar(al.CigarData, packedCigar);
\r
260 const unsigned int packedCigarLen = packedCigar.size();
\r
262 // encode the query
\r
263 string encodedQuery;
\r
264 EncodeQuerySequence(al.QueryBases, encodedQuery);
\r
265 const unsigned int encodedQueryLen = encodedQuery.size();
\r
267 // store the tag data length
\r
268 // -------------------------------------------
\r
269 // Modified: 3-25-2010 DWB
\r
270 // Contributed: ARQ
\r
271 // Fixed: "off by one" error when parsing tags in files produced by BamWriter
\r
272 const unsigned int tagDataLength = al.TagData.size();
\r
274 // const unsigned int tagDataLength = al.TagData.size() + 1;
\r
275 // -------------------------------------------
\r
277 // assign the BAM core data
\r
278 uint32_t buffer[8];
\r
279 buffer[0] = al.RefID;
\r
280 buffer[1] = al.Position;
\r
281 buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;
\r
282 buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
\r
283 buffer[4] = queryLen;
\r
284 buffer[5] = al.MateRefID;
\r
285 buffer[6] = al.MatePosition;
\r
286 buffer[7] = al.InsertSize;
\r
288 // write the block size
\r
289 unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;
\r
290 unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
\r
291 if ( IsBigEndian ) { SwapEndian_32(blockSize); }
\r
292 mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
\r
294 // write the BAM core
\r
295 if ( IsBigEndian ) {
\r
296 for ( int i = 0; i < 8; ++i ) {
\r
297 SwapEndian_32(buffer[i]);
\r
300 mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
\r
302 // write the query name
\r
303 mBGZF.Write(al.Name.c_str(), nameLen);
\r
305 // write the packed cigar
\r
306 if ( IsBigEndian ) {
\r
308 char* cigarData = (char*)calloc(sizeof(char), packedCigarLen);
\r
309 memcpy(cigarData, packedCigar.data(), packedCigarLen);
\r
311 for (unsigned int i = 0; i < packedCigarLen; ++i) {
\r
312 if ( IsBigEndian ) {
\r
313 SwapEndian_32p(&cigarData[i]);
\r
317 mBGZF.Write(cigarData, packedCigarLen);
\r
321 mBGZF.Write(packedCigar.data(), packedCigarLen);
\r
324 // write the encoded query sequence
\r
325 mBGZF.Write(encodedQuery.data(), encodedQueryLen);
\r
327 // write the base qualities
\r
328 string baseQualities = al.Qualities;
\r
329 char* pBaseQualities = (char*)al.Qualities.data();
\r
330 for(unsigned int i = 0; i < queryLen; i++) { pBaseQualities[i] -= 33; }
\r
331 mBGZF.Write(pBaseQualities, queryLen);
\r
333 // write the read group tag
\r
334 if ( IsBigEndian ) {
\r
336 char* tagData = (char*)calloc(sizeof(char), tagDataLength);
\r
337 memcpy(tagData, al.TagData.data(), tagDataLength);
\r
340 while ( (unsigned int)i < tagDataLength ) {
\r
342 i += 2; // skip tag type (e.g. "RG", "NM", etc)
\r
343 uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
\r
344 ++i; // skip value type
\r
354 SwapEndian_16p(&tagData[i]);
\r
355 i+=2; // sizeof(uint16_t)
\r
360 SwapEndian_32p(&tagData[i]);
\r
361 i+=4; // sizeof(uint32_t)
\r
365 SwapEndian_64p(&tagData[i]);
\r
366 i+=8; // sizeof(uint64_t)
\r
371 while (tagData[i]) { ++i; }
\r
372 ++i; // increment one more for null terminator
\r
376 printf("ERROR: Invalid tag value type\n"); // shouldn't get here
\r
382 mBGZF.Write(tagData, tagDataLength);
\r
385 mBGZF.Write(al.TagData.data(), tagDataLength);
\r
389 void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al, const BamAlignmentSupportData& supportData) {
\r
391 // assign the BAM core data
\r
392 uint32_t buffer[8];
\r
393 buffer[0] = al.RefID;
\r
394 buffer[1] = al.Position;
\r
395 buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | supportData.QueryNameLength;
\r
396 buffer[3] = (al.AlignmentFlag << 16) | supportData.NumCigarOperations;
\r
397 buffer[4] = supportData.QuerySequenceLength;
\r
398 buffer[5] = al.MateRefID;
\r
399 buffer[6] = al.MatePosition;
\r
400 buffer[7] = al.InsertSize;
\r
402 // write the block size
\r
403 unsigned int blockSize = supportData.BlockLength;
\r
404 if ( IsBigEndian ) { SwapEndian_32(blockSize); }
\r
405 mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
\r
407 // write the BAM core
\r
408 if ( IsBigEndian ) {
\r
409 for ( int i = 0; i < 8; ++i ) {
\r
410 SwapEndian_32(buffer[i]);
\r
413 mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
\r
415 // write the raw char data
\r
416 mBGZF.Write((char*)supportData.AllCharData.data(), supportData.BlockLength-BAM_CORE_SIZE);
\r