1 // ***************************************************************************
2 // BamWriter_p.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 11 January 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides the basic functionality for producing BAM files
9 // ***************************************************************************
11 #include <api/BamAlignment.h>
12 #include <api/internal/BamWriter_p.h>
13 using namespace BamTools;
14 using namespace BamTools::Internal;
18 BamWriterPrivate::BamWriterPrivate(void) {
19 IsBigEndian = SystemIsBigEndian();
23 BamWriterPrivate::~BamWriterPrivate(void) {
27 // calculates minimum bin for a BAM alignment interval
28 const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
30 if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
31 if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17);
32 if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20);
33 if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23);
34 if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26);
38 // closes the alignment archive
39 void BamWriterPrivate::Close(void) {
43 // creates a cigar string from the supplied alignment
44 void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
47 const unsigned int numCigarOperations = cigarOperations.size();
48 packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);
50 // pack the cigar data into the string
51 unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
54 vector<CigarOp>::const_iterator coIter;
55 for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {
57 switch(coIter->Type) {
58 case 'M': cigarOp = BAM_CMATCH; break;
59 case 'I': cigarOp = BAM_CINS; break;
60 case 'D': cigarOp = BAM_CDEL; break;
61 case 'N': cigarOp = BAM_CREF_SKIP; break;
62 case 'S': cigarOp = BAM_CSOFT_CLIP; break;
63 case 'H': cigarOp = BAM_CHARD_CLIP; break;
64 case 'P': cigarOp = BAM_CPAD; break;
66 fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);
70 *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
75 // encodes the supplied query sequence into 4-bit notation
76 void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
78 // prepare the encoded query string
79 const unsigned int queryLen = query.size();
80 const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
81 encodedQuery.resize(encodedQueryLen);
82 char* pEncodedQuery = (char*)encodedQuery.data();
83 const char* pQuery = (const char*)query.data();
85 unsigned char nucleotideCode;
86 bool useHighWord = true;
91 case '=': nucleotideCode = 0; break;
92 case 'A': nucleotideCode = 1; break;
93 case 'C': nucleotideCode = 2; break;
94 case 'G': nucleotideCode = 4; break;
95 case 'T': nucleotideCode = 8; break;
96 case 'N': nucleotideCode = 15; break;
98 fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
102 // pack the nucleotide code
104 *pEncodedQuery = nucleotideCode << 4;
107 *pEncodedQuery |= nucleotideCode;
112 // increment the query position
117 // opens the alignment archive
118 bool BamWriterPrivate::Open(const string& filename,
119 const string& samHeader,
120 const RefVector& referenceSequences,
121 bool isWriteUncompressed)
123 // open the BGZF file for writing, return failure if error
124 if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )
131 // write the BAM signature
132 const unsigned char SIGNATURE_LENGTH = 4;
133 const char* BAM_SIGNATURE = "BAM\1";
134 mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);
136 // write the SAM header text length
137 uint32_t samHeaderLen = samHeader.size();
138 if (IsBigEndian) SwapEndian_32(samHeaderLen);
139 mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);
141 // write the SAM header text
143 mBGZF.Write(samHeader.data(), samHeaderLen);
145 // write the number of reference sequences
146 uint32_t numReferenceSequences = referenceSequences.size();
147 if (IsBigEndian) SwapEndian_32(numReferenceSequences);
148 mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);
150 // =============================
151 // write the sequence dictionary
152 // =============================
154 RefVector::const_iterator rsIter = referenceSequences.begin();
155 RefVector::const_iterator rsEnd = referenceSequences.end();
156 for( ; rsIter != rsEnd; ++rsIter ) {
158 // write the reference sequence name length
159 uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
160 if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);
161 mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
163 // write the reference sequence name
164 mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
166 // write the reference sequence length
167 int32_t referenceLength = rsIter->RefLength;
168 if (IsBigEndian) SwapEndian_32(referenceLength);
169 mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);
176 // saves the alignment to the alignment archive
177 void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
179 // if BamAlignment contains only the core data and a raw char data buffer
180 // (as a result of BamReader::GetNextAlignmentCore())
181 if ( al.SupportData.HasCoreOnly ) {
183 // write the block size
184 unsigned int blockSize = al.SupportData.BlockLength;
185 if (IsBigEndian) SwapEndian_32(blockSize);
186 mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
188 // assign the BAM core data
190 buffer[0] = al.RefID;
191 buffer[1] = al.Position;
192 buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
193 buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
194 buffer[4] = al.SupportData.QuerySequenceLength;
195 buffer[5] = al.MateRefID;
196 buffer[6] = al.MatePosition;
197 buffer[7] = al.InsertSize;
199 // swap BAM core endian-ness, if necessary
201 for ( int i = 0; i < 8; ++i )
202 SwapEndian_32(buffer[i]);
205 // write the BAM core
206 mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
208 // write the raw char data
209 mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE);
212 // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
213 // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
216 // calculate char lengths
217 const unsigned int nameLength = al.Name.size() + 1;
218 const unsigned int numCigarOperations = al.CigarData.size();
219 const unsigned int queryLength = al.QueryBases.size();
220 const unsigned int tagDataLength = al.TagData.size();
222 // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
223 // force calculation of Bin before storing
224 const int endPosition = al.GetEndPosition();
225 const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
227 // create our packed cigar string
229 CreatePackedCigar(al.CigarData, packedCigar);
230 const unsigned int packedCigarLength = packedCigar.size();
234 EncodeQuerySequence(al.QueryBases, encodedQuery);
235 const unsigned int encodedQueryLength = encodedQuery.size();
237 // write the block size
238 const unsigned int dataBlockSize = nameLength +
243 unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
244 if (IsBigEndian) SwapEndian_32(blockSize);
245 mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
247 // assign the BAM core data
249 buffer[0] = al.RefID;
250 buffer[1] = al.Position;
251 buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
252 buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
253 buffer[4] = queryLength;
254 buffer[5] = al.MateRefID;
255 buffer[6] = al.MatePosition;
256 buffer[7] = al.InsertSize;
258 // swap BAM core endian-ness, if necessary
260 for ( int i = 0; i < 8; ++i )
261 SwapEndian_32(buffer[i]);
264 // write the BAM core
265 mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
267 // write the query name
268 mBGZF.Write(al.Name.c_str(), nameLength);
270 // write the packed cigar
273 char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
274 memcpy(cigarData, packedCigar.data(), packedCigarLength);
276 for (unsigned int i = 0; i < packedCigarLength; ++i) {
278 SwapEndian_32p(&cigarData[i]);
281 mBGZF.Write(cigarData, packedCigarLength);
285 mBGZF.Write(packedCigar.data(), packedCigarLength);
287 // write the encoded query sequence
288 mBGZF.Write(encodedQuery.data(), encodedQueryLength);
290 // write the base qualities
291 char* pBaseQualities = (char*)al.Qualities.data();
292 for(unsigned int i = 0; i < queryLength; i++)
293 pBaseQualities[i] -= 33; // FASTQ conversion
294 mBGZF.Write(pBaseQualities, queryLength);
296 // write the read group tag
299 char* tagData = (char*)calloc(sizeof(char), tagDataLength);
300 memcpy(tagData, al.TagData.data(), tagDataLength);
303 while ( (unsigned int)i < tagDataLength ) {
305 i += 2; // skip tag type (e.g. "RG", "NM", etc)
306 uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning
307 ++i; // skip value type
317 SwapEndian_16p(&tagData[i]);
318 i+=2; // sizeof(uint16_t)
323 SwapEndian_32p(&tagData[i]);
324 i+=4; // sizeof(uint32_t)
328 SwapEndian_64p(&tagData[i]);
329 i+=8; // sizeof(uint64_t)
334 while (tagData[i]) { ++i; }
335 ++i; // increment one more for null terminator
339 fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
345 mBGZF.Write(tagData, tagDataLength);
349 mBGZF.Write(al.TagData.data(), tagDataLength);