]> git.donarmstrong.com Git - bamtools.git/blob - BamWriter.cpp
Added tag data support.
[bamtools.git] / BamWriter.cpp
1 // ***************************************************************************
2 // BamWriter (c) 2009 Michael Strömberg
3 // Marth Lab, Deptartment of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // The BGZF routines were adapted from the bgzf.c code developed at the Broad
7 // Institute.
8 // ---------------------------------------------------------------------------
9 // Provides the basic functionality for producing BAM files
10 // ***************************************************************************
11
12 #include "BamWriter.h"
13
14 // constructor
15 BamWriter::BamWriter(void)
16 {}
17
18 // destructor
19 BamWriter::~BamWriter(void) {
20         if(mBGZF.IsOpen) BgzfClose();
21 }
22
23 // closes the BAM file
24 void BamWriter::BgzfClose(void) {
25
26         mBGZF.IsOpen = false;
27
28         // flush the BGZF block
29         BgzfFlushBlock();
30
31         // flush and close
32         fflush(mBGZF.Stream);
33         fclose(mBGZF.Stream);
34 }
35
36 // compresses the current block
37 int BamWriter::BgzfDeflateBlock(void) {
38
39         // initialize the gzip header
40         char* buffer = mBGZF.CompressedBlock;
41         unsigned int bufferSize = mBGZF.CompressedBlockSize;
42
43         memset(buffer, 0, 18);
44         buffer[0]  = GZIP_ID1;
45         buffer[1]  = (char)GZIP_ID2;
46         buffer[2]  = CM_DEFLATE;
47         buffer[3]  = FLG_FEXTRA;
48         buffer[9]  = (char)OS_UNKNOWN;
49         buffer[10] = BGZF_XLEN;
50         buffer[12] = BGZF_ID1;
51         buffer[13] = BGZF_ID2;
52         buffer[14] = BGZF_LEN;
53
54         // loop to retry for blocks that do not compress enough
55         int inputLength = mBGZF.BlockOffset;
56         int compressedLength = 0;
57
58         while(true) {
59
60                 z_stream zs;
61                 zs.zalloc    = NULL;
62                 zs.zfree     = NULL;
63                 zs.next_in   = (Bytef*)mBGZF.UncompressedBlock;
64                 zs.avail_in  = inputLength;
65                 zs.next_out  = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
66                 zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
67
68                 // initialize the zlib compression algorithm
69                 if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {
70                         printf("ERROR: zlib deflate initialization failed.\n");
71                         exit(1);
72                 }
73
74                 // compress the data
75                 int status = deflate(&zs, Z_FINISH);
76                 if(status != Z_STREAM_END) {
77                         deflateEnd(&zs);
78
79                         // reduce the input length and try again
80                         if(status == Z_OK) {
81                                 inputLength -= 1024;
82                                 if(inputLength < 0) {
83                                         printf("ERROR: input reduction failed.\n");
84                                         exit(1);
85                                 }
86                                 continue;
87                         }
88
89                         printf("ERROR: zlib deflate failed.\n");
90                         exit(1);
91                 }
92
93                 // finalize the compression routine
94                 if(deflateEnd(&zs) != Z_OK) {
95                         printf("ERROR: deflate end failed.\n");
96                         exit(1);
97                 }
98
99                 compressedLength = zs.total_out;
100                 compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
101
102                 if(compressedLength > MAX_BLOCK_SIZE) {
103                         printf("ERROR: deflate overflow.\n");
104                         exit(1);
105                 }
106
107                 break;
108         }
109
110         // store the compressed length
111         BgzfPackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
112
113         // store the CRC32 checksum
114         unsigned int crc = crc32(0, NULL, 0);
115         crc = crc32(crc, (Bytef*)mBGZF.UncompressedBlock, inputLength);
116         BgzfPackUnsignedInt(&buffer[compressedLength - 8], crc);
117         BgzfPackUnsignedInt(&buffer[compressedLength - 4], inputLength);
118
119         // ensure that we have less than a block of data left
120         int remaining = mBGZF.BlockOffset - inputLength;
121         if(remaining > 0) {
122                 if(remaining > inputLength) {
123                         printf("ERROR: remainder too large.\n");
124                         exit(1);
125                 }
126
127                 memcpy(mBGZF.UncompressedBlock, mBGZF.UncompressedBlock + inputLength, remaining);
128         }
129
130         mBGZF.BlockOffset = remaining;
131         return compressedLength;
132 }
133
134 // flushes the data in the BGZF block
135 void BamWriter::BgzfFlushBlock(void) {
136
137         // flush all of the remaining blocks
138         while(mBGZF.BlockOffset > 0) {
139
140                 // compress the data block
141                 int blockLength = BgzfDeflateBlock();
142
143                 // flush the data to our output stream
144                 int numBytesWritten = fwrite(mBGZF.CompressedBlock, 1, blockLength, mBGZF.Stream);
145
146                 if(numBytesWritten != blockLength) {
147                         printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
148                         exit(1);
149                 }
150
151                 mBGZF.BlockAddress += blockLength;
152         }
153 }
154
155 // opens the BAM file for writing
156 void BamWriter::BgzfOpen(const string& filename) {
157
158         mBGZF.Stream = fopen(filename.c_str(), "wb");
159
160         if(!mBGZF.Stream) {
161                 printf("ERROR: Unable to open the BAM file (%s) for writing.\n", filename.c_str());
162                 exit(1);
163         }
164
165         mBGZF.IsOpen = true;
166 }
167
168 // writes the supplied data into the BGZF buffer
169 unsigned int BamWriter::BgzfWrite(const char* data, const unsigned int dataLen) {
170
171         // initialize
172         unsigned int numBytesWritten = 0;
173         const char* input = data;
174         unsigned int blockLength = mBGZF.UncompressedBlockSize;
175
176         // copy the data to the buffer
177         while(numBytesWritten < dataLen) {
178                 unsigned int copyLength = min(blockLength - mBGZF.BlockOffset, dataLen - numBytesWritten);
179                 char* buffer = mBGZF.UncompressedBlock;
180                 memcpy(buffer + mBGZF.BlockOffset, input, copyLength);
181
182                 mBGZF.BlockOffset += copyLength;
183                 input             += copyLength;
184                 numBytesWritten   += copyLength;
185
186                 if(mBGZF.BlockOffset == blockLength) BgzfFlushBlock();
187         }
188
189         return numBytesWritten;
190 }
191
192 // closes the alignment archive
193 void BamWriter::Close(void) {
194         if(mBGZF.IsOpen) BgzfClose();
195 }
196
197 // creates a cigar string from the supplied alignment
198 void BamWriter::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
199
200         // initialize
201         const unsigned int numCigarOperations = cigarOperations.size();
202         packedCigar.resize(numCigarOperations * SIZEOF_INT);
203
204         // pack the cigar data into the string
205         unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
206
207         unsigned int cigarOp;
208         vector<CigarOp>::const_iterator coIter;
209         for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {
210
211                 switch(coIter->Type) {
212           case 'M':
213                   cigarOp = BAM_CMATCH;
214                   break;
215           case 'I':
216                   cigarOp = BAM_CINS;
217                   break;
218           case 'D':
219                   cigarOp = BAM_CDEL;
220                   break;
221           case 'N':
222                   cigarOp = BAM_CREF_SKIP;
223                   break;
224           case 'S':
225                   cigarOp = BAM_CSOFT_CLIP;
226                   break;
227           case 'H':
228                   cigarOp = BAM_CHARD_CLIP;
229                   break;
230           case 'P':
231                   cigarOp = BAM_CPAD;
232                   break;
233           default:
234                   printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);
235                   exit(1);
236                 }
237
238                 *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
239                 pPackedCigar++;
240         }
241 }
242
243 // encodes the supplied query sequence into 4-bit notation
244 void BamWriter::EncodeQuerySequence(const string& query, string& encodedQuery) {
245
246         // prepare the encoded query string
247         const unsigned int queryLen = query.size();
248         const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
249         encodedQuery.resize(encodedQueryLen);
250         char* pEncodedQuery = (char*)encodedQuery.data();
251         const char* pQuery = (const char*)query.data();
252
253         unsigned char nucleotideCode;
254         bool useHighWord = true;
255
256         while(*pQuery) {
257
258                 switch(*pQuery) {
259                         case '=':
260                                 nucleotideCode = 0;
261                                 break;
262                         case 'A':
263                                 nucleotideCode = 1;
264                                 break;
265                         case 'C':
266                                 nucleotideCode = 2;
267                                 break;
268                         case 'G':
269                                 nucleotideCode = 4;
270                                 break;
271                         case 'T':
272                                 nucleotideCode = 8;
273                                 break;
274                         case 'N':
275                                 nucleotideCode = 15;
276                                 break;
277                         default:
278                                 printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
279                                 exit(1);
280                 }
281
282                 // pack the nucleotide code
283                 if(useHighWord) {
284                         *pEncodedQuery = nucleotideCode << 4;
285                         useHighWord = false;
286                 } else {
287                         *pEncodedQuery |= nucleotideCode;
288                         pEncodedQuery++;
289                         useHighWord = true;
290                 }
291
292                 // increment the query position
293                 pQuery++;
294         }
295 }
296
297 // opens the alignment archive
298 void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {
299
300         // open the BGZF file for writing
301         BgzfOpen(filename);
302
303         // ================
304         // write the header
305         // ================
306
307         // write the BAM signature
308         const unsigned char SIGNATURE_LENGTH = 4;
309         const char* BAM_SIGNATURE = "BAM\1";
310         BgzfWrite(BAM_SIGNATURE, SIGNATURE_LENGTH);
311
312         // write the SAM header text length
313         const unsigned int samHeaderLen = samHeader.size();
314         BgzfWrite((char*)&samHeaderLen, SIZEOF_INT);
315
316         // write the SAM header text
317         if(samHeaderLen > 0) BgzfWrite(samHeader.data(), samHeaderLen);
318
319         // write the number of reference sequences
320         const unsigned int numReferenceSequences = referenceSequences.size();
321         BgzfWrite((char*)&numReferenceSequences, SIZEOF_INT);
322
323         // =============================
324         // write the sequence dictionary
325         // =============================
326
327         RefVector::const_iterator rsIter;
328         for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
329
330                 // write the reference sequence name length
331                 const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1;
332                 BgzfWrite((char*)&referenceSequenceNameLen, SIZEOF_INT);
333
334                 // write the reference sequence name
335                 BgzfWrite(rsIter->RefName.c_str(), referenceSequenceNameLen);
336
337                 // write the reference sequence length
338                 BgzfWrite((char*)&rsIter->RefLength, SIZEOF_INT);
339         }
340 }
341
342 // saves the alignment to the alignment archive
343 void BamWriter::SaveAlignment(const BamAlignment& al) {
344
345         // initialize
346         const unsigned int nameLen            = al.Name.size() + 1;
347         const unsigned int queryLen           = al.QueryBases.size();
348         const unsigned int numCigarOperations = al.CigarData.size();
349
350         // create our packed cigar string
351         string packedCigar;
352         CreatePackedCigar(al.CigarData, packedCigar);
353         const unsigned int packedCigarLen = packedCigar.size();
354
355         // encode the query
356         string encodedQuery;
357         EncodeQuerySequence(al.QueryBases, encodedQuery);
358         const unsigned int encodedQueryLen = encodedQuery.size();
359
360         // store the tag data length
361         const unsigned int tagDataLength = al.TagData.size();
362
363         // assign the BAM core data
364         unsigned int buffer[8];
365         buffer[0] = al.RefID;
366         buffer[1] = al.Position;
367         buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;
368         buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
369         buffer[4] = queryLen;
370         buffer[5] = al.MateRefID;
371         buffer[6] = al.MatePosition;
372         buffer[7] = al.InsertSize;
373
374         // write the block size
375         const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;
376         const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
377
378         BgzfWrite((char*)&blockSize, SIZEOF_INT);
379
380         // write the BAM core
381         BgzfWrite((char*)&buffer, BAM_CORE_SIZE);
382
383         // write the query name
384         BgzfWrite(al.Name.c_str(), nameLen);
385
386         // write the packed cigar
387         BgzfWrite(packedCigar.data(), packedCigarLen);
388
389         // write the encoded query sequence
390         BgzfWrite(encodedQuery.data(), encodedQueryLen);
391
392         // write the base qualities
393         string baseQualities = al.Qualities;
394         char* pBaseQualities = (char*)al.Qualities.data();
395         for(unsigned int i = 0; i < queryLen; i++) pBaseQualities[i] -= 33;
396         BgzfWrite(pBaseQualities, queryLen);
397
398         // write the read group tag
399         BgzfWrite(al.TagData.data(), tagDataLength);
400 }