]> git.donarmstrong.com Git - bamtools.git/blob - BamWriter.cpp
fixed a bug where the terminal null wasn't being saved in SaveAlignment.
[bamtools.git] / BamWriter.cpp
1 // ***************************************************************************
2 // BamWriter.cpp (c) 2009 Michael Strömberg, Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 24 June 2009 (DB)
7 // ---------------------------------------------------------------------------
8 // The BGZF routines were adapted from the bgzf.c code developed at the Broad
9 // Institute.
10 // ---------------------------------------------------------------------------
11 // Provides the basic functionality for producing BAM files
12 // ***************************************************************************
13
14 #include "BamWriter.h"
15 using namespace BamTools;
16 using namespace std;
17
18 // constructor
19 BamWriter::BamWriter(void)
20 {}
21
22 // destructor
23 BamWriter::~BamWriter(void) {
24         if(mBGZF.IsOpen) BgzfClose();
25 }
26
27 // closes the BAM file
28 void BamWriter::BgzfClose(void) {
29
30         mBGZF.IsOpen = false;
31
32         // flush the BGZF block
33         BgzfFlushBlock();
34
35         // flush and close
36         fflush(mBGZF.Stream);
37         fclose(mBGZF.Stream);
38 }
39
40 // compresses the current block
41 int BamWriter::BgzfDeflateBlock(void) {
42
43         // initialize the gzip header
44         char* buffer = mBGZF.CompressedBlock;
45         unsigned int bufferSize = mBGZF.CompressedBlockSize;
46
47         memset(buffer, 0, 18);
48         buffer[0]  = GZIP_ID1;
49         buffer[1]  = (char)GZIP_ID2;
50         buffer[2]  = CM_DEFLATE;
51         buffer[3]  = FLG_FEXTRA;
52         buffer[9]  = (char)OS_UNKNOWN;
53         buffer[10] = BGZF_XLEN;
54         buffer[12] = BGZF_ID1;
55         buffer[13] = BGZF_ID2;
56         buffer[14] = BGZF_LEN;
57
58         // loop to retry for blocks that do not compress enough
59         int inputLength = mBGZF.BlockOffset;
60         int compressedLength = 0;
61
62         while(true) {
63
64                 z_stream zs;
65                 zs.zalloc    = NULL;
66                 zs.zfree     = NULL;
67                 zs.next_in   = (Bytef*)mBGZF.UncompressedBlock;
68                 zs.avail_in  = inputLength;
69                 zs.next_out  = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
70                 zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
71
72                 // initialize the zlib compression algorithm
73                 if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {
74                         printf("ERROR: zlib deflate initialization failed.\n");
75                         exit(1);
76                 }
77
78                 // compress the data
79                 int status = deflate(&zs, Z_FINISH);
80                 if(status != Z_STREAM_END) {
81                         deflateEnd(&zs);
82
83                         // reduce the input length and try again
84                         if(status == Z_OK) {
85                                 inputLength -= 1024;
86                                 if(inputLength < 0) {
87                                         printf("ERROR: input reduction failed.\n");
88                                         exit(1);
89                                 }
90                                 continue;
91                         }
92
93                         printf("ERROR: zlib deflate failed.\n");
94                         exit(1);
95                 }
96
97                 // finalize the compression routine
98                 if(deflateEnd(&zs) != Z_OK) {
99                         printf("ERROR: deflate end failed.\n");
100                         exit(1);
101                 }
102
103                 compressedLength = zs.total_out;
104                 compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
105
106                 if(compressedLength > MAX_BLOCK_SIZE) {
107                         printf("ERROR: deflate overflow.\n");
108                         exit(1);
109                 }
110
111                 break;
112         }
113
114         // store the compressed length
115         BgzfPackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
116
117         // store the CRC32 checksum
118         unsigned int crc = crc32(0, NULL, 0);
119         crc = crc32(crc, (Bytef*)mBGZF.UncompressedBlock, inputLength);
120         BgzfPackUnsignedInt(&buffer[compressedLength - 8], crc);
121         BgzfPackUnsignedInt(&buffer[compressedLength - 4], inputLength);
122
123         // ensure that we have less than a block of data left
124         int remaining = mBGZF.BlockOffset - inputLength;
125         if(remaining > 0) {
126                 if(remaining > inputLength) {
127                         printf("ERROR: remainder too large.\n");
128                         exit(1);
129                 }
130
131                 memcpy(mBGZF.UncompressedBlock, mBGZF.UncompressedBlock + inputLength, remaining);
132         }
133
134         mBGZF.BlockOffset = remaining;
135         return compressedLength;
136 }
137
138 // flushes the data in the BGZF block
139 void BamWriter::BgzfFlushBlock(void) {
140
141         // flush all of the remaining blocks
142         while(mBGZF.BlockOffset > 0) {
143
144                 // compress the data block
145                 int blockLength = BgzfDeflateBlock();
146
147                 // flush the data to our output stream
148                 int numBytesWritten = fwrite(mBGZF.CompressedBlock, 1, blockLength, mBGZF.Stream);
149
150                 if(numBytesWritten != blockLength) {
151                         printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
152                         exit(1);
153                 }
154
155                 mBGZF.BlockAddress += blockLength;
156         }
157 }
158
159 // opens the BAM file for writing
160 void BamWriter::BgzfOpen(const string& filename) {
161
162         mBGZF.Stream = fopen(filename.c_str(), "wb");
163
164         if(!mBGZF.Stream) {
165                 printf("ERROR: Unable to open the BAM file (%s) for writing.\n", filename.c_str());
166                 exit(1);
167         }
168
169         mBGZF.IsOpen = true;
170 }
171
172 // writes the supplied data into the BGZF buffer
173 unsigned int BamWriter::BgzfWrite(const char* data, const unsigned int dataLen) {
174
175         // initialize
176         unsigned int numBytesWritten = 0;
177         const char* input = data;
178         unsigned int blockLength = mBGZF.UncompressedBlockSize;
179
180         // copy the data to the buffer
181         while(numBytesWritten < dataLen) {
182                 unsigned int copyLength = min(blockLength - mBGZF.BlockOffset, dataLen - numBytesWritten);
183                 char* buffer = mBGZF.UncompressedBlock;
184                 memcpy(buffer + mBGZF.BlockOffset, input, copyLength);
185
186                 mBGZF.BlockOffset += copyLength;
187                 input             += copyLength;
188                 numBytesWritten   += copyLength;
189
190                 if(mBGZF.BlockOffset == blockLength) BgzfFlushBlock();
191         }
192
193         return numBytesWritten;
194 }
195
196 // closes the alignment archive
197 void BamWriter::Close(void) {
198         if(mBGZF.IsOpen) BgzfClose();
199 }
200
201 // creates a cigar string from the supplied alignment
202 void BamWriter::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
203
204         // initialize
205         const unsigned int numCigarOperations = cigarOperations.size();
206         packedCigar.resize(numCigarOperations * SIZEOF_INT);
207
208         // pack the cigar data into the string
209         unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
210
211         unsigned int cigarOp;
212         vector<CigarOp>::const_iterator coIter;
213         for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {
214
215                 switch(coIter->Type) {
216           case 'M':
217                   cigarOp = BAM_CMATCH;
218                   break;
219           case 'I':
220                   cigarOp = BAM_CINS;
221                   break;
222           case 'D':
223                   cigarOp = BAM_CDEL;
224                   break;
225           case 'N':
226                   cigarOp = BAM_CREF_SKIP;
227                   break;
228           case 'S':
229                   cigarOp = BAM_CSOFT_CLIP;
230                   break;
231           case 'H':
232                   cigarOp = BAM_CHARD_CLIP;
233                   break;
234           case 'P':
235                   cigarOp = BAM_CPAD;
236                   break;
237           default:
238                   printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);
239                   exit(1);
240                 }
241
242                 *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
243                 pPackedCigar++;
244         }
245 }
246
247 // encodes the supplied query sequence into 4-bit notation
248 void BamWriter::EncodeQuerySequence(const string& query, string& encodedQuery) {
249
250         // prepare the encoded query string
251         const unsigned int queryLen = query.size();
252         const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
253         encodedQuery.resize(encodedQueryLen);
254         char* pEncodedQuery = (char*)encodedQuery.data();
255         const char* pQuery = (const char*)query.data();
256
257         unsigned char nucleotideCode;
258         bool useHighWord = true;
259
260         while(*pQuery) {
261
262                 switch(*pQuery) {
263                         case '=':
264                                 nucleotideCode = 0;
265                                 break;
266                         case 'A':
267                                 nucleotideCode = 1;
268                                 break;
269                         case 'C':
270                                 nucleotideCode = 2;
271                                 break;
272                         case 'G':
273                                 nucleotideCode = 4;
274                                 break;
275                         case 'T':
276                                 nucleotideCode = 8;
277                                 break;
278                         case 'N':
279                                 nucleotideCode = 15;
280                                 break;
281                         default:
282                                 printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
283                                 exit(1);
284                 }
285
286                 // pack the nucleotide code
287                 if(useHighWord) {
288                         *pEncodedQuery = nucleotideCode << 4;
289                         useHighWord = false;
290                 } else {
291                         *pEncodedQuery |= nucleotideCode;
292                         pEncodedQuery++;
293                         useHighWord = true;
294                 }
295
296                 // increment the query position
297                 pQuery++;
298         }
299 }
300
301 // opens the alignment archive
302 void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {
303
304         // open the BGZF file for writing
305         BgzfOpen(filename);
306
307         // ================
308         // write the header
309         // ================
310
311         // write the BAM signature
312         const unsigned char SIGNATURE_LENGTH = 4;
313         const char* BAM_SIGNATURE = "BAM\1";
314         BgzfWrite(BAM_SIGNATURE, SIGNATURE_LENGTH);
315
316         // write the SAM header text length
317         const unsigned int samHeaderLen = samHeader.size();
318         BgzfWrite((char*)&samHeaderLen, SIZEOF_INT);
319
320         // write the SAM header text
321         if(samHeaderLen > 0) BgzfWrite(samHeader.data(), samHeaderLen);
322
323         // write the number of reference sequences
324         const unsigned int numReferenceSequences = referenceSequences.size();
325         BgzfWrite((char*)&numReferenceSequences, SIZEOF_INT);
326
327         // =============================
328         // write the sequence dictionary
329         // =============================
330
331         RefVector::const_iterator rsIter;
332         for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
333
334                 // write the reference sequence name length
335                 const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1;
336                 BgzfWrite((char*)&referenceSequenceNameLen, SIZEOF_INT);
337
338                 // write the reference sequence name
339                 BgzfWrite(rsIter->RefName.c_str(), referenceSequenceNameLen);
340
341                 // write the reference sequence length
342                 BgzfWrite((char*)&rsIter->RefLength, SIZEOF_INT);
343         }
344 }
345
346 // saves the alignment to the alignment archive
347 void BamWriter::SaveAlignment(const BamAlignment& al) {
348
349         // initialize
350         const unsigned int nameLen            = al.Name.size() + 1;
351         const unsigned int queryLen           = al.QueryBases.size();
352         const unsigned int numCigarOperations = al.CigarData.size();
353
354         // create our packed cigar string
355         string packedCigar;
356         CreatePackedCigar(al.CigarData, packedCigar);
357         const unsigned int packedCigarLen = packedCigar.size();
358
359         // encode the query
360         string encodedQuery;
361         EncodeQuerySequence(al.QueryBases, encodedQuery);
362         const unsigned int encodedQueryLen = encodedQuery.size();
363
364         // store the tag data length
365         const unsigned int tagDataLength = al.TagData.size() + 1;
366
367         // assign the BAM core data
368         unsigned int buffer[8];
369         buffer[0] = al.RefID;
370         buffer[1] = al.Position;
371         buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;
372         buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
373         buffer[4] = queryLen;
374         buffer[5] = al.MateRefID;
375         buffer[6] = al.MatePosition;
376         buffer[7] = al.InsertSize;
377
378         // write the block size
379         const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;
380         const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
381
382         BgzfWrite((char*)&blockSize, SIZEOF_INT);
383
384         // write the BAM core
385         BgzfWrite((char*)&buffer, BAM_CORE_SIZE);
386
387         // write the query name
388         BgzfWrite(al.Name.c_str(), nameLen);
389
390         // write the packed cigar
391         BgzfWrite(packedCigar.data(), packedCigarLen);
392
393         // write the encoded query sequence
394         BgzfWrite(encodedQuery.data(), encodedQueryLen);
395
396         // write the base qualities
397         string baseQualities = al.Qualities;
398         char* pBaseQualities = (char*)al.Qualities.data();
399         for(unsigned int i = 0; i < queryLen; i++) pBaseQualities[i] -= 33;
400         BgzfWrite(pBaseQualities, queryLen);
401
402         // write the read group tag
403         BgzfWrite(al.TagData.data(), tagDataLength);
404 }