]> git.donarmstrong.com Git - bamtools.git/blob - BamWriter.cpp
Initial import.
[bamtools.git] / BamWriter.cpp
1 // ***************************************************************************
2 // BamWriter (c) 2009 Michael Strömberg
3 // Marth Lab, Deptartment of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Provides the basic functionality for producing BAM files
7 // ***************************************************************************
8
9 #include "BamWriter.h"
10
11 // constructor
12 BamWriter::BamWriter(void)
13 {}
14
15 // destructor
16 BamWriter::~BamWriter(void) {
17         if(mBGZF.IsOpen) BgzfClose();
18 }
19
20 // closes the BAM file
21 void BamWriter::BgzfClose(void) {
22
23         mBGZF.IsOpen = false;
24
25         // flush the BGZF block
26         BgzfFlushBlock();
27
28         // flush and close
29         fflush(mBGZF.Stream);
30         fclose(mBGZF.Stream);
31 }
32
33 // compresses the current block
34 int BamWriter::BgzfDeflateBlock(void) {
35
36         // initialize the gzip header
37         char* buffer = mBGZF.CompressedBlock;
38         unsigned int bufferSize = mBGZF.CompressedBlockSize;
39
40         memset(buffer, 0, 18);
41         buffer[0]  = GZIP_ID1;
42         buffer[1]  = (char)GZIP_ID2;
43         buffer[2]  = CM_DEFLATE;
44         buffer[3]  = FLG_FEXTRA;
45         buffer[9]  = (char)OS_UNKNOWN;
46         buffer[10] = BGZF_XLEN;
47         buffer[12] = BGZF_ID1;
48         buffer[13] = BGZF_ID2;
49         buffer[14] = BGZF_LEN;
50
51         // loop to retry for blocks that do not compress enough
52         int inputLength = mBGZF.BlockOffset;
53         int compressedLength = 0;
54
55         while(true) {
56
57                 z_stream zs;
58                 zs.zalloc    = NULL;
59                 zs.zfree     = NULL;
60                 zs.next_in   = (Bytef*)mBGZF.UncompressedBlock;
61                 zs.avail_in  = inputLength;
62                 zs.next_out  = (Bytef*)&buffer[BLOCK_HEADER_LENGTH];
63                 zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
64
65                 // initialize the zlib compression algorithm
66                 if(deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) {
67                         printf("ERROR: zlib deflate initialization failed.\n");
68                         exit(1);
69                 }
70
71                 // compress the data
72                 int status = deflate(&zs, Z_FINISH);
73                 if(status != Z_STREAM_END) {
74                         deflateEnd(&zs);
75
76                         // reduce the input length and try again
77                         if(status == Z_OK) {
78                                 inputLength -= 1024;
79                                 if(inputLength < 0) {
80                                         printf("ERROR: input reduction failed.\n");
81                                         exit(1);
82                                 }
83                                 continue;
84                         }
85
86                         printf("ERROR: zlib deflate failed.\n");
87                         exit(1);
88                 }
89
90                 // finalize the compression routine
91                 if(deflateEnd(&zs) != Z_OK) {
92                         printf("ERROR: deflate end failed.\n");
93                         exit(1);
94                 }
95
96                 compressedLength = zs.total_out;
97                 compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
98
99                 if(compressedLength > MAX_BLOCK_SIZE) {
100                         printf("ERROR: deflate overflow.\n");
101                         exit(1);
102                 }
103
104                 break;
105         }
106
107         // store the compressed length
108         BgzfPackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1));
109
110         // store the CRC32 checksum
111         unsigned int crc = crc32(0, NULL, 0);
112         crc = crc32(crc, (Bytef*)mBGZF.UncompressedBlock, inputLength);
113         BgzfPackUnsignedInt(&buffer[compressedLength - 8], crc);
114         BgzfPackUnsignedInt(&buffer[compressedLength - 4], inputLength);
115
116         // ensure that we have less than a block of data left
117         int remaining = mBGZF.BlockOffset - inputLength;
118         if(remaining > 0) {
119                 if(remaining > inputLength) {
120                         printf("ERROR: remainder too large.\n");
121                         exit(1);
122                 }
123
124                 memcpy(mBGZF.UncompressedBlock, mBGZF.UncompressedBlock + inputLength, remaining);
125         }
126
127         mBGZF.BlockOffset = remaining;
128         return compressedLength;
129 }
130
131 // flushes the data in the BGZF block
132 void BamWriter::BgzfFlushBlock(void) {
133
134         // flush all of the remaining blocks
135         while(mBGZF.BlockOffset > 0) {
136
137                 // compress the data block
138                 int blockLength = BgzfDeflateBlock();
139
140                 // flush the data to our output stream
141                 int numBytesWritten = fwrite(mBGZF.CompressedBlock, 1, blockLength, mBGZF.Stream);
142
143                 if(numBytesWritten != blockLength) {
144                         printf("ERROR: Expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten);
145                         exit(1);
146                 }
147
148                 mBGZF.BlockAddress += blockLength;
149         }
150 }
151
152 // opens the BAM file for writing
153 void BamWriter::BgzfOpen(const string& filename) {
154
155         mBGZF.Stream = fopen(filename.c_str(), "wb");
156
157         if(!mBGZF.Stream) {
158                 printf("ERROR: Unable to open the BAM file (%s) for writing.\n", filename.c_str());
159                 exit(1);
160         }
161
162         mBGZF.IsOpen = true;
163 }
164
165 // writes the supplied data into the BGZF buffer
166 unsigned int BamWriter::BgzfWrite(const char* data, const unsigned int dataLen) {
167
168         // initialize
169         unsigned int numBytesWritten = 0;
170         const char* input = data;
171         unsigned int blockLength = mBGZF.UncompressedBlockSize;
172
173         // copy the data to the buffer
174         while(numBytesWritten < dataLen) {
175                 unsigned int copyLength = min(blockLength - mBGZF.BlockOffset, dataLen - numBytesWritten);
176                 char* buffer = mBGZF.UncompressedBlock;
177                 memcpy(buffer + mBGZF.BlockOffset, input, copyLength);
178
179                 mBGZF.BlockOffset += copyLength;
180                 input             += copyLength;
181                 numBytesWritten   += copyLength;
182
183                 if(mBGZF.BlockOffset == blockLength) BgzfFlushBlock();
184         }
185
186         return numBytesWritten;
187 }
188
189 // closes the alignment archive
190 void BamWriter::Close(void) {
191         if(mBGZF.IsOpen) BgzfClose();
192 }
193
194 // creates a cigar string from the supplied alignment
195 void BamWriter::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
196
197         // initialize
198         const unsigned int numCigarOperations = cigarOperations.size();
199         packedCigar.resize(numCigarOperations * SIZEOF_INT);
200
201         // pack the cigar data into the string
202         unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
203
204         unsigned int cigarOp;
205         vector<CigarOp>::const_iterator coIter;
206         for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {
207
208                 switch(coIter->Type) {
209           case 'M':
210                   cigarOp = BAM_CMATCH;
211                   break;
212           case 'I':
213                   cigarOp = BAM_CINS;
214                   break;
215           case 'D':
216                   cigarOp = BAM_CDEL;
217                   break;
218           case 'N':
219                   cigarOp = BAM_CREF_SKIP;
220                   break;
221           case 'S':
222                   cigarOp = BAM_CSOFT_CLIP;
223                   break;
224           case 'H':
225                   cigarOp = BAM_CHARD_CLIP;
226                   break;
227           case 'P':
228                   cigarOp = BAM_CPAD;
229                   break;
230           default:
231                   printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);
232                   exit(1);
233                 }
234
235                 *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
236                 pPackedCigar++;
237         }
238 }
239
240 // encodes the supplied query sequence into 4-bit notation
241 void BamWriter::EncodeQuerySequence(const string& query, string& encodedQuery) {
242
243         // prepare the encoded query string
244         const unsigned int queryLen = query.size();
245         const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
246         encodedQuery.resize(encodedQueryLen);
247         char* pEncodedQuery = (char*)encodedQuery.data();
248         const char* pQuery = (const char*)query.data();
249
250         unsigned char nucleotideCode;
251         bool useHighWord = true;
252
253         while(*pQuery) {
254
255                 switch(*pQuery) {
256                         case '=':
257                                 nucleotideCode = 0;
258                                 break;
259                         case 'A':
260                                 nucleotideCode = 1;
261                                 break;
262                         case 'C':
263                                 nucleotideCode = 2;
264                                 break;
265                         case 'G':
266                                 nucleotideCode = 4;
267                                 break;
268                         case 'T':
269                                 nucleotideCode = 8;
270                                 break;
271                         case 'N':
272                                 nucleotideCode = 15;
273                                 break;
274                         default:
275                                 printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
276                                 exit(1);
277                 }
278
279                 // pack the nucleotide code
280                 if(useHighWord) {
281                         *pEncodedQuery = nucleotideCode << 4;
282                         useHighWord = false;
283                 } else {
284                         *pEncodedQuery |= nucleotideCode;
285                         pEncodedQuery++;
286                         useHighWord = true;
287                 }
288
289                 // increment the query position
290                 pQuery++;
291         }
292 }
293
294 // opens the alignment archive
295 void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {
296
297         // open the BGZF file for writing
298         BgzfOpen(filename);
299
300         // ================
301         // write the header
302         // ================
303
304         // write the BAM signature
305         const unsigned char SIGNATURE_LENGTH = 4;
306         const char* BAM_SIGNATURE = "BAM\1";
307         BgzfWrite(BAM_SIGNATURE, SIGNATURE_LENGTH);
308
309         // write the SAM header text length
310         const unsigned int samHeaderLen = samHeader.size();
311         BgzfWrite((char*)&samHeaderLen, SIZEOF_INT);
312
313         // write the SAM header text
314         if(samHeaderLen > 0) BgzfWrite(samHeader.data(), samHeaderLen);
315
316         // write the number of reference sequences
317         const unsigned int numReferenceSequences = referenceSequences.size();
318         BgzfWrite((char*)&numReferenceSequences, SIZEOF_INT);
319
320         // =============================
321         // write the sequence dictionary
322         // =============================
323
324         RefVector::const_iterator rsIter;
325         for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
326
327                 // write the reference sequence name length
328                 const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1;
329                 BgzfWrite((char*)&referenceSequenceNameLen, SIZEOF_INT);
330
331                 // write the reference sequence name
332                 BgzfWrite(rsIter->RefName.c_str(), referenceSequenceNameLen);
333
334                 // write the reference sequence length
335                 BgzfWrite((char*)&rsIter->RefLength, SIZEOF_INT);
336         }
337 }
338
339 // saves the alignment to the alignment archive
340 void BamWriter::SaveAlignment(const BamAlignment& al) {
341
342         // initialize
343         const unsigned int nameLen            = al.Name.size() + 1;
344         const unsigned int queryLen           = al.QueryBases.size();
345         const unsigned int numCigarOperations = al.CigarData.size();
346
347         // create our packed cigar string
348         string packedCigar;
349         CreatePackedCigar(al.CigarData, packedCigar);
350         const unsigned int packedCigarLen = packedCigar.size();
351
352         // encode the query
353         string encodedQuery;
354         EncodeQuerySequence(al.QueryBases, encodedQuery);
355         const unsigned int encodedQueryLen = encodedQuery.size();
356
357         // create our read group tag
358         // TODO: add read group tag support when it shows up in the BamAlignment struct
359         //string readGroupTag;
360         //const unsigned int readGroupTagLen = 3 + mReadGroupLUT[al.ReadGroupCode].NameLen + 1;
361         //readGroupTag.resize(readGroupTagLen);
362         //char* pReadGroupTag = (char*)readGroupTag.data();
363         //sprintf(pReadGroupTag, "RGZ%s", mReadGroupLUT[al.ReadGroupCode].Name);
364
365         // assign the BAM core data
366         unsigned int buffer[8];
367         buffer[0] = al.RefID;
368         buffer[1] = al.Position;
369         buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;
370         buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
371         buffer[4] = queryLen;
372         buffer[5] = al.MateRefID;
373         buffer[6] = al.MatePosition;
374         buffer[7] = al.InsertSize;
375
376         // write the block size
377         const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen /* + readGroupTagLen*/;
378         const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
379
380         BgzfWrite((char*)&blockSize, SIZEOF_INT);
381
382         // write the BAM core
383         BgzfWrite((char*)&buffer, BAM_CORE_SIZE);
384
385         // write the query name
386         BgzfWrite(al.Name.c_str(), nameLen);
387
388         // write the packed cigar
389         BgzfWrite(packedCigar.data(), packedCigarLen);
390
391         // write the encoded query sequence
392         BgzfWrite(encodedQuery.data(), encodedQueryLen);
393
394         // write the base qualities
395         string baseQualities = al.Qualities;
396         char* pBaseQualities = (char*)al.Qualities.data();
397         for(unsigned int i = 0; i < queryLen; i++) pBaseQualities[i] -= 33;
398         BgzfWrite(pBaseQualities, queryLen);
399
400         // write the read group tag
401         // TODO: add read group tag support when it shows up in the BamAlignment struct
402         //BgzfWrite(readGroupTag.c_str(), readGroupTagLen);
403 }