]> git.donarmstrong.com Git - bamtools.git/blob - src/api/internal/BamWriter_p.cpp
Refactored shared pipe/file behavior into ILocalIODevice
[bamtools.git] / src / api / internal / BamWriter_p.cpp
1 // ***************************************************************************
2 // BamWriter_p.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 16 June 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides the basic functionality for producing BAM files
9 // ***************************************************************************
10
11 #include <api/BamAlignment.h>
12 #include <api/BamConstants.h>
13 #include <api/IBamIODevice.h>
14 #include <api/internal/BamWriter_p.h>
15 using namespace BamTools;
16 using namespace BamTools::Internal;
17
18 #include <cstdio>
19 #include <cstdlib>
20 #include <cstring>
21 using namespace std;
22
23 // ctor
24 BamWriterPrivate::BamWriterPrivate(void)
25     : m_isBigEndian( BamTools::SystemIsBigEndian() )
26 { }
27
28 // dtor
29 BamWriterPrivate::~BamWriterPrivate(void) {
30     m_stream.Close();
31 }
32
33 // calculates minimum bin for a BAM alignment interval
34 unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
35     --end;
36     if ( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
37     if ( (begin >> 17) == (end >> 17) ) return  585 + (begin >> 17);
38     if ( (begin >> 20) == (end >> 20) ) return   73 + (begin >> 20);
39     if ( (begin >> 23) == (end >> 23) ) return    9 + (begin >> 23);
40     if ( (begin >> 26) == (end >> 26) ) return    1 + (begin >> 26);
41     return 0;
42 }
43
44 // closes the alignment archive
45 void BamWriterPrivate::Close(void) {
46     m_stream.Close();
47 }
48
49 // creates a cigar string from the supplied alignment
50 void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
51
52     // initialize
53     const unsigned int numCigarOperations = cigarOperations.size();
54     packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT);
55
56     // pack the cigar data into the string
57     unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
58
59     // iterate over cigar operations
60     vector<CigarOp>::const_iterator coIter = cigarOperations.begin();
61     vector<CigarOp>::const_iterator coEnd  = cigarOperations.end();
62     for ( ; coIter != coEnd; ++coIter ) {
63
64         // store op in packedCigar
65         unsigned int cigarOp;
66         switch ( coIter->Type ) {
67             case (Constants::BAM_CIGAR_MATCH_CHAR)    : cigarOp = Constants::BAM_CIGAR_MATCH;    break;
68             case (Constants::BAM_CIGAR_INS_CHAR)      : cigarOp = Constants::BAM_CIGAR_INS;      break;
69             case (Constants::BAM_CIGAR_DEL_CHAR)      : cigarOp = Constants::BAM_CIGAR_DEL;      break;
70             case (Constants::BAM_CIGAR_REFSKIP_CHAR)  : cigarOp = Constants::BAM_CIGAR_REFSKIP;  break;
71             case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_SOFTCLIP; break;
72             case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_HARDCLIP; break;
73             case (Constants::BAM_CIGAR_PAD_CHAR)      : cigarOp = Constants::BAM_CIGAR_PAD;      break;
74             case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_SEQMATCH; break;
75             case (Constants::BAM_CIGAR_MISMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MISMATCH; break;
76             default:
77               fprintf(stderr, "BamWriter ERROR: unknown cigar operation found: %c\n", coIter->Type);
78               exit(1);
79         }
80
81         *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp;
82         pPackedCigar++;
83     }
84 }
85
86 // encodes the supplied query sequence into 4-bit notation
87 void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
88
89     // prepare the encoded query string
90     const unsigned int queryLen = query.size();
91     const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
92     encodedQuery.resize(encodedQueryLen);
93     char* pEncodedQuery = (char*)encodedQuery.data();
94     const char* pQuery = (const char*)query.data();
95
96     unsigned char nucleotideCode;
97     bool useHighWord = true;
98
99     while ( *pQuery ) {
100         switch ( *pQuery ) {
101             case (Constants::BAM_DNA_EQUAL) : nucleotideCode = Constants::BAM_BASECODE_EQUAL; break;
102             case (Constants::BAM_DNA_A)     : nucleotideCode = Constants::BAM_BASECODE_A;     break;
103             case (Constants::BAM_DNA_C)     : nucleotideCode = Constants::BAM_BASECODE_C;     break;
104             case (Constants::BAM_DNA_G)     : nucleotideCode = Constants::BAM_BASECODE_G;     break;
105             case (Constants::BAM_DNA_T)     : nucleotideCode = Constants::BAM_BASECODE_T;     break;
106             case (Constants::BAM_DNA_N)     : nucleotideCode = Constants::BAM_BASECODE_N;     break;
107             default:
108                 fprintf(stderr, "BamWriter ERROR: only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
109                 exit(1);
110         }
111
112         // pack the nucleotide code
113         if ( useHighWord ) {
114             *pEncodedQuery = nucleotideCode << 4;
115             useHighWord = false;
116         } else {
117             *pEncodedQuery |= nucleotideCode;
118             ++pEncodedQuery;
119             useHighWord = true;
120         }
121
122         // increment the query position
123         ++pQuery;
124     }
125 }
126
127 // returns whether BAM file is open for writing or not
128 bool BamWriterPrivate::IsOpen(void) const {
129     return m_stream.IsOpen();
130 }
131
132 // opens the alignment archive
133 bool BamWriterPrivate::Open(const string& filename,
134                             const string& samHeaderText,
135                             const RefVector& referenceSequences)
136 {
137     // open the BGZF file for writing, return failure if error
138     if ( !m_stream.Open(filename, IBamIODevice::WriteOnly) )
139         return false;
140
141     // write BAM file 'metadata' components
142     WriteMagicNumber();
143     WriteSamHeaderText(samHeaderText);
144     WriteReferences(referenceSequences);
145     return true;
146 }
147
148 // saves the alignment to the alignment archive
149 void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
150
151     // if BamAlignment contains only the core data and a raw char data buffer
152     // (as a result of BamReader::GetNextAlignmentCore())
153     if ( al.SupportData.HasCoreOnly ) {
154
155         // write the block size
156         unsigned int blockSize = al.SupportData.BlockLength;
157         if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
158         m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
159
160         // re-calculate bin (in case BamAlignment's position has been previously modified)
161         const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition());
162
163         // assign the BAM core data
164         uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
165         buffer[0] = al.RefID;
166         buffer[1] = al.Position;
167         buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
168         buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
169         buffer[4] = al.SupportData.QuerySequenceLength;
170         buffer[5] = al.MateRefID;
171         buffer[6] = al.MatePosition;
172         buffer[7] = al.InsertSize;
173
174         // swap BAM core endian-ness, if necessary
175         if ( m_isBigEndian ) {
176             for ( int i = 0; i < 8; ++i )
177                 BamTools::SwapEndian_32(buffer[i]);
178         }
179
180         // write the BAM core
181         m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
182
183         // write the raw char data
184         m_stream.Write((char*)al.SupportData.AllCharData.data(),
185                        al.SupportData.BlockLength-Constants::BAM_CORE_SIZE);
186     }
187
188     // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
189     // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
190     else {
191
192         // calculate char lengths
193         const unsigned int nameLength         = al.Name.size() + 1;
194         const unsigned int numCigarOperations = al.CigarData.size();
195         const unsigned int queryLength        = al.QueryBases.size();
196         const unsigned int tagDataLength      = al.TagData.size();
197
198         // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
199         // force calculation of Bin before storing
200         const int endPosition = al.GetEndPosition();
201         const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
202
203         // create our packed cigar string
204         string packedCigar;
205         CreatePackedCigar(al.CigarData, packedCigar);
206         const unsigned int packedCigarLength = packedCigar.size();
207
208         // encode the query
209         string encodedQuery;
210         EncodeQuerySequence(al.QueryBases, encodedQuery);
211         const unsigned int encodedQueryLength = encodedQuery.size();
212
213         // write the block size
214         const unsigned int dataBlockSize = nameLength +
215                                            packedCigarLength +
216                                            encodedQueryLength +
217                                            queryLength +
218                                            tagDataLength;
219         unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize;
220         if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
221         m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
222
223         // assign the BAM core data
224         uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
225         buffer[0] = al.RefID;
226         buffer[1] = al.Position;
227         buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
228         buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
229         buffer[4] = queryLength;
230         buffer[5] = al.MateRefID;
231         buffer[6] = al.MatePosition;
232         buffer[7] = al.InsertSize;
233
234         // swap BAM core endian-ness, if necessary
235         if ( m_isBigEndian ) {
236             for ( int i = 0; i < 8; ++i )
237                 BamTools::SwapEndian_32(buffer[i]);
238         }
239
240         // write the BAM core
241         m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
242
243         // write the query name
244         m_stream.Write(al.Name.c_str(), nameLength);
245
246         // write the packed cigar
247         if ( m_isBigEndian ) {
248             char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
249             memcpy(cigarData, packedCigar.data(), packedCigarLength);
250             if ( m_isBigEndian ) {
251                 for ( unsigned int i = 0; i < packedCigarLength; ++i )
252                     BamTools::SwapEndian_32p(&cigarData[i]);
253             }
254             m_stream.Write(cigarData, packedCigarLength);
255             free(cigarData);
256         }
257         else
258             m_stream.Write(packedCigar.data(), packedCigarLength);
259
260         // write the encoded query sequence
261         m_stream.Write(encodedQuery.data(), encodedQueryLength);
262
263         // write the base qualities
264         char* pBaseQualities = (char*)al.Qualities.data();
265         for ( unsigned int i = 0; i < queryLength; ++i )
266             pBaseQualities[i] -= 33; // FASTQ conversion
267         m_stream.Write(pBaseQualities, queryLength);
268
269         // write the read group tag
270         if ( m_isBigEndian ) {
271
272             char* tagData = (char*)calloc(sizeof(char), tagDataLength);
273             memcpy(tagData, al.TagData.data(), tagDataLength);
274
275             int i = 0;
276             while ( (unsigned int)i < tagDataLength ) {
277
278                 i += Constants::BAM_TAG_TAGSIZE;  // skip tag chars (e.g. "RG", "NM", etc.)
279                 const char type = tagData[i];     // get tag type at position i
280                 ++i;
281
282                 switch ( type ) {
283
284                     case(Constants::BAM_TAG_TYPE_ASCII) :
285                     case(Constants::BAM_TAG_TYPE_INT8)  :
286                     case(Constants::BAM_TAG_TYPE_UINT8) :
287                         ++i;
288                         break;
289
290                     case(Constants::BAM_TAG_TYPE_INT16)  :
291                     case(Constants::BAM_TAG_TYPE_UINT16) :
292                         BamTools::SwapEndian_16p(&tagData[i]);
293                         i += sizeof(uint16_t);
294                         break;
295
296                     case(Constants::BAM_TAG_TYPE_FLOAT)  :
297                     case(Constants::BAM_TAG_TYPE_INT32)  :
298                     case(Constants::BAM_TAG_TYPE_UINT32) :
299                         BamTools::SwapEndian_32p(&tagData[i]);
300                         i += sizeof(uint32_t);
301                         break;
302
303                     case(Constants::BAM_TAG_TYPE_HEX) :
304                     case(Constants::BAM_TAG_TYPE_STRING) :
305                         // no endian swapping necessary for hex-string/string data
306                         while ( tagData[i] )
307                             ++i;
308                         // increment one more for null terminator
309                         ++i;
310                         break;
311
312                     case(Constants::BAM_TAG_TYPE_ARRAY) :
313
314                     {
315                         // read array type
316                         const char arrayType = tagData[i];
317                         ++i;
318
319                         // swap endian-ness of number of elements in place, then retrieve for loop
320                         BamTools::SwapEndian_32p(&tagData[i]);
321                         int32_t numElements;
322                         memcpy(&numElements, &tagData[i], sizeof(uint32_t));
323                         i += sizeof(uint32_t);
324
325                         // swap endian-ness of array elements
326                         for ( int j = 0; j < numElements; ++j ) {
327                             switch (arrayType) {
328                                 case (Constants::BAM_TAG_TYPE_INT8)  :
329                                 case (Constants::BAM_TAG_TYPE_UINT8) :
330                                     // no endian-swapping necessary
331                                     ++i;
332                                     break;
333                                 case (Constants::BAM_TAG_TYPE_INT16)  :
334                                 case (Constants::BAM_TAG_TYPE_UINT16) :
335                                     BamTools::SwapEndian_16p(&tagData[i]);
336                                     i += sizeof(uint16_t);
337                                     break;
338                                 case (Constants::BAM_TAG_TYPE_FLOAT)  :
339                                 case (Constants::BAM_TAG_TYPE_INT32)  :
340                                 case (Constants::BAM_TAG_TYPE_UINT32) :
341                                     BamTools::SwapEndian_32p(&tagData[i]);
342                                     i += sizeof(uint32_t);
343                                     break;
344                                 default:
345                                     // error case
346                                     fprintf(stderr,
347                                             "BamWriter ERROR: unknown binary array type encountered: [%c]\n",
348                                             arrayType);
349                                     exit(1);
350                             }
351                         }
352
353                         break;
354                     }
355
356                     default :
357                         fprintf(stderr, "BamWriter ERROR: invalid tag value type\n"); // shouldn't get here
358                         free(tagData);
359                         exit(1);
360                 }
361             }
362             m_stream.Write(tagData, tagDataLength);
363             free(tagData);
364         }
365         else
366             m_stream.Write(al.TagData.data(), tagDataLength);
367     }
368 }
369
370 void BamWriterPrivate::SetWriteCompressed(bool ok) {
371
372     // warn if BAM file is already open
373     // modifying compression is not allowed in this case
374     if ( IsOpen() ) {
375         cerr << "BamWriter WARNING: attempting to change compression mode on an open BAM file is not allowed. "
376              << "Ignoring request." << endl;
377         return;
378     }
379
380     // set BgzfStream compression mode
381     m_stream.SetWriteCompressed(ok);
382 }
383
384 void BamWriterPrivate::WriteMagicNumber(void) {
385     // write BAM file 'magic number'
386     m_stream.Write(Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH);
387 }
388
389 void BamWriterPrivate::WriteReferences(const BamTools::RefVector& referenceSequences) {
390
391     // write the number of reference sequences
392     uint32_t numReferenceSequences = referenceSequences.size();
393     if ( m_isBigEndian ) BamTools::SwapEndian_32(numReferenceSequences);
394     m_stream.Write((char*)&numReferenceSequences, Constants::BAM_SIZEOF_INT);
395
396     // foreach reference sequence
397     RefVector::const_iterator rsIter = referenceSequences.begin();
398     RefVector::const_iterator rsEnd  = referenceSequences.end();
399     for ( ; rsIter != rsEnd; ++rsIter ) {
400
401         // write the reference sequence name length
402         uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
403         if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceSequenceNameLen);
404         m_stream.Write((char*)&referenceSequenceNameLen, Constants::BAM_SIZEOF_INT);
405
406         // write the reference sequence name
407         m_stream.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
408
409         // write the reference sequence length
410         int32_t referenceLength = rsIter->RefLength;
411         if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceLength);
412         m_stream.Write((char*)&referenceLength, Constants::BAM_SIZEOF_INT);
413     }
414 }
415
416 void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText) {
417
418     // write the SAM header  text length
419     uint32_t samHeaderLen = samHeaderText.size();
420     if ( m_isBigEndian ) BamTools::SwapEndian_32(samHeaderLen);
421     m_stream.Write((char*)&samHeaderLen, Constants::BAM_SIZEOF_INT);
422
423     // write the SAM header text
424     if ( samHeaderLen > 0 )
425         m_stream.Write(samHeaderText.data(), samHeaderLen);
426 }