]> git.donarmstrong.com Git - bamtools.git/blob - src/api/internal/BamWriter_p.cpp
Added re-calculation of BamAlignment's BinID during
[bamtools.git] / src / api / internal / BamWriter_p.cpp
1 // ***************************************************************************
2 // BamWriter_p.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 16 June 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides the basic functionality for producing BAM files
9 // ***************************************************************************
10
11 #include <api/BamAlignment.h>
12 #include <api/BamConstants.h>
13 #include <api/internal/BamWriter_p.h>
14 using namespace BamTools;
15 using namespace BamTools::Internal;
16
17 #include <cstdio>
18 #include <cstdlib>
19 #include <cstring>
20 using namespace std;
21
22 // ctor
23 BamWriterPrivate::BamWriterPrivate(void)
24     : m_isBigEndian( BamTools::SystemIsBigEndian() )
25 { }
26
27 // dtor
28 BamWriterPrivate::~BamWriterPrivate(void) {
29     m_stream.Close();
30 }
31
32 // calculates minimum bin for a BAM alignment interval
33 unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
34     --end;
35     if ( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
36     if ( (begin >> 17) == (end >> 17) ) return  585 + (begin >> 17);
37     if ( (begin >> 20) == (end >> 20) ) return   73 + (begin >> 20);
38     if ( (begin >> 23) == (end >> 23) ) return    9 + (begin >> 23);
39     if ( (begin >> 26) == (end >> 26) ) return    1 + (begin >> 26);
40     return 0;
41 }
42
43 // closes the alignment archive
44 void BamWriterPrivate::Close(void) {
45     m_stream.Close();
46 }
47
48 // creates a cigar string from the supplied alignment
49 void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
50
51     // initialize
52     const unsigned int numCigarOperations = cigarOperations.size();
53     packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT);
54
55     // pack the cigar data into the string
56     unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
57
58     // iterate over cigar operations
59     vector<CigarOp>::const_iterator coIter = cigarOperations.begin();
60     vector<CigarOp>::const_iterator coEnd  = cigarOperations.end();
61     for ( ; coIter != coEnd; ++coIter ) {
62
63         // store op in packedCigar
64         unsigned int cigarOp;
65         switch ( coIter->Type ) {
66             case (Constants::BAM_CIGAR_MATCH_CHAR)    : cigarOp = Constants::BAM_CIGAR_MATCH;    break;
67             case (Constants::BAM_CIGAR_INS_CHAR)      : cigarOp = Constants::BAM_CIGAR_INS;      break;
68             case (Constants::BAM_CIGAR_DEL_CHAR)      : cigarOp = Constants::BAM_CIGAR_DEL;      break;
69             case (Constants::BAM_CIGAR_REFSKIP_CHAR)  : cigarOp = Constants::BAM_CIGAR_REFSKIP;  break;
70             case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_SOFTCLIP; break;
71             case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_HARDCLIP; break;
72             case (Constants::BAM_CIGAR_PAD_CHAR)      : cigarOp = Constants::BAM_CIGAR_PAD;      break;
73             case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_SEQMATCH; break;
74             case (Constants::BAM_CIGAR_MISMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MISMATCH; break;
75             default:
76               fprintf(stderr, "BamWriter ERROR: unknown cigar operation found: %c\n", coIter->Type);
77               exit(1);
78         }
79
80         *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp;
81         pPackedCigar++;
82     }
83 }
84
85 // encodes the supplied query sequence into 4-bit notation
86 void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
87
88     // prepare the encoded query string
89     const unsigned int queryLen = query.size();
90     const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
91     encodedQuery.resize(encodedQueryLen);
92     char* pEncodedQuery = (char*)encodedQuery.data();
93     const char* pQuery = (const char*)query.data();
94
95     unsigned char nucleotideCode;
96     bool useHighWord = true;
97
98     while ( *pQuery ) {
99         switch ( *pQuery ) {
100             case (Constants::BAM_DNA_EQUAL) : nucleotideCode = Constants::BAM_BASECODE_EQUAL; break;
101             case (Constants::BAM_DNA_A)     : nucleotideCode = Constants::BAM_BASECODE_A;     break;
102             case (Constants::BAM_DNA_C)     : nucleotideCode = Constants::BAM_BASECODE_C;     break;
103             case (Constants::BAM_DNA_G)     : nucleotideCode = Constants::BAM_BASECODE_G;     break;
104             case (Constants::BAM_DNA_T)     : nucleotideCode = Constants::BAM_BASECODE_T;     break;
105             case (Constants::BAM_DNA_N)     : nucleotideCode = Constants::BAM_BASECODE_N;     break;
106             default:
107                 fprintf(stderr, "BamWriter ERROR: only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
108                 exit(1);
109         }
110
111         // pack the nucleotide code
112         if ( useHighWord ) {
113             *pEncodedQuery = nucleotideCode << 4;
114             useHighWord = false;
115         } else {
116             *pEncodedQuery |= nucleotideCode;
117             ++pEncodedQuery;
118             useHighWord = true;
119         }
120
121         // increment the query position
122         ++pQuery;
123     }
124 }
125
126 // returns whether BAM file is open for writing or not
127 bool BamWriterPrivate::IsOpen(void) const {
128     return m_stream.IsOpen;
129 }
130
131 // opens the alignment archive
132 bool BamWriterPrivate::Open(const string& filename,
133                             const string& samHeaderText,
134                             const RefVector& referenceSequences)
135 {
136     // open the BGZF file for writing, return failure if error
137     if ( !m_stream.Open(filename, "wb") )
138         return false;
139
140     // write BAM file 'metadata' components
141     WriteMagicNumber();
142     WriteSamHeaderText(samHeaderText);
143     WriteReferences(referenceSequences);
144     return true;
145 }
146
147 // saves the alignment to the alignment archive
148 void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
149
150     // if BamAlignment contains only the core data and a raw char data buffer
151     // (as a result of BamReader::GetNextAlignmentCore())
152     if ( al.SupportData.HasCoreOnly ) {
153
154         // write the block size
155         unsigned int blockSize = al.SupportData.BlockLength;
156         if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
157         m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
158
159         // re-calculate bin (in case BamAlignment's position has been previously modified)
160         const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition());
161
162         // assign the BAM core data
163         uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
164         buffer[0] = al.RefID;
165         buffer[1] = al.Position;
166         buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
167         buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
168         buffer[4] = al.SupportData.QuerySequenceLength;
169         buffer[5] = al.MateRefID;
170         buffer[6] = al.MatePosition;
171         buffer[7] = al.InsertSize;
172
173         // swap BAM core endian-ness, if necessary
174         if ( m_isBigEndian ) {
175             for ( int i = 0; i < 8; ++i )
176                 BamTools::SwapEndian_32(buffer[i]);
177         }
178
179         // write the BAM core
180         m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
181
182         // write the raw char data
183         m_stream.Write((char*)al.SupportData.AllCharData.data(),
184                        al.SupportData.BlockLength-Constants::BAM_CORE_SIZE);
185     }
186
187     // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
188     // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
189     else {
190
191         // calculate char lengths
192         const unsigned int nameLength         = al.Name.size() + 1;
193         const unsigned int numCigarOperations = al.CigarData.size();
194         const unsigned int queryLength        = al.QueryBases.size();
195         const unsigned int tagDataLength      = al.TagData.size();
196
197         // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
198         // force calculation of Bin before storing
199         const int endPosition = al.GetEndPosition();
200         const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
201
202         // create our packed cigar string
203         string packedCigar;
204         CreatePackedCigar(al.CigarData, packedCigar);
205         const unsigned int packedCigarLength = packedCigar.size();
206
207         // encode the query
208         string encodedQuery;
209         EncodeQuerySequence(al.QueryBases, encodedQuery);
210         const unsigned int encodedQueryLength = encodedQuery.size();
211
212         // write the block size
213         const unsigned int dataBlockSize = nameLength +
214                                            packedCigarLength +
215                                            encodedQueryLength +
216                                            queryLength +
217                                            tagDataLength;
218         unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize;
219         if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
220         m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
221
222         // assign the BAM core data
223         uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
224         buffer[0] = al.RefID;
225         buffer[1] = al.Position;
226         buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
227         buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
228         buffer[4] = queryLength;
229         buffer[5] = al.MateRefID;
230         buffer[6] = al.MatePosition;
231         buffer[7] = al.InsertSize;
232
233         // swap BAM core endian-ness, if necessary
234         if ( m_isBigEndian ) {
235             for ( int i = 0; i < 8; ++i )
236                 BamTools::SwapEndian_32(buffer[i]);
237         }
238
239         // write the BAM core
240         m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
241
242         // write the query name
243         m_stream.Write(al.Name.c_str(), nameLength);
244
245         // write the packed cigar
246         if ( m_isBigEndian ) {
247             char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
248             memcpy(cigarData, packedCigar.data(), packedCigarLength);
249             if ( m_isBigEndian ) {
250                 for ( unsigned int i = 0; i < packedCigarLength; ++i )
251                     BamTools::SwapEndian_32p(&cigarData[i]);
252             }
253             m_stream.Write(cigarData, packedCigarLength);
254             free(cigarData);
255         }
256         else
257             m_stream.Write(packedCigar.data(), packedCigarLength);
258
259         // write the encoded query sequence
260         m_stream.Write(encodedQuery.data(), encodedQueryLength);
261
262         // write the base qualities
263         char* pBaseQualities = (char*)al.Qualities.data();
264         for ( unsigned int i = 0; i < queryLength; ++i )
265             pBaseQualities[i] -= 33; // FASTQ conversion
266         m_stream.Write(pBaseQualities, queryLength);
267
268         // write the read group tag
269         if ( m_isBigEndian ) {
270
271             char* tagData = (char*)calloc(sizeof(char), tagDataLength);
272             memcpy(tagData, al.TagData.data(), tagDataLength);
273
274             int i = 0;
275             while ( (unsigned int)i < tagDataLength ) {
276
277                 i += Constants::BAM_TAG_TAGSIZE;  // skip tag chars (e.g. "RG", "NM", etc.)
278                 const char type = tagData[i];     // get tag type at position i
279                 ++i;
280
281                 switch ( type ) {
282
283                     case(Constants::BAM_TAG_TYPE_ASCII) :
284                     case(Constants::BAM_TAG_TYPE_INT8)  :
285                     case(Constants::BAM_TAG_TYPE_UINT8) :
286                         ++i;
287                         break;
288
289                     case(Constants::BAM_TAG_TYPE_INT16)  :
290                     case(Constants::BAM_TAG_TYPE_UINT16) :
291                         BamTools::SwapEndian_16p(&tagData[i]);
292                         i += sizeof(uint16_t);
293                         break;
294
295                     case(Constants::BAM_TAG_TYPE_FLOAT)  :
296                     case(Constants::BAM_TAG_TYPE_INT32)  :
297                     case(Constants::BAM_TAG_TYPE_UINT32) :
298                         BamTools::SwapEndian_32p(&tagData[i]);
299                         i += sizeof(uint32_t);
300                         break;
301
302                     case(Constants::BAM_TAG_TYPE_HEX) :
303                     case(Constants::BAM_TAG_TYPE_STRING) :
304                         // no endian swapping necessary for hex-string/string data
305                         while ( tagData[i] )
306                             ++i;
307                         // increment one more for null terminator
308                         ++i;
309                         break;
310
311                     case(Constants::BAM_TAG_TYPE_ARRAY) :
312
313                     {
314                         // read array type
315                         const char arrayType = tagData[i];
316                         ++i;
317
318                         // swap endian-ness of number of elements in place, then retrieve for loop
319                         BamTools::SwapEndian_32p(&tagData[i]);
320                         int32_t numElements;
321                         memcpy(&numElements, &tagData[i], sizeof(uint32_t));
322                         i += sizeof(uint32_t);
323
324                         // swap endian-ness of array elements
325                         for ( int j = 0; j < numElements; ++j ) {
326                             switch (arrayType) {
327                                 case (Constants::BAM_TAG_TYPE_INT8)  :
328                                 case (Constants::BAM_TAG_TYPE_UINT8) :
329                                     // no endian-swapping necessary
330                                     ++i;
331                                     break;
332                                 case (Constants::BAM_TAG_TYPE_INT16)  :
333                                 case (Constants::BAM_TAG_TYPE_UINT16) :
334                                     BamTools::SwapEndian_16p(&tagData[i]);
335                                     i += sizeof(uint16_t);
336                                     break;
337                                 case (Constants::BAM_TAG_TYPE_FLOAT)  :
338                                 case (Constants::BAM_TAG_TYPE_INT32)  :
339                                 case (Constants::BAM_TAG_TYPE_UINT32) :
340                                     BamTools::SwapEndian_32p(&tagData[i]);
341                                     i += sizeof(uint32_t);
342                                     break;
343                                 default:
344                                     // error case
345                                     fprintf(stderr,
346                                             "BamWriter ERROR: unknown binary array type encountered: [%c]\n",
347                                             arrayType);
348                                     exit(1);
349                             }
350                         }
351
352                         break;
353                     }
354
355                     default :
356                         fprintf(stderr, "BamWriter ERROR: invalid tag value type\n"); // shouldn't get here
357                         free(tagData);
358                         exit(1);
359                 }
360             }
361             m_stream.Write(tagData, tagDataLength);
362             free(tagData);
363         }
364         else
365             m_stream.Write(al.TagData.data(), tagDataLength);
366     }
367 }
368
369 void BamWriterPrivate::SetWriteCompressed(bool ok) {
370
371     // warn if BAM file is already open
372     // modifying compression is not allowed in this case
373     if ( IsOpen() ) {
374         cerr << "BamWriter WARNING: attempting to change compression mode on an open BAM file is not allowed. "
375              << "Ignoring request." << endl;
376         return;
377     }
378
379     // set BgzfStream compression mode
380     m_stream.SetWriteCompressed(ok);
381 }
382
383 void BamWriterPrivate::WriteMagicNumber(void) {
384     // write BAM file 'magic number'
385     m_stream.Write(Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH);
386 }
387
388 void BamWriterPrivate::WriteReferences(const BamTools::RefVector& referenceSequences) {
389
390     // write the number of reference sequences
391     uint32_t numReferenceSequences = referenceSequences.size();
392     if ( m_isBigEndian ) BamTools::SwapEndian_32(numReferenceSequences);
393     m_stream.Write((char*)&numReferenceSequences, Constants::BAM_SIZEOF_INT);
394
395     // foreach reference sequence
396     RefVector::const_iterator rsIter = referenceSequences.begin();
397     RefVector::const_iterator rsEnd  = referenceSequences.end();
398     for ( ; rsIter != rsEnd; ++rsIter ) {
399
400         // write the reference sequence name length
401         uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
402         if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceSequenceNameLen);
403         m_stream.Write((char*)&referenceSequenceNameLen, Constants::BAM_SIZEOF_INT);
404
405         // write the reference sequence name
406         m_stream.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
407
408         // write the reference sequence length
409         int32_t referenceLength = rsIter->RefLength;
410         if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceLength);
411         m_stream.Write((char*)&referenceLength, Constants::BAM_SIZEOF_INT);
412     }
413 }
414
415 void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText) {
416
417     // write the SAM header  text length
418     uint32_t samHeaderLen = samHeaderText.size();
419     if ( m_isBigEndian ) BamTools::SwapEndian_32(samHeaderLen);
420     m_stream.Write((char*)&samHeaderLen, Constants::BAM_SIZEOF_INT);
421
422     // write the SAM header text
423     if ( samHeaderLen > 0 )
424         m_stream.Write(samHeaderText.data(), samHeaderLen);
425 }