]> git.donarmstrong.com Git - bamtools.git/blob - src/api/internal/BamWriter_p.cpp
e77c099f8a53b6488e39c6c2e3a630c3e1884570
[bamtools.git] / src / api / internal / BamWriter_p.cpp
1 // ***************************************************************************
2 // BamWriter_p.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 16 June 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides the basic functionality for producing BAM files
8 // ***************************************************************************
9
10 #include <api/BamAlignment.h>
11 #include <api/BamConstants.h>
12 #include <api/internal/BamWriter_p.h>
13 using namespace BamTools;
14 using namespace BamTools::Internal;
15
16 #include <cstdio>
17 #include <cstdlib>
18 #include <cstring>
19 using namespace std;
20
21 // ctor
22 BamWriterPrivate::BamWriterPrivate(void)
23     : m_isBigEndian( BamTools::SystemIsBigEndian() )
24 { }
25
26 // dtor
27 BamWriterPrivate::~BamWriterPrivate(void) {
28     m_stream.Close();
29 }
30
31 // calculates minimum bin for a BAM alignment interval
32 unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
33     --end;
34     if ( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
35     if ( (begin >> 17) == (end >> 17) ) return  585 + (begin >> 17);
36     if ( (begin >> 20) == (end >> 20) ) return   73 + (begin >> 20);
37     if ( (begin >> 23) == (end >> 23) ) return    9 + (begin >> 23);
38     if ( (begin >> 26) == (end >> 26) ) return    1 + (begin >> 26);
39     return 0;
40 }
41
42 // closes the alignment archive
43 void BamWriterPrivate::Close(void) {
44     m_stream.Close();
45 }
46
47 // creates a cigar string from the supplied alignment
48 void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
49
50     // initialize
51     const unsigned int numCigarOperations = cigarOperations.size();
52     packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT);
53
54     // pack the cigar data into the string
55     unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
56
57     // iterate over cigar operations
58     vector<CigarOp>::const_iterator coIter = cigarOperations.begin();
59     vector<CigarOp>::const_iterator coEnd  = cigarOperations.end();
60     for ( ; coIter != coEnd; ++coIter ) {
61
62         // store op in packedCigar
63         unsigned int cigarOp;
64         switch ( coIter->Type ) {
65             case (Constants::BAM_CIGAR_MATCH_CHAR)    : cigarOp = Constants::BAM_CIGAR_MATCH;    break;
66             case (Constants::BAM_CIGAR_INS_CHAR)      : cigarOp = Constants::BAM_CIGAR_INS;      break;
67             case (Constants::BAM_CIGAR_DEL_CHAR)      : cigarOp = Constants::BAM_CIGAR_DEL;      break;
68             case (Constants::BAM_CIGAR_REFSKIP_CHAR)  : cigarOp = Constants::BAM_CIGAR_REFSKIP;  break;
69             case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_SOFTCLIP; break;
70             case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_HARDCLIP; break;
71             case (Constants::BAM_CIGAR_PAD_CHAR)      : cigarOp = Constants::BAM_CIGAR_PAD;      break;
72             case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_SEQMATCH; break;
73             case (Constants::BAM_CIGAR_MISMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MISMATCH; break;
74             default:
75               fprintf(stderr, "BamWriter ERROR: unknown cigar operation found: %c\n", coIter->Type);
76               exit(1);
77         }
78
79         *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp;
80         pPackedCigar++;
81     }
82 }
83
84 // encodes the supplied query sequence into 4-bit notation
85 void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
86
87     // prepare the encoded query string
88     const unsigned int queryLen = query.size();
89     const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
90     encodedQuery.resize(encodedQueryLen);
91     char* pEncodedQuery = (char*)encodedQuery.data();
92     const char* pQuery = (const char*)query.data();
93
94     unsigned char nucleotideCode;
95     bool useHighWord = true;
96
97     while ( *pQuery ) {
98         switch ( *pQuery ) {
99             case (Constants::BAM_DNA_EQUAL) : nucleotideCode = Constants::BAM_BASECODE_EQUAL; break;
100             case (Constants::BAM_DNA_A)     : nucleotideCode = Constants::BAM_BASECODE_A;     break;
101             case (Constants::BAM_DNA_C)     : nucleotideCode = Constants::BAM_BASECODE_C;     break;
102             case (Constants::BAM_DNA_G)     : nucleotideCode = Constants::BAM_BASECODE_G;     break;
103             case (Constants::BAM_DNA_T)     : nucleotideCode = Constants::BAM_BASECODE_T;     break;
104             case (Constants::BAM_DNA_N)     : nucleotideCode = Constants::BAM_BASECODE_N;     break;
105             default:
106                 fprintf(stderr, "BamWriter ERROR: only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
107                 exit(1);
108         }
109
110         // pack the nucleotide code
111         if ( useHighWord ) {
112             *pEncodedQuery = nucleotideCode << 4;
113             useHighWord = false;
114         } else {
115             *pEncodedQuery |= nucleotideCode;
116             ++pEncodedQuery;
117             useHighWord = true;
118         }
119
120         // increment the query position
121         ++pQuery;
122     }
123 }
124
125 // returns whether BAM file is open for writing or not
126 bool BamWriterPrivate::IsOpen(void) const {
127     return m_stream.IsOpen;
128 }
129
130 // opens the alignment archive
131 bool BamWriterPrivate::Open(const string& filename,
132                             const string& samHeaderText,
133                             const RefVector& referenceSequences)
134 {
135     // open the BGZF file for writing, return failure if error
136     if ( !m_stream.Open(filename, "wb") )
137         return false;
138
139     // write BAM file 'metadata' components
140     WriteMagicNumber();
141     WriteSamHeaderText(samHeaderText);
142     WriteReferences(referenceSequences);
143     return true;
144 }
145
146 // saves the alignment to the alignment archive
147 void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
148
149     // if BamAlignment contains only the core data and a raw char data buffer
150     // (as a result of BamReader::GetNextAlignmentCore())
151     if ( al.SupportData.HasCoreOnly ) {
152
153         // write the block size
154         unsigned int blockSize = al.SupportData.BlockLength;
155         if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
156         m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
157
158         // re-calculate bin (in case BamAlignment's position has been previously modified)
159         const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition());
160
161         // assign the BAM core data
162         uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
163         buffer[0] = al.RefID;
164         buffer[1] = al.Position;
165         buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
166         buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
167         buffer[4] = al.SupportData.QuerySequenceLength;
168         buffer[5] = al.MateRefID;
169         buffer[6] = al.MatePosition;
170         buffer[7] = al.InsertSize;
171
172         // swap BAM core endian-ness, if necessary
173         if ( m_isBigEndian ) {
174             for ( int i = 0; i < 8; ++i )
175                 BamTools::SwapEndian_32(buffer[i]);
176         }
177
178         // write the BAM core
179         m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
180
181         // write the raw char data
182         m_stream.Write((char*)al.SupportData.AllCharData.data(),
183                        al.SupportData.BlockLength-Constants::BAM_CORE_SIZE);
184     }
185
186     // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
187     // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
188     else {
189
190         // calculate char lengths
191         const unsigned int nameLength         = al.Name.size() + 1;
192         const unsigned int numCigarOperations = al.CigarData.size();
193         const unsigned int queryLength        = al.QueryBases.size();
194         const unsigned int tagDataLength      = al.TagData.size();
195
196         // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
197         // force calculation of Bin before storing
198         const int endPosition = al.GetEndPosition();
199         const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
200
201         // create our packed cigar string
202         string packedCigar;
203         CreatePackedCigar(al.CigarData, packedCigar);
204         const unsigned int packedCigarLength = packedCigar.size();
205
206         // encode the query
207         string encodedQuery;
208         EncodeQuerySequence(al.QueryBases, encodedQuery);
209         const unsigned int encodedQueryLength = encodedQuery.size();
210
211         // write the block size
212         const unsigned int dataBlockSize = nameLength +
213                                            packedCigarLength +
214                                            encodedQueryLength +
215                                            queryLength +
216                                            tagDataLength;
217         unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize;
218         if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
219         m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
220
221         // assign the BAM core data
222         uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
223         buffer[0] = al.RefID;
224         buffer[1] = al.Position;
225         buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
226         buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
227         buffer[4] = queryLength;
228         buffer[5] = al.MateRefID;
229         buffer[6] = al.MatePosition;
230         buffer[7] = al.InsertSize;
231
232         // swap BAM core endian-ness, if necessary
233         if ( m_isBigEndian ) {
234             for ( int i = 0; i < 8; ++i )
235                 BamTools::SwapEndian_32(buffer[i]);
236         }
237
238         // write the BAM core
239         m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
240
241         // write the query name
242         m_stream.Write(al.Name.c_str(), nameLength);
243
244         // write the packed cigar
245         if ( m_isBigEndian ) {
246             char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
247             memcpy(cigarData, packedCigar.data(), packedCigarLength);
248             if ( m_isBigEndian ) {
249                 for ( unsigned int i = 0; i < packedCigarLength; ++i )
250                     BamTools::SwapEndian_32p(&cigarData[i]);
251             }
252             m_stream.Write(cigarData, packedCigarLength);
253             free(cigarData);
254         }
255         else
256             m_stream.Write(packedCigar.data(), packedCigarLength);
257
258         // write the encoded query sequence
259         m_stream.Write(encodedQuery.data(), encodedQueryLength);
260
261         // write the base qualities
262         char* pBaseQualities = (char*)al.Qualities.data();
263         for ( unsigned int i = 0; i < queryLength; ++i )
264             pBaseQualities[i] -= 33; // FASTQ conversion
265         m_stream.Write(pBaseQualities, queryLength);
266
267         // write the read group tag
268         if ( m_isBigEndian ) {
269
270             char* tagData = (char*)calloc(sizeof(char), tagDataLength);
271             memcpy(tagData, al.TagData.data(), tagDataLength);
272
273             int i = 0;
274             while ( (unsigned int)i < tagDataLength ) {
275
276                 i += Constants::BAM_TAG_TAGSIZE;  // skip tag chars (e.g. "RG", "NM", etc.)
277                 const char type = tagData[i];     // get tag type at position i
278                 ++i;
279
280                 switch ( type ) {
281
282                     case(Constants::BAM_TAG_TYPE_ASCII) :
283                     case(Constants::BAM_TAG_TYPE_INT8)  :
284                     case(Constants::BAM_TAG_TYPE_UINT8) :
285                         ++i;
286                         break;
287
288                     case(Constants::BAM_TAG_TYPE_INT16)  :
289                     case(Constants::BAM_TAG_TYPE_UINT16) :
290                         BamTools::SwapEndian_16p(&tagData[i]);
291                         i += sizeof(uint16_t);
292                         break;
293
294                     case(Constants::BAM_TAG_TYPE_FLOAT)  :
295                     case(Constants::BAM_TAG_TYPE_INT32)  :
296                     case(Constants::BAM_TAG_TYPE_UINT32) :
297                         BamTools::SwapEndian_32p(&tagData[i]);
298                         i += sizeof(uint32_t);
299                         break;
300
301                     case(Constants::BAM_TAG_TYPE_HEX) :
302                     case(Constants::BAM_TAG_TYPE_STRING) :
303                         // no endian swapping necessary for hex-string/string data
304                         while ( tagData[i] )
305                             ++i;
306                         // increment one more for null terminator
307                         ++i;
308                         break;
309
310                     case(Constants::BAM_TAG_TYPE_ARRAY) :
311
312                     {
313                         // read array type
314                         const char arrayType = tagData[i];
315                         ++i;
316
317                         // swap endian-ness of number of elements in place, then retrieve for loop
318                         BamTools::SwapEndian_32p(&tagData[i]);
319                         int32_t numElements;
320                         memcpy(&numElements, &tagData[i], sizeof(uint32_t));
321                         i += sizeof(uint32_t);
322
323                         // swap endian-ness of array elements
324                         for ( int j = 0; j < numElements; ++j ) {
325                             switch (arrayType) {
326                                 case (Constants::BAM_TAG_TYPE_INT8)  :
327                                 case (Constants::BAM_TAG_TYPE_UINT8) :
328                                     // no endian-swapping necessary
329                                     ++i;
330                                     break;
331                                 case (Constants::BAM_TAG_TYPE_INT16)  :
332                                 case (Constants::BAM_TAG_TYPE_UINT16) :
333                                     BamTools::SwapEndian_16p(&tagData[i]);
334                                     i += sizeof(uint16_t);
335                                     break;
336                                 case (Constants::BAM_TAG_TYPE_FLOAT)  :
337                                 case (Constants::BAM_TAG_TYPE_INT32)  :
338                                 case (Constants::BAM_TAG_TYPE_UINT32) :
339                                     BamTools::SwapEndian_32p(&tagData[i]);
340                                     i += sizeof(uint32_t);
341                                     break;
342                                 default:
343                                     // error case
344                                     fprintf(stderr,
345                                             "BamWriter ERROR: unknown binary array type encountered: [%c]\n",
346                                             arrayType);
347                                     exit(1);
348                             }
349                         }
350
351                         break;
352                     }
353
354                     default :
355                         fprintf(stderr, "BamWriter ERROR: invalid tag value type\n"); // shouldn't get here
356                         free(tagData);
357                         exit(1);
358                 }
359             }
360             m_stream.Write(tagData, tagDataLength);
361             free(tagData);
362         }
363         else
364             m_stream.Write(al.TagData.data(), tagDataLength);
365     }
366 }
367
368 void BamWriterPrivate::SetWriteCompressed(bool ok) {
369
370     // warn if BAM file is already open
371     // modifying compression is not allowed in this case
372     if ( IsOpen() ) {
373         cerr << "BamWriter WARNING: attempting to change compression mode on an open BAM file is not allowed. "
374              << "Ignoring request." << endl;
375         return;
376     }
377
378     // set BgzfStream compression mode
379     m_stream.SetWriteCompressed(ok);
380 }
381
382 void BamWriterPrivate::WriteMagicNumber(void) {
383     // write BAM file 'magic number'
384     m_stream.Write(Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH);
385 }
386
387 void BamWriterPrivate::WriteReferences(const BamTools::RefVector& referenceSequences) {
388
389     // write the number of reference sequences
390     uint32_t numReferenceSequences = referenceSequences.size();
391     if ( m_isBigEndian ) BamTools::SwapEndian_32(numReferenceSequences);
392     m_stream.Write((char*)&numReferenceSequences, Constants::BAM_SIZEOF_INT);
393
394     // foreach reference sequence
395     RefVector::const_iterator rsIter = referenceSequences.begin();
396     RefVector::const_iterator rsEnd  = referenceSequences.end();
397     for ( ; rsIter != rsEnd; ++rsIter ) {
398
399         // write the reference sequence name length
400         uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
401         if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceSequenceNameLen);
402         m_stream.Write((char*)&referenceSequenceNameLen, Constants::BAM_SIZEOF_INT);
403
404         // write the reference sequence name
405         m_stream.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
406
407         // write the reference sequence length
408         int32_t referenceLength = rsIter->RefLength;
409         if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceLength);
410         m_stream.Write((char*)&referenceLength, Constants::BAM_SIZEOF_INT);
411     }
412 }
413
414 void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText) {
415
416     // write the SAM header  text length
417     uint32_t samHeaderLen = samHeaderText.size();
418     if ( m_isBigEndian ) BamTools::SwapEndian_32(samHeaderLen);
419     m_stream.Write((char*)&samHeaderLen, Constants::BAM_SIZEOF_INT);
420
421     // write the SAM header text
422     if ( samHeaderLen > 0 )
423         m_stream.Write(samHeaderText.data(), samHeaderLen);
424 }