]> git.donarmstrong.com Git - bamtools.git/blob - src/api/internal/BamWriter_p.cpp
7147a33ff654918e720cd4ea9acf0036083a5213
[bamtools.git] / src / api / internal / BamWriter_p.cpp
1 // ***************************************************************************
2 // BamWriter_p.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 21 March 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides the basic functionality for producing BAM files
9 // ***************************************************************************
10
11 #include <api/BamAlignment.h>
12 #include <api/BamConstants.h>
13 #include <api/internal/BamWriter_p.h>
14 using namespace BamTools;
15 using namespace BamTools::Internal;
16
17 #include <cstdio>
18 #include <cstdlib>
19 #include <cstring>
20 using namespace std;
21
22 // ctor
23 BamWriterPrivate::BamWriterPrivate(void)
24     : m_isBigEndian( BamTools::SystemIsBigEndian() )
25 { }
26
27 // dtor
28 BamWriterPrivate::~BamWriterPrivate(void) {
29     m_stream.Close();
30 }
31
32 // calculates minimum bin for a BAM alignment interval
33 unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
34     --end;
35     if ( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
36     if ( (begin >> 17) == (end >> 17) ) return  585 + (begin >> 17);
37     if ( (begin >> 20) == (end >> 20) ) return   73 + (begin >> 20);
38     if ( (begin >> 23) == (end >> 23) ) return    9 + (begin >> 23);
39     if ( (begin >> 26) == (end >> 26) ) return    1 + (begin >> 26);
40     return 0;
41 }
42
43 // closes the alignment archive
44 void BamWriterPrivate::Close(void) {
45     m_stream.Close();
46 }
47
48 // creates a cigar string from the supplied alignment
49 void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
50
51     // initialize
52     const unsigned int numCigarOperations = cigarOperations.size();
53     packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT);
54
55     // pack the cigar data into the string
56     unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
57
58     // iterate over cigar operations
59     vector<CigarOp>::const_iterator coIter = cigarOperations.begin();
60     vector<CigarOp>::const_iterator coEnd  = cigarOperations.end();
61     for ( ; coIter != coEnd; ++coIter ) {
62
63         // store op in packedCigar
64         unsigned int cigarOp;
65         switch ( coIter->Type ) {
66             case (Constants::BAM_CIGAR_MATCH_CHAR)    : cigarOp = Constants::BAM_CIGAR_MATCH;    break;
67             case (Constants::BAM_CIGAR_INS_CHAR)      : cigarOp = Constants::BAM_CIGAR_INS;      break;
68             case (Constants::BAM_CIGAR_DEL_CHAR)      : cigarOp = Constants::BAM_CIGAR_DEL;      break;
69             case (Constants::BAM_CIGAR_REFSKIP_CHAR)  : cigarOp = Constants::BAM_CIGAR_REFSKIP;  break;
70             case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_SOFTCLIP; break;
71             case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_HARDCLIP; break;
72             case (Constants::BAM_CIGAR_PAD_CHAR)      : cigarOp = Constants::BAM_CIGAR_PAD;      break;
73             default:
74               fprintf(stderr, "BamWriter ERROR: unknown cigar operation found: %c\n", coIter->Type);
75               exit(1);
76         }
77
78         *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp;
79         pPackedCigar++;
80     }
81 }
82
83 // encodes the supplied query sequence into 4-bit notation
84 void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
85
86     // prepare the encoded query string
87     const unsigned int queryLen = query.size();
88     const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
89     encodedQuery.resize(encodedQueryLen);
90     char* pEncodedQuery = (char*)encodedQuery.data();
91     const char* pQuery = (const char*)query.data();
92
93     unsigned char nucleotideCode;
94     bool useHighWord = true;
95
96     while ( *pQuery ) {
97         switch ( *pQuery ) {
98             case (Constants::BAM_DNA_EQUAL) : nucleotideCode = Constants::BAM_BASECODE_EQUAL; break;
99             case (Constants::BAM_DNA_A)     : nucleotideCode = Constants::BAM_BASECODE_A;     break;
100             case (Constants::BAM_DNA_C)     : nucleotideCode = Constants::BAM_BASECODE_C;     break;
101             case (Constants::BAM_DNA_G)     : nucleotideCode = Constants::BAM_BASECODE_G;     break;
102             case (Constants::BAM_DNA_T)     : nucleotideCode = Constants::BAM_BASECODE_T;     break;
103             case (Constants::BAM_DNA_N)     : nucleotideCode = Constants::BAM_BASECODE_N;     break;
104             default:
105                 fprintf(stderr, "BamWriter ERROR: only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
106                 exit(1);
107         }
108
109         // pack the nucleotide code
110         if ( useHighWord ) {
111             *pEncodedQuery = nucleotideCode << 4;
112             useHighWord = false;
113         } else {
114             *pEncodedQuery |= nucleotideCode;
115             ++pEncodedQuery;
116             useHighWord = true;
117         }
118
119         // increment the query position
120         ++pQuery;
121     }
122 }
123
124 // returns whether BAM file is open for writing or not
125 bool BamWriterPrivate::IsOpen(void) const {
126     return m_stream.IsOpen;
127 }
128
129 // opens the alignment archive
130 bool BamWriterPrivate::Open(const string& filename,
131                             const string& samHeaderText,
132                             const RefVector& referenceSequences)
133 {
134     // open the BGZF file for writing, return failure if error
135     if ( !m_stream.Open(filename, "wb") )
136         return false;
137
138     // write BAM file 'metadata' components
139     WriteMagicNumber();
140     WriteSamHeaderText(samHeaderText);
141     WriteReferences(referenceSequences);
142     return true;
143 }
144
145 // saves the alignment to the alignment archive
146 void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
147
148     // if BamAlignment contains only the core data and a raw char data buffer
149     // (as a result of BamReader::GetNextAlignmentCore())
150     if ( al.SupportData.HasCoreOnly ) {
151
152         // write the block size
153         unsigned int blockSize = al.SupportData.BlockLength;
154         if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
155         m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
156
157         // assign the BAM core data
158         uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
159         buffer[0] = al.RefID;
160         buffer[1] = al.Position;
161         buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
162         buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
163         buffer[4] = al.SupportData.QuerySequenceLength;
164         buffer[5] = al.MateRefID;
165         buffer[6] = al.MatePosition;
166         buffer[7] = al.InsertSize;
167
168         // swap BAM core endian-ness, if necessary
169         if ( m_isBigEndian ) {
170             for ( int i = 0; i < 8; ++i )
171                 BamTools::SwapEndian_32(buffer[i]);
172         }
173
174         // write the BAM core
175         m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
176
177         // write the raw char data
178         m_stream.Write((char*)al.SupportData.AllCharData.data(),
179                        al.SupportData.BlockLength-Constants::BAM_CORE_SIZE);
180     }
181
182     // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
183     // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
184     else {
185
186         // calculate char lengths
187         const unsigned int nameLength         = al.Name.size() + 1;
188         const unsigned int numCigarOperations = al.CigarData.size();
189         const unsigned int queryLength        = al.QueryBases.size();
190         const unsigned int tagDataLength      = al.TagData.size();
191
192         // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
193         // force calculation of Bin before storing
194         const int endPosition = al.GetEndPosition();
195         const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
196
197         // create our packed cigar string
198         string packedCigar;
199         CreatePackedCigar(al.CigarData, packedCigar);
200         const unsigned int packedCigarLength = packedCigar.size();
201
202         // encode the query
203         string encodedQuery;
204         EncodeQuerySequence(al.QueryBases, encodedQuery);
205         const unsigned int encodedQueryLength = encodedQuery.size();
206
207         // write the block size
208         const unsigned int dataBlockSize = nameLength +
209                                            packedCigarLength +
210                                            encodedQueryLength +
211                                            queryLength +
212                                            tagDataLength;
213         unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize;
214         if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize);
215         m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);
216
217         // assign the BAM core data
218         uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE];
219         buffer[0] = al.RefID;
220         buffer[1] = al.Position;
221         buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
222         buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
223         buffer[4] = queryLength;
224         buffer[5] = al.MateRefID;
225         buffer[6] = al.MatePosition;
226         buffer[7] = al.InsertSize;
227
228         // swap BAM core endian-ness, if necessary
229         if ( m_isBigEndian ) {
230             for ( int i = 0; i < 8; ++i )
231                 BamTools::SwapEndian_32(buffer[i]);
232         }
233
234         // write the BAM core
235         m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);
236
237         // write the query name
238         m_stream.Write(al.Name.c_str(), nameLength);
239
240         // write the packed cigar
241         if ( m_isBigEndian ) {
242             char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
243             memcpy(cigarData, packedCigar.data(), packedCigarLength);
244             for (unsigned int i = 0; i < packedCigarLength; ++i)
245                 if (m_isBigEndian) BamTools::SwapEndian_32p(&cigarData[i]);
246             m_stream.Write(cigarData, packedCigarLength);
247             free(cigarData);
248         }
249         else
250             m_stream.Write(packedCigar.data(), packedCigarLength);
251
252         // write the encoded query sequence
253         m_stream.Write(encodedQuery.data(), encodedQueryLength);
254
255         // write the base qualities
256         char* pBaseQualities = (char*)al.Qualities.data();
257         for(unsigned int i = 0; i < queryLength; i++)
258             pBaseQualities[i] -= 33; // FASTQ conversion
259         m_stream.Write(pBaseQualities, queryLength);
260
261         // write the read group tag
262         if ( m_isBigEndian ) {
263
264             char* tagData = (char*)calloc(sizeof(char), tagDataLength);
265             memcpy(tagData, al.TagData.data(), tagDataLength);
266
267             int i = 0;
268             while ( (unsigned int)i < tagDataLength ) {
269
270                 i += Constants::BAM_TAG_TAGSIZE;  // skip tag chars (e.g. "RG", "NM", etc.)
271                 const char type = tagData[i];     // get tag type at position i
272                 ++i;
273
274                 switch ( type ) {
275
276                     case(Constants::BAM_TAG_TYPE_ASCII) :
277                     case(Constants::BAM_TAG_TYPE_INT8)  :
278                     case(Constants::BAM_TAG_TYPE_UINT8) :
279                         ++i;
280                         break;
281
282                     case(Constants::BAM_TAG_TYPE_INT16)  :
283                     case(Constants::BAM_TAG_TYPE_UINT16) :
284                         BamTools::SwapEndian_16p(&tagData[i]);
285                         i += sizeof(uint16_t);
286                         break;
287
288                     case(Constants::BAM_TAG_TYPE_FLOAT)  :
289                     case(Constants::BAM_TAG_TYPE_INT32)  :
290                     case(Constants::BAM_TAG_TYPE_UINT32) :
291                         BamTools::SwapEndian_32p(&tagData[i]);
292                         i += sizeof(uint32_t);
293                         break;
294
295                     case(Constants::BAM_TAG_TYPE_HEX) :
296                     case(Constants::BAM_TAG_TYPE_STRING) :
297                         // no endian swapping necessary for hex-string/string data
298                         while (tagData[i]) { ++i; }
299                         // increment one more for null terminator
300                         ++i;
301                         break;
302
303                     default :
304                         fprintf(stderr, "BamWriter ERROR: invalid tag value type\n"); // shouldn't get here
305                         free(tagData);
306                         exit(1);
307                 }
308             }
309             m_stream.Write(tagData, tagDataLength);
310             free(tagData);
311         }
312         else
313             m_stream.Write(al.TagData.data(), tagDataLength);
314     }
315 }
316
317 void BamWriterPrivate::SetWriteCompressed(bool ok) {
318
319     // warn if BAM file is already open
320     // modifying compression is not allowed in this case
321     if ( IsOpen() ) {
322         cerr << "BamWriter WARNING: attempting to change compression mode on an open BAM file is not allowed. "
323              << "Ignoring request." << endl;
324         return;
325     }
326
327     // set BgzfStream compression mode
328     m_stream.SetWriteCompressed(ok);
329 }
330
331 void BamWriterPrivate::WriteMagicNumber(void) {
332     // write BAM file 'magic number'
333     m_stream.Write(Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH);
334 }
335
336 void BamWriterPrivate::WriteReferences(const BamTools::RefVector& referenceSequences) {
337
338     // write the number of reference sequences
339     uint32_t numReferenceSequences = referenceSequences.size();
340     if ( m_isBigEndian ) BamTools::SwapEndian_32(numReferenceSequences);
341     m_stream.Write((char*)&numReferenceSequences, Constants::BAM_SIZEOF_INT);
342
343     // foreach reference sequence
344     RefVector::const_iterator rsIter = referenceSequences.begin();
345     RefVector::const_iterator rsEnd  = referenceSequences.end();
346     for ( ; rsIter != rsEnd; ++rsIter ) {
347
348         // write the reference sequence name length
349         uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
350         if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceSequenceNameLen);
351         m_stream.Write((char*)&referenceSequenceNameLen, Constants::BAM_SIZEOF_INT);
352
353         // write the reference sequence name
354         m_stream.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
355
356         // write the reference sequence length
357         int32_t referenceLength = rsIter->RefLength;
358         if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceLength);
359         m_stream.Write((char*)&referenceLength, Constants::BAM_SIZEOF_INT);
360     }
361 }
362
363 void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText) {
364
365     // write the SAM header  text length
366     uint32_t samHeaderLen = samHeaderText.size();
367     if ( m_isBigEndian ) BamTools::SwapEndian_32(samHeaderLen);
368     m_stream.Write((char*)&samHeaderLen, Constants::BAM_SIZEOF_INT);
369
370     // write the SAM header text
371     if (samHeaderLen > 0)
372         m_stream.Write(samHeaderText.data(), samHeaderLen);
373 }