]> git.donarmstrong.com Git - bamtools.git/blob - src/api/internal/BamWriter_p.cpp
Update to BamTools API 0.9.2 with exposure of SamHeader object from BamReader and...
[bamtools.git] / src / api / internal / BamWriter_p.cpp
1 // ***************************************************************************
2 // BamWriter_p.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 11 January 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides the basic functionality for producing BAM files
9 // ***************************************************************************
10
11 #include <api/BamAlignment.h>
12 #include <api/internal/BamWriter_p.h>
13 using namespace BamTools;
14 using namespace BamTools::Internal;
15 using namespace std;
16
17 // ctor
18 BamWriterPrivate::BamWriterPrivate(void) {
19     IsBigEndian = SystemIsBigEndian();
20 }
21
22 // dtor
23 BamWriterPrivate::~BamWriterPrivate(void) {
24     mBGZF.Close();
25 }
26
27 // calculates minimum bin for a BAM alignment interval
28 const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
29     --end;
30     if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
31     if( (begin >> 17) == (end >> 17) ) return  585 + (begin >> 17);
32     if( (begin >> 20) == (end >> 20) ) return   73 + (begin >> 20);
33     if( (begin >> 23) == (end >> 23) ) return    9 + (begin >> 23);
34     if( (begin >> 26) == (end >> 26) ) return    1 + (begin >> 26);
35     return 0;
36 }
37
38 // closes the alignment archive
39 void BamWriterPrivate::Close(void) {
40     mBGZF.Close();
41 }
42
43 // creates a cigar string from the supplied alignment
44 void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
45
46     // initialize
47     const unsigned int numCigarOperations = cigarOperations.size();
48     packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);
49
50     // pack the cigar data into the string
51     unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
52
53     unsigned int cigarOp;
54     vector<CigarOp>::const_iterator coIter;
55     for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {
56
57         switch(coIter->Type) {
58             case 'M': cigarOp = BAM_CMATCH; break;
59             case 'I': cigarOp = BAM_CINS; break;
60             case 'D': cigarOp = BAM_CDEL; break;
61             case 'N': cigarOp = BAM_CREF_SKIP; break;
62             case 'S': cigarOp = BAM_CSOFT_CLIP; break;
63             case 'H': cigarOp = BAM_CHARD_CLIP; break;
64             case 'P': cigarOp = BAM_CPAD; break;
65             default:
66               fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);
67               exit(1);
68         }
69
70         *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
71         pPackedCigar++;
72     }
73 }
74
75 // encodes the supplied query sequence into 4-bit notation
76 void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
77
78     // prepare the encoded query string
79     const unsigned int queryLen = query.size();
80     const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
81     encodedQuery.resize(encodedQueryLen);
82     char* pEncodedQuery = (char*)encodedQuery.data();
83     const char* pQuery = (const char*)query.data();
84
85     unsigned char nucleotideCode;
86     bool useHighWord = true;
87
88     while(*pQuery) {
89
90         switch(*pQuery) {
91             case '=': nucleotideCode = 0; break;
92             case 'A': nucleotideCode = 1; break;
93             case 'C': nucleotideCode = 2; break;
94             case 'G': nucleotideCode = 4; break;
95             case 'T': nucleotideCode = 8; break;
96             case 'N': nucleotideCode = 15; break;
97             default:
98                 fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
99                 exit(1);
100         }
101
102         // pack the nucleotide code
103         if(useHighWord) {
104             *pEncodedQuery = nucleotideCode << 4;
105             useHighWord = false;
106         } else {
107             *pEncodedQuery |= nucleotideCode;
108             pEncodedQuery++;
109             useHighWord = true;
110         }
111
112         // increment the query position
113         pQuery++;
114     }
115 }
116
117 // opens the alignment archive
118 bool BamWriterPrivate::Open(const string& filename,
119                             const string& samHeader,
120                             const RefVector& referenceSequences,
121                             bool isWriteUncompressed)
122 {
123     // open the BGZF file for writing, return failure if error
124     if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )
125         return false;
126
127     // ================
128     // write the header
129     // ================
130
131     // write the BAM signature
132     const unsigned char SIGNATURE_LENGTH = 4;
133     const char* BAM_SIGNATURE = "BAM\1";
134     mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);
135
136     // write the SAM header text length
137     uint32_t samHeaderLen = samHeader.size();
138     if (IsBigEndian) SwapEndian_32(samHeaderLen);
139     mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);
140
141     // write the SAM header text
142     if(samHeaderLen > 0)
143         mBGZF.Write(samHeader.data(), samHeaderLen);
144
145     // write the number of reference sequences
146     uint32_t numReferenceSequences = referenceSequences.size();
147     if (IsBigEndian) SwapEndian_32(numReferenceSequences);
148     mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);
149
150     // =============================
151     // write the sequence dictionary
152     // =============================
153
154     RefVector::const_iterator rsIter = referenceSequences.begin();
155     RefVector::const_iterator rsEnd  = referenceSequences.end();
156     for( ; rsIter != rsEnd; ++rsIter ) {
157
158         // write the reference sequence name length
159         uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
160         if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);
161         mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
162
163         // write the reference sequence name
164         mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
165
166         // write the reference sequence length
167         int32_t referenceLength = rsIter->RefLength;
168         if (IsBigEndian) SwapEndian_32(referenceLength);
169         mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);
170     }
171
172     // return success
173     return true;
174 }
175
176 // saves the alignment to the alignment archive
177 void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
178
179     // if BamAlignment contains only the core data and a raw char data buffer
180     // (as a result of BamReader::GetNextAlignmentCore())
181     if ( al.SupportData.HasCoreOnly ) {
182
183         // write the block size
184         unsigned int blockSize = al.SupportData.BlockLength;
185         if (IsBigEndian) SwapEndian_32(blockSize);
186         mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
187
188         // assign the BAM core data
189         uint32_t buffer[8];
190         buffer[0] = al.RefID;
191         buffer[1] = al.Position;
192         buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
193         buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
194         buffer[4] = al.SupportData.QuerySequenceLength;
195         buffer[5] = al.MateRefID;
196         buffer[6] = al.MatePosition;
197         buffer[7] = al.InsertSize;
198
199         // swap BAM core endian-ness, if necessary
200         if ( IsBigEndian ) {
201             for ( int i = 0; i < 8; ++i )
202             SwapEndian_32(buffer[i]);
203         }
204
205         // write the BAM core
206         mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
207
208         // write the raw char data
209         mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE);
210     }
211
212     // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
213     // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
214     else {
215
216         // calculate char lengths
217         const unsigned int nameLength         = al.Name.size() + 1;
218         const unsigned int numCigarOperations = al.CigarData.size();
219         const unsigned int queryLength        = al.QueryBases.size();
220         const unsigned int tagDataLength      = al.TagData.size();
221
222         // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
223         // force calculation of Bin before storing
224         const int endPosition = al.GetEndPosition();
225         const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
226
227         // create our packed cigar string
228         string packedCigar;
229         CreatePackedCigar(al.CigarData, packedCigar);
230         const unsigned int packedCigarLength = packedCigar.size();
231
232         // encode the query
233         string encodedQuery;
234         EncodeQuerySequence(al.QueryBases, encodedQuery);
235         const unsigned int encodedQueryLength = encodedQuery.size();
236
237         // write the block size
238         const unsigned int dataBlockSize = nameLength +
239                                            packedCigarLength +
240                                            encodedQueryLength +
241                                            queryLength +
242                                            tagDataLength;
243         unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
244         if (IsBigEndian) SwapEndian_32(blockSize);
245         mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
246
247         // assign the BAM core data
248         uint32_t buffer[8];
249         buffer[0] = al.RefID;
250         buffer[1] = al.Position;
251         buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
252         buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
253         buffer[4] = queryLength;
254         buffer[5] = al.MateRefID;
255         buffer[6] = al.MatePosition;
256         buffer[7] = al.InsertSize;
257
258         // swap BAM core endian-ness, if necessary
259         if ( IsBigEndian ) {
260             for ( int i = 0; i < 8; ++i )
261             SwapEndian_32(buffer[i]);
262         }
263
264         // write the BAM core
265         mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
266
267         // write the query name
268         mBGZF.Write(al.Name.c_str(), nameLength);
269
270         // write the packed cigar
271         if ( IsBigEndian ) {
272
273             char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
274             memcpy(cigarData, packedCigar.data(), packedCigarLength);
275
276             for (unsigned int i = 0; i < packedCigarLength; ++i) {
277                 if ( IsBigEndian )
278                     SwapEndian_32p(&cigarData[i]);
279             }
280
281             mBGZF.Write(cigarData, packedCigarLength);
282             free(cigarData);
283         }
284         else
285             mBGZF.Write(packedCigar.data(), packedCigarLength);
286
287         // write the encoded query sequence
288         mBGZF.Write(encodedQuery.data(), encodedQueryLength);
289
290         // write the base qualities
291         char* pBaseQualities = (char*)al.Qualities.data();
292         for(unsigned int i = 0; i < queryLength; i++)
293             pBaseQualities[i] -= 33; // FASTQ conversion
294         mBGZF.Write(pBaseQualities, queryLength);
295
296         // write the read group tag
297         if ( IsBigEndian ) {
298
299             char* tagData = (char*)calloc(sizeof(char), tagDataLength);
300             memcpy(tagData, al.TagData.data(), tagDataLength);
301
302             int i = 0;
303             while ( (unsigned int)i < tagDataLength ) {
304
305                 i += 2;                                 // skip tag type (e.g. "RG", "NM", etc)
306                 uint8_t type = toupper(tagData[i]);     // lower & upper case letters have same meaning
307                 ++i;                                    // skip value type
308
309                 switch (type) {
310
311                     case('A') :
312                     case('C') :
313                         ++i;
314                         break;
315
316                     case('S') :
317                         SwapEndian_16p(&tagData[i]);
318                         i+=2; // sizeof(uint16_t)
319                         break;
320
321                     case('F') :
322                     case('I') :
323                         SwapEndian_32p(&tagData[i]);
324                         i+=4; // sizeof(uint32_t)
325                         break;
326
327                     case('D') :
328                         SwapEndian_64p(&tagData[i]);
329                         i+=8; // sizeof(uint64_t)
330                         break;
331
332                     case('H') :
333                     case('Z') :
334                         while (tagData[i]) { ++i; }
335                         ++i; // increment one more for null terminator
336                         break;
337
338                     default :
339                         fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
340                         free(tagData);
341                         exit(1);
342                 }
343             }
344
345             mBGZF.Write(tagData, tagDataLength);
346             free(tagData);
347         }
348         else
349             mBGZF.Write(al.TagData.data(), tagDataLength);
350     }
351 }