]> git.donarmstrong.com Git - bamtools.git/blob - src/api/BamWriter.cpp
On second thought, moved the (non-indexing) constants back to BamAux.h, since they...
[bamtools.git] / src / api / BamWriter.cpp
1 // ***************************************************************************\r
2 // BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett\r
3 // Marth Lab, Department of Biology, Boston College\r
4 // All rights reserved.\r
5 // ---------------------------------------------------------------------------\r
6 // Last modified: 16 September 2010 (DB)\r
7 // ---------------------------------------------------------------------------\r
8 // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
9 // Institute.\r
10 // ---------------------------------------------------------------------------\r
11 // Provides the basic functionality for producing BAM files\r
12 // ***************************************************************************\r
13 \r
14 #include <iostream>\r
15 \r
16 #include "BGZF.h"\r
17 #include "BamWriter.h"\r
18 using namespace BamTools;\r
19 using namespace std;\r
20 \r
21 struct BamWriter::BamWriterPrivate {\r
22 \r
23     // data members\r
24     BgzfData mBGZF;\r
25     bool IsBigEndian;\r
26     \r
27     // constructor / destructor\r
28     BamWriterPrivate(void) { \r
29       IsBigEndian = SystemIsBigEndian();  \r
30     }\r
31     \r
32     ~BamWriterPrivate(void) {\r
33         mBGZF.Close();\r
34     }\r
35 \r
36     // "public" interface\r
37     void Close(void);\r
38     bool Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed);\r
39     void SaveAlignment(const BamAlignment& al);\r
40 \r
41     // internal methods\r
42     const unsigned int CalculateMinimumBin(const int begin, int end) const;\r
43     void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);\r
44     void EncodeQuerySequence(const string& query, string& encodedQuery);\r
45 };\r
46 \r
47 // -----------------------------------------------------\r
48 // BamWriter implementation\r
49 // -----------------------------------------------------\r
50 \r
51 // constructor\r
52 BamWriter::BamWriter(void) {\r
53     d = new BamWriterPrivate;\r
54 }\r
55 \r
56 // destructor\r
57 BamWriter::~BamWriter(void) {\r
58     delete d;\r
59     d = 0;\r
60 }\r
61 \r
62 // closes the alignment archive\r
63 void BamWriter::Close(void) { \r
64   d->Close(); \r
65 }\r
66 \r
67 // opens the alignment archive\r
68 bool BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) {\r
69     return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed);\r
70 }\r
71 \r
72 // saves the alignment to the alignment archive\r
73 void BamWriter::SaveAlignment(const BamAlignment& al) { \r
74     d->SaveAlignment(al);\r
75 }\r
76 \r
77 // -----------------------------------------------------\r
78 // BamWriterPrivate implementation\r
79 // -----------------------------------------------------\r
80 \r
81 // closes the alignment archive\r
82 void BamWriter::BamWriterPrivate::Close(void) {\r
83     mBGZF.Close();\r
84 }\r
85 \r
86 // calculates minimum bin for a BAM alignment interval\r
87 const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {  \r
88     --end;\r
89     if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);\r
90     if( (begin >> 17) == (end >> 17) ) return  585 + (begin >> 17);\r
91     if( (begin >> 20) == (end >> 20) ) return   73 + (begin >> 20);\r
92     if( (begin >> 23) == (end >> 23) ) return    9 + (begin >> 23);\r
93     if( (begin >> 26) == (end >> 26) ) return    1 + (begin >> 26);\r
94     return 0;\r
95 }\r
96 \r
97 // creates a cigar string from the supplied alignment\r
98 void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {\r
99 \r
100     // initialize\r
101     const unsigned int numCigarOperations = cigarOperations.size();\r
102     packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);\r
103 \r
104     // pack the cigar data into the string\r
105     unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();\r
106 \r
107     unsigned int cigarOp;\r
108     vector<CigarOp>::const_iterator coIter;\r
109     for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {\r
110 \r
111         switch(coIter->Type) {\r
112             case 'M':\r
113                   cigarOp = BAM_CMATCH;\r
114                   break;\r
115             case 'I':\r
116                   cigarOp = BAM_CINS;\r
117                   break;\r
118             case 'D':\r
119                   cigarOp = BAM_CDEL;\r
120                   break;\r
121             case 'N':\r
122                   cigarOp = BAM_CREF_SKIP;\r
123                   break;\r
124             case 'S':\r
125                   cigarOp = BAM_CSOFT_CLIP;\r
126                   break;\r
127             case 'H':\r
128                   cigarOp = BAM_CHARD_CLIP;\r
129                   break;\r
130             case 'P':\r
131                   cigarOp = BAM_CPAD;\r
132                   break;\r
133             default:\r
134                   fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);\r
135                   exit(1);\r
136         }\r
137 \r
138         *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;\r
139         pPackedCigar++;\r
140     }\r
141 }\r
142 \r
143 // encodes the supplied query sequence into 4-bit notation\r
144 void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {\r
145 \r
146     // prepare the encoded query string\r
147     const unsigned int queryLen = query.size();\r
148     const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);\r
149     encodedQuery.resize(encodedQueryLen);\r
150     char* pEncodedQuery = (char*)encodedQuery.data();\r
151     const char* pQuery = (const char*)query.data();\r
152 \r
153     unsigned char nucleotideCode;\r
154     bool useHighWord = true;\r
155 \r
156     while(*pQuery) {\r
157 \r
158         switch(*pQuery) {\r
159             \r
160             case '=':\r
161                 nucleotideCode = 0;\r
162                 break;\r
163                 \r
164             case 'A':\r
165                 nucleotideCode = 1;\r
166                 break;\r
167             \r
168             case 'C':\r
169                 nucleotideCode = 2;\r
170                 break;\r
171             \r
172             case 'G':\r
173                 nucleotideCode = 4;\r
174                 break;\r
175             \r
176             case 'T':\r
177                 nucleotideCode = 8;\r
178                 break;\r
179             \r
180             case 'N':\r
181                 nucleotideCode = 15;\r
182                 break;\r
183             \r
184             default:\r
185                 fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
186                 exit(1);\r
187         }\r
188 \r
189         // pack the nucleotide code\r
190         if(useHighWord) {\r
191             *pEncodedQuery = nucleotideCode << 4;\r
192             useHighWord = false;\r
193         } else {\r
194             *pEncodedQuery |= nucleotideCode;\r
195             pEncodedQuery++;\r
196             useHighWord = true;\r
197         }\r
198 \r
199         // increment the query position\r
200         pQuery++;\r
201     }\r
202 }\r
203 \r
204 // opens the alignment archive\r
205 bool BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) {\r
206 \r
207     // open the BGZF file for writing, return failure if error\r
208     if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )\r
209         return false;\r
210 \r
211     // ================\r
212     // write the header\r
213     // ================\r
214 \r
215     // write the BAM signature\r
216     const unsigned char SIGNATURE_LENGTH = 4;\r
217     const char* BAM_SIGNATURE = "BAM\1";\r
218     mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);\r
219 \r
220     // write the SAM header text length\r
221     uint32_t samHeaderLen = samHeader.size();\r
222     if (IsBigEndian) SwapEndian_32(samHeaderLen);\r
223     mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
224 \r
225     // write the SAM header text\r
226     if(samHeaderLen > 0) \r
227         mBGZF.Write(samHeader.data(), samHeaderLen);\r
228 \r
229     // write the number of reference sequences\r
230     uint32_t numReferenceSequences = referenceSequences.size();\r
231     if (IsBigEndian) SwapEndian_32(numReferenceSequences);\r
232     mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
233 \r
234     // =============================\r
235     // write the sequence dictionary\r
236     // =============================\r
237 \r
238     RefVector::const_iterator rsIter;\r
239     for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {\r
240 \r
241         // write the reference sequence name length\r
242         uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
243         if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);\r
244         mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
245 \r
246         // write the reference sequence name\r
247         mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);\r
248 \r
249         // write the reference sequence length\r
250         int32_t referenceLength = rsIter->RefLength;\r
251         if (IsBigEndian) SwapEndian_32(referenceLength);\r
252         mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);\r
253     }\r
254     \r
255     // return success\r
256     return true;\r
257 }\r
258 \r
259 // saves the alignment to the alignment archive\r
260 void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {\r
261 \r
262     // if BamAlignment contains only the core data and a raw char data buffer\r
263     // (as a result of BamReader::GetNextAlignmentCore())\r
264     if ( al.SupportData.HasCoreOnly ) {\r
265       \r
266         // write the block size\r
267         unsigned int blockSize = al.SupportData.BlockLength;\r
268         if (IsBigEndian) SwapEndian_32(blockSize);\r
269         mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
270 \r
271         // assign the BAM core data\r
272         uint32_t buffer[8];\r
273         buffer[0] = al.RefID;\r
274         buffer[1] = al.Position;\r
275         buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;\r
276         buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;\r
277         buffer[4] = al.SupportData.QuerySequenceLength;\r
278         buffer[5] = al.MateRefID;\r
279         buffer[6] = al.MatePosition;\r
280         buffer[7] = al.InsertSize;\r
281         \r
282         // swap BAM core endian-ness, if necessary\r
283         if ( IsBigEndian ) { \r
284             for ( int i = 0; i < 8; ++i )\r
285                 SwapEndian_32(buffer[i]); \r
286         }\r
287         \r
288         // write the BAM core\r
289         mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
290         \r
291         // write the raw char data\r
292         mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); \r
293     }\r
294     \r
295     // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc\r
296     // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )\r
297     else {\r
298         \r
299         // calculate char lengths\r
300         const unsigned int nameLength         = al.Name.size() + 1;\r
301         const unsigned int numCigarOperations = al.CigarData.size();\r
302         const unsigned int queryLength        = al.QueryBases.size();\r
303         const unsigned int tagDataLength      = al.TagData.size();\r
304         \r
305         // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)\r
306         // force calculation of Bin before storing\r
307         const int endPosition = al.GetEndPosition();\r
308         const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);\r
309       \r
310         // create our packed cigar string\r
311         string packedCigar;\r
312         CreatePackedCigar(al.CigarData, packedCigar);\r
313         const unsigned int packedCigarLength = packedCigar.size();\r
314 \r
315         // encode the query\r
316         string encodedQuery;\r
317         EncodeQuerySequence(al.QueryBases, encodedQuery);\r
318         const unsigned int encodedQueryLength = encodedQuery.size(); \r
319       \r
320         // write the block size\r
321         const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;\r
322         unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
323         if (IsBigEndian) SwapEndian_32(blockSize);\r
324         mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
325 \r
326         // assign the BAM core data\r
327         uint32_t buffer[8];\r
328         buffer[0] = al.RefID;\r
329         buffer[1] = al.Position;\r
330         buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;\r
331         buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;\r
332         buffer[4] = queryLength;\r
333         buffer[5] = al.MateRefID;\r
334         buffer[6] = al.MatePosition;\r
335         buffer[7] = al.InsertSize;\r
336         \r
337         // swap BAM core endian-ness, if necessary\r
338         if ( IsBigEndian ) { \r
339             for ( int i = 0; i < 8; ++i )\r
340                 SwapEndian_32(buffer[i]); \r
341         }\r
342         \r
343         // write the BAM core\r
344         mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
345         \r
346         // write the query name\r
347         mBGZF.Write(al.Name.c_str(), nameLength);\r
348 \r
349         // write the packed cigar\r
350         if ( IsBigEndian ) {\r
351           \r
352             char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);\r
353             memcpy(cigarData, packedCigar.data(), packedCigarLength);\r
354             \r
355             for (unsigned int i = 0; i < packedCigarLength; ++i) {\r
356                 if ( IsBigEndian )\r
357                   SwapEndian_32p(&cigarData[i]); \r
358             }\r
359             \r
360             mBGZF.Write(cigarData, packedCigarLength);\r
361             free(cigarData);    \r
362         } \r
363         else \r
364             mBGZF.Write(packedCigar.data(), packedCigarLength);\r
365 \r
366         // write the encoded query sequence\r
367         mBGZF.Write(encodedQuery.data(), encodedQueryLength);\r
368 \r
369         // write the base qualities\r
370         string baseQualities(al.Qualities);\r
371         char* pBaseQualities = (char*)al.Qualities.data();\r
372         for(unsigned int i = 0; i < queryLength; i++) { \r
373             pBaseQualities[i] -= 33; \r
374         }\r
375         mBGZF.Write(pBaseQualities, queryLength);\r
376 \r
377         // write the read group tag\r
378         if ( IsBigEndian ) {\r
379           \r
380             char* tagData = (char*)calloc(sizeof(char), tagDataLength);\r
381             memcpy(tagData, al.TagData.data(), tagDataLength);\r
382           \r
383             int i = 0;\r
384             while ( (unsigned int)i < tagDataLength ) {\r
385                 \r
386                 i += 2;                                 // skip tag type (e.g. "RG", "NM", etc)\r
387                 uint8_t type = toupper(tagData[i]);     // lower & upper case letters have same meaning \r
388                 ++i;                                    // skip value type\r
389                 \r
390                 switch (type) {\r
391                   \r
392                     case('A') :\r
393                     case('C') : \r
394                         ++i;\r
395                         break;\r
396                                 \r
397                     case('S') : \r
398                         SwapEndian_16p(&tagData[i]); \r
399                         i+=2; // sizeof(uint16_t)\r
400                         break;\r
401                                 \r
402                     case('F') :\r
403                     case('I') : \r
404                         SwapEndian_32p(&tagData[i]);\r
405                         i+=4; // sizeof(uint32_t)\r
406                         break;\r
407                                 \r
408                     case('D') : \r
409                         SwapEndian_64p(&tagData[i]);\r
410                         i+=8; // sizeof(uint64_t)\r
411                         break;\r
412                                 \r
413                     case('H') :\r
414                     case('Z') : \r
415                         while (tagData[i]) { ++i; }\r
416                         ++i; // increment one more for null terminator\r
417                         break;\r
418                                 \r
419                     default : \r
420                         fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here\r
421                         free(tagData);\r
422                         exit(1); \r
423                 }\r
424             }\r
425             \r
426             mBGZF.Write(tagData, tagDataLength);\r
427             free(tagData);\r
428         } \r
429         else \r
430             mBGZF.Write(al.TagData.data(), tagDataLength);      \r
431     }\r
432 }\r