]> git.donarmstrong.com Git - bamtools.git/blob - src/api/BamWriter.cpp
Did some housekeeping in BamAux.h - moved some constants to the files where they...
[bamtools.git] / src / api / BamWriter.cpp
1 // ***************************************************************************\r
2 // BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett\r
3 // Marth Lab, Department of Biology, Boston College\r
4 // All rights reserved.\r
5 // ---------------------------------------------------------------------------\r
6 // Last modified: 16 September 2010 (DB)\r
7 // ---------------------------------------------------------------------------\r
8 // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
9 // Institute.\r
10 // ---------------------------------------------------------------------------\r
11 // Provides the basic functionality for producing BAM files\r
12 // ***************************************************************************\r
13 \r
14 #include <iostream>\r
15 \r
16 #include "BGZF.h"\r
17 #include "BamWriter.h"\r
18 using namespace BamTools;\r
19 using namespace std;\r
20 \r
21 namespace BamTools {\r
22 \r
23 // BAM constants\r
24 const int BT_SIZEOF_INT   = 4;\r
25 const int BAM_CMATCH      = 0;\r
26 const int BAM_CINS        = 1;\r
27 const int BAM_CDEL        = 2;\r
28 const int BAM_CREF_SKIP   = 3;\r
29 const int BAM_CSOFT_CLIP  = 4;\r
30 const int BAM_CHARD_CLIP  = 5;\r
31 const int BAM_CPAD        = 6;\r
32 \r
33 } // namespace BamTools\r
34 \r
35 struct BamWriter::BamWriterPrivate {\r
36 \r
37     // data members\r
38     BgzfData mBGZF;\r
39     bool IsBigEndian;\r
40     \r
41     // constructor / destructor\r
42     BamWriterPrivate(void) { \r
43       IsBigEndian = SystemIsBigEndian();  \r
44     }\r
45     \r
46     ~BamWriterPrivate(void) {\r
47         mBGZF.Close();\r
48     }\r
49 \r
50     // "public" interface\r
51     void Close(void);\r
52     bool Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed);\r
53     void SaveAlignment(const BamAlignment& al);\r
54 \r
55     // internal methods\r
56     const unsigned int CalculateMinimumBin(const int begin, int end) const;\r
57     void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);\r
58     void EncodeQuerySequence(const string& query, string& encodedQuery);\r
59 };\r
60 \r
61 // -----------------------------------------------------\r
62 // BamWriter implementation\r
63 // -----------------------------------------------------\r
64 \r
65 // constructor\r
66 BamWriter::BamWriter(void) {\r
67     d = new BamWriterPrivate;\r
68 }\r
69 \r
70 // destructor\r
71 BamWriter::~BamWriter(void) {\r
72     delete d;\r
73     d = 0;\r
74 }\r
75 \r
76 // closes the alignment archive\r
77 void BamWriter::Close(void) { \r
78   d->Close(); \r
79 }\r
80 \r
81 // opens the alignment archive\r
82 bool BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) {\r
83     return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed);\r
84 }\r
85 \r
86 // saves the alignment to the alignment archive\r
87 void BamWriter::SaveAlignment(const BamAlignment& al) { \r
88     d->SaveAlignment(al);\r
89 }\r
90 \r
91 // -----------------------------------------------------\r
92 // BamWriterPrivate implementation\r
93 // -----------------------------------------------------\r
94 \r
95 // closes the alignment archive\r
96 void BamWriter::BamWriterPrivate::Close(void) {\r
97     mBGZF.Close();\r
98 }\r
99 \r
100 // calculates minimum bin for a BAM alignment interval\r
101 const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {  \r
102     --end;\r
103     if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);\r
104     if( (begin >> 17) == (end >> 17) ) return  585 + (begin >> 17);\r
105     if( (begin >> 20) == (end >> 20) ) return   73 + (begin >> 20);\r
106     if( (begin >> 23) == (end >> 23) ) return    9 + (begin >> 23);\r
107     if( (begin >> 26) == (end >> 26) ) return    1 + (begin >> 26);\r
108     return 0;\r
109 }\r
110 \r
111 // creates a cigar string from the supplied alignment\r
112 void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {\r
113 \r
114     // initialize\r
115     const unsigned int numCigarOperations = cigarOperations.size();\r
116     packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);\r
117 \r
118     // pack the cigar data into the string\r
119     unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();\r
120 \r
121     unsigned int cigarOp;\r
122     vector<CigarOp>::const_iterator coIter;\r
123     for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {\r
124 \r
125         switch(coIter->Type) {\r
126             case 'M':\r
127                   cigarOp = BAM_CMATCH;\r
128                   break;\r
129             case 'I':\r
130                   cigarOp = BAM_CINS;\r
131                   break;\r
132             case 'D':\r
133                   cigarOp = BAM_CDEL;\r
134                   break;\r
135             case 'N':\r
136                   cigarOp = BAM_CREF_SKIP;\r
137                   break;\r
138             case 'S':\r
139                   cigarOp = BAM_CSOFT_CLIP;\r
140                   break;\r
141             case 'H':\r
142                   cigarOp = BAM_CHARD_CLIP;\r
143                   break;\r
144             case 'P':\r
145                   cigarOp = BAM_CPAD;\r
146                   break;\r
147             default:\r
148                   fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);\r
149                   exit(1);\r
150         }\r
151 \r
152         *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;\r
153         pPackedCigar++;\r
154     }\r
155 }\r
156 \r
157 // encodes the supplied query sequence into 4-bit notation\r
158 void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {\r
159 \r
160     // prepare the encoded query string\r
161     const unsigned int queryLen = query.size();\r
162     const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);\r
163     encodedQuery.resize(encodedQueryLen);\r
164     char* pEncodedQuery = (char*)encodedQuery.data();\r
165     const char* pQuery = (const char*)query.data();\r
166 \r
167     unsigned char nucleotideCode;\r
168     bool useHighWord = true;\r
169 \r
170     while(*pQuery) {\r
171 \r
172         switch(*pQuery) {\r
173             \r
174             case '=':\r
175                 nucleotideCode = 0;\r
176                 break;\r
177                 \r
178             case 'A':\r
179                 nucleotideCode = 1;\r
180                 break;\r
181             \r
182             case 'C':\r
183                 nucleotideCode = 2;\r
184                 break;\r
185             \r
186             case 'G':\r
187                 nucleotideCode = 4;\r
188                 break;\r
189             \r
190             case 'T':\r
191                 nucleotideCode = 8;\r
192                 break;\r
193             \r
194             case 'N':\r
195                 nucleotideCode = 15;\r
196                 break;\r
197             \r
198             default:\r
199                 fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
200                 exit(1);\r
201         }\r
202 \r
203         // pack the nucleotide code\r
204         if(useHighWord) {\r
205             *pEncodedQuery = nucleotideCode << 4;\r
206             useHighWord = false;\r
207         } else {\r
208             *pEncodedQuery |= nucleotideCode;\r
209             pEncodedQuery++;\r
210             useHighWord = true;\r
211         }\r
212 \r
213         // increment the query position\r
214         pQuery++;\r
215     }\r
216 }\r
217 \r
218 // opens the alignment archive\r
219 bool BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) {\r
220 \r
221     // open the BGZF file for writing, return failure if error\r
222     if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )\r
223         return false;\r
224 \r
225     // ================\r
226     // write the header\r
227     // ================\r
228 \r
229     // write the BAM signature\r
230     const unsigned char SIGNATURE_LENGTH = 4;\r
231     const char* BAM_SIGNATURE = "BAM\1";\r
232     mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);\r
233 \r
234     // write the SAM header text length\r
235     uint32_t samHeaderLen = samHeader.size();\r
236     if (IsBigEndian) SwapEndian_32(samHeaderLen);\r
237     mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
238 \r
239     // write the SAM header text\r
240     if(samHeaderLen > 0) \r
241         mBGZF.Write(samHeader.data(), samHeaderLen);\r
242 \r
243     // write the number of reference sequences\r
244     uint32_t numReferenceSequences = referenceSequences.size();\r
245     if (IsBigEndian) SwapEndian_32(numReferenceSequences);\r
246     mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
247 \r
248     // =============================\r
249     // write the sequence dictionary\r
250     // =============================\r
251 \r
252     RefVector::const_iterator rsIter;\r
253     for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {\r
254 \r
255         // write the reference sequence name length\r
256         uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
257         if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);\r
258         mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
259 \r
260         // write the reference sequence name\r
261         mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);\r
262 \r
263         // write the reference sequence length\r
264         int32_t referenceLength = rsIter->RefLength;\r
265         if (IsBigEndian) SwapEndian_32(referenceLength);\r
266         mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);\r
267     }\r
268     \r
269     // return success\r
270     return true;\r
271 }\r
272 \r
273 // saves the alignment to the alignment archive\r
274 void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {\r
275 \r
276     // if BamAlignment contains only the core data and a raw char data buffer\r
277     // (as a result of BamReader::GetNextAlignmentCore())\r
278     if ( al.SupportData.HasCoreOnly ) {\r
279       \r
280         // write the block size\r
281         unsigned int blockSize = al.SupportData.BlockLength;\r
282         if (IsBigEndian) SwapEndian_32(blockSize);\r
283         mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
284 \r
285         // assign the BAM core data\r
286         uint32_t buffer[8];\r
287         buffer[0] = al.RefID;\r
288         buffer[1] = al.Position;\r
289         buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;\r
290         buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;\r
291         buffer[4] = al.SupportData.QuerySequenceLength;\r
292         buffer[5] = al.MateRefID;\r
293         buffer[6] = al.MatePosition;\r
294         buffer[7] = al.InsertSize;\r
295         \r
296         // swap BAM core endian-ness, if necessary\r
297         if ( IsBigEndian ) { \r
298             for ( int i = 0; i < 8; ++i )\r
299                 SwapEndian_32(buffer[i]); \r
300         }\r
301         \r
302         // write the BAM core\r
303         mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
304         \r
305         // write the raw char data\r
306         mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); \r
307     }\r
308     \r
309     // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc\r
310     // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )\r
311     else {\r
312         \r
313         // calculate char lengths\r
314         const unsigned int nameLength         = al.Name.size() + 1;\r
315         const unsigned int numCigarOperations = al.CigarData.size();\r
316         const unsigned int queryLength        = al.QueryBases.size();\r
317         const unsigned int tagDataLength      = al.TagData.size();\r
318         \r
319         // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)\r
320         // force calculation of Bin before storing\r
321         const int endPosition = al.GetEndPosition();\r
322         const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);\r
323       \r
324         // create our packed cigar string\r
325         string packedCigar;\r
326         CreatePackedCigar(al.CigarData, packedCigar);\r
327         const unsigned int packedCigarLength = packedCigar.size();\r
328 \r
329         // encode the query\r
330         string encodedQuery;\r
331         EncodeQuerySequence(al.QueryBases, encodedQuery);\r
332         const unsigned int encodedQueryLength = encodedQuery.size(); \r
333       \r
334         // write the block size\r
335         const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;\r
336         unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
337         if (IsBigEndian) SwapEndian_32(blockSize);\r
338         mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
339 \r
340         // assign the BAM core data\r
341         uint32_t buffer[8];\r
342         buffer[0] = al.RefID;\r
343         buffer[1] = al.Position;\r
344         buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;\r
345         buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;\r
346         buffer[4] = queryLength;\r
347         buffer[5] = al.MateRefID;\r
348         buffer[6] = al.MatePosition;\r
349         buffer[7] = al.InsertSize;\r
350         \r
351         // swap BAM core endian-ness, if necessary\r
352         if ( IsBigEndian ) { \r
353             for ( int i = 0; i < 8; ++i )\r
354                 SwapEndian_32(buffer[i]); \r
355         }\r
356         \r
357         // write the BAM core\r
358         mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
359         \r
360         // write the query name\r
361         mBGZF.Write(al.Name.c_str(), nameLength);\r
362 \r
363         // write the packed cigar\r
364         if ( IsBigEndian ) {\r
365           \r
366             char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);\r
367             memcpy(cigarData, packedCigar.data(), packedCigarLength);\r
368             \r
369             for (unsigned int i = 0; i < packedCigarLength; ++i) {\r
370                 if ( IsBigEndian )\r
371                   SwapEndian_32p(&cigarData[i]); \r
372             }\r
373             \r
374             mBGZF.Write(cigarData, packedCigarLength);\r
375             free(cigarData);    \r
376         } \r
377         else \r
378             mBGZF.Write(packedCigar.data(), packedCigarLength);\r
379 \r
380         // write the encoded query sequence\r
381         mBGZF.Write(encodedQuery.data(), encodedQueryLength);\r
382 \r
383         // write the base qualities\r
384         string baseQualities(al.Qualities);\r
385         char* pBaseQualities = (char*)al.Qualities.data();\r
386         for(unsigned int i = 0; i < queryLength; i++) { \r
387             pBaseQualities[i] -= 33; \r
388         }\r
389         mBGZF.Write(pBaseQualities, queryLength);\r
390 \r
391         // write the read group tag\r
392         if ( IsBigEndian ) {\r
393           \r
394             char* tagData = (char*)calloc(sizeof(char), tagDataLength);\r
395             memcpy(tagData, al.TagData.data(), tagDataLength);\r
396           \r
397             int i = 0;\r
398             while ( (unsigned int)i < tagDataLength ) {\r
399                 \r
400                 i += 2;                                 // skip tag type (e.g. "RG", "NM", etc)\r
401                 uint8_t type = toupper(tagData[i]);     // lower & upper case letters have same meaning \r
402                 ++i;                                    // skip value type\r
403                 \r
404                 switch (type) {\r
405                   \r
406                     case('A') :\r
407                     case('C') : \r
408                         ++i;\r
409                         break;\r
410                                 \r
411                     case('S') : \r
412                         SwapEndian_16p(&tagData[i]); \r
413                         i+=2; // sizeof(uint16_t)\r
414                         break;\r
415                                 \r
416                     case('F') :\r
417                     case('I') : \r
418                         SwapEndian_32p(&tagData[i]);\r
419                         i+=4; // sizeof(uint32_t)\r
420                         break;\r
421                                 \r
422                     case('D') : \r
423                         SwapEndian_64p(&tagData[i]);\r
424                         i+=8; // sizeof(uint64_t)\r
425                         break;\r
426                                 \r
427                     case('H') :\r
428                     case('Z') : \r
429                         while (tagData[i]) { ++i; }\r
430                         ++i; // increment one more for null terminator\r
431                         break;\r
432                                 \r
433                     default : \r
434                         fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here\r
435                         free(tagData);\r
436                         exit(1); \r
437                 }\r
438             }\r
439             \r
440             mBGZF.Write(tagData, tagDataLength);\r
441             free(tagData);\r
442         } \r
443         else \r
444             mBGZF.Write(al.TagData.data(), tagDataLength);      \r
445     }\r
446 }\r