]> git.donarmstrong.com Git - bamtools.git/blob - BamWriter.cpp
json output
[bamtools.git] / BamWriter.cpp
1 // ***************************************************************************\r
2 // BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett\r
3 // Marth Lab, Department of Biology, Boston College\r
4 // All rights reserved.\r
5 // ---------------------------------------------------------------------------\r
6 // Last modified: 30 March 2010 (DB)\r
7 // ---------------------------------------------------------------------------\r
8 // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
9 // Institute.\r
10 // ---------------------------------------------------------------------------\r
11 // Provides the basic functionality for producing BAM files\r
12 // ***************************************************************************\r
13 \r
14 // BGZF includes\r
15 #include "BGZF.h"\r
16 #include "BamWriter.h"\r
17 using namespace BamTools;\r
18 using namespace std;\r
19 \r
20 struct BamWriter::BamWriterPrivate {\r
21 \r
22     // data members\r
23     BgzfData mBGZF;\r
24     bool IsBigEndian;\r
25     \r
26     \r
27     // constructor / destructor\r
28     BamWriterPrivate(void) { \r
29       IsBigEndian = SystemIsBigEndian();  \r
30     }\r
31     \r
32     ~BamWriterPrivate(void) {\r
33         mBGZF.Close();\r
34     }\r
35 \r
36     // "public" interface\r
37     void Close(void);\r
38     void Open(const std::string& filename, const std::string& samHeader, const BamTools::RefVector& referenceSequences);\r
39     void SaveAlignment(const BamTools::BamAlignment& al);\r
40 \r
41     // internal methods\r
42     void CreatePackedCigar(const std::vector<CigarOp>& cigarOperations, std::string& packedCigar);\r
43     void EncodeQuerySequence(const std::string& query, std::string& encodedQuery);\r
44 };\r
45 \r
46 // -----------------------------------------------------\r
47 // BamWriter implementation\r
48 // -----------------------------------------------------\r
49 \r
50 // constructor\r
51 BamWriter::BamWriter(void) {\r
52     d = new BamWriterPrivate;\r
53 }\r
54 \r
55 // destructor\r
56 BamWriter::~BamWriter(void) {\r
57     delete d;\r
58     d = 0;\r
59 }\r
60 \r
61 // closes the alignment archive\r
62 void BamWriter::Close(void) {\r
63     d->Close();\r
64 }\r
65 \r
66 // opens the alignment archive\r
67 void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {\r
68     d->Open(filename, samHeader, referenceSequences);\r
69 }\r
70 \r
71 // saves the alignment to the alignment archive\r
72 void BamWriter::SaveAlignment(const BamAlignment& al) {\r
73     d->SaveAlignment(al);\r
74 }\r
75 \r
76 // -----------------------------------------------------\r
77 // BamWriterPrivate implementation\r
78 // -----------------------------------------------------\r
79 \r
80 // closes the alignment archive\r
81 void BamWriter::BamWriterPrivate::Close(void) {\r
82     mBGZF.Close();\r
83 }\r
84 \r
85 // creates a cigar string from the supplied alignment\r
86 void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {\r
87 \r
88     // initialize\r
89     const unsigned int numCigarOperations = cigarOperations.size();\r
90     packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);\r
91 \r
92     // pack the cigar data into the string\r
93     unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();\r
94 \r
95     unsigned int cigarOp;\r
96     vector<CigarOp>::const_iterator coIter;\r
97     for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {\r
98 \r
99         switch(coIter->Type) {\r
100             case 'M':\r
101                   cigarOp = BAM_CMATCH;\r
102                   break;\r
103             case 'I':\r
104                   cigarOp = BAM_CINS;\r
105                   break;\r
106             case 'D':\r
107                   cigarOp = BAM_CDEL;\r
108                   break;\r
109             case 'N':\r
110                   cigarOp = BAM_CREF_SKIP;\r
111                   break;\r
112             case 'S':\r
113                   cigarOp = BAM_CSOFT_CLIP;\r
114                   break;\r
115             case 'H':\r
116                   cigarOp = BAM_CHARD_CLIP;\r
117                   break;\r
118             case 'P':\r
119                   cigarOp = BAM_CPAD;\r
120                   break;\r
121             default:\r
122                   printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);\r
123                   exit(1);\r
124         }\r
125 \r
126         *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;\r
127         pPackedCigar++;\r
128     }\r
129 }\r
130 \r
131 // encodes the supplied query sequence into 4-bit notation\r
132 void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {\r
133 \r
134     // prepare the encoded query string\r
135     const unsigned int queryLen = query.size();\r
136     const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);\r
137     encodedQuery.resize(encodedQueryLen);\r
138     char* pEncodedQuery = (char*)encodedQuery.data();\r
139     const char* pQuery = (const char*)query.data();\r
140 \r
141     unsigned char nucleotideCode;\r
142     bool useHighWord = true;\r
143 \r
144     while(*pQuery) {\r
145 \r
146         switch(*pQuery) {\r
147             \r
148             case '=':\r
149                 nucleotideCode = 0;\r
150                 break;\r
151                 \r
152             case 'A':\r
153                 nucleotideCode = 1;\r
154                 break;\r
155             \r
156             case 'C':\r
157                 nucleotideCode = 2;\r
158                 break;\r
159             \r
160             case 'G':\r
161                 nucleotideCode = 4;\r
162                 break;\r
163             \r
164             case 'T':\r
165                 nucleotideCode = 8;\r
166                 break;\r
167             \r
168             case 'N':\r
169                 nucleotideCode = 15;\r
170                 break;\r
171             \r
172             default:\r
173                 printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
174                 exit(1);\r
175         }\r
176 \r
177         // pack the nucleotide code\r
178         if(useHighWord) {\r
179             *pEncodedQuery = nucleotideCode << 4;\r
180             useHighWord = false;\r
181         } else {\r
182             *pEncodedQuery |= nucleotideCode;\r
183             pEncodedQuery++;\r
184             useHighWord = true;\r
185         }\r
186 \r
187         // increment the query position\r
188         pQuery++;\r
189     }\r
190 }\r
191 \r
192 // opens the alignment archive\r
193 void BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {\r
194 \r
195     // open the BGZF file for writing\r
196     mBGZF.Open(filename, "wb");\r
197 \r
198     // ================\r
199     // write the header\r
200     // ================\r
201 \r
202     // write the BAM signature\r
203     const unsigned char SIGNATURE_LENGTH = 4;\r
204     const char* BAM_SIGNATURE = "BAM\1";\r
205     mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);\r
206 \r
207     // write the SAM header text length\r
208     uint32_t samHeaderLen = samHeader.size();\r
209     if ( IsBigEndian ) { SwapEndian_32(samHeaderLen); }\r
210     mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
211 \r
212     // write the SAM header text\r
213     if(samHeaderLen > 0) {\r
214         mBGZF.Write(samHeader.data(), samHeaderLen);\r
215     }\r
216 \r
217     // write the number of reference sequences\r
218     uint32_t numReferenceSequences = referenceSequences.size();\r
219     if ( IsBigEndian ) { SwapEndian_32(numReferenceSequences); }\r
220     mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
221 \r
222     // =============================\r
223     // write the sequence dictionary\r
224     // =============================\r
225 \r
226     RefVector::const_iterator rsIter;\r
227     for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {\r
228 \r
229         // write the reference sequence name length\r
230         uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
231         if ( IsBigEndian ) { SwapEndian_32(referenceSequenceNameLen); }\r
232         mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
233 \r
234         // write the reference sequence name\r
235         mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);\r
236 \r
237         // write the reference sequence length\r
238         int32_t referenceLength = rsIter->RefLength;\r
239         if ( IsBigEndian ) { SwapEndian_32(referenceLength); }\r
240         mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);\r
241     }\r
242 }\r
243 \r
244 // saves the alignment to the alignment archive\r
245 void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {\r
246 \r
247     // initialize\r
248     const unsigned int nameLen            = al.Name.size() + 1;\r
249     const unsigned int queryLen           = al.QueryBases.size();\r
250     const unsigned int numCigarOperations = al.CigarData.size();\r
251 \r
252     // create our packed cigar string\r
253     string packedCigar;\r
254     CreatePackedCigar(al.CigarData, packedCigar);\r
255     const unsigned int packedCigarLen = packedCigar.size();\r
256 \r
257     // encode the query\r
258     string encodedQuery;\r
259     EncodeQuerySequence(al.QueryBases, encodedQuery);\r
260     const unsigned int encodedQueryLen = encodedQuery.size();\r
261 \r
262     // store the tag data length\r
263     // -------------------------------------------\r
264     // Modified: 3-25-2010 DWB\r
265     // Contributed: ARQ\r
266     // Fixed: "off by one" error when parsing tags in files produced by BamWriter\r
267     const unsigned int tagDataLength = al.TagData.size();\r
268     // original line: \r
269     // const unsigned int tagDataLength = al.TagData.size() + 1;     \r
270     // -------------------------------------------\r
271     \r
272     // assign the BAM core data\r
273     uint32_t buffer[8];\r
274     buffer[0] = al.RefID;\r
275     buffer[1] = al.Position;\r
276     buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;\r
277     buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;\r
278     buffer[4] = queryLen;\r
279     buffer[5] = al.MateRefID;\r
280     buffer[6] = al.MatePosition;\r
281     buffer[7] = al.InsertSize;\r
282 \r
283     // write the block size\r
284     unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;\r
285     unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
286     if ( IsBigEndian ) { SwapEndian_32(blockSize); }\r
287     mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
288 \r
289     // write the BAM core\r
290     if ( IsBigEndian ) { \r
291         for ( int i = 0; i < 8; ++i ) { \r
292             SwapEndian_32(buffer[i]); \r
293         } \r
294     }\r
295     mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
296 \r
297     // write the query name\r
298     mBGZF.Write(al.Name.c_str(), nameLen);\r
299 \r
300     // write the packed cigar\r
301     if ( IsBigEndian ) {\r
302       \r
303         char* cigarData = (char*)calloc(sizeof(char), packedCigarLen);\r
304         memcpy(cigarData, packedCigar.data(), packedCigarLen);\r
305         \r
306         for (unsigned int i = 0; i < packedCigarLen; ++i) {\r
307             if ( IsBigEndian ) { \r
308               SwapEndian_32p(&cigarData[i]); \r
309             }\r
310         }\r
311         \r
312         mBGZF.Write(cigarData, packedCigarLen);\r
313         free(cigarData);\r
314         \r
315     } else { \r
316         mBGZF.Write(packedCigar.data(), packedCigarLen);\r
317     }\r
318 \r
319     // write the encoded query sequence\r
320     mBGZF.Write(encodedQuery.data(), encodedQueryLen);\r
321 \r
322     // write the base qualities\r
323     string baseQualities = al.Qualities;\r
324     char* pBaseQualities = (char*)al.Qualities.data();\r
325     for(unsigned int i = 0; i < queryLen; i++) { pBaseQualities[i] -= 33; }\r
326     mBGZF.Write(pBaseQualities, queryLen);\r
327 \r
328     // write the read group tag\r
329     if ( IsBigEndian ) {\r
330       \r
331         char* tagData = (char*)calloc(sizeof(char), tagDataLength);\r
332         memcpy(tagData, al.TagData.data(), tagDataLength);\r
333       \r
334         int i = 0;\r
335         while ( (unsigned int)i < tagDataLength ) {\r
336             \r
337             i += 2;                                 // skip tag type (e.g. "RG", "NM", etc)\r
338             uint8_t type = toupper(tagData[i]);     // lower & upper case letters have same meaning \r
339             ++i;                                    // skip value type\r
340             \r
341             switch (type) {\r
342               \r
343                 case('A') :\r
344                 case('C') : \r
345                     ++i;\r
346                     break;\r
347                             \r
348                 case('S') : \r
349                     SwapEndian_16p(&tagData[i]); \r
350                     i+=2; // sizeof(uint16_t)\r
351                     break;\r
352                             \r
353                 case('F') :\r
354                 case('I') : \r
355                     SwapEndian_32p(&tagData[i]);\r
356                     i+=4; // sizeof(uint32_t)\r
357                     break;\r
358                             \r
359                 case('D') : \r
360                     SwapEndian_64p(&tagData[i]);\r
361                     i+=8; // sizeof(uint64_t)\r
362                     break;\r
363                             \r
364                 case('H') :\r
365                 case('Z') : \r
366                     while (tagData[i]) { ++i; }\r
367                     ++i; // increment one more for null terminator\r
368                     break;\r
369                             \r
370                 default : \r
371                     printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
372                     free(tagData);\r
373                     exit(1); \r
374             }\r
375         }\r
376         \r
377         mBGZF.Write(tagData, tagDataLength);\r
378         free(tagData);\r
379     } else {\r
380         mBGZF.Write(al.TagData.data(), tagDataLength);\r
381     }\r
382 }\r