]> git.donarmstrong.com Git - bamtools.git/blob - BamWriter.cpp
further cleanup of duplicate @RG tag warning reporting
[bamtools.git] / BamWriter.cpp
1 // ***************************************************************************\r
2 // BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett\r
3 // Marth Lab, Department of Biology, Boston College\r
4 // All rights reserved.\r
5 // ---------------------------------------------------------------------------\r
6 // Last modified: 30 March 2010 (DB)\r
7 // ---------------------------------------------------------------------------\r
8 // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
9 // Institute.\r
10 // ---------------------------------------------------------------------------\r
11 // Provides the basic functionality for producing BAM files\r
12 // ***************************************************************************\r
13 \r
14 // BGZF includes\r
15 #include "BGZF.h"\r
16 #include "BamWriter.h"\r
17 using namespace BamTools;\r
18 using namespace std;\r
19 \r
20 struct BamWriter::BamWriterPrivate {\r
21 \r
22     // data members\r
23     BgzfData mBGZF;\r
24     bool IsBigEndian;\r
25     \r
26     \r
27     // constructor / destructor\r
28     BamWriterPrivate(void) { \r
29       IsBigEndian = SystemIsBigEndian();  \r
30     }\r
31     \r
32     ~BamWriterPrivate(void) {\r
33         mBGZF.Close();\r
34     }\r
35 \r
36     // "public" interface\r
37     void Close(void);\r
38     void Open(const string& filename, const string& samHeader, const RefVector& referenceSequences);\r
39     void SaveAlignment(const BamAlignment& al);\r
40     void SaveAlignment(const BamAlignment& al, const BamAlignmentSupportData& supportData);\r
41 \r
42     // internal methods\r
43     void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);\r
44     void EncodeQuerySequence(const string& query, string& encodedQuery);\r
45 };\r
46 \r
47 // -----------------------------------------------------\r
48 // BamWriter implementation\r
49 // -----------------------------------------------------\r
50 \r
51 // constructor\r
52 BamWriter::BamWriter(void) {\r
53     d = new BamWriterPrivate;\r
54 }\r
55 \r
56 // destructor\r
57 BamWriter::~BamWriter(void) {\r
58     delete d;\r
59     d = 0;\r
60 }\r
61 \r
62 // closes the alignment archive\r
63 void BamWriter::Close(void) { \r
64   d->Close(); \r
65 }\r
66 \r
67 // opens the alignment archive\r
68 void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {\r
69     d->Open(filename, samHeader, referenceSequences);\r
70 }\r
71 \r
72 // saves the alignment to the alignment archive\r
73 void BamWriter::SaveAlignment(const BamAlignment& al) { \r
74     d->SaveAlignment(al);\r
75 }\r
76 \r
77 void BamWriter::SaveAlignment(const BamAlignment& al, const BamAlignmentSupportData& supportData) {\r
78     d->SaveAlignment(al, supportData);\r
79 }\r
80 \r
81 // -----------------------------------------------------\r
82 // BamWriterPrivate implementation\r
83 // -----------------------------------------------------\r
84 \r
85 // closes the alignment archive\r
86 void BamWriter::BamWriterPrivate::Close(void) {\r
87     mBGZF.Close();\r
88 }\r
89 \r
90 // creates a cigar string from the supplied alignment\r
91 void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {\r
92 \r
93     // initialize\r
94     const unsigned int numCigarOperations = cigarOperations.size();\r
95     packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);\r
96 \r
97     // pack the cigar data into the string\r
98     unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();\r
99 \r
100     unsigned int cigarOp;\r
101     vector<CigarOp>::const_iterator coIter;\r
102     for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {\r
103 \r
104         switch(coIter->Type) {\r
105             case 'M':\r
106                   cigarOp = BAM_CMATCH;\r
107                   break;\r
108             case 'I':\r
109                   cigarOp = BAM_CINS;\r
110                   break;\r
111             case 'D':\r
112                   cigarOp = BAM_CDEL;\r
113                   break;\r
114             case 'N':\r
115                   cigarOp = BAM_CREF_SKIP;\r
116                   break;\r
117             case 'S':\r
118                   cigarOp = BAM_CSOFT_CLIP;\r
119                   break;\r
120             case 'H':\r
121                   cigarOp = BAM_CHARD_CLIP;\r
122                   break;\r
123             case 'P':\r
124                   cigarOp = BAM_CPAD;\r
125                   break;\r
126             default:\r
127                   printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);\r
128                   exit(1);\r
129         }\r
130 \r
131         *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;\r
132         pPackedCigar++;\r
133     }\r
134 }\r
135 \r
136 // encodes the supplied query sequence into 4-bit notation\r
137 void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {\r
138 \r
139     // prepare the encoded query string\r
140     const unsigned int queryLen = query.size();\r
141     const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);\r
142     encodedQuery.resize(encodedQueryLen);\r
143     char* pEncodedQuery = (char*)encodedQuery.data();\r
144     const char* pQuery = (const char*)query.data();\r
145 \r
146     unsigned char nucleotideCode;\r
147     bool useHighWord = true;\r
148 \r
149     while(*pQuery) {\r
150 \r
151         switch(*pQuery) {\r
152             \r
153             case '=':\r
154                 nucleotideCode = 0;\r
155                 break;\r
156                 \r
157             case 'A':\r
158                 nucleotideCode = 1;\r
159                 break;\r
160             \r
161             case 'C':\r
162                 nucleotideCode = 2;\r
163                 break;\r
164             \r
165             case 'G':\r
166                 nucleotideCode = 4;\r
167                 break;\r
168             \r
169             case 'T':\r
170                 nucleotideCode = 8;\r
171                 break;\r
172             \r
173             case 'N':\r
174                 nucleotideCode = 15;\r
175                 break;\r
176             \r
177             default:\r
178                 printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
179                 exit(1);\r
180         }\r
181 \r
182         // pack the nucleotide code\r
183         if(useHighWord) {\r
184             *pEncodedQuery = nucleotideCode << 4;\r
185             useHighWord = false;\r
186         } else {\r
187             *pEncodedQuery |= nucleotideCode;\r
188             pEncodedQuery++;\r
189             useHighWord = true;\r
190         }\r
191 \r
192         // increment the query position\r
193         pQuery++;\r
194     }\r
195 }\r
196 \r
197 // opens the alignment archive\r
198 void BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {\r
199 \r
200     // open the BGZF file for writing\r
201     mBGZF.Open(filename, "wb");\r
202 \r
203     // ================\r
204     // write the header\r
205     // ================\r
206 \r
207     // write the BAM signature\r
208     const unsigned char SIGNATURE_LENGTH = 4;\r
209     const char* BAM_SIGNATURE = "BAM\1";\r
210     mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);\r
211 \r
212     // write the SAM header text length\r
213     uint32_t samHeaderLen = samHeader.size();\r
214     if ( IsBigEndian ) { SwapEndian_32(samHeaderLen); }\r
215     mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
216 \r
217     // write the SAM header text\r
218     if(samHeaderLen > 0) {\r
219         mBGZF.Write(samHeader.data(), samHeaderLen);\r
220     }\r
221 \r
222     // write the number of reference sequences\r
223     uint32_t numReferenceSequences = referenceSequences.size();\r
224     if ( IsBigEndian ) { SwapEndian_32(numReferenceSequences); }\r
225     mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
226 \r
227     // =============================\r
228     // write the sequence dictionary\r
229     // =============================\r
230 \r
231     RefVector::const_iterator rsIter;\r
232     for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {\r
233 \r
234         // write the reference sequence name length\r
235         uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
236         if ( IsBigEndian ) { SwapEndian_32(referenceSequenceNameLen); }\r
237         mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
238 \r
239         // write the reference sequence name\r
240         mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);\r
241 \r
242         // write the reference sequence length\r
243         int32_t referenceLength = rsIter->RefLength;\r
244         if ( IsBigEndian ) { SwapEndian_32(referenceLength); }\r
245         mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);\r
246     }\r
247 }\r
248 \r
249 // saves the alignment to the alignment archive\r
250 void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {\r
251 \r
252     // initialize\r
253     const unsigned int nameLen            = al.Name.size() + 1;\r
254     const unsigned int queryLen           = al.QueryBases.size();\r
255     const unsigned int numCigarOperations = al.CigarData.size();\r
256 \r
257     // create our packed cigar string\r
258     string packedCigar;\r
259     CreatePackedCigar(al.CigarData, packedCigar);\r
260     const unsigned int packedCigarLen = packedCigar.size();\r
261 \r
262     // encode the query\r
263     string encodedQuery;\r
264     EncodeQuerySequence(al.QueryBases, encodedQuery);\r
265     const unsigned int encodedQueryLen = encodedQuery.size();\r
266 \r
267     // store the tag data length\r
268     // -------------------------------------------\r
269     // Modified: 3-25-2010 DWB\r
270     // Contributed: ARQ\r
271     // Fixed: "off by one" error when parsing tags in files produced by BamWriter\r
272     const unsigned int tagDataLength = al.TagData.size();\r
273     // original line: \r
274     // const unsigned int tagDataLength = al.TagData.size() + 1;     \r
275     // -------------------------------------------\r
276     \r
277     // assign the BAM core data\r
278     uint32_t buffer[8];\r
279     buffer[0] = al.RefID;\r
280     buffer[1] = al.Position;\r
281     buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;\r
282     buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;\r
283     buffer[4] = queryLen;\r
284     buffer[5] = al.MateRefID;\r
285     buffer[6] = al.MatePosition;\r
286     buffer[7] = al.InsertSize;\r
287 \r
288     // write the block size\r
289     unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;\r
290     unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
291     if ( IsBigEndian ) { SwapEndian_32(blockSize); }\r
292     mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
293 \r
294     // write the BAM core\r
295     if ( IsBigEndian ) { \r
296         for ( int i = 0; i < 8; ++i ) { \r
297             SwapEndian_32(buffer[i]); \r
298         } \r
299     }\r
300     mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
301 \r
302     // write the query name\r
303     mBGZF.Write(al.Name.c_str(), nameLen);\r
304 \r
305     // write the packed cigar\r
306     if ( IsBigEndian ) {\r
307       \r
308         char* cigarData = (char*)calloc(sizeof(char), packedCigarLen);\r
309         memcpy(cigarData, packedCigar.data(), packedCigarLen);\r
310         \r
311         for (unsigned int i = 0; i < packedCigarLen; ++i) {\r
312             if ( IsBigEndian ) { \r
313               SwapEndian_32p(&cigarData[i]); \r
314             }\r
315         }\r
316         \r
317         mBGZF.Write(cigarData, packedCigarLen);\r
318         free(cigarData);\r
319         \r
320     } else { \r
321         mBGZF.Write(packedCigar.data(), packedCigarLen);\r
322     }\r
323 \r
324     // write the encoded query sequence\r
325     mBGZF.Write(encodedQuery.data(), encodedQueryLen);\r
326 \r
327     // write the base qualities\r
328     string baseQualities = al.Qualities;\r
329     char* pBaseQualities = (char*)al.Qualities.data();\r
330     for(unsigned int i = 0; i < queryLen; i++) { pBaseQualities[i] -= 33; }\r
331     mBGZF.Write(pBaseQualities, queryLen);\r
332 \r
333     // write the read group tag\r
334     if ( IsBigEndian ) {\r
335       \r
336         char* tagData = (char*)calloc(sizeof(char), tagDataLength);\r
337         memcpy(tagData, al.TagData.data(), tagDataLength);\r
338       \r
339         int i = 0;\r
340         while ( (unsigned int)i < tagDataLength ) {\r
341             \r
342             i += 2;                                 // skip tag type (e.g. "RG", "NM", etc)\r
343             uint8_t type = toupper(tagData[i]);     // lower & upper case letters have same meaning \r
344             ++i;                                    // skip value type\r
345             \r
346             switch (type) {\r
347               \r
348                 case('A') :\r
349                 case('C') : \r
350                     ++i;\r
351                     break;\r
352                             \r
353                 case('S') : \r
354                     SwapEndian_16p(&tagData[i]); \r
355                     i+=2; // sizeof(uint16_t)\r
356                     break;\r
357                             \r
358                 case('F') :\r
359                 case('I') : \r
360                     SwapEndian_32p(&tagData[i]);\r
361                     i+=4; // sizeof(uint32_t)\r
362                     break;\r
363                             \r
364                 case('D') : \r
365                     SwapEndian_64p(&tagData[i]);\r
366                     i+=8; // sizeof(uint64_t)\r
367                     break;\r
368                             \r
369                 case('H') :\r
370                 case('Z') : \r
371                     while (tagData[i]) { ++i; }\r
372                     ++i; // increment one more for null terminator\r
373                     break;\r
374                             \r
375                 default : \r
376                     printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
377                     free(tagData);\r
378                     exit(1); \r
379             }\r
380         }\r
381         \r
382         mBGZF.Write(tagData, tagDataLength);\r
383         free(tagData);\r
384     } else {\r
385         mBGZF.Write(al.TagData.data(), tagDataLength);\r
386     }\r
387 }\r
388 \r
389 void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al, const BamAlignmentSupportData& supportData) {\r
390   \r
391     // assign the BAM core data\r
392     uint32_t buffer[8];\r
393     buffer[0] = al.RefID;\r
394     buffer[1] = al.Position;\r
395     buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | supportData.QueryNameLength;\r
396     buffer[3] = (al.AlignmentFlag << 16) | supportData.NumCigarOperations;\r
397     buffer[4] = supportData.QuerySequenceLength;\r
398     buffer[5] = al.MateRefID;\r
399     buffer[6] = al.MatePosition;\r
400     buffer[7] = al.InsertSize;\r
401 \r
402     // write the block size\r
403     unsigned int blockSize = supportData.BlockLength;\r
404     if ( IsBigEndian ) { SwapEndian_32(blockSize); }\r
405     mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
406 \r
407     // write the BAM core\r
408     if ( IsBigEndian ) { \r
409         for ( int i = 0; i < 8; ++i ) { \r
410             SwapEndian_32(buffer[i]); \r
411         } \r
412     }\r
413     mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
414 \r
415     // write the raw char data\r
416     mBGZF.Write((char*)supportData.AllCharData.data(), supportData.BlockLength-BAM_CORE_SIZE);\r
417 }\r
418 \r