]> git.donarmstrong.com Git - bamtools.git/blob - BamWriter.cpp
Added support for reading FASTA sequences, as well as generating FASTA index (.fai...
[bamtools.git] / BamWriter.cpp
1 // ***************************************************************************\r
2 // BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett\r
3 // Marth Lab, Department of Biology, Boston College\r
4 // All rights reserved.\r
5 // ---------------------------------------------------------------------------\r
6 // Last modified: 30 March 2010 (DB)\r
7 // ---------------------------------------------------------------------------\r
8 // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
9 // Institute.\r
10 // ---------------------------------------------------------------------------\r
11 // Provides the basic functionality for producing BAM files\r
12 // ***************************************************************************\r
13 \r
14 // BGZF includes\r
15 #include "BGZF.h"\r
16 #include "BamWriter.h"\r
17 using namespace BamTools;\r
18 using namespace std;\r
19 \r
20 struct BamWriter::BamWriterPrivate {\r
21 \r
22     // data members\r
23     BgzfData mBGZF;\r
24     bool IsBigEndian;\r
25     \r
26     // constructor / destructor\r
27     BamWriterPrivate(void) { \r
28       IsBigEndian = SystemIsBigEndian();  \r
29     }\r
30     \r
31     ~BamWriterPrivate(void) {\r
32         mBGZF.Close();\r
33     }\r
34 \r
35     // "public" interface\r
36     void Close(void);\r
37     void Open(const string& filename, const string& samHeader, const RefVector& referenceSequences);\r
38     void SaveAlignment(const BamAlignment& al);\r
39 \r
40     // internal methods\r
41     void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);\r
42     void EncodeQuerySequence(const string& query, string& encodedQuery);\r
43 };\r
44 \r
45 // -----------------------------------------------------\r
46 // BamWriter implementation\r
47 // -----------------------------------------------------\r
48 \r
49 // constructor\r
50 BamWriter::BamWriter(void) {\r
51     d = new BamWriterPrivate;\r
52 }\r
53 \r
54 // destructor\r
55 BamWriter::~BamWriter(void) {\r
56     delete d;\r
57     d = 0;\r
58 }\r
59 \r
60 // closes the alignment archive\r
61 void BamWriter::Close(void) { \r
62   d->Close(); \r
63 }\r
64 \r
65 // opens the alignment archive\r
66 void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {\r
67     d->Open(filename, samHeader, referenceSequences);\r
68 }\r
69 \r
70 // saves the alignment to the alignment archive\r
71 void BamWriter::SaveAlignment(const BamAlignment& al) { \r
72     d->SaveAlignment(al);\r
73 }\r
74 \r
75 // -----------------------------------------------------\r
76 // BamWriterPrivate implementation\r
77 // -----------------------------------------------------\r
78 \r
79 // closes the alignment archive\r
80 void BamWriter::BamWriterPrivate::Close(void) {\r
81     mBGZF.Close();\r
82 }\r
83 \r
84 // creates a cigar string from the supplied alignment\r
85 void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {\r
86 \r
87     // initialize\r
88     const unsigned int numCigarOperations = cigarOperations.size();\r
89     packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);\r
90 \r
91     // pack the cigar data into the string\r
92     unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();\r
93 \r
94     unsigned int cigarOp;\r
95     vector<CigarOp>::const_iterator coIter;\r
96     for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {\r
97 \r
98         switch(coIter->Type) {\r
99             case 'M':\r
100                   cigarOp = BAM_CMATCH;\r
101                   break;\r
102             case 'I':\r
103                   cigarOp = BAM_CINS;\r
104                   break;\r
105             case 'D':\r
106                   cigarOp = BAM_CDEL;\r
107                   break;\r
108             case 'N':\r
109                   cigarOp = BAM_CREF_SKIP;\r
110                   break;\r
111             case 'S':\r
112                   cigarOp = BAM_CSOFT_CLIP;\r
113                   break;\r
114             case 'H':\r
115                   cigarOp = BAM_CHARD_CLIP;\r
116                   break;\r
117             case 'P':\r
118                   cigarOp = BAM_CPAD;\r
119                   break;\r
120             default:\r
121                   printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);\r
122                   exit(1);\r
123         }\r
124 \r
125         *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;\r
126         pPackedCigar++;\r
127     }\r
128 }\r
129 \r
130 // encodes the supplied query sequence into 4-bit notation\r
131 void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {\r
132 \r
133     // prepare the encoded query string\r
134     const unsigned int queryLen = query.size();\r
135     const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);\r
136     encodedQuery.resize(encodedQueryLen);\r
137     char* pEncodedQuery = (char*)encodedQuery.data();\r
138     const char* pQuery = (const char*)query.data();\r
139 \r
140     unsigned char nucleotideCode;\r
141     bool useHighWord = true;\r
142 \r
143     while(*pQuery) {\r
144 \r
145         switch(*pQuery) {\r
146             \r
147             case '=':\r
148                 nucleotideCode = 0;\r
149                 break;\r
150                 \r
151             case 'A':\r
152                 nucleotideCode = 1;\r
153                 break;\r
154             \r
155             case 'C':\r
156                 nucleotideCode = 2;\r
157                 break;\r
158             \r
159             case 'G':\r
160                 nucleotideCode = 4;\r
161                 break;\r
162             \r
163             case 'T':\r
164                 nucleotideCode = 8;\r
165                 break;\r
166             \r
167             case 'N':\r
168                 nucleotideCode = 15;\r
169                 break;\r
170             \r
171             default:\r
172                 printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
173                 exit(1);\r
174         }\r
175 \r
176         // pack the nucleotide code\r
177         if(useHighWord) {\r
178             *pEncodedQuery = nucleotideCode << 4;\r
179             useHighWord = false;\r
180         } else {\r
181             *pEncodedQuery |= nucleotideCode;\r
182             pEncodedQuery++;\r
183             useHighWord = true;\r
184         }\r
185 \r
186         // increment the query position\r
187         pQuery++;\r
188     }\r
189 }\r
190 \r
191 // opens the alignment archive\r
192 void BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {\r
193 \r
194     // open the BGZF file for writing\r
195     mBGZF.Open(filename, "wb");\r
196 \r
197     // ================\r
198     // write the header\r
199     // ================\r
200 \r
201     // write the BAM signature\r
202     const unsigned char SIGNATURE_LENGTH = 4;\r
203     const char* BAM_SIGNATURE = "BAM\1";\r
204     mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);\r
205 \r
206     // write the SAM header text length\r
207     uint32_t samHeaderLen = samHeader.size();\r
208     if (IsBigEndian) SwapEndian_32(samHeaderLen);\r
209     mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
210 \r
211     // write the SAM header text\r
212     if(samHeaderLen > 0) \r
213         mBGZF.Write(samHeader.data(), samHeaderLen);\r
214 \r
215     // write the number of reference sequences\r
216     uint32_t numReferenceSequences = referenceSequences.size();\r
217     if (IsBigEndian) SwapEndian_32(numReferenceSequences);\r
218     mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
219 \r
220     // =============================\r
221     // write the sequence dictionary\r
222     // =============================\r
223 \r
224     RefVector::const_iterator rsIter;\r
225     for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {\r
226 \r
227         // write the reference sequence name length\r
228         uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
229         if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);\r
230         mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
231 \r
232         // write the reference sequence name\r
233         mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);\r
234 \r
235         // write the reference sequence length\r
236         int32_t referenceLength = rsIter->RefLength;\r
237         if (IsBigEndian) SwapEndian_32(referenceLength);\r
238         mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);\r
239     }\r
240 }\r
241 \r
242 // saves the alignment to the alignment archive\r
243 void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {\r
244 \r
245      // assign the BAM core data\r
246     uint32_t buffer[8];\r
247     buffer[0] = al.RefID;\r
248     buffer[1] = al.Position;\r
249     buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;\r
250     buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;\r
251     buffer[4] = al.SupportData.QuerySequenceLength;\r
252     buffer[5] = al.MateRefID;\r
253     buffer[6] = al.MatePosition;\r
254     buffer[7] = al.InsertSize;\r
255 \r
256     // write the block size\r
257     unsigned int blockSize = al.SupportData.BlockLength;\r
258     if (IsBigEndian) SwapEndian_32(blockSize);\r
259     mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
260 \r
261     // swap BAM core endian-ness, if necessary\r
262     if ( IsBigEndian ) { \r
263         for ( int i = 0; i < 8; ++i )\r
264             SwapEndian_32(buffer[i]); \r
265     }\r
266     \r
267     // write the BAM core\r
268     mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
269 \r
270     // if support data, not parsed out (resulted from BamReader::GetNextAlignmentCore()\r
271     // write the raw char data\r
272     if ( !al.SupportData.IsParsed )\r
273         mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE);  \r
274     \r
275     // re-pack (as needed) & write the parsed char data\r
276     else {\r
277       \r
278         // initialize\r
279         const unsigned int nameLen       = al.Name.size() + 1;\r
280         const unsigned int queryLen      = al.QueryBases.size();\r
281         const unsigned int tagDataLength = al.TagData.size();\r
282         \r
283         // create our packed cigar string\r
284         string packedCigar;\r
285         CreatePackedCigar(al.CigarData, packedCigar);\r
286         const unsigned int packedCigarLen = packedCigar.size();\r
287 \r
288         // encode the query\r
289         string encodedQuery;\r
290         EncodeQuerySequence(al.QueryBases, encodedQuery);\r
291         const unsigned int encodedQueryLen = encodedQuery.size(); \r
292       \r
293         // write the query name\r
294         mBGZF.Write(al.Name.c_str(), nameLen);\r
295 \r
296         // write the packed cigar\r
297         if ( IsBigEndian ) {\r
298           \r
299             char* cigarData = (char*)calloc(sizeof(char), packedCigarLen);\r
300             memcpy(cigarData, packedCigar.data(), packedCigarLen);\r
301             \r
302             for (unsigned int i = 0; i < packedCigarLen; ++i) {\r
303                 if ( IsBigEndian )\r
304                   SwapEndian_32p(&cigarData[i]); \r
305             }\r
306             \r
307             mBGZF.Write(cigarData, packedCigarLen);\r
308             free(cigarData);    \r
309         } \r
310         else \r
311             mBGZF.Write(packedCigar.data(), packedCigarLen);\r
312 \r
313         // write the encoded query sequence\r
314         mBGZF.Write(encodedQuery.data(), encodedQueryLen);\r
315 \r
316         // write the base qualities\r
317         string baseQualities(al.Qualities);\r
318         char* pBaseQualities = (char*)al.Qualities.data();\r
319         for(unsigned int i = 0; i < queryLen; i++) { \r
320             pBaseQualities[i] -= 33; \r
321         }\r
322         mBGZF.Write(pBaseQualities, queryLen);\r
323 \r
324         // write the read group tag\r
325         if ( IsBigEndian ) {\r
326           \r
327             char* tagData = (char*)calloc(sizeof(char), tagDataLength);\r
328             memcpy(tagData, al.TagData.data(), tagDataLength);\r
329           \r
330             int i = 0;\r
331             while ( (unsigned int)i < tagDataLength ) {\r
332                 \r
333                 i += 2;                                 // skip tag type (e.g. "RG", "NM", etc)\r
334                 uint8_t type = toupper(tagData[i]);     // lower & upper case letters have same meaning \r
335                 ++i;                                    // skip value type\r
336                 \r
337                 switch (type) {\r
338                   \r
339                     case('A') :\r
340                     case('C') : \r
341                         ++i;\r
342                         break;\r
343                                 \r
344                     case('S') : \r
345                         SwapEndian_16p(&tagData[i]); \r
346                         i+=2; // sizeof(uint16_t)\r
347                         break;\r
348                                 \r
349                     case('F') :\r
350                     case('I') : \r
351                         SwapEndian_32p(&tagData[i]);\r
352                         i+=4; // sizeof(uint32_t)\r
353                         break;\r
354                                 \r
355                     case('D') : \r
356                         SwapEndian_64p(&tagData[i]);\r
357                         i+=8; // sizeof(uint64_t)\r
358                         break;\r
359                                 \r
360                     case('H') :\r
361                     case('Z') : \r
362                         while (tagData[i]) { ++i; }\r
363                         ++i; // increment one more for null terminator\r
364                         break;\r
365                                 \r
366                     default : \r
367                         printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
368                         free(tagData);\r
369                         exit(1); \r
370                 }\r
371             }\r
372             \r
373             mBGZF.Write(tagData, tagDataLength);\r
374             free(tagData);\r
375         } \r
376         else \r
377             mBGZF.Write(al.TagData.data(), tagDataLength);      \r
378     }\r
379 }\r