]> git.donarmstrong.com Git - bamtools.git/blob - src/api/internal/BamWriter_p.cpp
Moved private implementation API files to internal directory for clearer separation...
[bamtools.git] / src / api / internal / BamWriter_p.cpp
1 // ***************************************************************************
2 // BamWriter_p.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 22 November 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides the basic functionality for producing BAM files
9 // ***************************************************************************
10
11 #include <api/BamAlignment.h>
12 #include <api/internal/BamWriter_p.h>
13 using namespace BamTools;
14 using namespace BamTools::Internal;
15 using namespace std;
16
17 BamWriterPrivate::BamWriterPrivate(void) {
18     IsBigEndian = SystemIsBigEndian();
19 }
20
21 BamWriterPrivate::~BamWriterPrivate(void) {
22     mBGZF.Close();
23 }
24
25 // closes the alignment archive
26 void BamWriterPrivate::Close(void) {
27     mBGZF.Close();
28 }
29
30 // calculates minimum bin for a BAM alignment interval
31 const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
32     --end;
33     if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);
34     if( (begin >> 17) == (end >> 17) ) return  585 + (begin >> 17);
35     if( (begin >> 20) == (end >> 20) ) return   73 + (begin >> 20);
36     if( (begin >> 23) == (end >> 23) ) return    9 + (begin >> 23);
37     if( (begin >> 26) == (end >> 26) ) return    1 + (begin >> 26);
38     return 0;
39 }
40
41 // creates a cigar string from the supplied alignment
42 void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
43
44     // initialize
45     const unsigned int numCigarOperations = cigarOperations.size();
46     packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);
47
48     // pack the cigar data into the string
49     unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
50
51     unsigned int cigarOp;
52     vector<CigarOp>::const_iterator coIter;
53     for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {
54
55         switch(coIter->Type) {
56             case 'M':
57                   cigarOp = BAM_CMATCH;
58                   break;
59             case 'I':
60                   cigarOp = BAM_CINS;
61                   break;
62             case 'D':
63                   cigarOp = BAM_CDEL;
64                   break;
65             case 'N':
66                   cigarOp = BAM_CREF_SKIP;
67                   break;
68             case 'S':
69                   cigarOp = BAM_CSOFT_CLIP;
70                   break;
71             case 'H':
72                   cigarOp = BAM_CHARD_CLIP;
73                   break;
74             case 'P':
75                   cigarOp = BAM_CPAD;
76                   break;
77             default:
78                   fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);
79                   exit(1);
80         }
81
82         *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
83         pPackedCigar++;
84     }
85 }
86
87 // encodes the supplied query sequence into 4-bit notation
88 void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
89
90     // prepare the encoded query string
91     const unsigned int queryLen = query.size();
92     const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
93     encodedQuery.resize(encodedQueryLen);
94     char* pEncodedQuery = (char*)encodedQuery.data();
95     const char* pQuery = (const char*)query.data();
96
97     unsigned char nucleotideCode;
98     bool useHighWord = true;
99
100     while(*pQuery) {
101
102         switch(*pQuery) {
103
104             case '=':
105                 nucleotideCode = 0;
106                 break;
107
108             case 'A':
109                 nucleotideCode = 1;
110                 break;
111
112             case 'C':
113                 nucleotideCode = 2;
114                 break;
115
116             case 'G':
117                 nucleotideCode = 4;
118                 break;
119
120             case 'T':
121                 nucleotideCode = 8;
122                 break;
123
124             case 'N':
125                 nucleotideCode = 15;
126                 break;
127
128             default:
129                 fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
130                 exit(1);
131         }
132
133         // pack the nucleotide code
134         if(useHighWord) {
135             *pEncodedQuery = nucleotideCode << 4;
136             useHighWord = false;
137         } else {
138             *pEncodedQuery |= nucleotideCode;
139             pEncodedQuery++;
140             useHighWord = true;
141         }
142
143         // increment the query position
144         pQuery++;
145     }
146 }
147
148 // opens the alignment archive
149 bool BamWriterPrivate::Open(const string& filename,
150                             const string& samHeader,
151                             const RefVector& referenceSequences,
152                             bool isWriteUncompressed)
153 {
154     // open the BGZF file for writing, return failure if error
155     if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )
156         return false;
157
158     // ================
159     // write the header
160     // ================
161
162     // write the BAM signature
163     const unsigned char SIGNATURE_LENGTH = 4;
164     const char* BAM_SIGNATURE = "BAM\1";
165     mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);
166
167     // write the SAM header text length
168     uint32_t samHeaderLen = samHeader.size();
169     if (IsBigEndian) SwapEndian_32(samHeaderLen);
170     mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);
171
172     // write the SAM header text
173     if(samHeaderLen > 0)
174         mBGZF.Write(samHeader.data(), samHeaderLen);
175
176     // write the number of reference sequences
177     uint32_t numReferenceSequences = referenceSequences.size();
178     if (IsBigEndian) SwapEndian_32(numReferenceSequences);
179     mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);
180
181     // =============================
182     // write the sequence dictionary
183     // =============================
184
185     RefVector::const_iterator rsIter;
186     for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
187
188         // write the reference sequence name length
189         uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
190         if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);
191         mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
192
193         // write the reference sequence name
194         mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
195
196         // write the reference sequence length
197         int32_t referenceLength = rsIter->RefLength;
198         if (IsBigEndian) SwapEndian_32(referenceLength);
199         mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);
200     }
201
202     // return success
203     return true;
204 }
205
206 // saves the alignment to the alignment archive
207 void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
208
209     // if BamAlignment contains only the core data and a raw char data buffer
210     // (as a result of BamReader::GetNextAlignmentCore())
211     if ( al.SupportData.HasCoreOnly ) {
212
213         // write the block size
214         unsigned int blockSize = al.SupportData.BlockLength;
215         if (IsBigEndian) SwapEndian_32(blockSize);
216         mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
217
218         // assign the BAM core data
219         uint32_t buffer[8];
220         buffer[0] = al.RefID;
221         buffer[1] = al.Position;
222         buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
223         buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
224         buffer[4] = al.SupportData.QuerySequenceLength;
225         buffer[5] = al.MateRefID;
226         buffer[6] = al.MatePosition;
227         buffer[7] = al.InsertSize;
228
229         // swap BAM core endian-ness, if necessary
230         if ( IsBigEndian ) {
231             for ( int i = 0; i < 8; ++i )
232                 SwapEndian_32(buffer[i]);
233         }
234
235         // write the BAM core
236         mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
237
238         // write the raw char data
239         mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE);
240     }
241
242     // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
243     // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
244     else {
245
246         // calculate char lengths
247         const unsigned int nameLength         = al.Name.size() + 1;
248         const unsigned int numCigarOperations = al.CigarData.size();
249         const unsigned int queryLength        = al.QueryBases.size();
250         const unsigned int tagDataLength      = al.TagData.size();
251
252         // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
253         // force calculation of Bin before storing
254         const int endPosition = al.GetEndPosition();
255         const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
256
257         // create our packed cigar string
258         string packedCigar;
259         CreatePackedCigar(al.CigarData, packedCigar);
260         const unsigned int packedCigarLength = packedCigar.size();
261
262         // encode the query
263         string encodedQuery;
264         EncodeQuerySequence(al.QueryBases, encodedQuery);
265         const unsigned int encodedQueryLength = encodedQuery.size();
266
267         // write the block size
268         const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;
269         unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
270         if (IsBigEndian) SwapEndian_32(blockSize);
271         mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
272
273         // assign the BAM core data
274         uint32_t buffer[8];
275         buffer[0] = al.RefID;
276         buffer[1] = al.Position;
277         buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
278         buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
279         buffer[4] = queryLength;
280         buffer[5] = al.MateRefID;
281         buffer[6] = al.MatePosition;
282         buffer[7] = al.InsertSize;
283
284         // swap BAM core endian-ness, if necessary
285         if ( IsBigEndian ) {
286             for ( int i = 0; i < 8; ++i )
287                 SwapEndian_32(buffer[i]);
288         }
289
290         // write the BAM core
291         mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
292
293         // write the query name
294         mBGZF.Write(al.Name.c_str(), nameLength);
295
296         // write the packed cigar
297         if ( IsBigEndian ) {
298
299             char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
300             memcpy(cigarData, packedCigar.data(), packedCigarLength);
301
302             for (unsigned int i = 0; i < packedCigarLength; ++i) {
303                 if ( IsBigEndian )
304                   SwapEndian_32p(&cigarData[i]);
305             }
306
307             mBGZF.Write(cigarData, packedCigarLength);
308             free(cigarData);
309         }
310         else
311             mBGZF.Write(packedCigar.data(), packedCigarLength);
312
313         // write the encoded query sequence
314         mBGZF.Write(encodedQuery.data(), encodedQueryLength);
315
316         // write the base qualities
317         string baseQualities(al.Qualities);
318         char* pBaseQualities = (char*)al.Qualities.data();
319         for(unsigned int i = 0; i < queryLength; i++) {
320             pBaseQualities[i] -= 33;
321         }
322         mBGZF.Write(pBaseQualities, queryLength);
323
324         // write the read group tag
325         if ( IsBigEndian ) {
326
327             char* tagData = (char*)calloc(sizeof(char), tagDataLength);
328             memcpy(tagData, al.TagData.data(), tagDataLength);
329
330             int i = 0;
331             while ( (unsigned int)i < tagDataLength ) {
332
333                 i += 2;                                 // skip tag type (e.g. "RG", "NM", etc)
334                 uint8_t type = toupper(tagData[i]);     // lower & upper case letters have same meaning
335                 ++i;                                    // skip value type
336
337                 switch (type) {
338
339                     case('A') :
340                     case('C') :
341                         ++i;
342                         break;
343
344                     case('S') :
345                         SwapEndian_16p(&tagData[i]);
346                         i+=2; // sizeof(uint16_t)
347                         break;
348
349                     case('F') :
350                     case('I') :
351                         SwapEndian_32p(&tagData[i]);
352                         i+=4; // sizeof(uint32_t)
353                         break;
354
355                     case('D') :
356                         SwapEndian_64p(&tagData[i]);
357                         i+=8; // sizeof(uint64_t)
358                         break;
359
360                     case('H') :
361                     case('Z') :
362                         while (tagData[i]) { ++i; }
363                         ++i; // increment one more for null terminator
364                         break;
365
366                     default :
367                         fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
368                         free(tagData);
369                         exit(1);
370                 }
371             }
372
373             mBGZF.Write(tagData, tagDataLength);
374             free(tagData);
375         }
376         else
377             mBGZF.Write(al.TagData.data(), tagDataLength);
378     }
379 }