]> git.donarmstrong.com Git - bamtools.git/blob - BamWriter.h
Initial import.
[bamtools.git] / BamWriter.h
1 // ***************************************************************************
2 // BamWriter (c) 2009 Michael Strömberg
3 // Marth Lab, Deptartment of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Provides the basic functionality for producing BAM files
7 // ***************************************************************************
8
9 #pragma once
10
11 #include <string>
12 #include <vector>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <zlib.h>
17 #include "BamAlignment.h"
18
19 using namespace std;
20
21 // our zlib constants
22 #define GZIP_ID1             31
23 #define GZIP_ID2            139
24 #define CM_DEFLATE            8
25 #define FLG_FEXTRA            4
26 #define OS_UNKNOWN          255
27 #define BGZF_XLEN             6
28 #define BGZF_ID1             66
29 #define BGZF_ID2             67
30 #define BGZF_LEN              2
31 #define GZIP_WINDOW_BITS    -15
32 #define Z_DEFAULT_MEM_LEVEL   8
33
34 // our BZGF constants
35 #define BLOCK_HEADER_LENGTH    18
36 #define BLOCK_FOOTER_LENGTH     8
37 #define MAX_BLOCK_SIZE      65536
38 #define DEFAULT_BLOCK_SIZE  65536
39
40 // our BAM constants
41 #define BAM_CORE_SIZE  32
42 #define BAM_CMATCH      0
43 #define BAM_CINS        1
44 #define BAM_CDEL        2
45 #define BAM_CREF_SKIP   3
46 #define BAM_CSOFT_CLIP  4
47 #define BAM_CHARD_CLIP  5
48 #define BAM_CPAD        6
49 #define BAM_CIGAR_SHIFT 4
50
51 #define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)
52
53 // our variable sizes
54 #define SIZEOF_INT 4
55
56 // define our BZGF structure
57 struct BgzfData {
58         unsigned int UncompressedBlockSize;
59         unsigned int CompressedBlockSize;
60         unsigned int BlockLength;
61         unsigned int BlockOffset;
62         uint64_t BlockAddress;
63         bool IsOpen;
64         FILE* Stream;
65         char* UncompressedBlock;
66         char* CompressedBlock;
67
68         // constructor
69         BgzfData(void)
70                 : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
71                 , CompressedBlockSize(MAX_BLOCK_SIZE)
72                 , BlockLength(0)
73                 , BlockOffset(0)
74                 , BlockAddress(0)
75                 , IsOpen(false)
76                 , Stream(NULL)
77                 , UncompressedBlock(NULL)
78                 , CompressedBlock(NULL)
79         {
80                 try {
81                         CompressedBlock   = new char[CompressedBlockSize];
82                         UncompressedBlock = new char[UncompressedBlockSize];
83                 } catch(bad_alloc&) {
84                         printf("ERROR: Unable to allocate memory for our BGZF object.\n");
85                         exit(1);
86                 }
87         }
88
89         // destructor
90         ~BgzfData(void) {
91                 if(CompressedBlock)   delete [] CompressedBlock;
92                 if(UncompressedBlock) delete [] UncompressedBlock;
93         }
94 };
95
96 class BamWriter {
97 public:
98         // constructor
99         BamWriter(void);
100         // destructor
101         ~BamWriter(void);
102         // closes the alignment archive
103         void Close(void);
104         // opens the alignment archive
105         void Open(const string& filename, const string& samHeader, const RefVector& referenceSequences);
106         // saves the alignment to the alignment archive
107         void SaveAlignment(const BamAlignment& al);
108 private:
109         // closes the BAM file
110         void BgzfClose(void);
111         // compresses the current block
112         int BgzfDeflateBlock(void);
113         // flushes the data in the BGZF block
114         void BgzfFlushBlock(void);
115         // opens the BAM file for writing
116         void BgzfOpen(const string& filename);
117         // packs an unsigned integer into the specified buffer
118         static inline void BgzfPackUnsignedInt(char* buffer, unsigned int value);
119         // packs an unsigned short into the specified buffer
120         static inline void BgzfPackUnsignedShort(char* buffer, unsigned short value);
121         // writes the supplied data into the BGZF buffer
122         unsigned int BgzfWrite(const char* data, const unsigned int dataLen);
123         // calculates the minimum bin that contains a region [begin, end)
124         static inline unsigned int CalculateMinimumBin(unsigned int begin, unsigned int end);
125         // creates a packed cigar string from the supplied alignment
126         static void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);
127         // encodes the supplied query sequence into 4-bit notation
128         static void EncodeQuerySequence(const string& query, string& encodedQuery);
129         // our BGZF output object
130         BgzfData mBGZF;
131 };
132
133 // packs an unsigned integer into the specified buffer
134 inline void BamWriter::BgzfPackUnsignedInt(char* buffer, unsigned int value) {
135         buffer[0] = (char)value;
136         buffer[1] = (char)(value >> 8);
137         buffer[2] = (char)(value >> 16);
138         buffer[3] = (char)(value >> 24);
139 }
140
141 // packs an unsigned short into the specified buffer
142 inline void BamWriter::BgzfPackUnsignedShort(char* buffer, unsigned short value) {
143         buffer[0] = (char)value;
144         buffer[1] = (char)(value >> 8);
145 }