]> git.donarmstrong.com Git - bamtools.git/blob - BamWriter.h
Major overhaul of BamReader. No longer relying on bgzf.* API. Sped up random-access...
[bamtools.git] / BamWriter.h
1 // ***************************************************************************
2 // BamWriter (c) 2009 Michael Strömberg
3 // Marth Lab, Deptartment of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // The BGZF routines were adapted from the bgzf.c code developed at the Broad
7 // Institute.
8 // ---------------------------------------------------------------------------
9 // Provides the basic functionality for producing BAM files
10 // ***************************************************************************
11
12 #pragma once
13
14 #include <string>
15 #include <vector>
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <string.h>
19 #include <zlib.h>
20 #include "BamAlignment.h"
21
22 using namespace std;
23
24 // our zlib constants
25 #define GZIP_ID1             31
26 #define GZIP_ID2            139
27 #define CM_DEFLATE            8
28 #define FLG_FEXTRA            4
29 #define OS_UNKNOWN          255
30 #define BGZF_XLEN             6
31 #define BGZF_ID1             66
32 #define BGZF_ID2             67
33 #define BGZF_LEN              2
34 #define GZIP_WINDOW_BITS    -15
35 #define Z_DEFAULT_MEM_LEVEL   8
36
37 // our BZGF constants
38 #define BLOCK_HEADER_LENGTH    18
39 #define BLOCK_FOOTER_LENGTH     8
40 #define MAX_BLOCK_SIZE      65536
41 #define DEFAULT_BLOCK_SIZE  65536
42
43 // our BAM constants
44 #define BAM_CORE_SIZE  32
45 #define BAM_CMATCH      0
46 #define BAM_CINS        1
47 #define BAM_CDEL        2
48 #define BAM_CREF_SKIP   3
49 #define BAM_CSOFT_CLIP  4
50 #define BAM_CHARD_CLIP  5
51 #define BAM_CPAD        6
52 #define BAM_CIGAR_SHIFT 4
53
54 #define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)
55
56 // our variable sizes
57 #define SIZEOF_INT 4
58
59 // define our BZGF structure
60 #ifndef BGZF_DATA
61 #define BGZF_DATA
62 struct BgzfData {
63         unsigned int UncompressedBlockSize;
64         unsigned int CompressedBlockSize;
65         unsigned int BlockLength;
66         unsigned int BlockOffset;
67         uint64_t BlockAddress;
68         bool IsOpen;
69         FILE* Stream;
70         char* UncompressedBlock;
71         char* CompressedBlock;
72
73         // constructor
74         BgzfData(void)
75                 : UncompressedBlockSize(DEFAULT_BLOCK_SIZE)
76                 , CompressedBlockSize(MAX_BLOCK_SIZE)
77                 , BlockLength(0)
78                 , BlockOffset(0)
79                 , BlockAddress(0)
80                 , IsOpen(false)
81                 , Stream(NULL)
82                 , UncompressedBlock(NULL)
83                 , CompressedBlock(NULL)
84         {
85                 try {
86                         CompressedBlock   = new char[CompressedBlockSize];
87                         UncompressedBlock = new char[UncompressedBlockSize];
88                 } catch(bad_alloc&) {
89                         printf("ERROR: Unable to allocate memory for our BGZF object.\n");
90                         exit(1);
91                 }
92         }
93
94         // destructor
95         ~BgzfData(void) {
96                 if(CompressedBlock)   delete [] CompressedBlock;
97                 if(UncompressedBlock) delete [] UncompressedBlock;
98         }
99 };
100 #endif  // BGZF_DATA
101
102 class BamWriter {
103 public:
104         // constructor
105         BamWriter(void);
106         // destructor
107         ~BamWriter(void);
108         // closes the alignment archive
109         void Close(void);
110         // opens the alignment archive
111         void Open(const string& filename, const string& samHeader, const RefVector& referenceSequences);
112         // saves the alignment to the alignment archive
113         void SaveAlignment(const BamAlignment& al);
114 private:
115         // closes the BAM file
116         void BgzfClose(void);
117         // compresses the current block
118         int BgzfDeflateBlock(void);
119         // flushes the data in the BGZF block
120         void BgzfFlushBlock(void);
121         // opens the BAM file for writing
122         void BgzfOpen(const string& filename);
123         // packs an unsigned integer into the specified buffer
124         static inline void BgzfPackUnsignedInt(char* buffer, unsigned int value);
125         // packs an unsigned short into the specified buffer
126         static inline void BgzfPackUnsignedShort(char* buffer, unsigned short value);
127         // writes the supplied data into the BGZF buffer
128         unsigned int BgzfWrite(const char* data, const unsigned int dataLen);
129         // calculates the minimum bin that contains a region [begin, end)
130         static inline unsigned int CalculateMinimumBin(unsigned int begin, unsigned int end);
131         // creates a packed cigar string from the supplied alignment
132         static void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);
133         // encodes the supplied query sequence into 4-bit notation
134         static void EncodeQuerySequence(const string& query, string& encodedQuery);
135         // our BGZF output object
136         BgzfData mBGZF;
137 };
138
139 // packs an unsigned integer into the specified buffer
140 inline void BamWriter::BgzfPackUnsignedInt(char* buffer, unsigned int value) {
141         buffer[0] = (char)value;
142         buffer[1] = (char)(value >> 8);
143         buffer[2] = (char)(value >> 16);
144         buffer[3] = (char)(value >> 24);
145 }
146
147 // packs an unsigned short into the specified buffer
148 inline void BamWriter::BgzfPackUnsignedShort(char* buffer, unsigned short value) {
149         buffer[0] = (char)value;
150         buffer[1] = (char)(value >> 8);
151 }