]> git.donarmstrong.com Git - bamtools.git/blobdiff - src/api/BamWriter.cpp
Reorganized source tree & build system
[bamtools.git] / src / api / BamWriter.cpp
diff --git a/src/api/BamWriter.cpp b/src/api/BamWriter.cpp
new file mode 100644 (file)
index 0000000..f83ff1c
--- /dev/null
@@ -0,0 +1,432 @@
+// ***************************************************************************\r
+// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 17 August 2010 (DB)\r
+// ---------------------------------------------------------------------------\r
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
+// Institute.\r
+// ---------------------------------------------------------------------------\r
+// Provides the basic functionality for producing BAM files\r
+// ***************************************************************************\r
+\r
+#include <iostream>\r
+\r
+#include "BGZF.h"\r
+#include "BamWriter.h"\r
+using namespace BamTools;\r
+using namespace std;\r
+\r
+struct BamWriter::BamWriterPrivate {\r
+\r
+    // data members\r
+    BgzfData mBGZF;\r
+    bool IsBigEndian;\r
+    \r
+    // constructor / destructor\r
+    BamWriterPrivate(void) { \r
+      IsBigEndian = SystemIsBigEndian();  \r
+    }\r
+    \r
+    ~BamWriterPrivate(void) {\r
+        mBGZF.Close();\r
+    }\r
+\r
+    // "public" interface\r
+    void Close(void);\r
+    bool Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed);\r
+    void SaveAlignment(const BamAlignment& al);\r
+\r
+    // internal methods\r
+    const unsigned int CalculateMinimumBin(const int begin, int end) const;\r
+    void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);\r
+    void EncodeQuerySequence(const string& query, string& encodedQuery);\r
+};\r
+\r
+// -----------------------------------------------------\r
+// BamWriter implementation\r
+// -----------------------------------------------------\r
+\r
+// constructor\r
+BamWriter::BamWriter(void) {\r
+    d = new BamWriterPrivate;\r
+}\r
+\r
+// destructor\r
+BamWriter::~BamWriter(void) {\r
+    delete d;\r
+    d = 0;\r
+}\r
+\r
+// closes the alignment archive\r
+void BamWriter::Close(void) { \r
+  d->Close(); \r
+}\r
+\r
+// opens the alignment archive\r
+bool BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) {\r
+    return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed);\r
+}\r
+\r
+// saves the alignment to the alignment archive\r
+void BamWriter::SaveAlignment(const BamAlignment& al) { \r
+    d->SaveAlignment(al);\r
+}\r
+\r
+// -----------------------------------------------------\r
+// BamWriterPrivate implementation\r
+// -----------------------------------------------------\r
+\r
+// closes the alignment archive\r
+void BamWriter::BamWriterPrivate::Close(void) {\r
+    mBGZF.Close();\r
+}\r
+\r
+// calculates minimum bin for a BAM alignment interval\r
+const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {  \r
+    --end;\r
+    if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);\r
+    if( (begin >> 17) == (end >> 17) ) return  585 + (begin >> 17);\r
+    if( (begin >> 20) == (end >> 20) ) return   73 + (begin >> 20);\r
+    if( (begin >> 23) == (end >> 23) ) return    9 + (begin >> 23);\r
+    if( (begin >> 26) == (end >> 26) ) return    1 + (begin >> 26);\r
+    return 0;\r
+}\r
+\r
+// creates a cigar string from the supplied alignment\r
+void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {\r
+\r
+    // initialize\r
+    const unsigned int numCigarOperations = cigarOperations.size();\r
+    packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);\r
+\r
+    // pack the cigar data into the string\r
+    unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();\r
+\r
+    unsigned int cigarOp;\r
+    vector<CigarOp>::const_iterator coIter;\r
+    for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {\r
+\r
+        switch(coIter->Type) {\r
+            case 'M':\r
+                  cigarOp = BAM_CMATCH;\r
+                  break;\r
+            case 'I':\r
+                  cigarOp = BAM_CINS;\r
+                  break;\r
+            case 'D':\r
+                  cigarOp = BAM_CDEL;\r
+                  break;\r
+            case 'N':\r
+                  cigarOp = BAM_CREF_SKIP;\r
+                  break;\r
+            case 'S':\r
+                  cigarOp = BAM_CSOFT_CLIP;\r
+                  break;\r
+            case 'H':\r
+                  cigarOp = BAM_CHARD_CLIP;\r
+                  break;\r
+            case 'P':\r
+                  cigarOp = BAM_CPAD;\r
+                  break;\r
+            default:\r
+                  printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);\r
+                  exit(1);\r
+        }\r
+\r
+        *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;\r
+        pPackedCigar++;\r
+    }\r
+}\r
+\r
+// encodes the supplied query sequence into 4-bit notation\r
+void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {\r
+\r
+    // prepare the encoded query string\r
+    const unsigned int queryLen = query.size();\r
+    const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);\r
+    encodedQuery.resize(encodedQueryLen);\r
+    char* pEncodedQuery = (char*)encodedQuery.data();\r
+    const char* pQuery = (const char*)query.data();\r
+\r
+    unsigned char nucleotideCode;\r
+    bool useHighWord = true;\r
+\r
+    while(*pQuery) {\r
+\r
+        switch(*pQuery) {\r
+            \r
+            case '=':\r
+                nucleotideCode = 0;\r
+                break;\r
+                \r
+            case 'A':\r
+                nucleotideCode = 1;\r
+                break;\r
+            \r
+            case 'C':\r
+                nucleotideCode = 2;\r
+                break;\r
+            \r
+            case 'G':\r
+                nucleotideCode = 4;\r
+                break;\r
+            \r
+            case 'T':\r
+                nucleotideCode = 8;\r
+                break;\r
+            \r
+            case 'N':\r
+                nucleotideCode = 15;\r
+                break;\r
+            \r
+            default:\r
+                printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
+                exit(1);\r
+        }\r
+\r
+        // pack the nucleotide code\r
+        if(useHighWord) {\r
+            *pEncodedQuery = nucleotideCode << 4;\r
+            useHighWord = false;\r
+        } else {\r
+            *pEncodedQuery |= nucleotideCode;\r
+            pEncodedQuery++;\r
+            useHighWord = true;\r
+        }\r
+\r
+        // increment the query position\r
+        pQuery++;\r
+    }\r
+}\r
+\r
+// opens the alignment archive\r
+bool BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) {\r
+\r
+    // open the BGZF file for writing, return failure if error\r
+    if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )\r
+        return false;\r
+\r
+    // ================\r
+    // write the header\r
+    // ================\r
+\r
+    // write the BAM signature\r
+    const unsigned char SIGNATURE_LENGTH = 4;\r
+    const char* BAM_SIGNATURE = "BAM\1";\r
+    mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);\r
+\r
+    // write the SAM header text length\r
+    uint32_t samHeaderLen = samHeader.size();\r
+    if (IsBigEndian) SwapEndian_32(samHeaderLen);\r
+    mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
+\r
+    // write the SAM header text\r
+    if(samHeaderLen > 0) \r
+        mBGZF.Write(samHeader.data(), samHeaderLen);\r
+\r
+    // write the number of reference sequences\r
+    uint32_t numReferenceSequences = referenceSequences.size();\r
+    if (IsBigEndian) SwapEndian_32(numReferenceSequences);\r
+    mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
+\r
+    // =============================\r
+    // write the sequence dictionary\r
+    // =============================\r
+\r
+    RefVector::const_iterator rsIter;\r
+    for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {\r
+\r
+        // write the reference sequence name length\r
+        uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
+        if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);\r
+        mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
+\r
+        // write the reference sequence name\r
+        mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);\r
+\r
+        // write the reference sequence length\r
+        int32_t referenceLength = rsIter->RefLength;\r
+        if (IsBigEndian) SwapEndian_32(referenceLength);\r
+        mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);\r
+    }\r
+    \r
+    // return success\r
+    return true;\r
+}\r
+\r
+// saves the alignment to the alignment archive\r
+void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {\r
+\r
+    // if BamAlignment contains only the core data and a raw char data buffer\r
+    // (as a result of BamReader::GetNextAlignmentCore())\r
+    if ( al.SupportData.HasCoreOnly ) {\r
+      \r
+        // write the block size\r
+        unsigned int blockSize = al.SupportData.BlockLength;\r
+        if (IsBigEndian) SwapEndian_32(blockSize);\r
+        mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
+\r
+        // assign the BAM core data\r
+        uint32_t buffer[8];\r
+        buffer[0] = al.RefID;\r
+        buffer[1] = al.Position;\r
+        buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;\r
+        buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;\r
+        buffer[4] = al.SupportData.QuerySequenceLength;\r
+        buffer[5] = al.MateRefID;\r
+        buffer[6] = al.MatePosition;\r
+        buffer[7] = al.InsertSize;\r
+        \r
+        // swap BAM core endian-ness, if necessary\r
+        if ( IsBigEndian ) { \r
+            for ( int i = 0; i < 8; ++i )\r
+                SwapEndian_32(buffer[i]); \r
+        }\r
+        \r
+        // write the BAM core\r
+        mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
+        \r
+        // write the raw char data\r
+        mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); \r
+    }\r
+    \r
+    // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc\r
+    // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )\r
+    else {\r
+        \r
+        // calculate char lengths\r
+        const unsigned int nameLength         = al.Name.size() + 1;\r
+        const unsigned int numCigarOperations = al.CigarData.size();\r
+        const unsigned int queryLength        = al.QueryBases.size();\r
+        const unsigned int tagDataLength      = al.TagData.size();\r
+        \r
+        // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)\r
+        // force calculation of Bin before storing\r
+        const int endPosition = al.GetEndPosition();\r
+        const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);\r
+      \r
+        // create our packed cigar string\r
+        string packedCigar;\r
+        CreatePackedCigar(al.CigarData, packedCigar);\r
+        const unsigned int packedCigarLength = packedCigar.size();\r
+\r
+        // encode the query\r
+        string encodedQuery;\r
+        EncodeQuerySequence(al.QueryBases, encodedQuery);\r
+        const unsigned int encodedQueryLength = encodedQuery.size(); \r
+      \r
+        // write the block size\r
+        const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;\r
+        unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
+        if (IsBigEndian) SwapEndian_32(blockSize);\r
+        mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
+\r
+        // assign the BAM core data\r
+        uint32_t buffer[8];\r
+        buffer[0] = al.RefID;\r
+        buffer[1] = al.Position;\r
+        buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;\r
+        buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;\r
+        buffer[4] = queryLength;\r
+        buffer[5] = al.MateRefID;\r
+        buffer[6] = al.MatePosition;\r
+        buffer[7] = al.InsertSize;\r
+        \r
+        // swap BAM core endian-ness, if necessary\r
+        if ( IsBigEndian ) { \r
+            for ( int i = 0; i < 8; ++i )\r
+                SwapEndian_32(buffer[i]); \r
+        }\r
+        \r
+        // write the BAM core\r
+        mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
+        \r
+        // write the query name\r
+        mBGZF.Write(al.Name.c_str(), nameLength);\r
+\r
+        // write the packed cigar\r
+        if ( IsBigEndian ) {\r
+          \r
+            char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);\r
+            memcpy(cigarData, packedCigar.data(), packedCigarLength);\r
+            \r
+            for (unsigned int i = 0; i < packedCigarLength; ++i) {\r
+                if ( IsBigEndian )\r
+                  SwapEndian_32p(&cigarData[i]); \r
+            }\r
+            \r
+            mBGZF.Write(cigarData, packedCigarLength);\r
+            free(cigarData);    \r
+        } \r
+        else \r
+            mBGZF.Write(packedCigar.data(), packedCigarLength);\r
+\r
+        // write the encoded query sequence\r
+        mBGZF.Write(encodedQuery.data(), encodedQueryLength);\r
+\r
+        // write the base qualities\r
+        string baseQualities(al.Qualities);\r
+        char* pBaseQualities = (char*)al.Qualities.data();\r
+        for(unsigned int i = 0; i < queryLength; i++) { \r
+            pBaseQualities[i] -= 33; \r
+        }\r
+        mBGZF.Write(pBaseQualities, queryLength);\r
+\r
+        // write the read group tag\r
+        if ( IsBigEndian ) {\r
+          \r
+            char* tagData = (char*)calloc(sizeof(char), tagDataLength);\r
+            memcpy(tagData, al.TagData.data(), tagDataLength);\r
+          \r
+            int i = 0;\r
+            while ( (unsigned int)i < tagDataLength ) {\r
+                \r
+                i += 2;                                 // skip tag type (e.g. "RG", "NM", etc)\r
+                uint8_t type = toupper(tagData[i]);     // lower & upper case letters have same meaning \r
+                ++i;                                    // skip value type\r
+                \r
+                switch (type) {\r
+                  \r
+                    case('A') :\r
+                    case('C') : \r
+                        ++i;\r
+                        break;\r
+                                \r
+                    case('S') : \r
+                        SwapEndian_16p(&tagData[i]); \r
+                        i+=2; // sizeof(uint16_t)\r
+                        break;\r
+                                \r
+                    case('F') :\r
+                    case('I') : \r
+                        SwapEndian_32p(&tagData[i]);\r
+                        i+=4; // sizeof(uint32_t)\r
+                        break;\r
+                                \r
+                    case('D') : \r
+                        SwapEndian_64p(&tagData[i]);\r
+                        i+=8; // sizeof(uint64_t)\r
+                        break;\r
+                                \r
+                    case('H') :\r
+                    case('Z') : \r
+                        while (tagData[i]) { ++i; }\r
+                        ++i; // increment one more for null terminator\r
+                        break;\r
+                                \r
+                    default : \r
+                        printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
+                        free(tagData);\r
+                        exit(1); \r
+                }\r
+            }\r
+            \r
+            mBGZF.Write(tagData, tagDataLength);\r
+            free(tagData);\r
+        } \r
+        else \r
+            mBGZF.Write(al.TagData.data(), tagDataLength);      \r
+    }\r
+}\r