]> git.donarmstrong.com Git - bamtools.git/blobdiff - BamWriter.cpp
Modified handling of BamAlignmentSupportData. This fix should allow BamWriter::SaveA...
[bamtools.git] / BamWriter.cpp
index 9d18fae6ba616b1d243913e7e07c4a55c70a88b4..a794dff577adc2d17ada9f94807413b156bdefcc 100644 (file)
@@ -3,7 +3,7 @@
 // Marth Lab, Department of Biology, Boston College\r
 // All rights reserved.\r
 // ---------------------------------------------------------------------------\r
-// Last modified: 30 March 2010 (DB)\r
+// Last modified: 15 July 2010 (DB)\r
 // ---------------------------------------------------------------------------\r
 // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
 // Institute.\r
@@ -11,7 +11,8 @@
 // Provides the basic functionality for producing BAM files\r
 // ***************************************************************************\r
 \r
-// BGZF includes\r
+#include <iostream>\r
+\r
 #include "BGZF.h"\r
 #include "BamWriter.h"\r
 using namespace BamTools;\r
@@ -23,7 +24,6 @@ struct BamWriter::BamWriterPrivate {
     BgzfData mBGZF;\r
     bool IsBigEndian;\r
     \r
-    \r
     // constructor / destructor\r
     BamWriterPrivate(void) { \r
       IsBigEndian = SystemIsBigEndian();  \r
@@ -37,9 +37,9 @@ struct BamWriter::BamWriterPrivate {
     void Close(void);\r
     void Open(const string& filename, const string& samHeader, const RefVector& referenceSequences);\r
     void SaveAlignment(const BamAlignment& al);\r
-    void SaveAlignment(const BamAlignment& al, const BamAlignmentSupportData& supportData);\r
 \r
     // internal methods\r
+    const unsigned int CalculateMinimumBin(const int begin, int end) const;\r
     void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar);\r
     void EncodeQuerySequence(const string& query, string& encodedQuery);\r
 };\r
@@ -74,10 +74,6 @@ void BamWriter::SaveAlignment(const BamAlignment& al) {
     d->SaveAlignment(al);\r
 }\r
 \r
-void BamWriter::SaveAlignment(const BamAlignment& al, const BamAlignmentSupportData& supportData) {\r
-    d->SaveAlignment(al, supportData);\r
-}\r
-\r
 // -----------------------------------------------------\r
 // BamWriterPrivate implementation\r
 // -----------------------------------------------------\r
@@ -87,6 +83,17 @@ void BamWriter::BamWriterPrivate::Close(void) {
     mBGZF.Close();\r
 }\r
 \r
+// calculates minimum bin for a BAM alignment interval\r
+const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {  \r
+    --end;\r
+    if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14);\r
+    if( (begin >> 17) == (end >> 17) ) return  585 + (begin >> 17);\r
+    if( (begin >> 20) == (end >> 20) ) return   73 + (begin >> 20);\r
+    if( (begin >> 23) == (end >> 23) ) return    9 + (begin >> 23);\r
+    if( (begin >> 26) == (end >> 26) ) return    1 + (begin >> 26);\r
+    return 0;\r
+}\r
+\r
 // creates a cigar string from the supplied alignment\r
 void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {\r
 \r
@@ -99,7 +106,7 @@ void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigar
 \r
     unsigned int cigarOp;\r
     vector<CigarOp>::const_iterator coIter;\r
-    for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {\r
+    for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {\r
 \r
         switch(coIter->Type) {\r
             case 'M':\r
@@ -211,17 +218,16 @@ void BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam
 \r
     // write the SAM header text length\r
     uint32_t samHeaderLen = samHeader.size();\r
-    if ( IsBigEndian ) { SwapEndian_32(samHeaderLen); }\r
+    if (IsBigEndian) SwapEndian_32(samHeaderLen);\r
     mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
 \r
     // write the SAM header text\r
-    if(samHeaderLen > 0) {\r
+    if(samHeaderLen > 0) \r
         mBGZF.Write(samHeader.data(), samHeaderLen);\r
-    }\r
 \r
     // write the number of reference sequences\r
     uint32_t numReferenceSequences = referenceSequences.size();\r
-    if ( IsBigEndian ) { SwapEndian_32(numReferenceSequences); }\r
+    if (IsBigEndian) SwapEndian_32(numReferenceSequences);\r
     mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
 \r
     // =============================\r
@@ -233,7 +239,7 @@ void BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam
 \r
         // write the reference sequence name length\r
         uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
-        if ( IsBigEndian ) { SwapEndian_32(referenceSequenceNameLen); }\r
+        if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);\r
         mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
 \r
         // write the reference sequence name\r
@@ -241,7 +247,7 @@ void BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam
 \r
         // write the reference sequence length\r
         int32_t referenceLength = rsIter->RefLength;\r
-        if ( IsBigEndian ) { SwapEndian_32(referenceLength); }\r
+        if (IsBigEndian) SwapEndian_32(referenceLength);\r
         mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);\r
     }\r
 }\r
@@ -249,170 +255,174 @@ void BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam
 // saves the alignment to the alignment archive\r
 void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {\r
 \r
-    // initialize\r
-    const unsigned int nameLen            = al.Name.size() + 1;\r
-    const unsigned int queryLen           = al.QueryBases.size();\r
-    const unsigned int numCigarOperations = al.CigarData.size();\r
-\r
-    // create our packed cigar string\r
-    string packedCigar;\r
-    CreatePackedCigar(al.CigarData, packedCigar);\r
-    const unsigned int packedCigarLen = packedCigar.size();\r
-\r
-    // encode the query\r
-    string encodedQuery;\r
-    EncodeQuerySequence(al.QueryBases, encodedQuery);\r
-    const unsigned int encodedQueryLen = encodedQuery.size();\r
-\r
-    // store the tag data length\r
-    // -------------------------------------------\r
-    // Modified: 3-25-2010 DWB\r
-    // Contributed: ARQ\r
-    // Fixed: "off by one" error when parsing tags in files produced by BamWriter\r
-    const unsigned int tagDataLength = al.TagData.size();\r
-    // original line: \r
-    // const unsigned int tagDataLength = al.TagData.size() + 1;     \r
-    // -------------------------------------------\r
-    \r
-    // assign the BAM core data\r
-    uint32_t buffer[8];\r
-    buffer[0] = al.RefID;\r
-    buffer[1] = al.Position;\r
-    buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;\r
-    buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;\r
-    buffer[4] = queryLen;\r
-    buffer[5] = al.MateRefID;\r
-    buffer[6] = al.MatePosition;\r
-    buffer[7] = al.InsertSize;\r
-\r
-    // write the block size\r
-    unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;\r
-    unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
-    if ( IsBigEndian ) { SwapEndian_32(blockSize); }\r
-    mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
-\r
-    // write the BAM core\r
-    if ( IsBigEndian ) { \r
-        for ( int i = 0; i < 8; ++i ) { \r
-            SwapEndian_32(buffer[i]); \r
-        } \r
-    }\r
-    mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
-\r
-    // write the query name\r
-    mBGZF.Write(al.Name.c_str(), nameLen);\r
-\r
-    // write the packed cigar\r
-    if ( IsBigEndian ) {\r
+    // if BamAlignment contains only the core data and a raw char data buffer\r
+    // (as a result of BamReader::GetNextAlignmentCore())\r
+    if ( al.SupportData.HasCoreOnly ) {\r
       \r
-        char* cigarData = (char*)calloc(sizeof(char), packedCigarLen);\r
-        memcpy(cigarData, packedCigar.data(), packedCigarLen);\r
+        // write the block size\r
+        unsigned int blockSize = al.SupportData.BlockLength;\r
+        if (IsBigEndian) SwapEndian_32(blockSize);\r
+        mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
+\r
+        // assign the BAM core data\r
+        uint32_t buffer[8];\r
+        buffer[0] = al.RefID;\r
+        buffer[1] = al.Position;\r
+        buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;\r
+        buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;\r
+        buffer[4] = al.SupportData.QuerySequenceLength;\r
+        buffer[5] = al.MateRefID;\r
+        buffer[6] = al.MatePosition;\r
+        buffer[7] = al.InsertSize;\r
         \r
-        for (unsigned int i = 0; i < packedCigarLen; ++i) {\r
-            if ( IsBigEndian ) { \r
-              SwapEndian_32p(&cigarData[i]); \r
-            }\r
+        // swap BAM core endian-ness, if necessary\r
+        if ( IsBigEndian ) { \r
+            for ( int i = 0; i < 8; ++i )\r
+                SwapEndian_32(buffer[i]); \r
         }\r
         \r
-        mBGZF.Write(cigarData, packedCigarLen);\r
-        free(cigarData);\r
+        // write the BAM core\r
+        mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
         \r
-    } else { \r
-        mBGZF.Write(packedCigar.data(), packedCigarLen);\r
+        // write the raw char data\r
+        mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); \r
     }\r
-\r
-    // write the encoded query sequence\r
-    mBGZF.Write(encodedQuery.data(), encodedQueryLen);\r
-\r
-    // write the base qualities\r
-    string baseQualities = al.Qualities;\r
-    char* pBaseQualities = (char*)al.Qualities.data();\r
-    for(unsigned int i = 0; i < queryLen; i++) { pBaseQualities[i] -= 33; }\r
-    mBGZF.Write(pBaseQualities, queryLen);\r
-\r
-    // write the read group tag\r
-    if ( IsBigEndian ) {\r
+    \r
+    // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc\r
+    // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )\r
+    else {\r
+        \r
+        // calculate char lengths\r
+        const unsigned int nameLength         = al.Name.size() + 1;\r
+        const unsigned int numCigarOperations = al.CigarData.size();\r
+        const unsigned int queryLength        = al.QueryBases.size();\r
+        const unsigned int tagDataLength      = al.TagData.size();\r
+        \r
+        // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)\r
+        // force calculation of Bin before storing\r
+        const int endPosition = al.GetEndPosition();\r
+        const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);\r
       \r
-        char* tagData = (char*)calloc(sizeof(char), tagDataLength);\r
-        memcpy(tagData, al.TagData.data(), tagDataLength);\r
+        // create our packed cigar string\r
+        string packedCigar;\r
+        CreatePackedCigar(al.CigarData, packedCigar);\r
+        const unsigned int packedCigarLength = packedCigar.size();\r
+\r
+        // encode the query\r
+        string encodedQuery;\r
+        EncodeQuerySequence(al.QueryBases, encodedQuery);\r
+        const unsigned int encodedQueryLength = encodedQuery.size(); \r
       \r
-        int i = 0;\r
-        while ( (unsigned int)i < tagDataLength ) {\r
-            \r
-            i += 2;                                 // skip tag type (e.g. "RG", "NM", etc)\r
-            uint8_t type = toupper(tagData[i]);     // lower & upper case letters have same meaning \r
-            ++i;                                    // skip value type\r
-            \r
-            switch (type) {\r
-              \r
-                case('A') :\r
-                case('C') : \r
-                    ++i;\r
-                    break;\r
-                            \r
-                case('S') : \r
-                    SwapEndian_16p(&tagData[i]); \r
-                    i+=2; // sizeof(uint16_t)\r
-                    break;\r
-                            \r
-                case('F') :\r
-                case('I') : \r
-                    SwapEndian_32p(&tagData[i]);\r
-                    i+=4; // sizeof(uint32_t)\r
-                    break;\r
-                            \r
-                case('D') : \r
-                    SwapEndian_64p(&tagData[i]);\r
-                    i+=8; // sizeof(uint64_t)\r
-                    break;\r
-                            \r
-                case('H') :\r
-                case('Z') : \r
-                    while (tagData[i]) { ++i; }\r
-                    ++i; // increment one more for null terminator\r
-                    break;\r
-                            \r
-                default : \r
-                    printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
-                    free(tagData);\r
-                    exit(1); \r
-            }\r
+        // write the block size\r
+        const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;\r
+        unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
+        if (IsBigEndian) SwapEndian_32(blockSize);\r
+        mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
+\r
+        // assign the BAM core data\r
+        uint32_t buffer[8];\r
+        buffer[0] = al.RefID;\r
+        buffer[1] = al.Position;\r
+        buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;\r
+        buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;\r
+        buffer[4] = queryLength;\r
+        buffer[5] = al.MateRefID;\r
+        buffer[6] = al.MatePosition;\r
+        buffer[7] = al.InsertSize;\r
+        \r
+        // swap BAM core endian-ness, if necessary\r
+        if ( IsBigEndian ) { \r
+            for ( int i = 0; i < 8; ++i )\r
+                SwapEndian_32(buffer[i]); \r
         }\r
         \r
-        mBGZF.Write(tagData, tagDataLength);\r
-        free(tagData);\r
-    } else {\r
-        mBGZF.Write(al.TagData.data(), tagDataLength);\r
-    }\r
-}\r
+        // write the BAM core\r
+        mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
+        \r
+        // write the query name\r
+        mBGZF.Write(al.Name.c_str(), nameLength);\r
+\r
+        // write the packed cigar\r
+        if ( IsBigEndian ) {\r
+          \r
+            char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);\r
+            memcpy(cigarData, packedCigar.data(), packedCigarLength);\r
+            \r
+            for (unsigned int i = 0; i < packedCigarLength; ++i) {\r
+                if ( IsBigEndian )\r
+                  SwapEndian_32p(&cigarData[i]); \r
+            }\r
+            \r
+            mBGZF.Write(cigarData, packedCigarLength);\r
+            free(cigarData);    \r
+        } \r
+        else \r
+            mBGZF.Write(packedCigar.data(), packedCigarLength);\r
+\r
+        // write the encoded query sequence\r
+        mBGZF.Write(encodedQuery.data(), encodedQueryLength);\r
 \r
-void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al, const BamAlignmentSupportData& supportData) {\r
-  \r
-    // assign the BAM core data\r
-    uint32_t buffer[8];\r
-    buffer[0] = al.RefID;\r
-    buffer[1] = al.Position;\r
-    buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | supportData.QueryNameLength;\r
-    buffer[3] = (al.AlignmentFlag << 16) | supportData.NumCigarOperations;\r
-    buffer[4] = supportData.QuerySequenceLength;\r
-    buffer[5] = al.MateRefID;\r
-    buffer[6] = al.MatePosition;\r
-    buffer[7] = al.InsertSize;\r
-\r
-    // write the block size\r
-    unsigned int blockSize = supportData.BlockLength;\r
-    if ( IsBigEndian ) { SwapEndian_32(blockSize); }\r
-    mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
-\r
-    // write the BAM core\r
-    if ( IsBigEndian ) { \r
-        for ( int i = 0; i < 8; ++i ) { \r
-            SwapEndian_32(buffer[i]); \r
+        // write the base qualities\r
+        string baseQualities(al.Qualities);\r
+        char* pBaseQualities = (char*)al.Qualities.data();\r
+        for(unsigned int i = 0; i < queryLength; i++) { \r
+            pBaseQualities[i] -= 33; \r
+        }\r
+        mBGZF.Write(pBaseQualities, queryLength);\r
+\r
+        // write the read group tag\r
+        if ( IsBigEndian ) {\r
+          \r
+            char* tagData = (char*)calloc(sizeof(char), tagDataLength);\r
+            memcpy(tagData, al.TagData.data(), tagDataLength);\r
+          \r
+            int i = 0;\r
+            while ( (unsigned int)i < tagDataLength ) {\r
+                \r
+                i += 2;                                 // skip tag type (e.g. "RG", "NM", etc)\r
+                uint8_t type = toupper(tagData[i]);     // lower & upper case letters have same meaning \r
+                ++i;                                    // skip value type\r
+                \r
+                switch (type) {\r
+                  \r
+                    case('A') :\r
+                    case('C') : \r
+                        ++i;\r
+                        break;\r
+                                \r
+                    case('S') : \r
+                        SwapEndian_16p(&tagData[i]); \r
+                        i+=2; // sizeof(uint16_t)\r
+                        break;\r
+                                \r
+                    case('F') :\r
+                    case('I') : \r
+                        SwapEndian_32p(&tagData[i]);\r
+                        i+=4; // sizeof(uint32_t)\r
+                        break;\r
+                                \r
+                    case('D') : \r
+                        SwapEndian_64p(&tagData[i]);\r
+                        i+=8; // sizeof(uint64_t)\r
+                        break;\r
+                                \r
+                    case('H') :\r
+                    case('Z') : \r
+                        while (tagData[i]) { ++i; }\r
+                        ++i; // increment one more for null terminator\r
+                        break;\r
+                                \r
+                    default : \r
+                        printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
+                        free(tagData);\r
+                        exit(1); \r
+                }\r
+            }\r
+            \r
+            mBGZF.Write(tagData, tagDataLength);\r
+            free(tagData);\r
         } \r
+        else \r
+            mBGZF.Write(al.TagData.data(), tagDataLength);      \r
     }\r
-    mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
-\r
-    // write the raw char data\r
-    mBGZF.Write((char*)supportData.AllCharData.data(), supportData.BlockLength-BAM_CORE_SIZE);\r
 }\r
-\r