]> git.donarmstrong.com Git - bamtools.git/commitdiff
Fixed: off by 1 in BamWriter, variable tag parsing in BamAux, endian-correctness
authorbarnett <barnett@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Tue, 30 Mar 2010 15:00:57 +0000 (15:00 +0000)
committerbarnett <barnett@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Tue, 30 Mar 2010 15:00:57 +0000 (15:00 +0000)
git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@40 9efb377e-2e27-44b9-b91a-ec4abb80ed8b

BamAux.h
BamReader.cpp
BamWriter.cpp

index fb0464211d7c46fa6fb0b6d6071e202057b101aa..ea709270fb1a173f8bb4abc6705b5f5ef107606d 100644 (file)
--- a/BamAux.h
+++ b/BamAux.h
@@ -1,9 +1,9 @@
 // ***************************************************************************\r
-// BamAux.h (c) 2009 Derek Barnett, Michael Strömberg\r
+// BamAux.h (c) 2009 Derek Barnett, Michael Strmberg\r
 // Marth Lab, Department of Biology, Boston College\r
 // All rights reserved.\r
 // ---------------------------------------------------------------------------\r
-// Last modified: 11 Janaury 2010 (DB)\r
+// Last modified: 29 March 2010 (DB)\r
 // ---------------------------------------------------------------------------\r
 // Provides the basic constants, data structures, etc. for using BAM files\r
 // ***************************************************************************\r
 // Platform-specific type definitions\r
 #ifndef BAMTOOLS_TYPES\r
 #define BAMTOOLS_TYPES\r
-       #ifdef _MSC_VER\r
-               typedef char                 int8_t;\r
-               typedef unsigned char       uint8_t;\r
-               typedef short               int16_t;\r
-               typedef unsigned short     uint16_t;\r
-               typedef int                 int32_t;\r
-               typedef unsigned int       uint32_t;\r
-               typedef long long           int64_t;\r
-               typedef unsigned long long uint64_t;\r
-       #else\r
-               #include <stdint.h>\r
-       #endif\r
+    #ifdef _MSC_VER\r
+        typedef char                 int8_t;\r
+        typedef unsigned char       uint8_t;\r
+        typedef short               int16_t;\r
+        typedef unsigned short     uint16_t;\r
+        typedef int                 int32_t;\r
+        typedef unsigned int       uint32_t;\r
+        typedef long long           int64_t;\r
+        typedef unsigned long long uint64_t;\r
+    #else\r
+        #include <stdint.h>\r
+    #endif\r
 #endif // BAMTOOLS_TYPES\r
 \r
 namespace BamTools {\r
@@ -91,9 +91,9 @@ struct BamAlignment {
         // Returns true if alignment is second mate on read\r
         bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }\r
 \r
-       // Manipulate alignment flag\r
-       public:\r
-               // Sets "PCR duplicate" bit \r
+    // Manipulate alignment flag\r
+    public:\r
+        // Sets "PCR duplicate" bit \r
         void SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; }\r
         // Sets "failed quality control" bit\r
         void SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; }\r
@@ -109,13 +109,13 @@ struct BamAlignment {
         void SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; }\r
         // Sets "alignment mapped to reverse strand" bit\r
         void SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; }\r
-               // Sets "position is primary alignment (determined by external app)"\r
+        // Sets "position is primary alignment (determined by external app)"\r
         void SetIsSecondaryAlignment(bool ok)  { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; }\r
         // Sets "alignment is second mate on read" bit\r
         void SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; }\r
-               // Sets "alignment is mapped" bit\r
+        // Sets "alignment is mapped" bit\r
         void SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; }\r
-               \r
+\r
     public:\r
 \r
         // get "RG" tag data\r
@@ -187,7 +187,7 @@ struct BamAlignment {
             if ( !foundEditDistanceTag ) { return false; }\r
 \r
             // assign the editDistance value\r
-            memcpy(&editDistance, pTagData, 1);\r
+            std::memcpy(&editDistance, pTagData, 1);\r
             return true;\r
         }\r
 \r
@@ -221,6 +221,12 @@ struct BamAlignment {
                             ++numBytesParsed;\r
                             ++pTagData;\r
                         }\r
+                        // ---------------------------\r
+                        // Added: 3-25-2010 DWB\r
+                        // Contributed: ARQ\r
+                        // Fixed: error parsing variable length tag data\r
+                        ++pTagData;\r
+                        // ---------------------------\r
                         break;\r
 \r
                 default:\r
@@ -274,7 +280,7 @@ struct CigarOp {
 struct RefData {\r
     // data members\r
     std::string RefName;          // Name of reference sequence\r
-    int         RefLength;        // Length of reference sequence\r
+    int32_t     RefLength;        // Length of reference sequence\r
     bool        RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence\r
     // constructor\r
     RefData(void)\r
@@ -283,7 +289,7 @@ struct RefData {
     { }\r
 };\r
 \r
-typedef std::vector<RefData> RefVector;\r
+typedef std::vector<RefData>      RefVector;\r
 typedef std::vector<BamAlignment> BamAlignmentVector;\r
 \r
 // ----------------------------------------------------------------\r
@@ -323,6 +329,59 @@ struct ReferenceIndex {
 \r
 typedef std::vector<ReferenceIndex> BamIndex;\r
 \r
+// ----------------------------------------------------------------\r
+// Added: 3-35-2010 DWB\r
+// Fixed: Routines to provide endian-correctness\r
+// ----------------------------------------------------------------\r
+\r
+// returns true if system is big endian\r
+inline bool SystemIsBigEndian(void) {\r
+   const uint16_t one = 0x0001;\r
+   return ((*(char*) &one) == 0 );\r
+}\r
+\r
+// swaps endianness of 16-bit value 'in place'\r
+inline void SwapEndian_16(uint16_t& x) {\r
+    x = ((x >> 8) | (x << 8));\r
+}\r
+\r
+// swaps endianness of 32-bit value 'in-place'\r
+inline void SwapEndian_32(uint32_t& value) {\r
+    x = ( (x >> 24) | \r
+         ((x << 8) & 0x00FF0000) | \r
+         ((x >> 8) & 0x0000FF00) | \r
+          (x << 24)\r
+        );\r
+}\r
+\r
+// swaps endianness of 64-bit value 'in-place'\r
+inline void SwapEndian_64(uint64_t& value) {\r
+    x = ( (x >> 56) | \r
+         ((x << 40) & 0x00FF000000000000) |\r
+         ((x << 24) & 0x0000FF0000000000) |\r
+         ((x << 8)  & 0x000000FF00000000) |\r
+         ((x >> 8)  & 0x00000000FF000000) |\r
+         ((x >> 24) & 0x0000000000FF0000) |\r
+         ((x >> 40) & 0x000000000000FF00) |\r
+          (x << 56)\r
+        );\r
+}\r
+\r
+inline void SwapEndian_16p(char* data) {\r
+    uint16_t& value = (uint16_t&)*data; \r
+    SwapEndian_16(value);\r
+}\r
+\r
+inline void SwapEndian_32p(char* data) {\r
+    uint32_t& value = (uint32_t&)*data; \r
+    SwapEndian_32(value);\r
+}\r
+\r
+inline void SwapEndian_64p(char* data) {\r
+    uint64_t& value = (uint64_t&)*data; \r
+    SwapEndian_64(value);\r
+}\r
+\r
 } // namespace BamTools\r
 \r
 #endif // BAMAUX_H\r
index 5dd65f22d1d3e4e427b108405db1d73a23eb4e12..c29b14ce5e867f8e9edcb9f2552fd6b04dd638f9 100644 (file)
@@ -1,9 +1,9 @@
 // ***************************************************************************\r
-// BamReader.cpp (c) 2009 Derek Barnett, Michael Strömberg\r
+// BamReader.cpp (c) 2009 Derek Barnett, Michael Strmberg\r
 // Marth Lab, Department of Biology, Boston College\r
 // All rights reserved.\r
 // ---------------------------------------------------------------------------\r
-// Last modified: 11 January 2010(DB)\r
+// Last modified: 29 March 2010 (DB)\r
 // ---------------------------------------------------------------------------\r
 // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
 // Institute.\r
@@ -38,6 +38,8 @@ struct BamReader::BamReaderPrivate {
     int64_t   AlignmentsBeginOffset;\r
     string    Filename;\r
     string    IndexFilename;\r
+    \r
+    bool IsBigEndian;\r
 \r
     // user-specified region values\r
     bool IsRegionSpecified;\r
@@ -163,7 +165,9 @@ BamReader::BamReaderPrivate::BamReaderPrivate(void)
     , CurrentLeft(0)\r
     , DNA_LOOKUP("=ACMGRSVTWYHKDBN")\r
     , CIGAR_LOOKUP("MIDNSHP")\r
-{ }\r
+{ \r
+    IsBigEndian = SystemIsBigEndian();\r
+}\r
 \r
 // destructor\r
 BamReader::BamReaderPrivate::~BamReaderPrivate(void) {\r
@@ -361,12 +365,12 @@ bool BamReader::BamReaderPrivate::CreateIndex(void) {
     // clear out index\r
     ClearIndex();\r
 \r
-       // build (& save) index from BAM file\r
+    // build (& save) index from BAM file\r
     bool ok = true;\r
     ok &= BuildIndex();\r
     ok &= WriteIndex();\r
 \r
-       // return success/fail\r
+    // return success/fail\r
     return ok;\r
 }\r
 \r
@@ -523,22 +527,22 @@ bool BamReader::BamReaderPrivate::Jump(int refID, int position) {
     // if data exists for this reference and position is valid    \r
     if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {\r
 \r
-               // set current region\r
+        // set current region\r
         CurrentRefID = refID;\r
         CurrentLeft  = position;\r
         IsRegionSpecified = true;\r
 \r
-               // calculate offset\r
+        // calculate offset\r
         int64_t offset = GetOffset(CurrentRefID, CurrentLeft);\r
 \r
-               // if in valid offset, return failure\r
+        // if in valid offset, return failure\r
         if ( offset == -1 ) { return false; }\r
 \r
-               // otherwise return success of seek operation\r
+        // otherwise return success of seek operation\r
         else { return mBGZF.Seek(offset); }\r
     }\r
 \r
-       // invalid jump request parameters, return failure\r
+    // invalid jump request parameters, return failure\r
     return false;\r
 }\r
 \r
@@ -559,8 +563,9 @@ void BamReader::BamReaderPrivate::LoadHeaderData(void) {
 \r
     // get BAM header text length\r
     mBGZF.Read(buffer, 4);\r
-    const unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);\r
-\r
+    unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);\r
+    if ( IsBigEndian ) { SwapEndian_32(headerTextLength); }\r
+    \r
     // get BAM header text\r
     char* headerText = (char*)calloc(headerTextLength + 1, 1);\r
     mBGZF.Read(headerText, headerTextLength);\r
@@ -586,7 +591,7 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) {
         return false;\r
     }\r
 \r
-       size_t elementsRead = 0;\r
+    size_t elementsRead = 0;\r
        \r
     // see if index is valid BAM index\r
     char magic[4];\r
@@ -600,7 +605,8 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) {
     // get number of reference sequences\r
     uint32_t numRefSeqs;\r
     elementsRead = fread(&numRefSeqs, 4, 1, indexStream);\r
-\r
+    if ( IsBigEndian ) { SwapEndian_32(numRefSeqs); }\r
+    \r
     // intialize space for BamIndex data structure\r
     Index.reserve(numRefSeqs);\r
 \r
@@ -610,6 +616,7 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) {
         // get number of bins for this reference sequence\r
         int32_t numBins;\r
         elementsRead = fread(&numBins, 4, 1, indexStream);\r
+        if ( IsBigEndian ) { SwapEndian_32(numBins); }\r
 \r
         if (numBins > 0) {\r
             RefData& refEntry = References[i];\r
@@ -630,6 +637,11 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) {
             uint32_t numChunks;\r
             elementsRead = fread(&numChunks, 4, 1, indexStream);\r
 \r
+            if ( IsBigEndian ) { \r
+              SwapEndian_32(binID);\r
+              SwapEndian_32(numChunks);\r
+            }\r
+            \r
             // intialize ChunkVector\r
             ChunkVector regionChunks;\r
             regionChunks.reserve(numChunks);\r
@@ -643,6 +655,11 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) {
                 elementsRead = fread(&left, 8, 1, indexStream);\r
                 elementsRead = fread(&right, 8, 1, indexStream);\r
 \r
+                if ( IsBigEndian ) {\r
+                    SwapEndian_64(left);\r
+                    SwapEndian_64(right);\r
+                }\r
+                \r
                 // save ChunkPair\r
                 regionChunks.push_back( Chunk(left, right) );\r
             }\r
@@ -659,6 +676,7 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) {
         // get number of linear offsets\r
         int32_t numLinearOffsets;\r
         elementsRead = fread(&numLinearOffsets, 4, 1, indexStream);\r
+        if ( IsBigEndian ) { SwapEndian_32(numLinearOffsets); }\r
 \r
         // intialize LinearOffsetVector\r
         LinearOffsetVector offsets;\r
@@ -669,6 +687,7 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) {
         for (int j = 0; j < numLinearOffsets; ++j) {\r
             // read a linear offset & store\r
             elementsRead = fread(&linearOffset, 8, 1, indexStream);\r
+            if ( IsBigEndian ) { SwapEndian_64(linearOffset); }\r
             offsets.push_back(linearOffset);\r
         }\r
 \r
@@ -690,40 +709,47 @@ bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {
     // read in the 'block length' value, make sure it's not zero\r
     char buffer[4];\r
     mBGZF.Read(buffer, 4);\r
-    const unsigned int blockLength = BgzfData::UnpackUnsignedInt(buffer);\r
+    unsigned int blockLength = BgzfData::UnpackUnsignedInt(buffer);\r
+    if ( IsBigEndian ) { SwapEndian_32(blockLength); }\r
     if ( blockLength == 0 ) { return false; }\r
 \r
     // keep track of bytes read as method progresses\r
     int bytesRead = 4;\r
 \r
     // read in core alignment data, make sure the right size of data was read\r
-    char x[BAM_CORE_SIZE];\r
+    uint32_t x[8];\r
     if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }\r
     bytesRead += BAM_CORE_SIZE;\r
 \r
+    if ( IsBigEndian ) {\r
+        for ( int i = 0; i < 8; ++i ) { \r
+          SwapEndian_32(x[i]); \r
+        }\r
+    }\r
+    \r
     // set BamAlignment 'core' data and character data lengths\r
     unsigned int tempValue;\r
     unsigned int queryNameLength;\r
     unsigned int numCigarOperations;\r
     unsigned int querySequenceLength;\r
 \r
-    bAlignment.RefID    = BgzfData::UnpackSignedInt(&x[0]);\r
-    bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);\r
-\r
-    tempValue             = BgzfData::UnpackUnsignedInt(&x[8]);\r
+    bAlignment.RefID    = BgzfData::UnpackSignedInt(&x[0]);  \r
+    bAlignment.Position = BgzfData::UnpackSignedInt(&x[1]);\r
+    \r
+    tempValue = BgzfData::UnpackUnsignedInt(&x[2]);\r
     bAlignment.Bin        = tempValue >> 16;\r
     bAlignment.MapQuality = tempValue >> 8 & 0xff;\r
     queryNameLength       = tempValue & 0xff;\r
 \r
-    tempValue                = BgzfData::UnpackUnsignedInt(&x[12]);\r
+    tempValue = BgzfData::UnpackUnsignedInt(&x[3]);\r
     bAlignment.AlignmentFlag = tempValue >> 16;\r
     numCigarOperations       = tempValue & 0xffff;\r
 \r
-    querySequenceLength     = BgzfData::UnpackUnsignedInt(&x[16]);\r
-    bAlignment.MateRefID    = BgzfData::UnpackSignedInt(&x[20]);\r
-    bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);\r
-    bAlignment.InsertSize   = BgzfData::UnpackSignedInt(&x[28]);\r
-\r
+    querySequenceLength     = BgzfData::UnpackUnsignedInt(&x[4]);\r
+    bAlignment.MateRefID    = BgzfData::UnpackSignedInt(&x[5]);\r
+    bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[6]);\r
+    bAlignment.InsertSize   = BgzfData::UnpackSignedInt(&x[7]);\r
+    \r
     // calculate lengths/offsets\r
     const unsigned int dataLength      = blockLength - BAM_CORE_SIZE;\r
     const unsigned int cigarDataOffset = queryNameLength;\r
@@ -746,7 +772,7 @@ bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {
         bytesRead += dataLength;\r
 \r
         // clear out any previous string data\r
-        bAlignment.Name.clear();\r
+        bAlignment.Name.clear(;)\r
         bAlignment.QueryBases.clear();\r
         bAlignment.Qualities.clear();\r
         bAlignment.AlignedBases.clear();\r
@@ -757,6 +783,12 @@ bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {
         bAlignment.Name = (string)((const char*)(allCharData));\r
 \r
         // save query sequence\r
+       // -----------------------\r
+       // Added: 3-25-2010 DWB\r
+       // Improved: reduced repeated memory allocations as string grows\r
+       bAlignment.QueryBases.reserve(querySequenceLength);\r
+       // -----------------------\r
+       \r
         for (unsigned int i = 0; i < querySequenceLength; ++i) {\r
             char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];\r
             bAlignment.QueryBases.append( 1, singleBase );\r
@@ -766,15 +798,29 @@ bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {
         bAlignment.Length = bAlignment.QueryBases.length();\r
 \r
         // save qualities, convert from numeric QV to FASTQ character\r
-        for (unsigned int i = 0; i < querySequenceLength; ++i) {\r
+       // -----------------------\r
+       // Added: 3-25-2010 DWB\r
+       // Improved: reduced repeated memory allocations as string grows\r
+       bAlignment.Qualities.reserve(querySequenceLength);\r
+        // -----------------------\r
+       \r
+       for (unsigned int i = 0; i < querySequenceLength; ++i) {\r
             char singleQuality = (char)(qualData[i]+33);\r
             bAlignment.Qualities.append( 1, singleQuality );\r
         }\r
 \r
         // save CIGAR-related data;\r
-        int k = 0;\r
+       // -----------------------\r
+       // Added: 3-25-2010 DWB\r
+       // Improved: reduced repeated memory allocations as string grows\r
+       bAlignment.AlignedBases.reserve(querySequenceLength);\r
+       // -----------------------\r
+       \r
+       int k = 0;\r
         for (unsigned int i = 0; i < numCigarOperations; ++i) {\r
 \r
+            if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); }\r
+          \r
             // build CigarOp struct\r
             CigarOp op;\r
             op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);\r
@@ -787,28 +833,92 @@ bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {
             switch (op.Type) {\r
 \r
                 case ('M') :\r
-                case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases\r
-                case ('S') : k += op.Length;                                                               // for 'S' - skip over query bases\r
-                             break;\r
-\r
-                case ('D') : bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character\r
-                             break;\r
-\r
-                case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character;\r
-                             break;\r
-\r
-                case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in query sequence\r
-                             k += op.Length;\r
-                             break;\r
-\r
-                case ('H') : break;                                            // for 'H' - do nothing, move to next op\r
-\r
-                default    : printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here\r
-                             exit(1);\r
+                case ('I') : \r
+                    bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases\r
+                    // fall through\r
+                    \r
+                case ('S') : \r
+                    k += op.Length;  // for 'S' - skip over query bases\r
+                    break;\r
+\r
+                case ('D') : \r
+                    bAlignment.AlignedBases.append( op.Length, '-' );  // for 'D' - write gap character\r
+                    break;\r
+\r
+                case ('P') : \r
+                    bAlignment.AlignedBases.append( op.Length, '*' );  // for 'P' - write padding character;\r
+                    break;\r
+\r
+                case ('N') : \r
+                    bAlignment.AlignedBases.append( op.Length, 'N' );  // for 'N' - write N's, skip bases in query sequence\r
+                    // -----------------------\r
+                    // Removed: 3-25-2010 DWB\r
+                    // Contributed: ARQ\r
+                    // Fixed: compliance with actual 'N' definition in BAM spec\r
+                    // k += op.Length;\r
+                    // -----------------------\r
+                    break;\r
+\r
+                case ('H') : \r
+                    break;  // for 'H' - do nothing, move to next op\r
+\r
+                default : \r
+                    printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here\r
+                    free(allCharData);\r
+                    exit(1);\r
             }\r
         }\r
 \r
-        // read in the tag data\r
+        // -----------------------\r
+        // Added: 3-25-2010 DWB\r
+        // Fixed: endian-correctness for tag data\r
+        // -----------------------\r
+        if ( IsBigEndian ) {\r
+            int i = 0;\r
+            while ( i < tagDataLen ) {\r
+                \r
+                i += 2;                                 // skip tag type (e.g. "RG", "NM", etc)\r
+                uint8_t type = toupper(tagData[i]);     // lower & upper case letters have same meaning \r
+                ++i;                                    // skip value type\r
+                \r
+                switch (type) {\r
+                  \r
+                    case('A') :\r
+                    case('C') : \r
+                        ++i;\r
+                        break;\r
+                                \r
+                    case('S') : \r
+                        SwapEndian_16p(&tagData[i]); \r
+                        i+=2; // sizeof(uint16_t)\r
+                        break;\r
+                                \r
+                    case('F') :\r
+                    case('I') : \r
+                        SwapEndian_32p(&tagData[i]);\r
+                        i+=4; // sizeof(uint32_t)\r
+                        break;\r
+                                \r
+                    case('D') : \r
+                        SwapEndian_64p(&tagData[i]);\r
+                        i+=8; // sizeof(uint64_t)\r
+                        break;\r
+                                \r
+                    case('H') :\r
+                    case('Z') : \r
+                        while (tagData[i]) { ++i; }\r
+                        ++i; // increment one more for null terminator\r
+                        break;\r
+                                \r
+                    default : \r
+                        printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
+                        free(allCharData);\r
+                        exit(1); \r
+                }\r
+            }\r
+        }\r
+        \r
+        // store tag data\r
         bAlignment.TagData.resize(tagDataLen);\r
         memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);\r
     }\r
@@ -823,7 +933,8 @@ void BamReader::BamReaderPrivate::LoadReferenceData(void) {
     // get number of reference sequences\r
     char buffer[4];\r
     mBGZF.Read(buffer, 4);\r
-    const unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);\r
+    unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);\r
+    if ( IsBigEndian ) { SwapEndian_32(numberRefSeqs); }\r
     if (numberRefSeqs == 0) { return; }\r
     References.reserve((int)numberRefSeqs);\r
 \r
@@ -832,13 +943,15 @@ void BamReader::BamReaderPrivate::LoadReferenceData(void) {
 \r
         // get length of reference name\r
         mBGZF.Read(buffer, 4);\r
-        const unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);\r
+        unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);\r
+        if ( IsBigEndian ) { SwapEndian_32(refNameLength); }\r
         char* refName = (char*)calloc(refNameLength, 1);\r
 \r
         // get reference name and reference sequence length\r
         mBGZF.Read(refName, refNameLength);\r
         mBGZF.Read(buffer, 4);\r
-        const int refLength = BgzfData::UnpackSignedInt(buffer);\r
+        int refLength = BgzfData::UnpackSignedInt(buffer);\r
+        if ( IsBigEndian ) { SwapEndian_32(refLength); }\r
 \r
         // store data for reference\r
         RefData aReference;\r
@@ -973,6 +1086,7 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) {
 \r
     // write number of reference sequences\r
     int32_t numReferenceSeqs = Index.size();\r
+    if ( IsBigEndian ) { SwapEndian_32(numReferenceSeqs); }\r
     fwrite(&numReferenceSeqs, 4, 1, indexStream);\r
 \r
     // iterate over reference sequences\r
@@ -987,6 +1101,7 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) {
 \r
         // write number of bins\r
         int32_t binCount = binMap.size();\r
+        if ( IsBigEndian ) { SwapEndian_32(binCount); }\r
         fwrite(&binCount, 4, 1, indexStream);\r
 \r
         // iterate over bins\r
@@ -995,14 +1110,16 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) {
         for ( ; binIter != binEnd; ++binIter ) {\r
 \r
             // get bin data (key and chunk vector)\r
-            const uint32_t& binKey = (*binIter).first;\r
+            uint32_t binKey = (*binIter).first;\r
             const ChunkVector& binChunks = (*binIter).second;\r
 \r
             // save BAM bin key\r
+            if ( IsBigEndian ) { SwapEndian_32(binKey); }\r
             fwrite(&binKey, 4, 1, indexStream);\r
 \r
             // save chunk count\r
             int32_t chunkCount = binChunks.size();\r
+            if ( IsBigEndian ) { SwapEndian_32(chunkCount); }\r
             fwrite(&chunkCount, 4, 1, indexStream);\r
 \r
             // iterate over chunks\r
@@ -1012,9 +1129,14 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) {
 \r
                 // get current chunk data\r
                 const Chunk& chunk    = (*chunkIter);\r
-                const uint64_t& start = chunk.Start;\r
-                const uint64_t& stop  = chunk.Stop;\r
+                uint64_t start = chunk.Start;\r
+                uint64_t stop  = chunk.Stop;\r
 \r
+                if ( IsBigEndian ) {\r
+                    SwapEndian_64(start);\r
+                    SwapEndian_64(stop);\r
+                }\r
+                \r
                 // save chunk offsets\r
                 fwrite(&start, 8, 1, indexStream);\r
                 fwrite(&stop,  8, 1, indexStream);\r
@@ -1023,6 +1145,7 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) {
 \r
         // write linear offsets size\r
         int32_t offsetSize = offsets.size();\r
+        if ( IsBigEndian ) { SwapEndian_32(offsetSize); }\r
         fwrite(&offsetSize, 4, 1, indexStream);\r
 \r
         // iterate over linear offsets\r
@@ -1031,7 +1154,8 @@ bool BamReader::BamReaderPrivate::WriteIndex(void) {
         for ( ; offsetIter != offsetEnd; ++offsetIter ) {\r
 \r
             // write linear offset value\r
-            const uint64_t& linearOffset = (*offsetIter);\r
+            uint64_t linearOffset = (*offsetIter);\r
+            if ( IsBigEndian ) { SwapEndian_64(linearOffset); }\r
             fwrite(&linearOffset, 8, 1, indexStream);\r
         }\r
     }\r
index c834d45aa86449b1a48077d3e27d320cb6530d78..075989a6f015a09340a9392d856a37ed0914ce30 100644 (file)
@@ -1,9 +1,9 @@
 // ***************************************************************************\r
-// BamWriter.cpp (c) 2009 Michael Strömberg, Derek Barnett\r
+// BamWriter.cpp (c) 2009 Michael Strmberg, Derek Barnett\r
 // Marth Lab, Department of Biology, Boston College\r
 // All rights reserved.\r
 // ---------------------------------------------------------------------------\r
-// Last modified: 8 December 2009 (DB)\r
+// Last modified: 29 March 2010 (DB)\r
 // ---------------------------------------------------------------------------\r
 // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad\r
 // Institute.\r
@@ -21,9 +21,14 @@ struct BamWriter::BamWriterPrivate {
 \r
     // data members\r
     BgzfData mBGZF;\r
-\r
+    bool IsBigEndian;\r
+    \r
+    \r
     // constructor / destructor\r
-    BamWriterPrivate(void) { }\r
+    BamWriterPrivate(void) { \r
+      IsBigEndian = SystemIsBigEndian();  \r
+    }\r
+    \r
     ~BamWriterPrivate(void) {\r
         mBGZF.Close();\r
     }\r
@@ -139,27 +144,34 @@ void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, strin
     while(*pQuery) {\r
 \r
         switch(*pQuery) {\r
+            \r
             case '=':\r
-                    nucleotideCode = 0;\r
-                    break;\r
+                nucleotideCode = 0;\r
+                break;\r
+                \r
             case 'A':\r
-                    nucleotideCode = 1;\r
-                    break;\r
+                nucleotideCode = 1;\r
+                break;\r
+            \r
             case 'C':\r
-                    nucleotideCode = 2;\r
-                    break;\r
+                nucleotideCode = 2;\r
+                break;\r
+            \r
             case 'G':\r
-                    nucleotideCode = 4;\r
-                    break;\r
+                nucleotideCode = 4;\r
+                break;\r
+            \r
             case 'T':\r
-                    nucleotideCode = 8;\r
-                    break;\r
+                nucleotideCode = 8;\r
+                break;\r
+            \r
             case 'N':\r
-                    nucleotideCode = 15;\r
-                    break;\r
+                nucleotideCode = 15;\r
+                break;\r
+            \r
             default:\r
-                    printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
-                    exit(1);\r
+                printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);\r
+                exit(1);\r
         }\r
 \r
         // pack the nucleotide code\r
@@ -193,7 +205,8 @@ void BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam
     mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);\r
 \r
     // write the SAM header text length\r
-    const unsigned int samHeaderLen = samHeader.size();\r
+    uint32_t samHeaderLen = samHeader.size();\r
+    if ( IsBigEndian ) { SwapEndian_32(samHeaderLen); }\r
     mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);\r
 \r
     // write the SAM header text\r
@@ -202,7 +215,8 @@ void BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam
     }\r
 \r
     // write the number of reference sequences\r
-    const unsigned int numReferenceSequences = referenceSequences.size();\r
+    uint32_t numReferenceSequences = referenceSequences.size();\r
+    if ( IsBigEndian ) { SwapEndian_32(numReferenceSequences); }\r
     mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);\r
 \r
     // =============================\r
@@ -213,14 +227,17 @@ void BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam
     for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {\r
 \r
         // write the reference sequence name length\r
-        const unsigned int referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
+        uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;\r
+        if ( IsBigEndian ) { SwapEndian_32(referenceSequenceNameLen); }\r
         mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);\r
 \r
         // write the reference sequence name\r
         mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);\r
 \r
         // write the reference sequence length\r
-        mBGZF.Write((char*)&rsIter->RefLength, BT_SIZEOF_INT);\r
+        int32_t referenceLength = rsIter->RefLength;\r
+        if ( IsBigEndian ) { SwapEndian_32(referenceLength); }\r
+        mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);\r
     }\r
 }\r
 \r
@@ -243,10 +260,17 @@ void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
     const unsigned int encodedQueryLen = encodedQuery.size();\r
 \r
     // store the tag data length\r
-    const unsigned int tagDataLength = al.TagData.size() + 1;\r
-\r
+    // -------------------------------------------\r
+    // Modified: 3-25-2010 DWB\r
+    // Contributed: ARQ\r
+    // Fixed: "off by one" error when parsing tags in files produced by BamWriter\r
+    const unsigned int tagDataLength = al.TagData.size();\r
+    // original line: \r
+    // const unsigned int tagDataLength = al.TagData.size() + 1;     \r
+    // -------------------------------------------\r
+    \r
     // assign the BAM core data\r
-    unsigned int buffer[8];\r
+    uint32_t buffer[8];\r
     buffer[0] = al.RefID;\r
     buffer[1] = al.Position;\r
     buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;\r
@@ -257,18 +281,40 @@ void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
     buffer[7] = al.InsertSize;\r
 \r
     // write the block size\r
-    const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;\r
-    const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
+    unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;\r
+    unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;\r
+    if ( IsBigEndian ) { SwapEndian_32(blockSize); }\r
     mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);\r
 \r
     // write the BAM core\r
+    if ( IsBigEndian ) { \r
+        for ( int i = 0; i < 8; ++i ) { \r
+            SwapEndian_32(buffer[i]); \r
+        } \r
+    }\r
     mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);\r
 \r
     // write the query name\r
     mBGZF.Write(al.Name.c_str(), nameLen);\r
 \r
     // write the packed cigar\r
-    mBGZF.Write(packedCigar.data(), packedCigarLen);\r
+    if ( IsBigEndian ) {\r
+      \r
+        char* cigarData = (char*)calloc(sizeof(char), packedCigarLen);\r
+        memcpy(cigarData, packedCigar.data(), packedCigarLen);\r
+        \r
+        for (unsigned int i = 0; i < packedCigarLen; ++i) {\r
+            if ( IsBigEndian ) { \r
+              SwapEndian_32(cigarData[i]); \r
+            }\r
+        }\r
+        \r
+        mBGZF.Write(cigarData, packedCigarLen);\r
+        free(cigarData);\r
+        \r
+    } else { \r
+        mBGZF.Write(packedCigar.data(), packedCigarLen);\r
+    }\r
 \r
     // write the encoded query sequence\r
     mBGZF.Write(encodedQuery.data(), encodedQueryLen);\r
@@ -280,5 +326,57 @@ void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
     mBGZF.Write(pBaseQualities, queryLen);\r
 \r
     // write the read group tag\r
-    mBGZF.Write(al.TagData.data(), tagDataLength);\r
+    if ( IsBigEndian ) {\r
+      \r
+        char* tagData = (char*)calloc(sizeof(char), tagDataLength);\r
+        memcpy(tagData, al.TagData.data(), tagDataLength);\r
+      \r
+        int i = 0;\r
+        while ( i < tagDataLength ) {\r
+            \r
+            i += 2;                                 // skip tag type (e.g. "RG", "NM", etc)\r
+            uint8_t type = toupper(tagData[i]);     // lower & upper case letters have same meaning \r
+            ++i;                                    // skip value type\r
+            \r
+            switch (type) {\r
+              \r
+                case('A') :\r
+                case('C') : \r
+                    ++i;\r
+                    break;\r
+                            \r
+                case('S') : \r
+                    SwapEndian_16p(&tagData[i]); \r
+                    i+=2; // sizeof(uint16_t)\r
+                    break;\r
+                            \r
+                case('F') :\r
+                case('I') : \r
+                    SwapEndian_32p(&tagData[i]);\r
+                    i+=4; // sizeof(uint32_t)\r
+                    break;\r
+                            \r
+                case('D') : \r
+                    SwapEndian_64p(&tagData[i]);\r
+                    i+=8; // sizeof(uint64_t)\r
+                    break;\r
+                            \r
+                case('H') :\r
+                case('Z') : \r
+                    while (tagData[i]) { ++i; }\r
+                    ++i; // increment one more for null terminator\r
+                    break;\r
+                            \r
+                default : \r
+                    printf("ERROR: Invalid tag value type\n"); // shouldn't get here\r
+                    free(tagData);\r
+                    exit(1); \r
+            }\r
+        }\r
+        \r
+        mBGZF.Write(tagData, tagDataLength);\r
+        free(tagData);\r
+    } else {\r
+        mBGZF.Write(al.TagData.data(), tagDataLength);\r
+    }\r
 }\r