]> git.donarmstrong.com Git - bamtools.git/commitdiff
Update to BamTools API 0.9.2 with exposure of SamHeader object from BamReader and...
authorderek <derekwbarnett@gmail.com>
Wed, 12 Jan 2011 03:37:31 +0000 (22:37 -0500)
committerderek <derekwbarnett@gmail.com>
Wed, 12 Jan 2011 03:37:31 +0000 (22:37 -0500)
src/api/BamReader.cpp
src/api/BamReader.h
src/api/BamWriter.cpp
src/api/BamWriter.h
src/api/CMakeLists.txt
src/api/internal/BamHeader_p.cpp [new file with mode: 0644]
src/api/internal/BamHeader_p.h [new file with mode: 0644]
src/api/internal/BamReader_p.cpp
src/api/internal/BamReader_p.h
src/api/internal/BamWriter_p.cpp
src/api/internal/BamWriter_p.h

index c357b9dfceef5858eded4222b9ec022710f449e0..473dc8941161539480bfdd6e27c8e1602717455d 100644 (file)
@@ -3,7 +3,7 @@
 // Marth Lab, Department of Biology, Boston College
 // All rights reserved.
 // ---------------------------------------------------------------------------
-// Last modified: 22 November 2010 (DB)
+// Last modified: 11 January 2011 (DB)
 // ---------------------------------------------------------------------------
 // Provides the basic functionality for reading BAM files
 // ***************************************************************************
@@ -55,6 +55,7 @@ bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAl
 bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); }
 
 // access auxiliary data
+SamHeader BamReader::GetHeader(void) const { return d->GetSamHeader(); }
 const string BamReader::GetHeaderText(void) const { return d->GetHeaderText(); }
 int BamReader::GetReferenceCount(void) const { return d->References.size(); }
 const RefVector& BamReader::GetReferenceData(void) const { return d->References; }
index c3452f6cb82b34c95596e68f9d673d41af051e9b..d68ab6c86d82f15417999310f44e6e809c80fe35 100644 (file)
@@ -3,7 +3,7 @@
 // Marth Lab, Department of Biology, Boston College\r
 // All rights reserved.\r
 // ---------------------------------------------------------------------------\r
-// Last modified: 19 November 2010 (DB)\r
+// Last modified: 11 January 2011 (DB)\r
 // ---------------------------------------------------------------------------\r
 // Provides the basic functionality for reading BAM files\r
 // ***************************************************************************\r
@@ -14,6 +14,7 @@
 #include <api/api_global.h>\r
 #include <api/BamAlignment.h>\r
 #include <api/BamIndex.h>\r
+#include <api/SamHeader.h>\r
 #include <string>\r
 \r
 namespace BamTools {\r
@@ -74,6 +75,8 @@ class API_EXPORT BamReader {
         // access auxiliary data\r
         // ----------------------\r
 \r
+        // returns SamHeader object - see SamHeader.h for more info\r
+        SamHeader GetHeader(void) const;\r
         // returns SAM header text\r
         const std::string GetHeaderText(void) const;\r
         // returns number of reference sequences\r
index e92d0714443744e7855d12a432ba582cf3b174b8..386755d09256aebeb0e1298a7bb222ef4a103253 100644 (file)
@@ -3,7 +3,7 @@
 // Marth Lab, Department of Biology, Boston College\r
 // All rights reserved.\r
 // ---------------------------------------------------------------------------\r
-// Last modified: 22 November 2010 (DB)\r
+// Last modified: 11 January 2011 (DB)\r
 // ---------------------------------------------------------------------------\r
 // Provides the basic functionality for producing BAM files\r
 // ***************************************************************************\r
@@ -32,7 +32,7 @@ void BamWriter::Close(void) {
     d->Close();\r
 }\r
 \r
-// opens the alignment archive\r
+// opens the alignment archive (using std::string SAM header)\r
 bool BamWriter::Open(const string& filename,\r
                      const string& samHeader,\r
                      const RefVector& referenceSequences,\r
@@ -41,6 +41,15 @@ bool BamWriter::Open(const string& filename,
     return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed);\r
 }\r
 \r
+// opens the alignment archive (using SamHeader object)\r
+bool BamWriter::Open(const string& filename,\r
+                     const SamHeader& samHeader,\r
+                     const RefVector& referenceSequences,\r
+                     bool isWriteUncompressed)\r
+{\r
+    return d->Open(filename, samHeader.ToString(), referenceSequences, isWriteUncompressed);\r
+}\r
+\r
 // saves the alignment to the alignment archive\r
 void BamWriter::SaveAlignment(const BamAlignment& al) {\r
     d->SaveAlignment(al);\r
index 55f86f4f29f13b0415604b7f1c785d525239c487..2d8b52810da74764b6e1844d1dced2a93f5dd938 100644 (file)
@@ -3,7 +3,7 @@
 // Marth Lab, Department of Biology, Boston College\r
 // All rights reserved.\r
 // ---------------------------------------------------------------------------\r
-// Last modified: 19 November 2010 (DB)\r
+// Last modified: 11 January 2011 (DB)\r
 // ---------------------------------------------------------------------------\r
 // Provides the basic functionality for producing BAM files\r
 // ***************************************************************************\r
@@ -13,6 +13,7 @@
 \r
 #include <api/api_global.h>\r
 #include <api/BamAlignment.h>\r
+#include <api/SamHeader.h>\r
 #include <string>\r
 \r
 namespace BamTools {\r
@@ -32,11 +33,16 @@ class API_EXPORT BamWriter {
     public:\r
         // closes the alignment archive\r
         void Close(void);\r
-        // opens the alignment archive\r
+        // opens the alignment archive (using std::string SAM header)\r
         bool Open(const std::string& filename, \r
                   const std::string& samHeader, \r
                   const BamTools::RefVector& referenceSequences, \r
                   bool writeUncompressed = false);\r
+        // opens the alignment archive (using SamHeader object)\r
+        bool Open(const std::string& filename,\r
+                  const SamHeader& samHeader,\r
+                  const BamTools::RefVector& referenceSequences,\r
+                  bool writeUncompressed = false);\r
         // saves the alignment to the alignment archive\r
         void SaveAlignment(const BamTools::BamAlignment& al);\r
 \r
index 9e41c720408cb81a520f44d9e10c1ec27eaa6f74..6aab198dc26fa6c4ca6505ff2e7558bb7ab605c1 100644 (file)
@@ -24,6 +24,7 @@ set( BamToolsAPISources
         SamReadGroupDictionary.cpp
         SamSequence.cpp
         SamSequenceDictionary.cpp
+        internal/BamHeader_p.cpp
         internal/BamMultiReader_p.cpp
         internal/BamReader_p.cpp
         internal/BamStandardIndex_p.cpp
@@ -36,7 +37,7 @@ set( BamToolsAPISources
 
 # create main BamTools API shared library
 add_library( BamTools SHARED ${BamToolsAPISources} )
-set_target_properties( BamTools PROPERTIES SOVERSION "0.9.1" )
+set_target_properties( BamTools PROPERTIES SOVERSION "0.9.2" )
 set_target_properties( BamTools PROPERTIES OUTPUT_NAME "bamtools" )
 
 # create main BamTools API static library
diff --git a/src/api/internal/BamHeader_p.cpp b/src/api/internal/BamHeader_p.cpp
new file mode 100644 (file)
index 0000000..c613b12
--- /dev/null
@@ -0,0 +1,155 @@
+// ***************************************************************************
+// BamHeader_p.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 25 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for handling BAM headers.
+// ***************************************************************************
+
+#include <api/BamAux.h>
+#include <api/BamConstants.h>
+#include <api/BGZF.h>
+#include <api/internal/BamHeader_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+using namespace std;
+
+// ---------------------------------
+// BamHeaderPrivate implementation
+
+struct BamHeader::BamHeaderPrivate {
+
+    // data members
+    SamHeader* m_samHeader;
+
+    // ctor
+    BamHeaderPrivate(void)
+        : m_samHeader(0)
+    { }
+
+    // 'public' interface
+    bool Load(BgzfData* stream);
+
+    // internal methods
+    bool CheckMagicNumber(BgzfData* stream);
+    bool ReadHeaderLength(BgzfData* stream, uint32_t& length);
+    bool ReadHeaderText(BgzfData* stream, const uint32_t& length);
+};
+
+bool BamHeader::BamHeaderPrivate::Load(BgzfData* stream) {
+
+    // cannot load if invalid stream
+    if ( stream == 0 )
+        return false;
+
+    // cannot load if magic number is invalid
+    if ( !CheckMagicNumber(stream) )
+        return false;
+
+    // cannot load header if cannot read header length
+    uint32_t length(0);
+    if ( !ReadHeaderLength(stream, length) )
+        return false;
+
+    // cannot load header if cannot read header text
+    if ( !ReadHeaderText(stream, length) )
+        return false;
+
+    // otherwise, everything OK
+    return true;
+}
+
+bool BamHeader::BamHeaderPrivate::CheckMagicNumber(BgzfData* stream) {
+
+    // try to read magic number
+    char buffer[Constants::BAM_HEADER_MAGIC_SIZE];
+    if ( stream->Read(buffer, Constants::BAM_HEADER_MAGIC_SIZE) != (int)Constants::BAM_HEADER_MAGIC_SIZE ) {
+        fprintf(stderr, "BAM header error - could not read magic number\n");
+        return false;
+    }
+
+    // validate magic number
+    if ( strncmp(buffer, Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_SIZE) != 0 ) {
+        fprintf(stderr, "BAM header error - invalid magic number\n");
+        return false;
+    }
+
+    // all checks out
+    return true;
+}
+
+bool BamHeader::BamHeaderPrivate::ReadHeaderLength(BgzfData* stream, uint32_t& length) {
+
+    // attempt to read BAM header text length
+    char buffer[sizeof(uint32_t)];
+    if ( stream->Read(buffer, sizeof(uint32_t)) != sizeof(uint32_t) ) {
+        fprintf(stderr, "BAM header error - could not read header length\n");
+        return false;
+    }
+
+    // convert char buffer to length, return success
+    length = BgzfData::UnpackUnsignedInt(buffer);
+    if ( BamTools::SystemIsBigEndian() )
+        SwapEndian_32(length);
+    return true;
+}
+
+bool BamHeader::BamHeaderPrivate::ReadHeaderText(BgzfData* stream, const uint32_t& length) {
+
+    // set up destination buffer
+    char* headerText = (char*)calloc(length + 1, 1);
+
+    // attempt to read header text
+    const unsigned bytesRead = stream->Read(headerText, length);
+    const bool readOk = ( bytesRead == length );
+    if ( readOk )
+        m_samHeader = new SamHeader( (string)((const char*)headerText) );
+    else
+        fprintf(stderr, "BAM header error - could not read header text\n");
+
+    // clean up calloc-ed temp variable (on success or fail)
+    free(headerText);
+
+    // return read success
+    return readOk;
+}
+
+// --------------------------
+// BamHeader implementation
+
+BamHeader::BamHeader(void)
+    : d(new BamHeaderPrivate)
+{ }
+
+BamHeader::~BamHeader(void) {
+    delete d;
+    d = 0;
+}
+
+void BamHeader::Clear(void) {
+    delete d->m_samHeader;
+    d->m_samHeader = new SamHeader("");
+}
+
+bool BamHeader::IsValid(void) const {
+    return d->m_samHeader->IsValid();
+}
+
+bool BamHeader::Load(BgzfData* stream) {
+    return d->Load(stream);
+}
+
+SamHeader BamHeader::ToSamHeader(void) const {
+    return *(d->m_samHeader);
+}
+
+string BamHeader::ToString(void) const {
+    return d->m_samHeader->ToString();
+}
diff --git a/src/api/internal/BamHeader_p.h b/src/api/internal/BamHeader_p.h
new file mode 100644 (file)
index 0000000..352ce88
--- /dev/null
@@ -0,0 +1,56 @@
+// ***************************************************************************
+// BamHeader_p.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 25 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides the basic functionality for handling BAM headers.
+// ***************************************************************************
+
+#ifndef BAMHEADER_P_H
+#define BAMHEADER_P_H
+
+//  -------------
+//  W A R N I N G
+//  -------------
+//
+// This file is not part of the BamTools API.  It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <api/SamHeader.h>
+#include <string>
+
+namespace BamTools {
+
+class BgzfData;
+
+namespace Internal {
+
+class BamHeader {
+
+    public:
+        BamHeader(void);
+        ~BamHeader(void);
+
+    public:
+        void Clear(void);
+        bool IsValid(void) const;
+        bool Load(BgzfData* stream);
+
+    public:
+        SamHeader ToSamHeader(void) const;
+        std::string ToString(void) const;
+
+    private:
+        struct BamHeaderPrivate;
+        BamHeaderPrivate* d;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // BAMHEADER_P_H
index f14dcee646a28eba84d2ee9e6afcf96601b07387..7aa18f3b0a1f44176732107f52c9ee98722cdd46 100644 (file)
@@ -3,13 +3,14 @@
 // Marth Lab, Department of Biology, Boston College
 // All rights reserved.
 // ---------------------------------------------------------------------------
-// Last modified: 22 November 2010 (DB)
+// Last modified: 11 January 2011 (DB)
 // ---------------------------------------------------------------------------
 // Provides the basic functionality for reading BAM files
 // ***************************************************************************
 
 #include <api/BamReader.h>
 #include <api/BGZF.h>
+#include <api/internal/BamHeader_p.h>
 #include <api/internal/BamReader_p.h>
 #include <api/internal/BamStandardIndex_p.h>
 #include <api/internal/BamToolsIndex_p.h>
@@ -24,14 +25,13 @@ using namespace std;
 
 // constructor
 BamReaderPrivate::BamReaderPrivate(BamReader* parent)
-    : HeaderText("")
-    , Index(0)
+    : Index(0)
     , HasIndex(false)
     , AlignmentsBeginOffset(0)
-    //    , m_header(0)
     , IndexCacheMode(BamIndex::LimitedIndexCaching)
     , HasAlignmentsInRegion(true)
     , Parent(parent)
+    , m_header(new BamHeader)
     , DNA_LOOKUP("=ACMGRSVTWYHKDBN")
     , CIGAR_LOOKUP("MIDNSHP")
 {
@@ -40,7 +40,11 @@ BamReaderPrivate::BamReaderPrivate(BamReader* parent)
 
 // destructor
 BamReaderPrivate::~BamReaderPrivate(void) {
+
     Close();
+
+    delete m_header;
+    m_header = 0;
 }
 
 // adjusts requested region if necessary (depending on where data actually begins)
@@ -88,12 +92,7 @@ void BamReaderPrivate::Close(void) {
     ClearIndex();
 
     // clear out header data
-    HeaderText.clear();
-
-    //    if ( m_header ) {
-    // delete m_header;
-    // m_header = 0;
-    //    }
+    m_header->Clear();
 
     // clear out region flags
     Region.clear();
@@ -135,13 +134,11 @@ bool BamReaderPrivate::CreateIndex(bool useStandardIndex) {
 }
 
 const string BamReaderPrivate::GetHeaderText(void) const {
+    return m_header->ToString();
+}
 
-    return HeaderText;
-
-    //    if ( m_header )
-    // return m_header->Text();
-    //    else
-    // return string("");
+SamHeader BamReaderPrivate::GetSamHeader(void) const {
+    return m_header->ToSamHeader();
 }
 
 // get next alignment (from specified region, if given)
@@ -278,36 +275,7 @@ BamReaderPrivate::RegionState BamReaderPrivate::IsOverlap(BamAlignment& bAlignme
 
 // load BAM header data
 void BamReaderPrivate::LoadHeaderData(void) {
-
-    //    m_header = new BamHeader(&mBGZF);
-    //    bool headerLoadedOk = m_header->Load();
-    //    if ( !headerLoadedOk )
-    // cerr << "BamReader could not load header" << endl;
-
-    // check to see if proper BAM header
-    char buffer[4];
-    if (mBGZF.Read(buffer, 4) != 4) {
-        fprintf(stderr, "Could not read header type\n");
-        exit(1);
-    }
-
-    if (strncmp(buffer, "BAM\001", 4)) {
-        fprintf(stderr, "wrong header type!\n");
-        exit(1);
-    }
-
-    // get BAM header text length
-    mBGZF.Read(buffer, 4);
-    unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);
-    if ( IsBigEndian ) SwapEndian_32(headerTextLength);
-
-    // get BAM header text
-    char* headerText = (char*)calloc(headerTextLength + 1, 1);
-    mBGZF.Read(headerText, headerTextLength);
-    HeaderText = (string)((const char*)headerText);
-
-    // clean up calloc-ed temp variable
-    free(headerText);
+     m_header->Load(&mBGZF);
 }
 
 // load existing index data from BAM index file (".bti" OR ".bai"), return success/fail
index ed3940a504ba50e2678b2d5e5b9214b42b0c66a4..79bcddb3698023a67e1db28e461e547c3813cf21 100644 (file)
@@ -3,7 +3,7 @@
 // Marth Lab, Department of Biology, Boston College
 // All rights reserved.
 // ---------------------------------------------------------------------------
-// Last modified: 19 November 2010 (DB)
+// Last modified: 11 January 2011 (DB)
 // ---------------------------------------------------------------------------
 // Provides the basic functionality for reading BAM files
 // ***************************************************************************
 #include <api/BamAlignment.h>
 #include <api/BamIndex.h>
 #include <api/BGZF.h>
+#include <api/SamHeader.h>
 #include <string>
 
 namespace BamTools {
 
 class BamReader;
+class SamHeader;
 
 namespace Internal {
 
+class BamHeader;
+
 class BamReaderPrivate {
 
     // enums
@@ -63,6 +67,7 @@ class BamReaderPrivate {
 
         // access auxiliary data
         const std::string GetHeaderText(void) const;
+        SamHeader GetSamHeader(void) const;
         int GetReferenceID(const std::string& refName) const;
 
         // index operations
@@ -101,7 +106,6 @@ class BamReaderPrivate {
 
         // general file data
         BgzfData  mBGZF;
-        std::string HeaderText;
         BamIndex* Index;
         RefVector References;
         bool      HasIndex;
@@ -109,7 +113,6 @@ class BamReaderPrivate {
         std::string    Filename;
         std::string    IndexFilename;
 
-        //     Internal::BamHeader* m_header;
 
         // index caching mode
         BamIndex::BamIndexCacheMode IndexCacheMode;
@@ -123,6 +126,7 @@ class BamReaderPrivate {
 
         // parent BamReader
         BamReader* Parent;
+        BamHeader* m_header;
 
         // BAM character constants
         const char* DNA_LOOKUP;
index a75eb44a3b6b7783445a97abbd9f078bd808d039..90959b6a7882dae23878edd234b3f51f61fa09c5 100644 (file)
@@ -3,7 +3,7 @@
 // Marth Lab, Department of Biology, Boston College
 // All rights reserved.
 // ---------------------------------------------------------------------------
-// Last modified: 22 November 2010 (DB)
+// Last modified: 11 January 2011 (DB)
 // ---------------------------------------------------------------------------
 // Provides the basic functionality for producing BAM files
 // ***************************************************************************
@@ -14,19 +14,16 @@ using namespace BamTools;
 using namespace BamTools::Internal;
 using namespace std;
 
+// ctor
 BamWriterPrivate::BamWriterPrivate(void) {
     IsBigEndian = SystemIsBigEndian();
 }
 
+// dtor
 BamWriterPrivate::~BamWriterPrivate(void) {
     mBGZF.Close();
 }
 
-// closes the alignment archive
-void BamWriterPrivate::Close(void) {
-    mBGZF.Close();
-}
-
 // calculates minimum bin for a BAM alignment interval
 const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const {
     --end;
@@ -38,6 +35,11 @@ const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int en
     return 0;
 }
 
+// closes the alignment archive
+void BamWriterPrivate::Close(void) {
+    mBGZF.Close();
+}
+
 // creates a cigar string from the supplied alignment
 void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
 
@@ -52,35 +54,21 @@ void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations,
     vector<CigarOp>::const_iterator coIter;
     for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) {
 
-       switch(coIter->Type) {
-           case 'M':
-                 cigarOp = BAM_CMATCH;
-                 break;
-           case 'I':
-                 cigarOp = BAM_CINS;
-                 break;
-           case 'D':
-                 cigarOp = BAM_CDEL;
-                 break;
-           case 'N':
-                 cigarOp = BAM_CREF_SKIP;
-                 break;
-           case 'S':
-                 cigarOp = BAM_CSOFT_CLIP;
-                 break;
-           case 'H':
-                 cigarOp = BAM_CHARD_CLIP;
-                 break;
-           case 'P':
-                 cigarOp = BAM_CPAD;
-                 break;
-           default:
-                 fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);
-                 exit(1);
-       }
-
-       *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
-       pPackedCigar++;
+        switch(coIter->Type) {
+            case 'M': cigarOp = BAM_CMATCH; break;
+            case 'I': cigarOp = BAM_CINS; break;
+            case 'D': cigarOp = BAM_CDEL; break;
+            case 'N': cigarOp = BAM_CREF_SKIP; break;
+            case 'S': cigarOp = BAM_CSOFT_CLIP; break;
+            case 'H': cigarOp = BAM_CHARD_CLIP; break;
+            case 'P': cigarOp = BAM_CPAD; break;
+            default:
+              fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type);
+              exit(1);
+        }
+
+        *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
+        pPackedCigar++;
     }
 }
 
@@ -99,49 +87,30 @@ void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQ
 
     while(*pQuery) {
 
-       switch(*pQuery) {
-
-           case '=':
-               nucleotideCode = 0;
-               break;
-
-           case 'A':
-               nucleotideCode = 1;
-               break;
-
-           case 'C':
-               nucleotideCode = 2;
-               break;
-
-           case 'G':
-               nucleotideCode = 4;
-               break;
-
-           case 'T':
-               nucleotideCode = 8;
-               break;
-
-           case 'N':
-               nucleotideCode = 15;
-               break;
-
-           default:
-               fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
-               exit(1);
-       }
-
-       // pack the nucleotide code
-       if(useHighWord) {
-           *pEncodedQuery = nucleotideCode << 4;
-           useHighWord = false;
-       } else {
-           *pEncodedQuery |= nucleotideCode;
-           pEncodedQuery++;
-           useHighWord = true;
-       }
-
-       // increment the query position
-       pQuery++;
+        switch(*pQuery) {
+            case '=': nucleotideCode = 0; break;
+            case 'A': nucleotideCode = 1; break;
+            case 'C': nucleotideCode = 2; break;
+            case 'G': nucleotideCode = 4; break;
+            case 'T': nucleotideCode = 8; break;
+            case 'N': nucleotideCode = 15; break;
+            default:
+                fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
+                exit(1);
+        }
+
+        // pack the nucleotide code
+        if(useHighWord) {
+            *pEncodedQuery = nucleotideCode << 4;
+            useHighWord = false;
+        } else {
+            *pEncodedQuery |= nucleotideCode;
+            pEncodedQuery++;
+            useHighWord = true;
+        }
+
+        // increment the query position
+        pQuery++;
     }
 }
 
@@ -153,7 +122,7 @@ bool BamWriterPrivate::Open(const string& filename,
 {
     // open the BGZF file for writing, return failure if error
     if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) )
-       return false;
+        return false;
 
     // ================
     // write the header
@@ -182,21 +151,22 @@ bool BamWriterPrivate::Open(const string& filename,
     // write the sequence dictionary
     // =============================
 
-    RefVector::const_iterator rsIter;
-    for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
+    RefVector::const_iterator rsIter = referenceSequences.begin();
+    RefVector::const_iterator rsEnd  = referenceSequences.end();
+    for( ; rsIter != rsEnd; ++rsIter ) {
 
-       // write the reference sequence name length
-       uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
-       if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);
-       mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
+        // write the reference sequence name length
+        uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1;
+        if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen);
+        mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
 
-       // write the reference sequence name
-       mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
+        // write the reference sequence name
+        mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
 
-       // write the reference sequence length
-       int32_t referenceLength = rsIter->RefLength;
-       if (IsBigEndian) SwapEndian_32(referenceLength);
-       mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);
+        // write the reference sequence length
+        int32_t referenceLength = rsIter->RefLength;
+        if (IsBigEndian) SwapEndian_32(referenceLength);
+        mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT);
     }
 
     // return success
@@ -210,170 +180,172 @@ void BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
     // (as a result of BamReader::GetNextAlignmentCore())
     if ( al.SupportData.HasCoreOnly ) {
 
-       // write the block size
-       unsigned int blockSize = al.SupportData.BlockLength;
-       if (IsBigEndian) SwapEndian_32(blockSize);
-       mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
-
-       // assign the BAM core data
-       uint32_t buffer[8];
-       buffer[0] = al.RefID;
-       buffer[1] = al.Position;
-       buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
-       buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
-       buffer[4] = al.SupportData.QuerySequenceLength;
-       buffer[5] = al.MateRefID;
-       buffer[6] = al.MatePosition;
-       buffer[7] = al.InsertSize;
-
-       // swap BAM core endian-ness, if necessary
-       if ( IsBigEndian ) {
-           for ( int i = 0; i < 8; ++i )
-               SwapEndian_32(buffer[i]);
-       }
-
-       // write the BAM core
-       mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
-
-       // write the raw char data
-       mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE);
+        // write the block size
+        unsigned int blockSize = al.SupportData.BlockLength;
+        if (IsBigEndian) SwapEndian_32(blockSize);
+        mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
+
+        // assign the BAM core data
+        uint32_t buffer[8];
+        buffer[0] = al.RefID;
+        buffer[1] = al.Position;
+        buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
+        buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
+        buffer[4] = al.SupportData.QuerySequenceLength;
+        buffer[5] = al.MateRefID;
+        buffer[6] = al.MatePosition;
+        buffer[7] = al.InsertSize;
+
+        // swap BAM core endian-ness, if necessary
+        if ( IsBigEndian ) {
+            for ( int i = 0; i < 8; ++i )
+            SwapEndian_32(buffer[i]);
+        }
+
+        // write the BAM core
+        mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
+
+        // write the raw char data
+        mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE);
     }
 
     // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc
     // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code )
     else {
 
-       // calculate char lengths
-       const unsigned int nameLength         = al.Name.size() + 1;
-       const unsigned int numCigarOperations = al.CigarData.size();
-       const unsigned int queryLength        = al.QueryBases.size();
-       const unsigned int tagDataLength      = al.TagData.size();
-
-       // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
-       // force calculation of Bin before storing
-       const int endPosition = al.GetEndPosition();
-       const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
-
-       // create our packed cigar string
-       string packedCigar;
-       CreatePackedCigar(al.CigarData, packedCigar);
-       const unsigned int packedCigarLength = packedCigar.size();
-
-       // encode the query
-       string encodedQuery;
-       EncodeQuerySequence(al.QueryBases, encodedQuery);
-       const unsigned int encodedQueryLength = encodedQuery.size();
-
-       // write the block size
-       const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength;
-       unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
-       if (IsBigEndian) SwapEndian_32(blockSize);
-       mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
-
-       // assign the BAM core data
-       uint32_t buffer[8];
-       buffer[0] = al.RefID;
-       buffer[1] = al.Position;
-       buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
-       buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
-       buffer[4] = queryLength;
-       buffer[5] = al.MateRefID;
-       buffer[6] = al.MatePosition;
-       buffer[7] = al.InsertSize;
-
-       // swap BAM core endian-ness, if necessary
-       if ( IsBigEndian ) {
-           for ( int i = 0; i < 8; ++i )
-               SwapEndian_32(buffer[i]);
-       }
-
-       // write the BAM core
-       mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
-
-       // write the query name
-       mBGZF.Write(al.Name.c_str(), nameLength);
-
-       // write the packed cigar
-       if ( IsBigEndian ) {
-
-           char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
-           memcpy(cigarData, packedCigar.data(), packedCigarLength);
-
-           for (unsigned int i = 0; i < packedCigarLength; ++i) {
-               if ( IsBigEndian )
-                 SwapEndian_32p(&cigarData[i]);
-           }
-
-           mBGZF.Write(cigarData, packedCigarLength);
-           free(cigarData);
-       }
-       else
-           mBGZF.Write(packedCigar.data(), packedCigarLength);
-
-       // write the encoded query sequence
-       mBGZF.Write(encodedQuery.data(), encodedQueryLength);
-
-       // write the base qualities
-       string baseQualities(al.Qualities);
-       char* pBaseQualities = (char*)al.Qualities.data();
-       for(unsigned int i = 0; i < queryLength; i++) {
-           pBaseQualities[i] -= 33;
-       }
-       mBGZF.Write(pBaseQualities, queryLength);
-
-       // write the read group tag
-       if ( IsBigEndian ) {
-
-           char* tagData = (char*)calloc(sizeof(char), tagDataLength);
-           memcpy(tagData, al.TagData.data(), tagDataLength);
-
-           int i = 0;
-           while ( (unsigned int)i < tagDataLength ) {
-
-               i += 2;                                 // skip tag type (e.g. "RG", "NM", etc)
-               uint8_t type = toupper(tagData[i]);     // lower & upper case letters have same meaning
-               ++i;                                    // skip value type
-
-               switch (type) {
-
-                   case('A') :
-                   case('C') :
-                       ++i;
-                       break;
-
-                   case('S') :
-                       SwapEndian_16p(&tagData[i]);
-                       i+=2; // sizeof(uint16_t)
-                       break;
-
-                   case('F') :
-                   case('I') :
-                       SwapEndian_32p(&tagData[i]);
-                       i+=4; // sizeof(uint32_t)
-                       break;
-
-                   case('D') :
-                       SwapEndian_64p(&tagData[i]);
-                       i+=8; // sizeof(uint64_t)
-                       break;
-
-                   case('H') :
-                   case('Z') :
-                       while (tagData[i]) { ++i; }
-                       ++i; // increment one more for null terminator
-                       break;
-
-                   default :
-                       fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
-                       free(tagData);
-                       exit(1);
-               }
-           }
-
-           mBGZF.Write(tagData, tagDataLength);
-           free(tagData);
-       }
-       else
-           mBGZF.Write(al.TagData.data(), tagDataLength);
+        // calculate char lengths
+        const unsigned int nameLength         = al.Name.size() + 1;
+        const unsigned int numCigarOperations = al.CigarData.size();
+        const unsigned int queryLength        = al.QueryBases.size();
+        const unsigned int tagDataLength      = al.TagData.size();
+
+        // no way to tell if BamAlignment.Bin is already defined (no default, invalid value)
+        // force calculation of Bin before storing
+        const int endPosition = al.GetEndPosition();
+        const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition);
+
+        // create our packed cigar string
+        string packedCigar;
+        CreatePackedCigar(al.CigarData, packedCigar);
+        const unsigned int packedCigarLength = packedCigar.size();
+
+        // encode the query
+        string encodedQuery;
+        EncodeQuerySequence(al.QueryBases, encodedQuery);
+        const unsigned int encodedQueryLength = encodedQuery.size();
+
+        // write the block size
+        const unsigned int dataBlockSize = nameLength +
+                                           packedCigarLength +
+                                           encodedQueryLength +
+                                           queryLength +
+                                           tagDataLength;
+        unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
+        if (IsBigEndian) SwapEndian_32(blockSize);
+        mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
+
+        // assign the BAM core data
+        uint32_t buffer[8];
+        buffer[0] = al.RefID;
+        buffer[1] = al.Position;
+        buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
+        buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
+        buffer[4] = queryLength;
+        buffer[5] = al.MateRefID;
+        buffer[6] = al.MatePosition;
+        buffer[7] = al.InsertSize;
+
+        // swap BAM core endian-ness, if necessary
+        if ( IsBigEndian ) {
+            for ( int i = 0; i < 8; ++i )
+            SwapEndian_32(buffer[i]);
+        }
+
+        // write the BAM core
+        mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
+
+        // write the query name
+        mBGZF.Write(al.Name.c_str(), nameLength);
+
+        // write the packed cigar
+        if ( IsBigEndian ) {
+
+            char* cigarData = (char*)calloc(sizeof(char), packedCigarLength);
+            memcpy(cigarData, packedCigar.data(), packedCigarLength);
+
+            for (unsigned int i = 0; i < packedCigarLength; ++i) {
+                if ( IsBigEndian )
+                    SwapEndian_32p(&cigarData[i]);
+            }
+
+            mBGZF.Write(cigarData, packedCigarLength);
+            free(cigarData);
+        }
+        else
+            mBGZF.Write(packedCigar.data(), packedCigarLength);
+
+        // write the encoded query sequence
+        mBGZF.Write(encodedQuery.data(), encodedQueryLength);
+
+        // write the base qualities
+        char* pBaseQualities = (char*)al.Qualities.data();
+        for(unsigned int i = 0; i < queryLength; i++)
+            pBaseQualities[i] -= 33; // FASTQ conversion
+        mBGZF.Write(pBaseQualities, queryLength);
+
+        // write the read group tag
+        if ( IsBigEndian ) {
+
+            char* tagData = (char*)calloc(sizeof(char), tagDataLength);
+            memcpy(tagData, al.TagData.data(), tagDataLength);
+
+            int i = 0;
+            while ( (unsigned int)i < tagDataLength ) {
+
+                i += 2;                                 // skip tag type (e.g. "RG", "NM", etc)
+                uint8_t type = toupper(tagData[i]);     // lower & upper case letters have same meaning
+                ++i;                                    // skip value type
+
+                switch (type) {
+
+                    case('A') :
+                    case('C') :
+                        ++i;
+                        break;
+
+                    case('S') :
+                        SwapEndian_16p(&tagData[i]);
+                        i+=2; // sizeof(uint16_t)
+                        break;
+
+                    case('F') :
+                    case('I') :
+                        SwapEndian_32p(&tagData[i]);
+                        i+=4; // sizeof(uint32_t)
+                        break;
+
+                    case('D') :
+                        SwapEndian_64p(&tagData[i]);
+                        i+=8; // sizeof(uint64_t)
+                        break;
+
+                    case('H') :
+                    case('Z') :
+                        while (tagData[i]) { ++i; }
+                        ++i; // increment one more for null terminator
+                        break;
+
+                    default :
+                        fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here
+                        free(tagData);
+                        exit(1);
+                }
+            }
+
+            mBGZF.Write(tagData, tagDataLength);
+            free(tagData);
+        }
+        else
+            mBGZF.Write(al.TagData.data(), tagDataLength);
     }
 }
index 4fe626a892b5bbdfb4ffbebdf3dd9ce9f4bff275..6eecd73b9d147dc89f9ae201b4792a9bae24d11e 100644 (file)
@@ -27,6 +27,9 @@
 #include <vector>
 
 namespace BamTools {
+
+class SamHeader;
+
 namespace Internal {
 
 class BamWriterPrivate {
@@ -43,6 +46,10 @@ class BamWriterPrivate {
                  const std::string& samHeader,
                  const BamTools::RefVector& referenceSequences,
                  bool isWriteUncompressed);
+    bool Open(const std::string& filename,
+              const std::string& samHeader,
+              const BamTools::RefVector& referenceSequences,
+              bool isWriteUncompressed);
        void SaveAlignment(const BamAlignment& al);
 
     // internal methods