]> git.donarmstrong.com Git - bamtools.git/commitdiff
Added SAM header-handling classes for read/write/validate.
authorderek <derekwbarnett@gmail.com>
Fri, 24 Dec 2010 03:33:33 +0000 (22:33 -0500)
committerderek <derekwbarnett@gmail.com>
Fri, 24 Dec 2010 03:33:33 +0000 (22:33 -0500)
  * Not fully connected to the BamReader/Writer API yet, but will be
phased in soon.
  * Will enable clients to query, modify & validate a BAM file's SAM
header data using the BamTools API, instead of having to use hand-rolled
string-parsing code on the result of BamReader::GetHeaderText().

19 files changed:
src/api/CMakeLists.txt
src/api/SamConstants.h [new file with mode: 0644]
src/api/SamHeader.cpp [new file with mode: 0644]
src/api/SamHeader.h [new file with mode: 0644]
src/api/SamReadGroup.cpp [new file with mode: 0644]
src/api/SamReadGroup.h [new file with mode: 0644]
src/api/SamReadGroupDictionary.cpp [new file with mode: 0644]
src/api/SamReadGroupDictionary.h [new file with mode: 0644]
src/api/SamSequence.cpp [new file with mode: 0644]
src/api/SamSequence.h [new file with mode: 0644]
src/api/SamSequenceDictionary.cpp [new file with mode: 0644]
src/api/SamSequenceDictionary.h [new file with mode: 0644]
src/api/internal/SamFormatParser_p.cpp [new file with mode: 0644]
src/api/internal/SamFormatParser_p.h [new file with mode: 0644]
src/api/internal/SamFormatPrinter_p.cpp [new file with mode: 0644]
src/api/internal/SamFormatPrinter_p.h [new file with mode: 0644]
src/api/internal/SamHeaderValidator_p.cpp [new file with mode: 0644]
src/api/internal/SamHeaderValidator_p.h [new file with mode: 0644]
src/api/internal/SamHeaderVersion_p.h [new file with mode: 0644]

index 951ba87e91699737bc542fcf69ef069254641225..9e41c720408cb81a520f44d9e10c1ec27eaa6f74 100644 (file)
@@ -11,39 +11,36 @@ include_directories( ${BamTools_SOURCE_DIR}/src )
 # add compiler definitions 
 add_definitions( -DBAMTOOLS_API_LIBRARY ) # (for proper exporting of library symbols)
 
+# list of all BamTools API source (.cpp) files
+set( BamToolsAPISources
+        BamAlignment.cpp
+        BamIndex.cpp
+        BamMultiReader.cpp
+        BamReader.cpp
+        BamWriter.cpp
+        BGZF.cpp
+        SamHeader.cpp
+        SamReadGroup.cpp
+        SamReadGroupDictionary.cpp
+        SamSequence.cpp
+        SamSequenceDictionary.cpp
+        internal/BamMultiReader_p.cpp
+        internal/BamReader_p.cpp
+        internal/BamStandardIndex_p.cpp
+        internal/BamToolsIndex_p.cpp
+        internal/BamWriter_p.cpp
+        internal/SamFormatParser_p.cpp
+        internal/SamFormatPrinter_p.cpp
+        internal/SamHeaderValidator_p.cpp
+)
+
 # create main BamTools API shared library
-add_library( BamTools SHARED
-             BamAlignment.cpp
-             BamIndex.cpp
-             BamMultiReader.cpp
-             BamReader.cpp
-             BamWriter.cpp
-             BGZF.cpp
-             internal/BamMultiReader_p.cpp
-             internal/BamReader_p.cpp
-             internal/BamStandardIndex_p.cpp
-             internal/BamToolsIndex_p.cpp
-             internal/BamWriter_p.cpp
-           )
-# set shared lib properties
+add_library( BamTools SHARED ${BamToolsAPISources} )
 set_target_properties( BamTools PROPERTIES SOVERSION "0.9.1" )
 set_target_properties( BamTools PROPERTIES OUTPUT_NAME "bamtools" )
 
 # create main BamTools API static library
-add_library( BamTools-static STATIC
-             BamAlignment.cpp
-             BamIndex.cpp
-             BamMultiReader.cpp
-             BamReader.cpp
-             BamWriter.cpp
-             BGZF.cpp
-             internal/BamMultiReader_p.cpp
-             internal/BamReader_p.cpp
-             internal/BamStandardIndex_p.cpp
-             internal/BamToolsIndex_p.cpp
-             internal/BamWriter_p.cpp
-           )
-# set static lib properties
+add_library( BamTools-static STATIC ${BamToolsAPISources} )
 set_target_properties( BamTools-static PROPERTIES OUTPUT_NAME "bamtools" )
 set_target_properties( BamTools-static PROPERTIES PREFIX "lib" )
 
@@ -58,11 +55,17 @@ install( TARGETS BamTools-static ARCHIVE DESTINATION "lib/bamtools")
 # export API headers
 include(../ExportHeader.cmake)
 set(ApiIncludeDir "api")
-ExportHeader(APIHeaders api_global.h     ${ApiIncludeDir})
-ExportHeader(APIHeaders BamAlignment.h   ${ApiIncludeDir})
-ExportHeader(APIHeaders BamAux.h         ${ApiIncludeDir})
-ExportHeader(APIHeaders BamIndex.h       ${ApiIncludeDir})
-ExportHeader(APIHeaders BamMultiReader.h ${ApiIncludeDir})
-ExportHeader(APIHeaders BamReader.h      ${ApiIncludeDir})
-ExportHeader(APIHeaders BamWriter.h      ${ApiIncludeDir})
-ExportHeader(APIHeaders BGZF.h           ${ApiIncludeDir})
+ExportHeader(APIHeaders api_global.h             ${ApiIncludeDir})
+ExportHeader(APIHeaders BamAlignment.h           ${ApiIncludeDir})
+ExportHeader(APIHeaders BamAux.h                 ${ApiIncludeDir})
+ExportHeader(APIHeaders BamIndex.h               ${ApiIncludeDir})
+ExportHeader(APIHeaders BamMultiReader.h         ${ApiIncludeDir})
+ExportHeader(APIHeaders BamReader.h              ${ApiIncludeDir})
+ExportHeader(APIHeaders BamWriter.h              ${ApiIncludeDir})
+ExportHeader(APIHeaders BGZF.h                   ${ApiIncludeDir})
+ExportHeader(APIHeaders SamConstants.h           ${ApiIncludeDir})
+ExportHeader(APIHeaders SamHeader.h              ${ApiIncludeDir})
+ExportHeader(APIHeaders SamReadGroup.h           ${ApiIncludeDir})
+ExportHeader(APIHeaders SamReadGroupDictionary.h ${ApiIncludeDir})
+ExportHeader(APIHeaders SamSequence.h            ${ApiIncludeDir})
+ExportHeader(APIHeaders SamSequenceDictionary.h  ${ApiIncludeDir})
diff --git a/src/api/SamConstants.h b/src/api/SamConstants.h
new file mode 100644 (file)
index 0000000..6412b3d
--- /dev/null
@@ -0,0 +1,104 @@
+// ***************************************************************************
+// SamConstants.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides constants for SAM header
+// ***************************************************************************
+
+#ifndef SAM_CONSTANTS_H
+#define SAM_CONSTANTS_H
+
+#include <api/api_global.h>
+#include <string>
+
+namespace BamTools {
+namespace Constants {
+
+const char SAM_COLON  = ':';
+const char SAM_EQUAL  = '=';
+const char SAM_PERIOD = '.';
+const char SAM_STAR   = '*';
+const char SAM_TAB    = '\t';
+const std::string SAM_DIGITS = "0123456789";
+
+// HD entries
+const std::string SAM_HD_BEGIN_TOKEN    = "@HD";
+const std::string SAM_HD_VERSION_TAG    = "VN";
+const std::string SAM_HD_SORTORDER_TAG  = "SO";
+const std::string SAM_HD_GROUPORDER_TAG = "GO";
+
+// SQ entries
+const std::string SAM_SQ_BEGIN_TOKEN    = "@SQ";
+const std::string SAM_SQ_NAME_TAG       = "SN";
+const std::string SAM_SQ_LENGTH_TAG     = "LN";
+const std::string SAM_SQ_ASSEMBLYID_TAG = "AS";
+const std::string SAM_SQ_URI_TAG        = "UR";
+const std::string SAM_SQ_CHECKSUM_TAG   = "M5";
+const std::string SAM_SQ_SPECIES_TAG    = "SP";
+
+// RG entries
+const std::string SAM_RG_BEGIN_TOKEN             = "@RG";
+const std::string SAM_RG_ID_TAG                  = "ID";
+const std::string SAM_RG_SAMPLE_TAG              = "SM";
+const std::string SAM_RG_LIBRARY_TAG             = "LB";
+const std::string SAM_RG_DESCRIPTION_TAG         = "DS";
+const std::string SAM_RG_PLATFORMUNIT_TAG        = "PU";
+const std::string SAM_RG_PREDICTEDINSERTSIZE_TAG = "PI";
+const std::string SAM_RG_SEQCENTER_TAG           = "CN";
+const std::string SAM_RG_PRODUCTIONDATE_TAG      = "DT";
+const std::string SAM_RG_SEQTECHNOLOGY_TAG       = "PL";
+
+// PG entries
+const std::string SAM_PG_BEGIN_TOKEN     = "@PG";
+const std::string SAM_PG_NAME_TAG        = "ID";
+const std::string SAM_PG_VERSION_TAG     = "VN";
+const std::string SAM_PG_COMMANDLINE_TAG = "CL";
+
+// CO entries
+const std::string SAM_CO_BEGIN_TOKEN = "@CO";
+
+// HD:SO values
+const std::string SAM_HD_SORTORDER_COORDINATE = "coordinate";
+const std::string SAM_HD_SORTORDER_QUERYNAME  = "queryname";
+const std::string SAM_HD_SORTORDER_UNSORTED   = "unsorted";
+
+// HD:GO values
+const std::string SAM_HD_GROUPORDER_NONE      = "none";
+const std::string SAM_HD_GROUPORDER_QUERY     = "query";
+const std::string SAM_HD_GROUPORDER_REFERENCE = "reference";
+
+// SQ:LN values
+const unsigned int SAM_SQ_LENGTH_MIN = 1;
+const unsigned int SAM_SQ_LENGTH_MAX = 536870911; // 2^29 - 1
+
+// --------------
+// RG:PL values
+
+// 454
+const std::string SAM_RG_SEQTECHNOLOGY_454         = "454";
+const std::string SAM_RG_SEQTECHNOLOGY_LS454_LOWER = "ls454";
+const std::string SAM_RG_SEQTECHNOLOGY_LS454_UPPER = "LS454";
+
+// Helicos
+const std::string SAM_RG_SEQTECHNOLOGY_HELICOS_LOWER = "helicos";
+const std::string SAM_RG_SEQTECHNOLOGY_HELICOS_UPPER = "HELICOS";
+
+// Illumina
+const std::string SAM_RG_SEQTECHNOLOGY_ILLUMINA_LOWER = "illumina";
+const std::string SAM_RG_SEQTECHNOLOGY_ILLUMINA_UPPER = "ILLUMINA";
+
+// PacBio
+const std::string SAM_RG_SEQTECHNOLOGY_PACBIO_LOWER = "pacbio";
+const std::string SAM_RG_SEQTECHNOLOGY_PACBIO_UPPER = "PACBIO";
+
+// SOLiD
+const std::string SAM_RG_SEQTECHNOLOGY_SOLID_LOWER = "solid";
+const std::string SAM_RG_SEQTECHNOLOGY_SOLID_UPPER = "SOLID";
+
+} // namespace Constants
+} // namespace BamTools
+
+#endif // SAM_CONSTANTS_H
diff --git a/src/api/SamHeader.cpp b/src/api/SamHeader.cpp
new file mode 100644 (file)
index 0000000..405033f
--- /dev/null
@@ -0,0 +1,102 @@
+// ***************************************************************************
+// SamHeader.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for querying/manipulating SAM header data
+// **************************************************************************
+
+#include <api/SamHeader.h>
+#include <api/internal/SamFormatParser_p.h>
+#include <api/internal/SamFormatPrinter_p.h>
+#include <api/internal/SamHeaderValidator_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+using namespace std;
+
+SamHeader::SamHeader(const string& headerText)
+    : Version("")
+    , SortOrder("")
+    , GroupOrder("")
+    , ProgramName("")
+    , ProgramVersion("")
+    , ProgramCommandLine("")
+{
+    SamFormatParser parser(*this);
+    parser.Parse(headerText);
+}
+
+SamHeader::~SamHeader(void) {
+    Clear();
+}
+
+void SamHeader::Clear(void) {
+    Version.clear();
+    SortOrder.clear();
+    GroupOrder.clear();
+    Sequences.Clear();
+    ReadGroups.Clear();
+    ProgramName.clear();
+    ProgramVersion.clear();
+    ProgramCommandLine.clear();
+    Comments.clear();
+}
+
+// retrieve the SAM header, with any local modifications
+string SamHeader::ToString(void) const {
+    SamFormatPrinter printer(*this);
+    return printer.ToString();
+}
+
+// query if header contains @HD ID:<Version>
+bool SamHeader::HasVersion(void) const {
+    return (!Version.empty());
+}
+
+// query if header contains @HD SO:<SortOrder>
+bool SamHeader::HasSortOrder(void) const {
+    return (!SortOrder.empty());
+}
+
+// query if header contains @HD GO:<GroupOrder>
+bool SamHeader::HasGroupOrder(void) const {
+    return (!GroupOrder.empty());
+}
+
+// query if header contains @SQ entries
+bool SamHeader::HasSequences(void) const {
+    return (!Sequences.IsEmpty());
+}
+
+// query if header contains @RG entries
+bool SamHeader::HasReadGroups(void) const {
+    return (!ReadGroups.IsEmpty());
+}
+
+// query if header contains @PG ID:<ProgramName>
+bool SamHeader::HasProgramName(void) const {
+    return (!ProgramName.empty());
+}
+
+// query if header contains @HD VN:<ProgramVersion>
+bool SamHeader::HasProgramVersion(void) const {
+    return (!ProgramVersion.empty());
+}
+
+// query if header contains @HD CL:<ProgramCommandLine>
+bool SamHeader::HasProgramCommandLine(void) const {
+    return (!ProgramCommandLine.empty());
+}
+
+// query if header contains @CO entries
+bool SamHeader::HasComments(void) const {
+    return (!Comments.empty());
+}
+
+// validation
+bool SamHeader::IsValid(bool verbose) const {
+    SamHeaderValidator validator(*this);
+    return validator.Validate(verbose);
+}
diff --git a/src/api/SamHeader.h b/src/api/SamHeader.h
new file mode 100644 (file)
index 0000000..b51f175
--- /dev/null
@@ -0,0 +1,81 @@
+// ***************************************************************************
+// SamHeader.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for querying/manipulating SAM header data
+// **************************************************************************
+
+#ifndef SAM_HEADER_H
+#define SAM_HEADER_H
+
+#include <api/api_global.h>
+#include <api/SamReadGroupDictionary.h>
+#include <api/SamSequenceDictionary.h>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+struct API_EXPORT SamHeader {
+
+    // ctor & dtor
+    public:
+        explicit SamHeader(const std::string& headerText = "");
+        ~SamHeader(void);
+
+    // query/modify entire SamHeader at once
+    public:
+
+        // clear all header contents
+        void Clear(void);
+
+        // checks if SAM header is well-formed
+        // @verbose - if true, validation errors & warnings will be printed to stderr
+        // otherwise, output is suppressed and only validation check occurs
+        bool IsValid(bool verbose = false) const;
+
+        // retrieves the printable, SAM-formatted header
+        // (with any local modifications since construction)
+        std::string ToString(void) const;
+
+    // query if header contains data elements
+    public:
+        bool HasVersion(void) const;
+        bool HasSortOrder(void) const;
+        bool HasGroupOrder(void) const;
+        bool HasSequences(void) const;
+        bool HasReadGroups(void) const;
+        bool HasProgramName(void) const;
+        bool HasProgramVersion(void) const;
+        bool HasProgramCommandLine(void) const;
+        bool HasComments(void) const;
+
+    // data members
+    public:
+
+        // header metadata (@HD line)
+        std::string Version;                // VN:<Version>
+        std::string SortOrder;              // SO:<SortOrder>
+        std::string GroupOrder;             // GO:<GroupOrder>
+
+        // header sequences (@SQ entries)
+        SamSequenceDictionary Sequences;
+
+        // header read groups (@RG entries)
+        SamReadGroupDictionary ReadGroups;
+
+        // header program data (@PG entries)
+        std::string ProgramName;            // ID:<ProgramName>
+        std::string ProgramVersion;         // VN:<ProgramVersion>
+        std::string ProgramCommandLine;     // CL:<ProgramCommandLine>
+
+        // header comments (@CO entries)
+        std::vector<std::string> Comments;
+};
+
+} // namespace BamTools
+
+#endif // SAM_HEADER_H
diff --git a/src/api/SamReadGroup.cpp b/src/api/SamReadGroup.cpp
new file mode 100644 (file)
index 0000000..96c8e4e
--- /dev/null
@@ -0,0 +1,68 @@
+// ***************************************************************************
+// SamReadGroup.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for querying/manipulating read group data
+// **************************************************************************
+
+#include <api/SamReadGroup.h>
+using namespace BamTools;
+using namespace std;
+
+// default ctor
+SamReadGroup::SamReadGroup(void)
+    : ID("")
+    , Sample("")
+    , Library("")
+    , Description("")
+    , PlatformUnit("")
+    , PredictedInsertSize("")
+    , SequencingCenter("")
+    , ProductionDate("")
+    , SequencingTechnology("")
+{ }
+
+// ctor with provided ID
+SamReadGroup::SamReadGroup(const string& id)
+    : ID(id)
+    , Sample("")
+    , Library("")
+    , Description("")
+    , PlatformUnit("")
+    , PredictedInsertSize("")
+    , SequencingCenter("")
+    , ProductionDate("")
+    , SequencingTechnology("")
+{ }
+
+// dtor
+SamReadGroup::~SamReadGroup(void) {
+    Clear();
+}
+
+// clear all contents
+void SamReadGroup::Clear(void) {
+    ID.clear();
+    Sample.clear();
+    Library.clear();
+    Description.clear();
+    PlatformUnit.clear();
+    PredictedInsertSize.clear();
+    SequencingCenter.clear();
+    ProductionDate.clear();
+    SequencingTechnology.clear();
+}
+
+// convenience methods to check if SamReadGroup contains these values:
+bool SamReadGroup::HasID(void) const                   { return (!ID.empty());                   }
+bool SamReadGroup::HasSample(void) const               { return (!Sample.empty());               }
+bool SamReadGroup::HasLibrary(void) const              { return (!Library.empty());              }
+bool SamReadGroup::HasDescription(void) const          { return (!Description.empty());          }
+bool SamReadGroup::HasPlatformUnit(void) const         { return (!PlatformUnit.empty());         }
+bool SamReadGroup::HasPredictedInsertSize(void) const  { return (!PredictedInsertSize.empty());  }
+bool SamReadGroup::HasSequencingCenter(void) const     { return (!SequencingCenter.empty());     }
+bool SamReadGroup::HasProductionDate(void) const       { return (!ProductionDate.empty());       }
+bool SamReadGroup::HasSequencingTechnology(void) const { return (!SequencingTechnology.empty()); }
diff --git a/src/api/SamReadGroup.h b/src/api/SamReadGroup.h
new file mode 100644 (file)
index 0000000..fac4612
--- /dev/null
@@ -0,0 +1,67 @@
+// ***************************************************************************
+// SamReadGroup.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for querying/manipulating read group data
+// **************************************************************************
+
+#ifndef SAM_READGROUP_H
+#define SAM_READGROUP_H
+
+#include "api/api_global.h"
+#include <string>
+
+namespace BamTools {
+
+class API_EXPORT SamReadGroup {
+
+    // ctor & dtor
+    public:
+        SamReadGroup(void);
+        SamReadGroup(const std::string& id);
+        ~SamReadGroup(void);
+
+    // public methods
+    public:
+
+        // clear all contents
+        void Clear(void);
+
+        // convenience methods to check if SamReadGroup contains these values:
+        bool HasID(void) const;
+        bool HasSample(void) const;
+        bool HasLibrary(void) const;
+        bool HasDescription(void) const;
+        bool HasPlatformUnit(void) const;
+        bool HasPredictedInsertSize(void) const;
+        bool HasSequencingCenter(void) const;
+        bool HasProductionDate(void) const;
+        bool HasSequencingTechnology(void) const;
+
+    // data members
+    public:
+        std::string ID;                   // ID:<ID>
+        std::string Sample;               // SM:<Sample>
+        std::string Library;              // LB:<Library>
+        std::string Description;          // DS:<Description>
+        std::string PlatformUnit;         // PU:<PlatformUnit>
+        std::string PredictedInsertSize;  // PI:<PredictedInsertSize>
+        std::string SequencingCenter;     // CN:<SequencingCenter>
+        std::string ProductionDate;       // DT:<ProductionDate>
+        std::string SequencingTechnology; // PL:<SequencingTechnology>
+};
+
+// ---------------------------------------------------
+// comparison operators
+
+// for equality: compare IDs
+inline bool operator==(const SamReadGroup& lhs, const SamReadGroup& rhs) {
+    return lhs.ID == rhs.ID;
+}
+
+} // namespace BamTools
+
+#endif // SAM_READGROUP_H
diff --git a/src/api/SamReadGroupDictionary.cpp b/src/api/SamReadGroupDictionary.cpp
new file mode 100644 (file)
index 0000000..fb03fac
--- /dev/null
@@ -0,0 +1,168 @@
+// ***************************************************************************
+// SamReadGroupDictionary.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides container operations for collection of read group entries
+// *************************************************************************
+
+#include <api/SamReadGroupDictionary.h>
+using namespace BamTools;
+
+#include <algorithm>
+#include <iostream>
+using namespace std;
+
+// ctor
+SamReadGroupDictionary::SamReadGroupDictionary(void) { }
+
+// dtor
+SamReadGroupDictionary::~SamReadGroupDictionary(void) {
+    m_data.clear();
+}
+
+// adds read group if not already in container
+void SamReadGroupDictionary::Add(const SamReadGroup& readGroup) {
+    if ( IsEmpty() || !Contains(readGroup) )
+        m_data.push_back(readGroup);
+}
+
+// overload to support std::string
+void SamReadGroupDictionary::Add(const string& readGroupId) {
+    Add( SamReadGroup(readGroupId) );
+}
+
+// add multiple read groups
+void SamReadGroupDictionary::Add(const vector<SamReadGroup>& readGroups) {
+    vector<SamReadGroup>::const_iterator rgIter = readGroups.begin();
+    vector<SamReadGroup>::const_iterator rgEnd  = readGroups.end();
+    for ( ; rgIter!= rgEnd; ++rgIter )
+        Add(*rgIter);
+}
+
+// overload to support std::string
+void SamReadGroupDictionary::Add(const vector<string>& readGroupIds) {
+    vector<string>::const_iterator rgIter = readGroupIds.begin();
+    vector<string>::const_iterator rgEnd  = readGroupIds.end();
+    for ( ; rgIter!= rgEnd; ++rgIter )
+        Add(*rgIter);
+}
+
+// returns iterator to container begin
+SamReadGroupIterator SamReadGroupDictionary::Begin(void) {
+    return m_data.begin();
+}
+
+// returns const_iterator to container begin
+SamReadGroupConstIterator SamReadGroupDictionary::Begin(void) const {
+    return m_data.begin();
+}
+
+// clear read group container
+void SamReadGroupDictionary::Clear(void) {
+    m_data.clear();
+}
+
+// explicit request for const_iterator to container begin
+SamReadGroupConstIterator SamReadGroupDictionary::ConstBegin(void) const {
+    return m_data.begin();
+}
+
+// explicit request for const_iterator to container end
+SamReadGroupConstIterator SamReadGroupDictionary::ConstEnd(void) const {
+    return m_data.end();
+}
+
+// returns true if container contains a read group with this ID tag
+bool SamReadGroupDictionary::Contains(const string& readGroupId) const {
+    return ( IndexOf(readGroupId) != (int)m_data.size() );
+}
+
+bool SamReadGroupDictionary::Contains(const SamReadGroup& readGroup) const {
+    return ( IndexOf(readGroup) != (int)m_data.size() );
+}
+
+// returns iterator to container end
+SamReadGroupIterator SamReadGroupDictionary::End(void) {
+    return m_data.end();
+}
+
+// returns const_iterator to container begin
+SamReadGroupConstIterator SamReadGroupDictionary::End(void) const {
+    return m_data.end();
+}
+
+// returns vector index of read group if found
+// returns vector::size() (invalid index) if not found
+int SamReadGroupDictionary::IndexOf(const SamReadGroup& readGroup) const {
+    SamReadGroupConstIterator begin = ConstBegin();
+    SamReadGroupConstIterator iter  = begin;
+    SamReadGroupConstIterator end   = ConstEnd();
+    for ( ; iter != end; ++iter )
+        if ( *iter == readGroup ) break;
+    return distance( begin, iter );
+}
+
+// overload to support std::string
+int SamReadGroupDictionary::IndexOf(const string& readGroupId) const {
+    return IndexOf( SamReadGroup(readGroupId) );
+}
+
+// returns true if container is empty
+bool SamReadGroupDictionary::IsEmpty(void) const {
+    return m_data.empty();
+}
+
+// removes read group (if it exists)
+void SamReadGroupDictionary::Remove(const SamReadGroup& readGroup) {
+    if ( Contains(readGroup) )
+        m_data.erase( m_data.begin() + IndexOf(readGroup) );
+}
+
+// overlaod to support std::string
+void SamReadGroupDictionary::Remove(const string& readGroupId) {
+    Remove( SamReadGroup(readGroupId) );
+}
+
+// remove multiple read groups
+void SamReadGroupDictionary::Remove(const vector<SamReadGroup>& readGroups) {
+    vector<SamReadGroup>::const_iterator rgIter = readGroups.begin();
+    vector<SamReadGroup>::const_iterator rgEnd  = readGroups.end();
+    for ( ; rgIter!= rgEnd; ++rgIter )
+        Remove(*rgIter);
+}
+
+// overload to support std::string
+void SamReadGroupDictionary::Remove(const vector<string>& readGroupIds) {
+    vector<string>::const_iterator rgIter = readGroupIds.begin();
+    vector<string>::const_iterator rgEnd  = readGroupIds.end();
+    for ( ; rgIter!= rgEnd; ++rgIter )
+        Remove(*rgIter);
+}
+
+// returns size of container (number of current read groups)
+int SamReadGroupDictionary::Size(void) const {
+    return m_data.size();
+}
+
+// retrieves the SamReadGroup object associated with this ID
+// if readGroupId is unknown, a new SamReadGroup is created with this ID
+// and a reference to this new read group entry is returned (like std::map)
+SamReadGroup& SamReadGroupDictionary::operator[](const std::string& readGroupId) {
+
+    // look up read group ID
+    int index = IndexOf(readGroupId);
+
+    // if found, return read group at index
+    if ( index != (int)m_data.size() )
+        return m_data[index];
+
+    // otherwise, append new read group and return reference
+    else {
+        SamReadGroup rg(readGroupId);
+        m_data.push_back(rg);
+        return m_data.back();
+    }
+}
diff --git a/src/api/SamReadGroupDictionary.h b/src/api/SamReadGroupDictionary.h
new file mode 100644 (file)
index 0000000..d21ccf8
--- /dev/null
@@ -0,0 +1,95 @@
+// ***************************************************************************
+// SamReadGroupDictionary.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides container operations for collection of read group entries
+// *************************************************************************
+
+#ifndef SAM_READGROUP_DICTIONARY_H
+#define SAM_READGROUP_DICTIONARY_H
+
+#include <api/api_global.h>
+#include <api/SamReadGroup.h>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+typedef std::vector<SamReadGroup>             SamReadGroupContainer;
+typedef SamReadGroupContainer::iterator       SamReadGroupIterator;
+typedef SamReadGroupContainer::const_iterator SamReadGroupConstIterator;
+
+// stores read groups
+// can access read groups using SamReadGroup object or (std::string) read group ID tag
+class API_EXPORT SamReadGroupDictionary {
+
+    // ctor & dtor
+    public:
+        SamReadGroupDictionary(void);
+        ~SamReadGroupDictionary(void);
+
+    // query/modify read group data
+    public:
+        // add a read group
+        void Add(const SamReadGroup& readGroup);
+        void Add(const std::string& readGroupIds);
+
+        // add multiple read groups
+        void Add(const std::vector<SamReadGroup>& readGroups);
+        void Add(const std::vector<std::string>& readGroupIds);
+
+        // clear all read groups records
+        void Clear(void);
+
+        // returns true if dictionary contains this read group
+        bool Contains(const SamReadGroup& readGroup) const;
+        bool Contains(const std::string& readGroupId) const;
+
+        // returns true if dictionary is empty
+        bool IsEmpty(void) const;
+
+        // remove a single read group (does nothing if read group not found)
+        void Remove(const SamReadGroup& readGroup);
+        void Remove(const std::string& readGroupId);
+
+        // remove multiple read groups
+        void Remove(const std::vector<SamReadGroup>& readGroups);
+        void Remove(const std::vector<std::string>& readGroupIds);
+
+        // returns size of dictionary (number of current elements)
+        int Size(void) const;
+
+        // retrieves the SamReadGroup object associated with this ID
+        // if readGroupId is unknown, a new SamReadGroup is created with this ID (and no other data)
+        // and a reference to this new read group entry is returned (like std::map)
+        //
+        // * To avoid these partial entries being created, it is recommended to check
+        //   for existence first using Contains()
+        SamReadGroup& operator[](const std::string& readGroupId);
+
+    // retrieve read group iterators
+    // these are typedefs for STL iterators and thus are compatible with STL containers/algorithms
+    public:
+        SamReadGroupIterator      Begin(void);
+        SamReadGroupConstIterator Begin(void) const;
+        SamReadGroupConstIterator ConstBegin(void) const;
+        SamReadGroupIterator      End(void);
+        SamReadGroupConstIterator End(void) const;
+        SamReadGroupConstIterator ConstEnd(void) const;
+
+    // internal methods
+    private:
+        int IndexOf(const SamReadGroup& readGroup) const;
+        int IndexOf(const std::string& readGroupId) const;
+
+    // data members
+    private:
+        SamReadGroupContainer m_data;
+};
+
+} // namespace BamTools
+
+#endif // SAM_READGROUP_DICTIONARY 
diff --git a/src/api/SamSequence.cpp b/src/api/SamSequence.cpp
new file mode 100644 (file)
index 0000000..0554604
--- /dev/null
@@ -0,0 +1,46 @@
+// ***************************************************************************
+// SamSequence.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for querying/manipulating sequence data
+// *************************************************************************
+
+#include <api/SamSequence.h>
+using namespace BamTools;
+using namespace std;
+
+// ctor
+SamSequence::SamSequence(const string& name)
+    : Name(name)
+    , Length("")
+    , AssemblyID("")
+    , Checksum("")
+    , URI("")
+    , Species("")
+{ }
+
+// dtor
+SamSequence::~SamSequence(void) {
+    Clear();
+}
+
+// clear all contents
+void SamSequence::Clear(void) {
+    Name.clear();
+    Length.clear();
+    AssemblyID.clear();
+    Checksum.clear();
+    URI.clear();
+    Species.clear();
+}
+
+// convenience methods to check if SamSequence contains these values:
+bool SamSequence::HasName(void) const       { return (!Name.empty());       }
+bool SamSequence::HasLength(void) const     { return (!Length.empty());     }
+bool SamSequence::HasAssemblyID(void) const { return (!AssemblyID.empty()); }
+bool SamSequence::HasChecksum(void) const   { return (!Checksum.empty());   }
+bool SamSequence::HasURI(void) const        { return (!URI.empty());        }
+bool SamSequence::HasSpecies(void) const    { return (!Species.empty());    }
diff --git a/src/api/SamSequence.h b/src/api/SamSequence.h
new file mode 100644 (file)
index 0000000..db6891d
--- /dev/null
@@ -0,0 +1,64 @@
+// ***************************************************************************
+// SamSequence.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for querying/manipulating sequence data
+// **************************************************************************
+
+#ifndef SAM_SEQUENCE_H
+#define SAM_SEQUENCE_H
+
+#include <api/api_global.h>
+#include <string>
+
+namespace BamTools {
+
+class API_EXPORT SamSequence {
+
+    // ctor & dtor
+    public:
+        SamSequence(const std::string& name = "");
+        ~SamSequence(void);
+
+    // public methods
+    public:
+
+        // clear all contents
+        void Clear(void);
+
+        // convenience methods to check if SamSequence contains these values:
+        bool HasName(void) const;
+        bool HasLength(void) const;
+        bool HasAssemblyID(void) const;
+        bool HasChecksum(void) const;
+        bool HasURI(void) const;
+        bool HasSpecies(void) const;
+
+    // data members
+    public:
+        std::string Name;       // SN:<Name>
+        std::string Length;     // LN:<Length>
+        std::string AssemblyID; // AS:<AssemblyID>
+        std::string Checksum;   // M5:<Checksum>
+        std::string URI;        // UR:<URI>
+        std::string Species;    // SP:<Species>
+};
+
+// ---------------------------------------------------
+// comparison operators
+
+// for equality: compare Name, Length, & Checksum (if it exists for both)
+inline bool operator==(const SamSequence& lhs, const SamSequence& rhs) {
+    if ( lhs.Name   != rhs.Name   ) return false;
+    if ( lhs.Length != rhs.Length ) return false;
+    if ( lhs.HasChecksum() && rhs.HasChecksum() )
+        return (lhs.Checksum == rhs.Checksum);
+    else return true;
+}
+
+} // namespace BamTools
+
+#endif // SAM_SEQUENCE_H
diff --git a/src/api/SamSequenceDictionary.cpp b/src/api/SamSequenceDictionary.cpp
new file mode 100644 (file)
index 0000000..c023a39
--- /dev/null
@@ -0,0 +1,159 @@
+#include <api/SamSequenceDictionary.h>
+using namespace BamTools;
+
+#include <iostream>
+using namespace std;
+
+// ctor
+SamSequenceDictionary::SamSequenceDictionary(void) { }
+
+// dtor
+SamSequenceDictionary::~SamSequenceDictionary(void) {
+    m_data.clear();
+}
+
+// adds sequence if not already in container
+void SamSequenceDictionary::Add(const SamSequence& sequence) {
+    if ( IsEmpty() || !Contains(sequence) )
+        m_data.push_back(sequence);
+}
+
+// overload to support std::string
+void SamSequenceDictionary::Add(const string& sequenceName) {
+    Add( SamSequence(sequenceName) );
+}
+
+// add multiple sequences
+void SamSequenceDictionary::Add(const vector<SamSequence>& sequences) {
+    vector<SamSequence>::const_iterator rgIter = sequences.begin();
+    vector<SamSequence>::const_iterator rgEnd  = sequences.end();
+    for ( ; rgIter!= rgEnd; ++rgIter )
+        Add(*rgIter);
+}
+
+// overload to support std::string
+void SamSequenceDictionary::Add(const vector<string>& sequenceNames) {
+    vector<string>::const_iterator rgIter = sequenceNames.begin();
+    vector<string>::const_iterator rgEnd  = sequenceNames.end();
+    for ( ; rgIter!= rgEnd; ++rgIter )
+        Add(*rgIter);
+}
+
+// returns iterator to container begin
+SamSequenceIterator SamSequenceDictionary::Begin(void) {
+    return m_data.begin();
+}
+
+// returns const_iterator to container begin
+SamSequenceConstIterator SamSequenceDictionary::Begin(void) const {
+    return m_data.begin();
+}
+
+// clear sequence container
+void SamSequenceDictionary::Clear(void) {
+    m_data.clear();
+}
+
+// explicit request for const_iterator to container begin
+SamSequenceConstIterator SamSequenceDictionary::ConstBegin(void) const {
+    return m_data.begin();
+}
+
+// explicit request for const_iterator to container end
+SamSequenceConstIterator SamSequenceDictionary::ConstEnd(void) const {
+    return m_data.end();
+}
+
+// returns true if container contains a sequence with this ID tag
+bool SamSequenceDictionary::Contains(const string& sequenceName) const {
+    return ( IndexOf(sequenceName) != (int)m_data.size() );
+}
+
+bool SamSequenceDictionary::Contains(const SamSequence& seq) const {
+    return ( IndexOf(seq) != (int)m_data.size() );
+}
+
+// returns iterator to container end
+SamSequenceIterator SamSequenceDictionary::End(void) {
+    return m_data.end();
+}
+
+// returns const_iterator to container begin
+SamSequenceConstIterator SamSequenceDictionary::End(void) const {
+    return m_data.end();
+}
+
+// returns vector index of sequence if found
+// returns vector::size() (invalid index) if not found
+int SamSequenceDictionary::IndexOf(const SamSequence& sequence) const {
+    SamSequenceConstIterator begin = ConstBegin();
+    SamSequenceConstIterator iter  = begin;
+    SamSequenceConstIterator end   = ConstEnd();
+    for ( ; iter != end; ++iter )
+        if ( *iter == sequence ) break;
+    return distance( begin, iter );
+}
+
+// overload to support std::string
+int SamSequenceDictionary::IndexOf(const string& sequenceName) const {
+    return IndexOf( SamSequence(sequenceName) );
+}
+
+// returns true if container is empty
+bool SamSequenceDictionary::IsEmpty(void) const {
+    return m_data.empty();
+}
+
+// removes sequence (if it exists)
+void SamSequenceDictionary::Remove(const SamSequence& sequence) {
+    if ( Contains(sequence) )
+        m_data.erase( m_data.begin() + IndexOf(sequence) );
+}
+
+// overlaod to support std::string
+void SamSequenceDictionary::Remove(const string& sequenceName) {
+    Remove( SamSequence(sequenceName) );
+}
+
+// remove multiple sequences
+void SamSequenceDictionary::Remove(const vector<SamSequence>& sequences) {
+    vector<SamSequence>::const_iterator rgIter = sequences.begin();
+    vector<SamSequence>::const_iterator rgEnd  = sequences.end();
+    for ( ; rgIter!= rgEnd; ++rgIter )
+        Remove(*rgIter);
+}
+
+// overload to support std::string
+void SamSequenceDictionary::Remove(const vector<string>& sequenceNames) {
+    vector<string>::const_iterator rgIter = sequenceNames.begin();
+    vector<string>::const_iterator rgEnd  = sequenceNames.end();
+    for ( ; rgIter!= rgEnd; ++rgIter )
+        Remove(*rgIter);
+}
+
+// returns size of container (number of current sequences)
+int SamSequenceDictionary::Size(void) const {
+    return m_data.size();
+}
+
+// retrieves the SamSequence object associated with this name
+// if sequenceName is unknown, a new SamSequence is created with this name (and invalid length 0)
+// and a reference to this new sequence entry is returned (like std::map)
+SamSequence& SamSequenceDictionary::operator[](const std::string& sequenceName) {
+
+    // look up sequence ID
+    int index = IndexOf(sequenceName);
+
+    // if found, return sequence at index
+    if ( index != (int)m_data.size() )
+        return m_data[index];
+
+    // otherwise, append new sequence and return reference
+    else {
+        SamSequence seq(sequenceName);
+        seq.Length = "0";
+        m_data.push_back(seq);
+        return m_data.back();
+    }
+}
+
diff --git a/src/api/SamSequenceDictionary.h b/src/api/SamSequenceDictionary.h
new file mode 100644 (file)
index 0000000..bcd1652
--- /dev/null
@@ -0,0 +1,94 @@
+// ***************************************************************************
+// SamSequenceDictionary.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides container operations for collection of sequence entries
+// *************************************************************************
+
+#ifndef SAM_SEQUENCE_DICTIONARY_H
+#define SAM_SEQUENCE_DICTIONARY_H
+
+#include <api/api_global.h>
+#include <api/SamSequence.h>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+typedef std::vector<SamSequence>             SamSequenceContainer;
+typedef SamSequenceContainer::iterator       SamSequenceIterator;
+typedef SamSequenceContainer::const_iterator SamSequenceConstIterator;
+
+class API_EXPORT SamSequenceDictionary {
+
+    // ctor & dtor
+    public:
+        SamSequenceDictionary(void);
+        ~SamSequenceDictionary(void);
+
+    // query/modify sequence data
+    public:
+        // add a sequence
+        void Add(const SamSequence& sequence);
+        void Add(const std::string& sequenceNames);
+
+        // add multiple sequences
+        void Add(const std::vector<SamSequence>& sequences);
+        void Add(const std::vector<std::string>& sequenceNames);
+
+        // clear all sequence records
+        void Clear(void);
+
+        // returns true if dictionary contains this sequence
+        bool Contains(const SamSequence& sequence) const;
+        bool Contains(const std::string& sequenceName) const;
+
+        // returns true if dictionary is empty
+        bool IsEmpty(void) const;
+
+        // remove a single sequence (does nothing if sequence not found)
+        void Remove(const SamSequence& sequence);
+        void Remove(const std::string& sequenceName);
+
+        // remove multiple sequences
+        void Remove(const std::vector<SamSequence>& sequences);
+        void Remove(const std::vector<std::string>& sequenceNames);
+
+        // returns size of dictionary (number of current elements)
+        int Size(void) const;
+
+        // retrieves the SamSequence object associated with this name
+        // if sequenceName is unknown, a new SamSequence is created with this name (and invalid length 0)
+        // and a reference to this new sequence entry is returned (like std::map)
+        //
+        // * To avoid these partial entries being created, it is recommended to check
+        //   for existence first using Contains()
+        SamSequence& operator[](const std::string& sequenceName);
+
+    // retrieve sequence iterators
+    // these are typedefs for STL iterators and thus are compatible with STL containers/algorithms
+    public:
+        SamSequenceIterator      Begin(void);
+        SamSequenceConstIterator Begin(void) const;
+        SamSequenceConstIterator ConstBegin(void) const;
+        SamSequenceIterator      End(void);
+        SamSequenceConstIterator End(void) const;
+        SamSequenceConstIterator ConstEnd(void) const;
+
+    // internal methods
+    private:
+        int IndexOf(const SamSequence& sequence) const;
+        int IndexOf(const std::string& sequenceName) const;
+
+    // data members
+    private:
+        SamSequenceContainer m_data;
+};
+
+} // namespace BamTools
+
+#endif // SAM_SEQUENCE_DICTIONARY 
+
diff --git a/src/api/internal/SamFormatParser_p.cpp b/src/api/internal/SamFormatParser_p.cpp
new file mode 100644 (file)
index 0000000..aa690b8
--- /dev/null
@@ -0,0 +1,221 @@
+// ***************************************************************************
+// SamFormatParser.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for parsing SAM header text into SamHeader object
+// ***************************************************************************
+
+#include <api/SamConstants.h>
+#include <api/SamHeader.h>
+#include <api/internal/SamFormatParser_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+using namespace std;
+
+SamFormatParser::SamFormatParser(SamHeader& header)
+    : m_header(header)
+{ }
+
+SamFormatParser::~SamFormatParser(void) { }
+
+void SamFormatParser::Parse(const string& headerText) {
+
+    // clear header's prior contents
+    m_header.Clear();
+
+    // empty header is OK, but skip processing
+    if ( headerText.empty() )
+        return;
+
+    // other wise parse SAM lines
+    istringstream headerStream(headerText);
+    string headerLine = "";
+    while ( getline(headerStream, headerLine) )
+         ParseSamLine(headerLine);
+    return;
+}
+
+void SamFormatParser::ParseSamLine(const string& line) {
+
+    // skip if line is not long enough to contain true values
+    if (line.length() < 5 ) return;
+
+    // determine token at beginning of line
+    const string firstToken = line.substr(0,3);
+    string restOfLine = line.substr(4);
+    if      ( firstToken == Constants::SAM_HD_BEGIN_TOKEN) ParseHDLine(restOfLine);
+    else if ( firstToken == Constants::SAM_SQ_BEGIN_TOKEN) ParseSQLine(restOfLine);
+    else if ( firstToken == Constants::SAM_RG_BEGIN_TOKEN) ParseRGLine(restOfLine);
+    else if ( firstToken == Constants::SAM_PG_BEGIN_TOKEN) ParsePGLine(restOfLine);
+    else if ( firstToken == Constants::SAM_CO_BEGIN_TOKEN) ParseCOLine(restOfLine);
+    else cerr << "SAM Format Error - unknown token: " << firstToken << endl;
+    return;
+}
+
+void SamFormatParser::ParseHDLine(const string& line) {
+
+    // split HD lines into tokens
+    vector<string> tokens = Split(line, Constants::SAM_TAB);
+
+    // iterate over tokens
+    vector<string>::const_iterator tokenIter = tokens.begin();
+    vector<string>::const_iterator tokenEnd  = tokens.end();
+    for ( ; tokenIter != tokenEnd; ++tokenIter ) {
+
+        // get tag/value
+        const string tokenTag = (*tokenIter).substr(0,2);
+        const string tokenValue = (*tokenIter).substr(3);
+
+        // set header contents
+        if      ( tokenTag == Constants::SAM_HD_VERSION_TAG    ) m_header.Version    = tokenValue;
+        else if ( tokenTag == Constants::SAM_HD_GROUPORDER_TAG ) m_header.GroupOrder = tokenValue;
+        else if ( tokenTag == Constants::SAM_HD_SORTORDER_TAG  ) m_header.SortOrder  = tokenValue;
+        else
+            cerr << "SAM Format Error - unknown HD tag: " << tokenTag << endl;
+    }
+
+    // if @HD line exists, VN must be provided
+    if ( !m_header.HasVersion() ) {
+        cerr << "SAM Format Error - @HD line is missing VN tag!" << endl;
+        return;
+    }
+}
+
+void SamFormatParser::ParseSQLine(const string& line) {
+
+    SamSequence seq;
+
+    // split SQ line into tokens
+    vector<string> tokens = Split(line, Constants::SAM_TAB);
+
+    // iterate over tokens
+    vector<string>::const_iterator tokenIter = tokens.begin();
+    vector<string>::const_iterator tokenEnd  = tokens.end();
+    for ( ; tokenIter != tokenEnd; ++tokenIter ) {
+
+        // get tag/value
+        const string tokenTag = (*tokenIter).substr(0,2);
+        const string tokenValue = (*tokenIter).substr(3);
+
+        // set sequence contents
+        if      ( tokenTag == Constants::SAM_SQ_NAME_TAG       ) seq.Name = tokenValue;
+        else if ( tokenTag == Constants::SAM_SQ_LENGTH_TAG     ) seq.Length = tokenValue;
+        else if ( tokenTag == Constants::SAM_SQ_ASSEMBLYID_TAG ) seq.AssemblyID = tokenValue;
+        else if ( tokenTag == Constants::SAM_SQ_URI_TAG        ) seq.URI = tokenValue;
+        else if ( tokenTag == Constants::SAM_SQ_CHECKSUM_TAG   ) seq.Checksum = tokenValue;
+        else if ( tokenTag == Constants::SAM_SQ_SPECIES_TAG    ) seq.Species = tokenValue;
+        else
+            cerr << "SAM Format Error - unknown SQ tag: " << tokenTag << endl;
+    }
+
+    // if @SQ line exists, SN must be provided
+    if ( !seq.HasName() ) {
+        cerr << "SAM Format Error - @SQ line is missing SN tag!" << endl;
+        return;
+    }
+
+    // if @SQ line exists, LN must be provided
+    if ( !seq.HasLength() ) {
+        cerr << "SAM Format Error - @SQ line is missing LN tag!" << endl;
+        return;
+    }
+
+    // store SAM sequence entry
+    m_header.Sequences.Add(seq);
+}
+
+void SamFormatParser::ParseRGLine(const string& line) {
+
+    SamReadGroup rg;
+
+    // split string into tokens
+    vector<string> tokens = Split(line, Constants::SAM_TAB);
+
+    // iterate over tokens
+    vector<string>::const_iterator tokenIter = tokens.begin();
+    vector<string>::const_iterator tokenEnd  = tokens.end();
+    for ( ; tokenIter != tokenEnd; ++tokenIter ) {
+
+        // get token tag/value
+        const string tokenTag = (*tokenIter).substr(0,2);
+        const string tokenValue = (*tokenIter).substr(3);
+
+        // set read group contents
+        if      ( tokenTag == Constants::SAM_RG_ID_TAG                  ) rg.ID = tokenValue;
+        else if ( tokenTag == Constants::SAM_RG_SAMPLE_TAG              ) rg.Sample = tokenValue;
+        else if ( tokenTag == Constants::SAM_RG_LIBRARY_TAG             ) rg.Library = tokenValue;
+        else if ( tokenTag == Constants::SAM_RG_DESCRIPTION_TAG         ) rg.Description = tokenValue;
+        else if ( tokenTag == Constants::SAM_RG_PLATFORMUNIT_TAG        ) rg.PlatformUnit = tokenValue;
+        else if ( tokenTag == Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG ) rg.PredictedInsertSize = tokenValue;
+        else if ( tokenTag == Constants::SAM_RG_SEQCENTER_TAG           ) rg.SequencingCenter = tokenValue;
+        else if ( tokenTag == Constants::SAM_RG_PRODUCTIONDATE_TAG      ) rg.ProductionDate = tokenValue;
+        else if ( tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG       ) rg.SequencingTechnology = tokenValue;
+        else
+            cerr << "SAM Format Error - unknown RG tag: " << tokenTag << endl;
+    }
+
+    // if @RG line exists, ID must be provided
+    if ( !rg.HasID() ) {
+        cerr << "SAM Format Error - @RG line is missing ID tag!" << endl;
+        return;
+    }
+
+    // if @RG line exists, SM must be provided
+    if ( !rg.HasSample() ) {
+        cerr << "SAM Format Error - @RG line is missing SM tag!" << endl;
+        return;
+    }
+
+    // store SAM read group entry
+    m_header.ReadGroups.Add(rg);
+}
+
+void SamFormatParser::ParsePGLine(const string& line) {
+
+    // split string into tokens
+    vector<string> tokens = Split(line, Constants::SAM_TAB);
+
+    // iterate over tokens
+    vector<string>::const_iterator tokenIter = tokens.begin();
+    vector<string>::const_iterator tokenEnd  = tokens.end();
+    for ( ; tokenIter != tokenEnd; ++tokenIter ) {
+
+        // get token tag/value
+        const string tokenTag = (*tokenIter).substr(0,2);
+        const string tokenValue = (*tokenIter).substr(3);
+
+        // set header contents
+        if      ( tokenTag == Constants::SAM_PG_NAME_TAG        ) m_header.ProgramName = tokenValue;
+        else if ( tokenTag == Constants::SAM_PG_VERSION_TAG     ) m_header.ProgramVersion = tokenValue;
+        else if ( tokenTag == Constants::SAM_PG_COMMANDLINE_TAG ) m_header.ProgramCommandLine = tokenValue;
+        else
+            cerr << "SAM Format Error - unknown PG tag: " << tokenTag << endl;
+    }
+
+    // if @PG line exists, ID must be provided
+    if ( !m_header.HasProgramName() ) {
+        cerr << "SAM Format Error - @PG line is missing ID tag!" << endl;
+        return;
+    }
+}
+
+void SamFormatParser::ParseCOLine(const string& line) {
+    // simply add line to comments list
+    m_header.Comments.push_back(line);
+}
+
+const vector<string> SamFormatParser::Split(const string& line, const char delim) {
+    vector<string> tokens;
+    stringstream lineStream(line);
+    string token;
+    while ( getline(lineStream, token, delim) )
+        tokens.push_back(token);
+    return tokens;
+}
diff --git a/src/api/internal/SamFormatParser_p.h b/src/api/internal/SamFormatParser_p.h
new file mode 100644 (file)
index 0000000..daabe39
--- /dev/null
@@ -0,0 +1,62 @@
+// ***************************************************************************
+// SamFormatParser.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for parsing SAM header text into SamHeader object
+// ***************************************************************************
+
+#ifndef SAM_FORMAT_PARSER_H
+#define SAM_FORMAT_PARSER_H
+
+//  -------------
+//  W A R N I N G
+//  -------------
+//
+// This file is not part of the BamTools API.  It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+class SamHeader;
+
+namespace Internal {
+
+class SamFormatParser {
+
+    // ctor & dtor
+    public:
+        SamFormatParser(BamTools::SamHeader& header);
+        ~SamFormatParser(void);
+
+    // parse text & populate header data
+    public:
+        void Parse(const std::string& headerText);
+
+    // internal methods
+    private:
+        void ParseSamLine(const std::string& line);
+        void ParseHDLine(const std::string& line);
+        void ParseSQLine(const std::string& line);
+        void ParseRGLine(const std::string& line);
+        void ParsePGLine(const std::string& line);
+        void ParseCOLine(const std::string& line);
+        const std::vector<std::string> Split(const std::string& line, const char delim);
+
+    // data members
+    private:
+        SamHeader& m_header;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // SAM_FORMAT_PARSER_H
diff --git a/src/api/internal/SamFormatPrinter_p.cpp b/src/api/internal/SamFormatPrinter_p.cpp
new file mode 100644 (file)
index 0000000..dcde46e
--- /dev/null
@@ -0,0 +1,185 @@
+// ***************************************************************************
+// SamFormatPrinter.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for printing formatted SAM header to string
+// ***************************************************************************
+
+#include <api/SamConstants.h>
+#include <api/SamHeader.h>
+#include <api/internal/SamFormatPrinter_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+using namespace std;
+
+SamFormatPrinter::SamFormatPrinter(const SamHeader& header)
+    : m_header(header)
+{ }
+
+SamFormatPrinter::~SamFormatPrinter(void) { }
+
+const string SamFormatPrinter::FormatTag(const string &tag, const string &value) const {
+    return string(Constants::SAM_TAB + tag + Constants::SAM_COLON + value);
+}
+
+const string SamFormatPrinter::ToString(void) const {
+
+    // clear out stream
+    stringstream out("");
+
+    // generate formatted header text
+    PrintHD(out);
+    PrintSQ(out);
+    PrintRG(out);
+    PrintPG(out);
+    PrintCO(out);
+
+    // return result
+    return out.str();
+}
+
+void SamFormatPrinter::PrintHD(std::stringstream& out) const {
+
+    // if header has @HD data
+    if ( m_header.HasVersion() ) {
+
+        // @HD VN:<Version>
+        out << Constants::SAM_HD_BEGIN_TOKEN
+            << FormatTag(Constants::SAM_HD_VERSION_TAG, m_header.Version);
+
+        // SO:<SortOrder>
+        if ( m_header.HasSortOrder() )
+            out << FormatTag(Constants::SAM_HD_SORTORDER_TAG, m_header.SortOrder);
+
+        // GO:<GroupOrder>
+        if ( m_header.HasGroupOrder() )
+            out << FormatTag(Constants::SAM_HD_GROUPORDER_TAG, m_header.GroupOrder);
+
+        // newline
+        out << endl;
+    }
+}
+
+void SamFormatPrinter::PrintSQ(std::stringstream& out) const {
+
+    // iterate over sequence entries
+    SamSequenceConstIterator seqIter = m_header.Sequences.ConstBegin();
+    SamSequenceConstIterator seqEnd  = m_header.Sequences.ConstEnd();
+    for ( ; seqIter != seqEnd; ++seqIter ) {
+        const SamSequence& seq = (*seqIter);
+
+        // @SQ SN:<Name> LN:<Length>
+        out << Constants::SAM_SQ_BEGIN_TOKEN
+            << FormatTag(Constants::SAM_SQ_NAME_TAG, seq.Name)
+            << FormatTag(Constants::SAM_SQ_LENGTH_TAG, seq.Length);
+
+        // AS:<AssemblyID>
+        if ( seq.HasAssemblyID() )
+            out << FormatTag(Constants::SAM_SQ_ASSEMBLYID_TAG, seq.AssemblyID);
+
+        // M5:<Checksum>
+        if ( seq.HasChecksum() )
+            out << FormatTag(Constants::SAM_SQ_CHECKSUM_TAG, seq.Checksum);
+
+        // UR:<URI>
+        if ( seq.HasURI() )
+            out << FormatTag(Constants::SAM_SQ_URI_TAG, seq.URI);
+
+        // SP:<Species>
+        if ( seq.HasSpecies() )
+            out << FormatTag(Constants::SAM_SQ_SPECIES_TAG, seq.Species);
+
+        // newline
+        out << endl;
+    }
+}
+
+void SamFormatPrinter::PrintRG(std::stringstream& out) const {
+
+    // iterate over read group entries
+    SamReadGroupConstIterator rgIter = m_header.ReadGroups.ConstBegin();
+    SamReadGroupConstIterator rgEnd  = m_header.ReadGroups.ConstEnd();
+    for ( ; rgIter != rgEnd; ++rgIter ) {
+        const SamReadGroup& rg = (*rgIter);
+
+        // @RG ID:<ID> SM:<Sample>
+        out << Constants::SAM_RG_BEGIN_TOKEN
+            << FormatTag(Constants::SAM_RG_ID_TAG, rg.ID)
+            << FormatTag(Constants::SAM_RG_SAMPLE_TAG, rg.Sample);
+
+        // LB:<Library>
+        if ( rg.HasLibrary() )
+            out << FormatTag(Constants::SAM_RG_LIBRARY_TAG, rg.Library);
+
+        // DS:<Description>
+        if ( rg.HasDescription() )
+            out << FormatTag(Constants::SAM_RG_DESCRIPTION_TAG, rg.Description);
+
+        // PU:<PlatformUnit>
+        if ( rg.HasPlatformUnit() )
+            out << FormatTag(Constants::SAM_RG_PLATFORMUNIT_TAG, rg.PlatformUnit);
+
+        // PI:<PredictedInsertSize>
+        if ( rg.HasPredictedInsertSize() )
+            out << FormatTag(Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG, rg.PredictedInsertSize);
+
+        // CN:<SequencingCenter>
+        if ( rg.HasSequencingCenter() )
+            out << FormatTag(Constants::SAM_RG_SEQCENTER_TAG, rg.SequencingCenter);
+
+        // DT:<ProductionDate>
+        if ( rg.HasProductionDate() )
+            out << FormatTag(Constants::SAM_RG_PRODUCTIONDATE_TAG, rg.ProductionDate);
+
+        // PL:<SequencingTechnology>
+        if ( rg.HasSequencingTechnology() )
+            out << FormatTag(Constants::SAM_RG_SEQTECHNOLOGY_TAG, rg.SequencingTechnology);
+
+        // newline
+        out << endl;
+    }
+}
+
+void SamFormatPrinter::PrintPG(std::stringstream& out) const {
+
+    // if header has @PG data
+    if ( m_header.HasProgramName() ) {
+
+        // @PG ID:<ProgramName>
+        out << Constants::SAM_PG_BEGIN_TOKEN
+            << FormatTag(Constants::SAM_PG_NAME_TAG, m_header.ProgramName);
+
+        // VN:<ProgramVersion>
+        if ( m_header.HasProgramVersion() )
+            out << FormatTag(Constants::SAM_PG_VERSION_TAG, m_header.ProgramVersion);
+
+        // CL:<ProgramCommandLine>
+        if ( m_header.HasProgramCommandLine() )
+            out << FormatTag(Constants::SAM_PG_COMMANDLINE_TAG, m_header.ProgramCommandLine);
+
+        // newline
+        out << endl;
+    }
+}
+
+void SamFormatPrinter::PrintCO(std::stringstream& out) const {
+
+    // iterate over comments
+    vector<string>::const_iterator commentIter = m_header.Comments.begin();
+    vector<string>::const_iterator commentEnd  = m_header.Comments.end();
+    for ( ; commentIter != commentEnd; ++commentIter ) {
+
+        // @CO <Comment>
+        out << Constants::SAM_CO_BEGIN_TOKEN
+            << Constants::SAM_TAB
+            << (*commentIter)
+            << endl;
+    }
+}
diff --git a/src/api/internal/SamFormatPrinter_p.h b/src/api/internal/SamFormatPrinter_p.h
new file mode 100644 (file)
index 0000000..5e28e97
--- /dev/null
@@ -0,0 +1,61 @@
+// ***************************************************************************
+// SamFormatPrinter.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for printing formatted SAM header to string
+// ***************************************************************************
+
+#ifndef SAM_FORMAT_PRINTER_H
+#define SAM_FORMAT_PRINTER_H
+
+//  -------------
+//  W A R N I N G
+//  -------------
+//
+// This file is not part of the BamTools API.  It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <sstream>
+#include <string>
+
+namespace BamTools {
+
+class SamHeader;
+
+namespace Internal {
+
+class SamFormatPrinter {
+
+    // ctor & dtor
+    public:
+        SamFormatPrinter(const BamTools::SamHeader& header);
+        ~SamFormatPrinter(void);
+
+    // generates SAM-formatted string from header data
+    public:
+        const std::string ToString(void) const;
+
+    // internal methods
+    private:
+        const std::string FormatTag(const std::string& tag, const std::string& value) const;
+        void PrintHD(std::stringstream& out) const;
+        void PrintSQ(std::stringstream& out) const;
+        void PrintRG(std::stringstream& out) const;
+        void PrintPG(std::stringstream& out) const;
+        void PrintCO(std::stringstream& out) const;
+
+    // data members
+    private:
+        const SamHeader& m_header;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // SAM_FORMAT_PRINTER_H
diff --git a/src/api/internal/SamHeaderValidator_p.cpp b/src/api/internal/SamHeaderValidator_p.cpp
new file mode 100644 (file)
index 0000000..4409411
--- /dev/null
@@ -0,0 +1,545 @@
+// ***************************************************************************
+// SamHeaderValidator.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for validating SamHeader data
+// ***************************************************************************
+
+#include <api/SamConstants.h>
+#include <api/SamHeader.h>
+#include <api/internal/SamHeaderValidator_p.h>
+#include <api/internal/SamHeaderVersion_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <iostream>
+#include <set>
+#include <sstream>
+#include <vector>
+using namespace std;
+
+// -------------------------------------------------------------------
+// Allow validation rules to vary between SAM header versions
+//
+// use SAM_VERSION_X_Y to tag important changes
+//
+// Together, they will allow for comparisons like:
+// if ( m_version < SAM_VERSION_2_0 ) {
+//     // use some older rule
+// else
+//     // use rule introduced with version 2.0
+
+static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0);
+static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3);
+
+// -----------------------------------------
+// SamHeaderValidatorPrivate implementation
+
+class SamHeaderValidator::SamHeaderValidatorPrivate {
+
+    // ctor & dtor
+    public:
+        SamHeaderValidatorPrivate(const SamHeader& header);
+        ~SamHeaderValidatorPrivate(void) { }
+
+    // 'public' methods
+    public:
+        bool Validate(bool verbose);
+
+    // internal validation methods
+    private:
+
+        // validate header metadata
+        bool ValidateMetadata(void);
+        bool ValidateVersion(void);
+        bool ContainsOnlyDigits(const string& s);
+        bool ValidateSortOrder(void);
+        bool ValidateGroupOrder(void);
+
+        // validate sequence dictionary
+        bool ValidateSequenceDictionary(void);
+        bool ContainsUniqueSequenceNames(void);
+        bool CheckNameFormat(const string& name);
+        bool ValidateSequence(const SamSequence& seq);
+        bool CheckLengthInRange(const string& length);
+
+        // validate read group dictionary
+        bool ValidateReadGroupDictionary(void);
+        bool ValidateReadGroup(const SamReadGroup& rg);
+        bool ContainsUniqueIDsAndPlatformUnits(void);
+        bool CheckReadGroupID(const string& id);
+        bool CheckSequencingTechnology(const string& technology);
+        bool Is454(const string& technology);
+        bool IsHelicos(const string& technology);
+        bool IsIllumina(const string& technology);
+        bool IsPacBio(const string& technology);
+        bool IsSolid(const string& technology);
+
+        // validate program data
+        bool ValidateProgramData(void);
+        bool ContainsUniqueProgramIds(void);
+        bool ValidatePreviousProgramIds(void);
+
+    // error reporting
+    private:
+        void AddError(const string& message);
+        void AddWarning(const string& message);
+        void PrintErrorMessages(void);
+        void PrintWarningMessages(void);
+
+    // data members
+    private:
+        const SamHeader&       m_header;
+        const SamHeaderVersion m_version;
+
+        bool m_isVerboseOutput;
+        const string ERROR_PREFIX;
+        const string WARN_PREFIX;
+        const string NEWLINE;
+        vector<string> m_errorMessages;
+        vector<string> m_warningMessages;
+};
+
+SamHeaderValidator::SamHeaderValidatorPrivate::SamHeaderValidatorPrivate(const SamHeader& header)
+    : m_header(header)
+    , m_version( header.Version )
+    , m_isVerboseOutput(false)
+    , ERROR_PREFIX("ERROR: ")
+    , WARN_PREFIX("WARNING: ")
+    , NEWLINE("\n")
+{ }
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::Validate(bool verbose) {
+
+    // set error reporting mode
+    m_isVerboseOutput = verbose;
+
+    // validate header components
+    bool isValid = true;
+    isValid &= ValidateMetadata();
+    isValid &= ValidateSequenceDictionary();
+    isValid &= ValidateReadGroupDictionary();
+    isValid &= ValidateProgramData();
+
+    // report errors if desired
+    if ( m_isVerboseOutput ) {
+        PrintErrorMessages();
+        PrintWarningMessages();
+    }
+
+    // return validation status
+    return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateMetadata(void) {
+    bool isValid = true;
+    isValid &= ValidateVersion();
+    isValid &= ValidateSortOrder();
+    isValid &= ValidateGroupOrder();
+    return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateVersion(void) {
+
+    const string& version = m_header.Version;
+
+    // warn if version not present
+    if ( version.empty() ) {
+        AddWarning("Version (VN) missing. Not required, but strongly recommended");
+        return true;
+    }
+
+    // invalid if version does not contain a period
+    const size_t periodFound = version.find(Constants::SAM_PERIOD);
+    if ( periodFound == string::npos ) {
+        AddError("Invalid version (VN) format: " + version);
+        return false;
+    }
+
+    // invalid if major version is empty or contains non-digits
+    const string majorVersion = version.substr(0, periodFound);
+    if ( majorVersion.empty() || !ContainsOnlyDigits(majorVersion) ) {
+        AddError("Invalid version (VN) format: " + version);
+        return false;
+    }
+
+    // invalid if major version is empty or contains non-digits
+    const string minorVersion = version.substr(periodFound + 1);
+    if ( minorVersion.empty() || !ContainsOnlyDigits(minorVersion) ) {
+        AddError("Invalid version (VN) format: " + version);
+        return false;
+    }
+
+    // TODO: check if version is not just syntactically OK,
+    // but is also a valid SAM version ( 1.0 .. CURRENT )
+
+    // all checked out this far, then version is OK
+    return true;
+}
+
+// assumes non-empty input string
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsOnlyDigits(const string& s) {
+    const size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS);
+    return ( nonDigitPosition == string::npos ) ;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateSortOrder(void) {
+
+    const string& sortOrder = m_header.SortOrder;
+
+    // warn if sort order not present
+    if ( sortOrder.empty() ) {
+        AddWarning("Sort order (SO) missing. Not required, but strongly recommended");
+        return true;
+    }
+
+    // if sort order is valid keyword
+    if ( sortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ||
+         sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME  ||
+         sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED
+       )
+    { return true; }
+
+    // otherwise
+    AddError("Invalid sort order (SO): " + sortOrder);
+    return false;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateGroupOrder(void) {
+
+    const string& groupOrder = m_header.GroupOrder;
+
+    // if no group order, no problem, just return OK
+    if ( groupOrder.empty() ) return true;
+
+    // if group order is valid keyword
+    if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE  ||
+         groupOrder == Constants::SAM_HD_GROUPORDER_QUERY ||
+         groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE
+       )
+    { return true; }
+
+    // otherwise
+    AddError("Invalid group order (GO): " + groupOrder);
+    return false;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateSequenceDictionary(void) {
+
+    // TODO: warn/error if no sequences ?
+
+    bool isValid = true;
+
+    // check for unique sequence names
+    isValid &= ContainsUniqueSequenceNames();
+
+    // iterate over sequences
+    const SamSequenceDictionary& sequences = m_header.Sequences;
+    SamSequenceConstIterator seqIter = sequences.ConstBegin();
+    SamSequenceConstIterator seqEnd  = sequences.ConstEnd();
+    for ( ; seqIter != seqEnd; ++seqIter ) {
+        const SamSequence& seq = (*seqIter);
+        isValid &= ValidateSequence(seq);
+    }
+
+    // return validation state
+    return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsUniqueSequenceNames(void) {
+
+    bool isValid = true;
+    set<string> sequenceNames;
+    set<string>::iterator nameIter;
+
+    // iterate over sequences
+    const SamSequenceDictionary& sequences = m_header.Sequences;
+    SamSequenceConstIterator seqIter = sequences.ConstBegin();
+    SamSequenceConstIterator seqEnd  = sequences.ConstEnd();
+    for ( ; seqIter != seqEnd; ++seqIter ) {
+        const SamSequence& seq = (*seqIter);
+        const string& name = seq.Name;
+
+        // lookup sequence name
+        nameIter = sequenceNames.find(name);
+
+        // error if found (duplicate entry)
+        if ( nameIter != sequenceNames.end() ) {
+            AddError("Sequence name (SN): " + name + " is not unique");
+            isValid = false;
+        }
+
+        // otherwise ok, store name
+        sequenceNames.insert(name);
+    }
+
+    // return validation state
+    return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateSequence(const SamSequence& seq) {
+    bool isValid = true;
+    isValid &= CheckNameFormat(seq.Name);
+    isValid &= CheckLengthInRange(seq.Length);
+    return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckNameFormat(const string& name) {
+
+    // invalid if name is empty
+    if ( name.empty() ) {
+        AddError("Sequence entry (@SQ) is missing SN tag");
+        return false;
+    }
+
+    // invalid if first character is a reserved char
+    const char firstChar = name.at(0);
+    if ( firstChar == Constants::SAM_EQUAL || firstChar == Constants::SAM_STAR ) {
+        AddError("Invalid sequence name (SN): " + name);
+        return false;
+    }
+    // otherwise OK
+    return true;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckLengthInRange(const string& length) {
+
+    // invalid if empty
+    if ( length.empty() ) {
+        AddError("Sequence entry (@SQ) is missing LN tag");
+        return false;
+    }
+
+    // convert string length to numeric
+    stringstream lengthStream(length);
+    unsigned int sequenceLength;
+    lengthStream >> sequenceLength;
+
+    // invalid if length outside accepted range
+    if ( sequenceLength < Constants::SAM_SQ_LENGTH_MIN || sequenceLength > Constants::SAM_SQ_LENGTH_MAX ) {
+        AddError("Sequence length (LN): " + length + " out of range");
+        return false;
+    }
+
+    // otherwise OK
+    return true;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateReadGroupDictionary(void) {
+
+    // TODO: warn/error if no read groups ?
+
+    bool isValid = true;
+
+    // check for unique read group IDs & platform units
+    isValid &= ContainsUniqueIDsAndPlatformUnits();
+
+    // iterate over read groups
+    const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
+    SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
+    SamReadGroupConstIterator rgEnd  = readGroups.ConstEnd();
+    for ( ; rgIter != rgEnd; ++rgIter ) {
+        const SamReadGroup& rg = (*rgIter);
+        isValid &= ValidateReadGroup(rg);
+    }
+
+    // return validation state
+    return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsUniqueIDsAndPlatformUnits(void) {
+
+    bool isValid = true;
+    set<string> readGroupIds;
+    set<string> platformUnits;
+    set<string>::iterator idIter;
+    set<string>::iterator puIter;
+
+    // iterate over sequences
+    const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
+    SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
+    SamReadGroupConstIterator rgEnd  = readGroups.ConstEnd();
+    for ( ; rgIter != rgEnd; ++rgIter ) {
+        const SamReadGroup& rg = (*rgIter);
+
+        // --------------------------------
+        // check for unique ID
+
+        // lookup read group ID
+        const string& id = rg.ID;
+        idIter = readGroupIds.find(id);
+
+        // error if found (duplicate entry)
+        if ( idIter != readGroupIds.end() ) {
+            AddError("Read group ID (ID): " + id + " is not unique");
+            isValid = false;
+        }
+
+        // otherwise ok, store id
+        readGroupIds.insert(id);
+
+        // --------------------------------
+        // check for unique platform unit
+
+        // lookup platform unit
+        const string& pu = rg.PlatformUnit;
+        puIter = platformUnits.find(pu);
+
+        // error if found (duplicate entry)
+        if ( puIter != platformUnits.end() ) {
+            AddError("Platform unit (PU): " + pu + " is not unique");
+            isValid = false;
+        }
+
+        // otherwise ok, store platform unit
+        platformUnits.insert(pu);
+    }
+
+    // return validation state
+    return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateReadGroup(const SamReadGroup& rg) {
+    bool isValid = true;
+    isValid &= CheckReadGroupID(rg.ID);
+    isValid &= CheckSequencingTechnology(rg.SequencingTechnology);
+    return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckReadGroupID(const string& id) {
+
+    // invalid if empty
+    if ( id.empty() ) {
+        AddError("Read group entry (@RG) is missing ID tag");
+        return false;
+    }
+
+    // otherwise OK
+    return true;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckSequencingTechnology(const string& technology) {
+
+    // if no technology provided, no problem, just return OK
+    if ( technology.empty() ) return true;
+
+    // if technology is valid keyword
+    if ( Is454(technology)      ||
+         IsHelicos(technology)  ||
+         IsIllumina(technology) ||
+         IsPacBio(technology)   ||
+         IsSolid(technology)
+       )
+    { return true; }
+
+    // otherwise
+    AddError("Invalid read group sequencing platform (PL): " + technology);
+    return false;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::Is454(const string& technology) {
+    return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_454 ||
+             technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_LOWER ||
+             technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_UPPER
+           );
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::IsHelicos(const string& technology) {
+    return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_LOWER ||
+             technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_UPPER
+           );
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::IsIllumina(const string& technology) {
+    return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_LOWER ||
+             technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_UPPER
+           );
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::IsPacBio(const string& technology) {
+    return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_LOWER ||
+             technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_UPPER
+           );
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::IsSolid(const string& technology) {
+    return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_LOWER ||
+             technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_UPPER
+           );
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateProgramData(void) {
+    bool isValid = true;
+    isValid &= ContainsUniqueProgramIds();
+    isValid &= ValidatePreviousProgramIds();
+    return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsUniqueProgramIds(void) {
+    bool isValid = true;
+    // TODO: once we have ability to handle multiple @PG entries,
+    // check here for duplicate ID's
+    // but for now, just return true
+    return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidatePreviousProgramIds(void) {
+    bool isValid = true;
+    // TODO: check that PP entries are valid later, after we get multiple @PG-entry handling
+    // just return true for now
+    return isValid;
+}
+void SamHeaderValidator::SamHeaderValidatorPrivate::AddError(const string& message) {
+    m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE);
+}
+
+void SamHeaderValidator::SamHeaderValidatorPrivate::AddWarning(const string& message) {
+    m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE);
+}
+
+void SamHeaderValidator::SamHeaderValidatorPrivate::PrintErrorMessages(void) {
+
+    // skip if no error messages
+    if ( m_errorMessages.empty() ) return;
+
+    // print error header line
+    cerr << "* SAM header has " << m_errorMessages.size() << " errors:" << endl;
+
+    // print each error message
+    vector<string>::const_iterator errorIter = m_errorMessages.begin();
+    vector<string>::const_iterator errorEnd  = m_errorMessages.end();
+    for ( ; errorIter != errorEnd; ++errorIter )
+        cerr << (*errorIter);
+}
+
+void SamHeaderValidator::SamHeaderValidatorPrivate::PrintWarningMessages(void) {
+
+    // skip if no warning messages
+    if ( m_warningMessages.empty() ) return;
+
+    // print warning header line
+    cerr << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl;
+
+    // print each warning message
+    vector<string>::const_iterator warnIter = m_warningMessages.begin();
+    vector<string>::const_iterator warnEnd  = m_warningMessages.end();
+    for ( ; warnIter != warnEnd; ++warnIter )
+        cerr << (*warnIter);
+}
+
+// -----------------------------------
+// SamHeaderValidator implementation
+
+SamHeaderValidator::SamHeaderValidator(const BamTools::SamHeader& header)
+    : d( new SamHeaderValidatorPrivate(header) )
+{ }
+
+SamHeaderValidator::~SamHeaderValidator(void) {
+    delete d;
+    d = 0;
+}
+
+bool SamHeaderValidator::Validate(bool verbose) { return d->Validate(verbose); }
diff --git a/src/api/internal/SamHeaderValidator_p.h b/src/api/internal/SamHeaderValidator_p.h
new file mode 100644 (file)
index 0000000..41c04ee
--- /dev/null
@@ -0,0 +1,52 @@
+// ***************************************************************************
+// SamHeaderValidator.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for validating SamHeader data
+// ***************************************************************************
+
+#ifndef SAM_HEADER_VALIDATOR_P_H
+#define SAM_HEADER_VALIDATOR_P_H
+
+//  -------------
+//  W A R N I N G
+//  -------------
+//
+// This file is not part of the BamTools API.  It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+class SamHeader;
+
+namespace Internal {
+
+class SamHeaderValidator {
+
+    public:
+        SamHeaderValidator(const BamTools::SamHeader& header);
+        ~SamHeaderValidator(void);
+
+    public:
+        // validates SamHeader data
+        // prints error & warning messages to stderr when (verbose == true)
+        bool Validate(bool verbose = false);
+
+    private:
+        struct SamHeaderValidatorPrivate;
+        SamHeaderValidatorPrivate* d;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // SAM_HEADER_VALIDATOR_P_H
diff --git a/src/api/internal/SamHeaderVersion_p.h b/src/api/internal/SamHeaderVersion_p.h
new file mode 100644 (file)
index 0000000..ff96471
--- /dev/null
@@ -0,0 +1,115 @@
+#ifndef SAM_HEADERVERSION_P_H
+#define SAM_HEADERVERSION_P_H
+
+#include <api/SamConstants.h>
+#include <sstream>
+#include <string>
+
+namespace BamTools {
+namespace Internal {
+
+class SamHeaderVersion {
+
+    // ctors & dtor
+    public:
+        SamHeaderVersion(void)
+            : m_majorVersion(0)
+            , m_minorVersion(0)
+        { }
+
+        explicit SamHeaderVersion(const std::string& version)
+            : m_majorVersion(0)
+            , m_minorVersion(0)
+        {
+            SetVersion(version);
+        }
+
+        SamHeaderVersion(const unsigned int& major, const unsigned int& minor)
+            : m_majorVersion(major)
+            , m_minorVersion(minor)
+        { }
+
+        ~SamHeaderVersion(void) {
+            m_majorVersion = 0;
+            m_minorVersion = 0;
+        }
+    
+    // acess data
+    public:
+        unsigned int MajorVersion(void) const { return m_majorVersion; }
+        unsigned int MinorVersion(void) const { return m_minorVersion; }
+
+        inline void SetVersion(const std::string& version);
+        inline std::string ToString(void) const;
+
+    // data members
+    private:
+        unsigned int m_majorVersion;
+        unsigned int m_minorVersion;
+};
+
+inline
+void SamHeaderVersion::SetVersion(const std::string& version) {
+
+    // do nothing if version is empty
+    if ( !version.empty() ) {
+
+        // do nothing if period not found
+        const size_t periodFound = version.find(Constants::SAM_PERIOD);
+        if ( periodFound != std::string::npos ) {
+
+            // store major version if non-empty and contains only digits
+            const std::string& majorVersion = version.substr(0, periodFound);
+            if ( majorVersion.empty() ) {
+                const size_t nonDigitFound = majorVersion.find_first_not_of(Constants::SAM_DIGITS);
+                if ( nonDigitFound == std::string::npos ) {
+                    std::stringstream major(majorVersion);
+                    major >> m_majorVersion;
+                }
+            }
+
+            // store minor version if non-empty and contains only digits
+            const std::string& minorVersion = version.substr(periodFound + 1);
+            if ( minorVersion.empty() ) {
+                const size_t nonDigitFound = minorVersion.find_first_not_of(Constants::SAM_DIGITS);
+                if ( nonDigitFound == std::string::npos ) {
+                    std::stringstream minor(minorVersion);
+                    minor >> m_minorVersion;
+                }
+            }
+        }
+    }
+}
+
+// -----------------------------------------------------
+// printing
+
+inline std::string SamHeaderVersion::ToString(void) const {
+    std::stringstream version;
+    version << m_majorVersion << Constants::SAM_PERIOD << m_minorVersion;
+    return version.str();
+}
+
+// -----------------------------------------------------
+// comparison operators
+
+inline bool operator==(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) {
+    return (lhs.MajorVersion() == rhs.MajorVersion()) &&
+           (lhs.MinorVersion() == rhs.MinorVersion());
+}
+
+inline bool operator<(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) {
+    if ( lhs.MajorVersion() == rhs.MajorVersion() )
+        return lhs.MinorVersion() < rhs.MinorVersion();
+    else 
+        return lhs.MajorVersion() < rhs.MajorVersion();
+}
+
+inline bool operator> (const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return rhs < lhs;  }
+inline bool operator<=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return !(lhs>rhs); }
+inline bool operator>=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return !(lhs<rhs); }
+
+} // namespace Internal 
+} // namespace BamTools
+
+#endif // SAM_HEADERVERSION_P_H