# add compiler definitions
add_definitions( -DBAMTOOLS_API_LIBRARY ) # (for proper exporting of library symbols)
+# list of all BamTools API source (.cpp) files
+set( BamToolsAPISources
+ BamAlignment.cpp
+ BamIndex.cpp
+ BamMultiReader.cpp
+ BamReader.cpp
+ BamWriter.cpp
+ BGZF.cpp
+ SamHeader.cpp
+ SamReadGroup.cpp
+ SamReadGroupDictionary.cpp
+ SamSequence.cpp
+ SamSequenceDictionary.cpp
+ internal/BamMultiReader_p.cpp
+ internal/BamReader_p.cpp
+ internal/BamStandardIndex_p.cpp
+ internal/BamToolsIndex_p.cpp
+ internal/BamWriter_p.cpp
+ internal/SamFormatParser_p.cpp
+ internal/SamFormatPrinter_p.cpp
+ internal/SamHeaderValidator_p.cpp
+)
+
# create main BamTools API shared library
-add_library( BamTools SHARED
- BamAlignment.cpp
- BamIndex.cpp
- BamMultiReader.cpp
- BamReader.cpp
- BamWriter.cpp
- BGZF.cpp
- internal/BamMultiReader_p.cpp
- internal/BamReader_p.cpp
- internal/BamStandardIndex_p.cpp
- internal/BamToolsIndex_p.cpp
- internal/BamWriter_p.cpp
- )
-# set shared lib properties
+add_library( BamTools SHARED ${BamToolsAPISources} )
set_target_properties( BamTools PROPERTIES SOVERSION "0.9.1" )
set_target_properties( BamTools PROPERTIES OUTPUT_NAME "bamtools" )
# create main BamTools API static library
-add_library( BamTools-static STATIC
- BamAlignment.cpp
- BamIndex.cpp
- BamMultiReader.cpp
- BamReader.cpp
- BamWriter.cpp
- BGZF.cpp
- internal/BamMultiReader_p.cpp
- internal/BamReader_p.cpp
- internal/BamStandardIndex_p.cpp
- internal/BamToolsIndex_p.cpp
- internal/BamWriter_p.cpp
- )
-# set static lib properties
+add_library( BamTools-static STATIC ${BamToolsAPISources} )
set_target_properties( BamTools-static PROPERTIES OUTPUT_NAME "bamtools" )
set_target_properties( BamTools-static PROPERTIES PREFIX "lib" )
# export API headers
include(../ExportHeader.cmake)
set(ApiIncludeDir "api")
-ExportHeader(APIHeaders api_global.h ${ApiIncludeDir})
-ExportHeader(APIHeaders BamAlignment.h ${ApiIncludeDir})
-ExportHeader(APIHeaders BamAux.h ${ApiIncludeDir})
-ExportHeader(APIHeaders BamIndex.h ${ApiIncludeDir})
-ExportHeader(APIHeaders BamMultiReader.h ${ApiIncludeDir})
-ExportHeader(APIHeaders BamReader.h ${ApiIncludeDir})
-ExportHeader(APIHeaders BamWriter.h ${ApiIncludeDir})
-ExportHeader(APIHeaders BGZF.h ${ApiIncludeDir})
+ExportHeader(APIHeaders api_global.h ${ApiIncludeDir})
+ExportHeader(APIHeaders BamAlignment.h ${ApiIncludeDir})
+ExportHeader(APIHeaders BamAux.h ${ApiIncludeDir})
+ExportHeader(APIHeaders BamIndex.h ${ApiIncludeDir})
+ExportHeader(APIHeaders BamMultiReader.h ${ApiIncludeDir})
+ExportHeader(APIHeaders BamReader.h ${ApiIncludeDir})
+ExportHeader(APIHeaders BamWriter.h ${ApiIncludeDir})
+ExportHeader(APIHeaders BGZF.h ${ApiIncludeDir})
+ExportHeader(APIHeaders SamConstants.h ${ApiIncludeDir})
+ExportHeader(APIHeaders SamHeader.h ${ApiIncludeDir})
+ExportHeader(APIHeaders SamReadGroup.h ${ApiIncludeDir})
+ExportHeader(APIHeaders SamReadGroupDictionary.h ${ApiIncludeDir})
+ExportHeader(APIHeaders SamSequence.h ${ApiIncludeDir})
+ExportHeader(APIHeaders SamSequenceDictionary.h ${ApiIncludeDir})
--- /dev/null
+// ***************************************************************************
+// SamConstants.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides constants for SAM header
+// ***************************************************************************
+
+#ifndef SAM_CONSTANTS_H
+#define SAM_CONSTANTS_H
+
+#include <api/api_global.h>
+#include <string>
+
+namespace BamTools {
+namespace Constants {
+
+const char SAM_COLON = ':';
+const char SAM_EQUAL = '=';
+const char SAM_PERIOD = '.';
+const char SAM_STAR = '*';
+const char SAM_TAB = '\t';
+const std::string SAM_DIGITS = "0123456789";
+
+// HD entries
+const std::string SAM_HD_BEGIN_TOKEN = "@HD";
+const std::string SAM_HD_VERSION_TAG = "VN";
+const std::string SAM_HD_SORTORDER_TAG = "SO";
+const std::string SAM_HD_GROUPORDER_TAG = "GO";
+
+// SQ entries
+const std::string SAM_SQ_BEGIN_TOKEN = "@SQ";
+const std::string SAM_SQ_NAME_TAG = "SN";
+const std::string SAM_SQ_LENGTH_TAG = "LN";
+const std::string SAM_SQ_ASSEMBLYID_TAG = "AS";
+const std::string SAM_SQ_URI_TAG = "UR";
+const std::string SAM_SQ_CHECKSUM_TAG = "M5";
+const std::string SAM_SQ_SPECIES_TAG = "SP";
+
+// RG entries
+const std::string SAM_RG_BEGIN_TOKEN = "@RG";
+const std::string SAM_RG_ID_TAG = "ID";
+const std::string SAM_RG_SAMPLE_TAG = "SM";
+const std::string SAM_RG_LIBRARY_TAG = "LB";
+const std::string SAM_RG_DESCRIPTION_TAG = "DS";
+const std::string SAM_RG_PLATFORMUNIT_TAG = "PU";
+const std::string SAM_RG_PREDICTEDINSERTSIZE_TAG = "PI";
+const std::string SAM_RG_SEQCENTER_TAG = "CN";
+const std::string SAM_RG_PRODUCTIONDATE_TAG = "DT";
+const std::string SAM_RG_SEQTECHNOLOGY_TAG = "PL";
+
+// PG entries
+const std::string SAM_PG_BEGIN_TOKEN = "@PG";
+const std::string SAM_PG_NAME_TAG = "ID";
+const std::string SAM_PG_VERSION_TAG = "VN";
+const std::string SAM_PG_COMMANDLINE_TAG = "CL";
+
+// CO entries
+const std::string SAM_CO_BEGIN_TOKEN = "@CO";
+
+// HD:SO values
+const std::string SAM_HD_SORTORDER_COORDINATE = "coordinate";
+const std::string SAM_HD_SORTORDER_QUERYNAME = "queryname";
+const std::string SAM_HD_SORTORDER_UNSORTED = "unsorted";
+
+// HD:GO values
+const std::string SAM_HD_GROUPORDER_NONE = "none";
+const std::string SAM_HD_GROUPORDER_QUERY = "query";
+const std::string SAM_HD_GROUPORDER_REFERENCE = "reference";
+
+// SQ:LN values
+const unsigned int SAM_SQ_LENGTH_MIN = 1;
+const unsigned int SAM_SQ_LENGTH_MAX = 536870911; // 2^29 - 1
+
+// --------------
+// RG:PL values
+
+// 454
+const std::string SAM_RG_SEQTECHNOLOGY_454 = "454";
+const std::string SAM_RG_SEQTECHNOLOGY_LS454_LOWER = "ls454";
+const std::string SAM_RG_SEQTECHNOLOGY_LS454_UPPER = "LS454";
+
+// Helicos
+const std::string SAM_RG_SEQTECHNOLOGY_HELICOS_LOWER = "helicos";
+const std::string SAM_RG_SEQTECHNOLOGY_HELICOS_UPPER = "HELICOS";
+
+// Illumina
+const std::string SAM_RG_SEQTECHNOLOGY_ILLUMINA_LOWER = "illumina";
+const std::string SAM_RG_SEQTECHNOLOGY_ILLUMINA_UPPER = "ILLUMINA";
+
+// PacBio
+const std::string SAM_RG_SEQTECHNOLOGY_PACBIO_LOWER = "pacbio";
+const std::string SAM_RG_SEQTECHNOLOGY_PACBIO_UPPER = "PACBIO";
+
+// SOLiD
+const std::string SAM_RG_SEQTECHNOLOGY_SOLID_LOWER = "solid";
+const std::string SAM_RG_SEQTECHNOLOGY_SOLID_UPPER = "SOLID";
+
+} // namespace Constants
+} // namespace BamTools
+
+#endif // SAM_CONSTANTS_H
--- /dev/null
+// ***************************************************************************
+// SamHeader.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for querying/manipulating SAM header data
+// **************************************************************************
+
+#include <api/SamHeader.h>
+#include <api/internal/SamFormatParser_p.h>
+#include <api/internal/SamFormatPrinter_p.h>
+#include <api/internal/SamHeaderValidator_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+using namespace std;
+
+SamHeader::SamHeader(const string& headerText)
+ : Version("")
+ , SortOrder("")
+ , GroupOrder("")
+ , ProgramName("")
+ , ProgramVersion("")
+ , ProgramCommandLine("")
+{
+ SamFormatParser parser(*this);
+ parser.Parse(headerText);
+}
+
+SamHeader::~SamHeader(void) {
+ Clear();
+}
+
+void SamHeader::Clear(void) {
+ Version.clear();
+ SortOrder.clear();
+ GroupOrder.clear();
+ Sequences.Clear();
+ ReadGroups.Clear();
+ ProgramName.clear();
+ ProgramVersion.clear();
+ ProgramCommandLine.clear();
+ Comments.clear();
+}
+
+// retrieve the SAM header, with any local modifications
+string SamHeader::ToString(void) const {
+ SamFormatPrinter printer(*this);
+ return printer.ToString();
+}
+
+// query if header contains @HD ID:<Version>
+bool SamHeader::HasVersion(void) const {
+ return (!Version.empty());
+}
+
+// query if header contains @HD SO:<SortOrder>
+bool SamHeader::HasSortOrder(void) const {
+ return (!SortOrder.empty());
+}
+
+// query if header contains @HD GO:<GroupOrder>
+bool SamHeader::HasGroupOrder(void) const {
+ return (!GroupOrder.empty());
+}
+
+// query if header contains @SQ entries
+bool SamHeader::HasSequences(void) const {
+ return (!Sequences.IsEmpty());
+}
+
+// query if header contains @RG entries
+bool SamHeader::HasReadGroups(void) const {
+ return (!ReadGroups.IsEmpty());
+}
+
+// query if header contains @PG ID:<ProgramName>
+bool SamHeader::HasProgramName(void) const {
+ return (!ProgramName.empty());
+}
+
+// query if header contains @HD VN:<ProgramVersion>
+bool SamHeader::HasProgramVersion(void) const {
+ return (!ProgramVersion.empty());
+}
+
+// query if header contains @HD CL:<ProgramCommandLine>
+bool SamHeader::HasProgramCommandLine(void) const {
+ return (!ProgramCommandLine.empty());
+}
+
+// query if header contains @CO entries
+bool SamHeader::HasComments(void) const {
+ return (!Comments.empty());
+}
+
+// validation
+bool SamHeader::IsValid(bool verbose) const {
+ SamHeaderValidator validator(*this);
+ return validator.Validate(verbose);
+}
--- /dev/null
+// ***************************************************************************
+// SamHeader.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for querying/manipulating SAM header data
+// **************************************************************************
+
+#ifndef SAM_HEADER_H
+#define SAM_HEADER_H
+
+#include <api/api_global.h>
+#include <api/SamReadGroupDictionary.h>
+#include <api/SamSequenceDictionary.h>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+struct API_EXPORT SamHeader {
+
+ // ctor & dtor
+ public:
+ explicit SamHeader(const std::string& headerText = "");
+ ~SamHeader(void);
+
+ // query/modify entire SamHeader at once
+ public:
+
+ // clear all header contents
+ void Clear(void);
+
+ // checks if SAM header is well-formed
+ // @verbose - if true, validation errors & warnings will be printed to stderr
+ // otherwise, output is suppressed and only validation check occurs
+ bool IsValid(bool verbose = false) const;
+
+ // retrieves the printable, SAM-formatted header
+ // (with any local modifications since construction)
+ std::string ToString(void) const;
+
+ // query if header contains data elements
+ public:
+ bool HasVersion(void) const;
+ bool HasSortOrder(void) const;
+ bool HasGroupOrder(void) const;
+ bool HasSequences(void) const;
+ bool HasReadGroups(void) const;
+ bool HasProgramName(void) const;
+ bool HasProgramVersion(void) const;
+ bool HasProgramCommandLine(void) const;
+ bool HasComments(void) const;
+
+ // data members
+ public:
+
+ // header metadata (@HD line)
+ std::string Version; // VN:<Version>
+ std::string SortOrder; // SO:<SortOrder>
+ std::string GroupOrder; // GO:<GroupOrder>
+
+ // header sequences (@SQ entries)
+ SamSequenceDictionary Sequences;
+
+ // header read groups (@RG entries)
+ SamReadGroupDictionary ReadGroups;
+
+ // header program data (@PG entries)
+ std::string ProgramName; // ID:<ProgramName>
+ std::string ProgramVersion; // VN:<ProgramVersion>
+ std::string ProgramCommandLine; // CL:<ProgramCommandLine>
+
+ // header comments (@CO entries)
+ std::vector<std::string> Comments;
+};
+
+} // namespace BamTools
+
+#endif // SAM_HEADER_H
--- /dev/null
+// ***************************************************************************
+// SamReadGroup.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for querying/manipulating read group data
+// **************************************************************************
+
+#include <api/SamReadGroup.h>
+using namespace BamTools;
+using namespace std;
+
+// default ctor
+SamReadGroup::SamReadGroup(void)
+ : ID("")
+ , Sample("")
+ , Library("")
+ , Description("")
+ , PlatformUnit("")
+ , PredictedInsertSize("")
+ , SequencingCenter("")
+ , ProductionDate("")
+ , SequencingTechnology("")
+{ }
+
+// ctor with provided ID
+SamReadGroup::SamReadGroup(const string& id)
+ : ID(id)
+ , Sample("")
+ , Library("")
+ , Description("")
+ , PlatformUnit("")
+ , PredictedInsertSize("")
+ , SequencingCenter("")
+ , ProductionDate("")
+ , SequencingTechnology("")
+{ }
+
+// dtor
+SamReadGroup::~SamReadGroup(void) {
+ Clear();
+}
+
+// clear all contents
+void SamReadGroup::Clear(void) {
+ ID.clear();
+ Sample.clear();
+ Library.clear();
+ Description.clear();
+ PlatformUnit.clear();
+ PredictedInsertSize.clear();
+ SequencingCenter.clear();
+ ProductionDate.clear();
+ SequencingTechnology.clear();
+}
+
+// convenience methods to check if SamReadGroup contains these values:
+bool SamReadGroup::HasID(void) const { return (!ID.empty()); }
+bool SamReadGroup::HasSample(void) const { return (!Sample.empty()); }
+bool SamReadGroup::HasLibrary(void) const { return (!Library.empty()); }
+bool SamReadGroup::HasDescription(void) const { return (!Description.empty()); }
+bool SamReadGroup::HasPlatformUnit(void) const { return (!PlatformUnit.empty()); }
+bool SamReadGroup::HasPredictedInsertSize(void) const { return (!PredictedInsertSize.empty()); }
+bool SamReadGroup::HasSequencingCenter(void) const { return (!SequencingCenter.empty()); }
+bool SamReadGroup::HasProductionDate(void) const { return (!ProductionDate.empty()); }
+bool SamReadGroup::HasSequencingTechnology(void) const { return (!SequencingTechnology.empty()); }
--- /dev/null
+// ***************************************************************************
+// SamReadGroup.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for querying/manipulating read group data
+// **************************************************************************
+
+#ifndef SAM_READGROUP_H
+#define SAM_READGROUP_H
+
+#include "api/api_global.h"
+#include <string>
+
+namespace BamTools {
+
+class API_EXPORT SamReadGroup {
+
+ // ctor & dtor
+ public:
+ SamReadGroup(void);
+ SamReadGroup(const std::string& id);
+ ~SamReadGroup(void);
+
+ // public methods
+ public:
+
+ // clear all contents
+ void Clear(void);
+
+ // convenience methods to check if SamReadGroup contains these values:
+ bool HasID(void) const;
+ bool HasSample(void) const;
+ bool HasLibrary(void) const;
+ bool HasDescription(void) const;
+ bool HasPlatformUnit(void) const;
+ bool HasPredictedInsertSize(void) const;
+ bool HasSequencingCenter(void) const;
+ bool HasProductionDate(void) const;
+ bool HasSequencingTechnology(void) const;
+
+ // data members
+ public:
+ std::string ID; // ID:<ID>
+ std::string Sample; // SM:<Sample>
+ std::string Library; // LB:<Library>
+ std::string Description; // DS:<Description>
+ std::string PlatformUnit; // PU:<PlatformUnit>
+ std::string PredictedInsertSize; // PI:<PredictedInsertSize>
+ std::string SequencingCenter; // CN:<SequencingCenter>
+ std::string ProductionDate; // DT:<ProductionDate>
+ std::string SequencingTechnology; // PL:<SequencingTechnology>
+};
+
+// ---------------------------------------------------
+// comparison operators
+
+// for equality: compare IDs
+inline bool operator==(const SamReadGroup& lhs, const SamReadGroup& rhs) {
+ return lhs.ID == rhs.ID;
+}
+
+} // namespace BamTools
+
+#endif // SAM_READGROUP_H
--- /dev/null
+// ***************************************************************************
+// SamReadGroupDictionary.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides container operations for collection of read group entries
+// *************************************************************************
+
+#include <api/SamReadGroupDictionary.h>
+using namespace BamTools;
+
+#include <algorithm>
+#include <iostream>
+using namespace std;
+
+// ctor
+SamReadGroupDictionary::SamReadGroupDictionary(void) { }
+
+// dtor
+SamReadGroupDictionary::~SamReadGroupDictionary(void) {
+ m_data.clear();
+}
+
+// adds read group if not already in container
+void SamReadGroupDictionary::Add(const SamReadGroup& readGroup) {
+ if ( IsEmpty() || !Contains(readGroup) )
+ m_data.push_back(readGroup);
+}
+
+// overload to support std::string
+void SamReadGroupDictionary::Add(const string& readGroupId) {
+ Add( SamReadGroup(readGroupId) );
+}
+
+// add multiple read groups
+void SamReadGroupDictionary::Add(const vector<SamReadGroup>& readGroups) {
+ vector<SamReadGroup>::const_iterator rgIter = readGroups.begin();
+ vector<SamReadGroup>::const_iterator rgEnd = readGroups.end();
+ for ( ; rgIter!= rgEnd; ++rgIter )
+ Add(*rgIter);
+}
+
+// overload to support std::string
+void SamReadGroupDictionary::Add(const vector<string>& readGroupIds) {
+ vector<string>::const_iterator rgIter = readGroupIds.begin();
+ vector<string>::const_iterator rgEnd = readGroupIds.end();
+ for ( ; rgIter!= rgEnd; ++rgIter )
+ Add(*rgIter);
+}
+
+// returns iterator to container begin
+SamReadGroupIterator SamReadGroupDictionary::Begin(void) {
+ return m_data.begin();
+}
+
+// returns const_iterator to container begin
+SamReadGroupConstIterator SamReadGroupDictionary::Begin(void) const {
+ return m_data.begin();
+}
+
+// clear read group container
+void SamReadGroupDictionary::Clear(void) {
+ m_data.clear();
+}
+
+// explicit request for const_iterator to container begin
+SamReadGroupConstIterator SamReadGroupDictionary::ConstBegin(void) const {
+ return m_data.begin();
+}
+
+// explicit request for const_iterator to container end
+SamReadGroupConstIterator SamReadGroupDictionary::ConstEnd(void) const {
+ return m_data.end();
+}
+
+// returns true if container contains a read group with this ID tag
+bool SamReadGroupDictionary::Contains(const string& readGroupId) const {
+ return ( IndexOf(readGroupId) != (int)m_data.size() );
+}
+
+bool SamReadGroupDictionary::Contains(const SamReadGroup& readGroup) const {
+ return ( IndexOf(readGroup) != (int)m_data.size() );
+}
+
+// returns iterator to container end
+SamReadGroupIterator SamReadGroupDictionary::End(void) {
+ return m_data.end();
+}
+
+// returns const_iterator to container begin
+SamReadGroupConstIterator SamReadGroupDictionary::End(void) const {
+ return m_data.end();
+}
+
+// returns vector index of read group if found
+// returns vector::size() (invalid index) if not found
+int SamReadGroupDictionary::IndexOf(const SamReadGroup& readGroup) const {
+ SamReadGroupConstIterator begin = ConstBegin();
+ SamReadGroupConstIterator iter = begin;
+ SamReadGroupConstIterator end = ConstEnd();
+ for ( ; iter != end; ++iter )
+ if ( *iter == readGroup ) break;
+ return distance( begin, iter );
+}
+
+// overload to support std::string
+int SamReadGroupDictionary::IndexOf(const string& readGroupId) const {
+ return IndexOf( SamReadGroup(readGroupId) );
+}
+
+// returns true if container is empty
+bool SamReadGroupDictionary::IsEmpty(void) const {
+ return m_data.empty();
+}
+
+// removes read group (if it exists)
+void SamReadGroupDictionary::Remove(const SamReadGroup& readGroup) {
+ if ( Contains(readGroup) )
+ m_data.erase( m_data.begin() + IndexOf(readGroup) );
+}
+
+// overlaod to support std::string
+void SamReadGroupDictionary::Remove(const string& readGroupId) {
+ Remove( SamReadGroup(readGroupId) );
+}
+
+// remove multiple read groups
+void SamReadGroupDictionary::Remove(const vector<SamReadGroup>& readGroups) {
+ vector<SamReadGroup>::const_iterator rgIter = readGroups.begin();
+ vector<SamReadGroup>::const_iterator rgEnd = readGroups.end();
+ for ( ; rgIter!= rgEnd; ++rgIter )
+ Remove(*rgIter);
+}
+
+// overload to support std::string
+void SamReadGroupDictionary::Remove(const vector<string>& readGroupIds) {
+ vector<string>::const_iterator rgIter = readGroupIds.begin();
+ vector<string>::const_iterator rgEnd = readGroupIds.end();
+ for ( ; rgIter!= rgEnd; ++rgIter )
+ Remove(*rgIter);
+}
+
+// returns size of container (number of current read groups)
+int SamReadGroupDictionary::Size(void) const {
+ return m_data.size();
+}
+
+// retrieves the SamReadGroup object associated with this ID
+// if readGroupId is unknown, a new SamReadGroup is created with this ID
+// and a reference to this new read group entry is returned (like std::map)
+SamReadGroup& SamReadGroupDictionary::operator[](const std::string& readGroupId) {
+
+ // look up read group ID
+ int index = IndexOf(readGroupId);
+
+ // if found, return read group at index
+ if ( index != (int)m_data.size() )
+ return m_data[index];
+
+ // otherwise, append new read group and return reference
+ else {
+ SamReadGroup rg(readGroupId);
+ m_data.push_back(rg);
+ return m_data.back();
+ }
+}
--- /dev/null
+// ***************************************************************************
+// SamReadGroupDictionary.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides container operations for collection of read group entries
+// *************************************************************************
+
+#ifndef SAM_READGROUP_DICTIONARY_H
+#define SAM_READGROUP_DICTIONARY_H
+
+#include <api/api_global.h>
+#include <api/SamReadGroup.h>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+typedef std::vector<SamReadGroup> SamReadGroupContainer;
+typedef SamReadGroupContainer::iterator SamReadGroupIterator;
+typedef SamReadGroupContainer::const_iterator SamReadGroupConstIterator;
+
+// stores read groups
+// can access read groups using SamReadGroup object or (std::string) read group ID tag
+class API_EXPORT SamReadGroupDictionary {
+
+ // ctor & dtor
+ public:
+ SamReadGroupDictionary(void);
+ ~SamReadGroupDictionary(void);
+
+ // query/modify read group data
+ public:
+ // add a read group
+ void Add(const SamReadGroup& readGroup);
+ void Add(const std::string& readGroupIds);
+
+ // add multiple read groups
+ void Add(const std::vector<SamReadGroup>& readGroups);
+ void Add(const std::vector<std::string>& readGroupIds);
+
+ // clear all read groups records
+ void Clear(void);
+
+ // returns true if dictionary contains this read group
+ bool Contains(const SamReadGroup& readGroup) const;
+ bool Contains(const std::string& readGroupId) const;
+
+ // returns true if dictionary is empty
+ bool IsEmpty(void) const;
+
+ // remove a single read group (does nothing if read group not found)
+ void Remove(const SamReadGroup& readGroup);
+ void Remove(const std::string& readGroupId);
+
+ // remove multiple read groups
+ void Remove(const std::vector<SamReadGroup>& readGroups);
+ void Remove(const std::vector<std::string>& readGroupIds);
+
+ // returns size of dictionary (number of current elements)
+ int Size(void) const;
+
+ // retrieves the SamReadGroup object associated with this ID
+ // if readGroupId is unknown, a new SamReadGroup is created with this ID (and no other data)
+ // and a reference to this new read group entry is returned (like std::map)
+ //
+ // * To avoid these partial entries being created, it is recommended to check
+ // for existence first using Contains()
+ SamReadGroup& operator[](const std::string& readGroupId);
+
+ // retrieve read group iterators
+ // these are typedefs for STL iterators and thus are compatible with STL containers/algorithms
+ public:
+ SamReadGroupIterator Begin(void);
+ SamReadGroupConstIterator Begin(void) const;
+ SamReadGroupConstIterator ConstBegin(void) const;
+ SamReadGroupIterator End(void);
+ SamReadGroupConstIterator End(void) const;
+ SamReadGroupConstIterator ConstEnd(void) const;
+
+ // internal methods
+ private:
+ int IndexOf(const SamReadGroup& readGroup) const;
+ int IndexOf(const std::string& readGroupId) const;
+
+ // data members
+ private:
+ SamReadGroupContainer m_data;
+};
+
+} // namespace BamTools
+
+#endif // SAM_READGROUP_DICTIONARY
--- /dev/null
+// ***************************************************************************
+// SamSequence.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for querying/manipulating sequence data
+// *************************************************************************
+
+#include <api/SamSequence.h>
+using namespace BamTools;
+using namespace std;
+
+// ctor
+SamSequence::SamSequence(const string& name)
+ : Name(name)
+ , Length("")
+ , AssemblyID("")
+ , Checksum("")
+ , URI("")
+ , Species("")
+{ }
+
+// dtor
+SamSequence::~SamSequence(void) {
+ Clear();
+}
+
+// clear all contents
+void SamSequence::Clear(void) {
+ Name.clear();
+ Length.clear();
+ AssemblyID.clear();
+ Checksum.clear();
+ URI.clear();
+ Species.clear();
+}
+
+// convenience methods to check if SamSequence contains these values:
+bool SamSequence::HasName(void) const { return (!Name.empty()); }
+bool SamSequence::HasLength(void) const { return (!Length.empty()); }
+bool SamSequence::HasAssemblyID(void) const { return (!AssemblyID.empty()); }
+bool SamSequence::HasChecksum(void) const { return (!Checksum.empty()); }
+bool SamSequence::HasURI(void) const { return (!URI.empty()); }
+bool SamSequence::HasSpecies(void) const { return (!Species.empty()); }
--- /dev/null
+// ***************************************************************************
+// SamSequence.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for querying/manipulating sequence data
+// **************************************************************************
+
+#ifndef SAM_SEQUENCE_H
+#define SAM_SEQUENCE_H
+
+#include <api/api_global.h>
+#include <string>
+
+namespace BamTools {
+
+class API_EXPORT SamSequence {
+
+ // ctor & dtor
+ public:
+ SamSequence(const std::string& name = "");
+ ~SamSequence(void);
+
+ // public methods
+ public:
+
+ // clear all contents
+ void Clear(void);
+
+ // convenience methods to check if SamSequence contains these values:
+ bool HasName(void) const;
+ bool HasLength(void) const;
+ bool HasAssemblyID(void) const;
+ bool HasChecksum(void) const;
+ bool HasURI(void) const;
+ bool HasSpecies(void) const;
+
+ // data members
+ public:
+ std::string Name; // SN:<Name>
+ std::string Length; // LN:<Length>
+ std::string AssemblyID; // AS:<AssemblyID>
+ std::string Checksum; // M5:<Checksum>
+ std::string URI; // UR:<URI>
+ std::string Species; // SP:<Species>
+};
+
+// ---------------------------------------------------
+// comparison operators
+
+// for equality: compare Name, Length, & Checksum (if it exists for both)
+inline bool operator==(const SamSequence& lhs, const SamSequence& rhs) {
+ if ( lhs.Name != rhs.Name ) return false;
+ if ( lhs.Length != rhs.Length ) return false;
+ if ( lhs.HasChecksum() && rhs.HasChecksum() )
+ return (lhs.Checksum == rhs.Checksum);
+ else return true;
+}
+
+} // namespace BamTools
+
+#endif // SAM_SEQUENCE_H
--- /dev/null
+#include <api/SamSequenceDictionary.h>
+using namespace BamTools;
+
+#include <iostream>
+using namespace std;
+
+// ctor
+SamSequenceDictionary::SamSequenceDictionary(void) { }
+
+// dtor
+SamSequenceDictionary::~SamSequenceDictionary(void) {
+ m_data.clear();
+}
+
+// adds sequence if not already in container
+void SamSequenceDictionary::Add(const SamSequence& sequence) {
+ if ( IsEmpty() || !Contains(sequence) )
+ m_data.push_back(sequence);
+}
+
+// overload to support std::string
+void SamSequenceDictionary::Add(const string& sequenceName) {
+ Add( SamSequence(sequenceName) );
+}
+
+// add multiple sequences
+void SamSequenceDictionary::Add(const vector<SamSequence>& sequences) {
+ vector<SamSequence>::const_iterator rgIter = sequences.begin();
+ vector<SamSequence>::const_iterator rgEnd = sequences.end();
+ for ( ; rgIter!= rgEnd; ++rgIter )
+ Add(*rgIter);
+}
+
+// overload to support std::string
+void SamSequenceDictionary::Add(const vector<string>& sequenceNames) {
+ vector<string>::const_iterator rgIter = sequenceNames.begin();
+ vector<string>::const_iterator rgEnd = sequenceNames.end();
+ for ( ; rgIter!= rgEnd; ++rgIter )
+ Add(*rgIter);
+}
+
+// returns iterator to container begin
+SamSequenceIterator SamSequenceDictionary::Begin(void) {
+ return m_data.begin();
+}
+
+// returns const_iterator to container begin
+SamSequenceConstIterator SamSequenceDictionary::Begin(void) const {
+ return m_data.begin();
+}
+
+// clear sequence container
+void SamSequenceDictionary::Clear(void) {
+ m_data.clear();
+}
+
+// explicit request for const_iterator to container begin
+SamSequenceConstIterator SamSequenceDictionary::ConstBegin(void) const {
+ return m_data.begin();
+}
+
+// explicit request for const_iterator to container end
+SamSequenceConstIterator SamSequenceDictionary::ConstEnd(void) const {
+ return m_data.end();
+}
+
+// returns true if container contains a sequence with this ID tag
+bool SamSequenceDictionary::Contains(const string& sequenceName) const {
+ return ( IndexOf(sequenceName) != (int)m_data.size() );
+}
+
+bool SamSequenceDictionary::Contains(const SamSequence& seq) const {
+ return ( IndexOf(seq) != (int)m_data.size() );
+}
+
+// returns iterator to container end
+SamSequenceIterator SamSequenceDictionary::End(void) {
+ return m_data.end();
+}
+
+// returns const_iterator to container begin
+SamSequenceConstIterator SamSequenceDictionary::End(void) const {
+ return m_data.end();
+}
+
+// returns vector index of sequence if found
+// returns vector::size() (invalid index) if not found
+int SamSequenceDictionary::IndexOf(const SamSequence& sequence) const {
+ SamSequenceConstIterator begin = ConstBegin();
+ SamSequenceConstIterator iter = begin;
+ SamSequenceConstIterator end = ConstEnd();
+ for ( ; iter != end; ++iter )
+ if ( *iter == sequence ) break;
+ return distance( begin, iter );
+}
+
+// overload to support std::string
+int SamSequenceDictionary::IndexOf(const string& sequenceName) const {
+ return IndexOf( SamSequence(sequenceName) );
+}
+
+// returns true if container is empty
+bool SamSequenceDictionary::IsEmpty(void) const {
+ return m_data.empty();
+}
+
+// removes sequence (if it exists)
+void SamSequenceDictionary::Remove(const SamSequence& sequence) {
+ if ( Contains(sequence) )
+ m_data.erase( m_data.begin() + IndexOf(sequence) );
+}
+
+// overlaod to support std::string
+void SamSequenceDictionary::Remove(const string& sequenceName) {
+ Remove( SamSequence(sequenceName) );
+}
+
+// remove multiple sequences
+void SamSequenceDictionary::Remove(const vector<SamSequence>& sequences) {
+ vector<SamSequence>::const_iterator rgIter = sequences.begin();
+ vector<SamSequence>::const_iterator rgEnd = sequences.end();
+ for ( ; rgIter!= rgEnd; ++rgIter )
+ Remove(*rgIter);
+}
+
+// overload to support std::string
+void SamSequenceDictionary::Remove(const vector<string>& sequenceNames) {
+ vector<string>::const_iterator rgIter = sequenceNames.begin();
+ vector<string>::const_iterator rgEnd = sequenceNames.end();
+ for ( ; rgIter!= rgEnd; ++rgIter )
+ Remove(*rgIter);
+}
+
+// returns size of container (number of current sequences)
+int SamSequenceDictionary::Size(void) const {
+ return m_data.size();
+}
+
+// retrieves the SamSequence object associated with this name
+// if sequenceName is unknown, a new SamSequence is created with this name (and invalid length 0)
+// and a reference to this new sequence entry is returned (like std::map)
+SamSequence& SamSequenceDictionary::operator[](const std::string& sequenceName) {
+
+ // look up sequence ID
+ int index = IndexOf(sequenceName);
+
+ // if found, return sequence at index
+ if ( index != (int)m_data.size() )
+ return m_data[index];
+
+ // otherwise, append new sequence and return reference
+ else {
+ SamSequence seq(sequenceName);
+ seq.Length = "0";
+ m_data.push_back(seq);
+ return m_data.back();
+ }
+}
+
--- /dev/null
+// ***************************************************************************
+// SamSequenceDictionary.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides container operations for collection of sequence entries
+// *************************************************************************
+
+#ifndef SAM_SEQUENCE_DICTIONARY_H
+#define SAM_SEQUENCE_DICTIONARY_H
+
+#include <api/api_global.h>
+#include <api/SamSequence.h>
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+typedef std::vector<SamSequence> SamSequenceContainer;
+typedef SamSequenceContainer::iterator SamSequenceIterator;
+typedef SamSequenceContainer::const_iterator SamSequenceConstIterator;
+
+class API_EXPORT SamSequenceDictionary {
+
+ // ctor & dtor
+ public:
+ SamSequenceDictionary(void);
+ ~SamSequenceDictionary(void);
+
+ // query/modify sequence data
+ public:
+ // add a sequence
+ void Add(const SamSequence& sequence);
+ void Add(const std::string& sequenceNames);
+
+ // add multiple sequences
+ void Add(const std::vector<SamSequence>& sequences);
+ void Add(const std::vector<std::string>& sequenceNames);
+
+ // clear all sequence records
+ void Clear(void);
+
+ // returns true if dictionary contains this sequence
+ bool Contains(const SamSequence& sequence) const;
+ bool Contains(const std::string& sequenceName) const;
+
+ // returns true if dictionary is empty
+ bool IsEmpty(void) const;
+
+ // remove a single sequence (does nothing if sequence not found)
+ void Remove(const SamSequence& sequence);
+ void Remove(const std::string& sequenceName);
+
+ // remove multiple sequences
+ void Remove(const std::vector<SamSequence>& sequences);
+ void Remove(const std::vector<std::string>& sequenceNames);
+
+ // returns size of dictionary (number of current elements)
+ int Size(void) const;
+
+ // retrieves the SamSequence object associated with this name
+ // if sequenceName is unknown, a new SamSequence is created with this name (and invalid length 0)
+ // and a reference to this new sequence entry is returned (like std::map)
+ //
+ // * To avoid these partial entries being created, it is recommended to check
+ // for existence first using Contains()
+ SamSequence& operator[](const std::string& sequenceName);
+
+ // retrieve sequence iterators
+ // these are typedefs for STL iterators and thus are compatible with STL containers/algorithms
+ public:
+ SamSequenceIterator Begin(void);
+ SamSequenceConstIterator Begin(void) const;
+ SamSequenceConstIterator ConstBegin(void) const;
+ SamSequenceIterator End(void);
+ SamSequenceConstIterator End(void) const;
+ SamSequenceConstIterator ConstEnd(void) const;
+
+ // internal methods
+ private:
+ int IndexOf(const SamSequence& sequence) const;
+ int IndexOf(const std::string& sequenceName) const;
+
+ // data members
+ private:
+ SamSequenceContainer m_data;
+};
+
+} // namespace BamTools
+
+#endif // SAM_SEQUENCE_DICTIONARY
+
--- /dev/null
+// ***************************************************************************
+// SamFormatParser.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for parsing SAM header text into SamHeader object
+// ***************************************************************************
+
+#include <api/SamConstants.h>
+#include <api/SamHeader.h>
+#include <api/internal/SamFormatParser_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+using namespace std;
+
+SamFormatParser::SamFormatParser(SamHeader& header)
+ : m_header(header)
+{ }
+
+SamFormatParser::~SamFormatParser(void) { }
+
+void SamFormatParser::Parse(const string& headerText) {
+
+ // clear header's prior contents
+ m_header.Clear();
+
+ // empty header is OK, but skip processing
+ if ( headerText.empty() )
+ return;
+
+ // other wise parse SAM lines
+ istringstream headerStream(headerText);
+ string headerLine = "";
+ while ( getline(headerStream, headerLine) )
+ ParseSamLine(headerLine);
+ return;
+}
+
+void SamFormatParser::ParseSamLine(const string& line) {
+
+ // skip if line is not long enough to contain true values
+ if (line.length() < 5 ) return;
+
+ // determine token at beginning of line
+ const string firstToken = line.substr(0,3);
+ string restOfLine = line.substr(4);
+ if ( firstToken == Constants::SAM_HD_BEGIN_TOKEN) ParseHDLine(restOfLine);
+ else if ( firstToken == Constants::SAM_SQ_BEGIN_TOKEN) ParseSQLine(restOfLine);
+ else if ( firstToken == Constants::SAM_RG_BEGIN_TOKEN) ParseRGLine(restOfLine);
+ else if ( firstToken == Constants::SAM_PG_BEGIN_TOKEN) ParsePGLine(restOfLine);
+ else if ( firstToken == Constants::SAM_CO_BEGIN_TOKEN) ParseCOLine(restOfLine);
+ else cerr << "SAM Format Error - unknown token: " << firstToken << endl;
+ return;
+}
+
+void SamFormatParser::ParseHDLine(const string& line) {
+
+ // split HD lines into tokens
+ vector<string> tokens = Split(line, Constants::SAM_TAB);
+
+ // iterate over tokens
+ vector<string>::const_iterator tokenIter = tokens.begin();
+ vector<string>::const_iterator tokenEnd = tokens.end();
+ for ( ; tokenIter != tokenEnd; ++tokenIter ) {
+
+ // get tag/value
+ const string tokenTag = (*tokenIter).substr(0,2);
+ const string tokenValue = (*tokenIter).substr(3);
+
+ // set header contents
+ if ( tokenTag == Constants::SAM_HD_VERSION_TAG ) m_header.Version = tokenValue;
+ else if ( tokenTag == Constants::SAM_HD_GROUPORDER_TAG ) m_header.GroupOrder = tokenValue;
+ else if ( tokenTag == Constants::SAM_HD_SORTORDER_TAG ) m_header.SortOrder = tokenValue;
+ else
+ cerr << "SAM Format Error - unknown HD tag: " << tokenTag << endl;
+ }
+
+ // if @HD line exists, VN must be provided
+ if ( !m_header.HasVersion() ) {
+ cerr << "SAM Format Error - @HD line is missing VN tag!" << endl;
+ return;
+ }
+}
+
+void SamFormatParser::ParseSQLine(const string& line) {
+
+ SamSequence seq;
+
+ // split SQ line into tokens
+ vector<string> tokens = Split(line, Constants::SAM_TAB);
+
+ // iterate over tokens
+ vector<string>::const_iterator tokenIter = tokens.begin();
+ vector<string>::const_iterator tokenEnd = tokens.end();
+ for ( ; tokenIter != tokenEnd; ++tokenIter ) {
+
+ // get tag/value
+ const string tokenTag = (*tokenIter).substr(0,2);
+ const string tokenValue = (*tokenIter).substr(3);
+
+ // set sequence contents
+ if ( tokenTag == Constants::SAM_SQ_NAME_TAG ) seq.Name = tokenValue;
+ else if ( tokenTag == Constants::SAM_SQ_LENGTH_TAG ) seq.Length = tokenValue;
+ else if ( tokenTag == Constants::SAM_SQ_ASSEMBLYID_TAG ) seq.AssemblyID = tokenValue;
+ else if ( tokenTag == Constants::SAM_SQ_URI_TAG ) seq.URI = tokenValue;
+ else if ( tokenTag == Constants::SAM_SQ_CHECKSUM_TAG ) seq.Checksum = tokenValue;
+ else if ( tokenTag == Constants::SAM_SQ_SPECIES_TAG ) seq.Species = tokenValue;
+ else
+ cerr << "SAM Format Error - unknown SQ tag: " << tokenTag << endl;
+ }
+
+ // if @SQ line exists, SN must be provided
+ if ( !seq.HasName() ) {
+ cerr << "SAM Format Error - @SQ line is missing SN tag!" << endl;
+ return;
+ }
+
+ // if @SQ line exists, LN must be provided
+ if ( !seq.HasLength() ) {
+ cerr << "SAM Format Error - @SQ line is missing LN tag!" << endl;
+ return;
+ }
+
+ // store SAM sequence entry
+ m_header.Sequences.Add(seq);
+}
+
+void SamFormatParser::ParseRGLine(const string& line) {
+
+ SamReadGroup rg;
+
+ // split string into tokens
+ vector<string> tokens = Split(line, Constants::SAM_TAB);
+
+ // iterate over tokens
+ vector<string>::const_iterator tokenIter = tokens.begin();
+ vector<string>::const_iterator tokenEnd = tokens.end();
+ for ( ; tokenIter != tokenEnd; ++tokenIter ) {
+
+ // get token tag/value
+ const string tokenTag = (*tokenIter).substr(0,2);
+ const string tokenValue = (*tokenIter).substr(3);
+
+ // set read group contents
+ if ( tokenTag == Constants::SAM_RG_ID_TAG ) rg.ID = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_SAMPLE_TAG ) rg.Sample = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_LIBRARY_TAG ) rg.Library = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_DESCRIPTION_TAG ) rg.Description = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_PLATFORMUNIT_TAG ) rg.PlatformUnit = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG ) rg.PredictedInsertSize = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_SEQCENTER_TAG ) rg.SequencingCenter = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_PRODUCTIONDATE_TAG ) rg.ProductionDate = tokenValue;
+ else if ( tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG ) rg.SequencingTechnology = tokenValue;
+ else
+ cerr << "SAM Format Error - unknown RG tag: " << tokenTag << endl;
+ }
+
+ // if @RG line exists, ID must be provided
+ if ( !rg.HasID() ) {
+ cerr << "SAM Format Error - @RG line is missing ID tag!" << endl;
+ return;
+ }
+
+ // if @RG line exists, SM must be provided
+ if ( !rg.HasSample() ) {
+ cerr << "SAM Format Error - @RG line is missing SM tag!" << endl;
+ return;
+ }
+
+ // store SAM read group entry
+ m_header.ReadGroups.Add(rg);
+}
+
+void SamFormatParser::ParsePGLine(const string& line) {
+
+ // split string into tokens
+ vector<string> tokens = Split(line, Constants::SAM_TAB);
+
+ // iterate over tokens
+ vector<string>::const_iterator tokenIter = tokens.begin();
+ vector<string>::const_iterator tokenEnd = tokens.end();
+ for ( ; tokenIter != tokenEnd; ++tokenIter ) {
+
+ // get token tag/value
+ const string tokenTag = (*tokenIter).substr(0,2);
+ const string tokenValue = (*tokenIter).substr(3);
+
+ // set header contents
+ if ( tokenTag == Constants::SAM_PG_NAME_TAG ) m_header.ProgramName = tokenValue;
+ else if ( tokenTag == Constants::SAM_PG_VERSION_TAG ) m_header.ProgramVersion = tokenValue;
+ else if ( tokenTag == Constants::SAM_PG_COMMANDLINE_TAG ) m_header.ProgramCommandLine = tokenValue;
+ else
+ cerr << "SAM Format Error - unknown PG tag: " << tokenTag << endl;
+ }
+
+ // if @PG line exists, ID must be provided
+ if ( !m_header.HasProgramName() ) {
+ cerr << "SAM Format Error - @PG line is missing ID tag!" << endl;
+ return;
+ }
+}
+
+void SamFormatParser::ParseCOLine(const string& line) {
+ // simply add line to comments list
+ m_header.Comments.push_back(line);
+}
+
+const vector<string> SamFormatParser::Split(const string& line, const char delim) {
+ vector<string> tokens;
+ stringstream lineStream(line);
+ string token;
+ while ( getline(lineStream, token, delim) )
+ tokens.push_back(token);
+ return tokens;
+}
--- /dev/null
+// ***************************************************************************
+// SamFormatParser.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for parsing SAM header text into SamHeader object
+// ***************************************************************************
+
+#ifndef SAM_FORMAT_PARSER_H
+#define SAM_FORMAT_PARSER_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+class SamHeader;
+
+namespace Internal {
+
+class SamFormatParser {
+
+ // ctor & dtor
+ public:
+ SamFormatParser(BamTools::SamHeader& header);
+ ~SamFormatParser(void);
+
+ // parse text & populate header data
+ public:
+ void Parse(const std::string& headerText);
+
+ // internal methods
+ private:
+ void ParseSamLine(const std::string& line);
+ void ParseHDLine(const std::string& line);
+ void ParseSQLine(const std::string& line);
+ void ParseRGLine(const std::string& line);
+ void ParsePGLine(const std::string& line);
+ void ParseCOLine(const std::string& line);
+ const std::vector<std::string> Split(const std::string& line, const char delim);
+
+ // data members
+ private:
+ SamHeader& m_header;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // SAM_FORMAT_PARSER_H
--- /dev/null
+// ***************************************************************************
+// SamFormatPrinter.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for printing formatted SAM header to string
+// ***************************************************************************
+
+#include <api/SamConstants.h>
+#include <api/SamHeader.h>
+#include <api/internal/SamFormatPrinter_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+using namespace std;
+
+SamFormatPrinter::SamFormatPrinter(const SamHeader& header)
+ : m_header(header)
+{ }
+
+SamFormatPrinter::~SamFormatPrinter(void) { }
+
+const string SamFormatPrinter::FormatTag(const string &tag, const string &value) const {
+ return string(Constants::SAM_TAB + tag + Constants::SAM_COLON + value);
+}
+
+const string SamFormatPrinter::ToString(void) const {
+
+ // clear out stream
+ stringstream out("");
+
+ // generate formatted header text
+ PrintHD(out);
+ PrintSQ(out);
+ PrintRG(out);
+ PrintPG(out);
+ PrintCO(out);
+
+ // return result
+ return out.str();
+}
+
+void SamFormatPrinter::PrintHD(std::stringstream& out) const {
+
+ // if header has @HD data
+ if ( m_header.HasVersion() ) {
+
+ // @HD VN:<Version>
+ out << Constants::SAM_HD_BEGIN_TOKEN
+ << FormatTag(Constants::SAM_HD_VERSION_TAG, m_header.Version);
+
+ // SO:<SortOrder>
+ if ( m_header.HasSortOrder() )
+ out << FormatTag(Constants::SAM_HD_SORTORDER_TAG, m_header.SortOrder);
+
+ // GO:<GroupOrder>
+ if ( m_header.HasGroupOrder() )
+ out << FormatTag(Constants::SAM_HD_GROUPORDER_TAG, m_header.GroupOrder);
+
+ // newline
+ out << endl;
+ }
+}
+
+void SamFormatPrinter::PrintSQ(std::stringstream& out) const {
+
+ // iterate over sequence entries
+ SamSequenceConstIterator seqIter = m_header.Sequences.ConstBegin();
+ SamSequenceConstIterator seqEnd = m_header.Sequences.ConstEnd();
+ for ( ; seqIter != seqEnd; ++seqIter ) {
+ const SamSequence& seq = (*seqIter);
+
+ // @SQ SN:<Name> LN:<Length>
+ out << Constants::SAM_SQ_BEGIN_TOKEN
+ << FormatTag(Constants::SAM_SQ_NAME_TAG, seq.Name)
+ << FormatTag(Constants::SAM_SQ_LENGTH_TAG, seq.Length);
+
+ // AS:<AssemblyID>
+ if ( seq.HasAssemblyID() )
+ out << FormatTag(Constants::SAM_SQ_ASSEMBLYID_TAG, seq.AssemblyID);
+
+ // M5:<Checksum>
+ if ( seq.HasChecksum() )
+ out << FormatTag(Constants::SAM_SQ_CHECKSUM_TAG, seq.Checksum);
+
+ // UR:<URI>
+ if ( seq.HasURI() )
+ out << FormatTag(Constants::SAM_SQ_URI_TAG, seq.URI);
+
+ // SP:<Species>
+ if ( seq.HasSpecies() )
+ out << FormatTag(Constants::SAM_SQ_SPECIES_TAG, seq.Species);
+
+ // newline
+ out << endl;
+ }
+}
+
+void SamFormatPrinter::PrintRG(std::stringstream& out) const {
+
+ // iterate over read group entries
+ SamReadGroupConstIterator rgIter = m_header.ReadGroups.ConstBegin();
+ SamReadGroupConstIterator rgEnd = m_header.ReadGroups.ConstEnd();
+ for ( ; rgIter != rgEnd; ++rgIter ) {
+ const SamReadGroup& rg = (*rgIter);
+
+ // @RG ID:<ID> SM:<Sample>
+ out << Constants::SAM_RG_BEGIN_TOKEN
+ << FormatTag(Constants::SAM_RG_ID_TAG, rg.ID)
+ << FormatTag(Constants::SAM_RG_SAMPLE_TAG, rg.Sample);
+
+ // LB:<Library>
+ if ( rg.HasLibrary() )
+ out << FormatTag(Constants::SAM_RG_LIBRARY_TAG, rg.Library);
+
+ // DS:<Description>
+ if ( rg.HasDescription() )
+ out << FormatTag(Constants::SAM_RG_DESCRIPTION_TAG, rg.Description);
+
+ // PU:<PlatformUnit>
+ if ( rg.HasPlatformUnit() )
+ out << FormatTag(Constants::SAM_RG_PLATFORMUNIT_TAG, rg.PlatformUnit);
+
+ // PI:<PredictedInsertSize>
+ if ( rg.HasPredictedInsertSize() )
+ out << FormatTag(Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG, rg.PredictedInsertSize);
+
+ // CN:<SequencingCenter>
+ if ( rg.HasSequencingCenter() )
+ out << FormatTag(Constants::SAM_RG_SEQCENTER_TAG, rg.SequencingCenter);
+
+ // DT:<ProductionDate>
+ if ( rg.HasProductionDate() )
+ out << FormatTag(Constants::SAM_RG_PRODUCTIONDATE_TAG, rg.ProductionDate);
+
+ // PL:<SequencingTechnology>
+ if ( rg.HasSequencingTechnology() )
+ out << FormatTag(Constants::SAM_RG_SEQTECHNOLOGY_TAG, rg.SequencingTechnology);
+
+ // newline
+ out << endl;
+ }
+}
+
+void SamFormatPrinter::PrintPG(std::stringstream& out) const {
+
+ // if header has @PG data
+ if ( m_header.HasProgramName() ) {
+
+ // @PG ID:<ProgramName>
+ out << Constants::SAM_PG_BEGIN_TOKEN
+ << FormatTag(Constants::SAM_PG_NAME_TAG, m_header.ProgramName);
+
+ // VN:<ProgramVersion>
+ if ( m_header.HasProgramVersion() )
+ out << FormatTag(Constants::SAM_PG_VERSION_TAG, m_header.ProgramVersion);
+
+ // CL:<ProgramCommandLine>
+ if ( m_header.HasProgramCommandLine() )
+ out << FormatTag(Constants::SAM_PG_COMMANDLINE_TAG, m_header.ProgramCommandLine);
+
+ // newline
+ out << endl;
+ }
+}
+
+void SamFormatPrinter::PrintCO(std::stringstream& out) const {
+
+ // iterate over comments
+ vector<string>::const_iterator commentIter = m_header.Comments.begin();
+ vector<string>::const_iterator commentEnd = m_header.Comments.end();
+ for ( ; commentIter != commentEnd; ++commentIter ) {
+
+ // @CO <Comment>
+ out << Constants::SAM_CO_BEGIN_TOKEN
+ << Constants::SAM_TAB
+ << (*commentIter)
+ << endl;
+ }
+}
--- /dev/null
+// ***************************************************************************
+// SamFormatPrinter.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for printing formatted SAM header to string
+// ***************************************************************************
+
+#ifndef SAM_FORMAT_PRINTER_H
+#define SAM_FORMAT_PRINTER_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <sstream>
+#include <string>
+
+namespace BamTools {
+
+class SamHeader;
+
+namespace Internal {
+
+class SamFormatPrinter {
+
+ // ctor & dtor
+ public:
+ SamFormatPrinter(const BamTools::SamHeader& header);
+ ~SamFormatPrinter(void);
+
+ // generates SAM-formatted string from header data
+ public:
+ const std::string ToString(void) const;
+
+ // internal methods
+ private:
+ const std::string FormatTag(const std::string& tag, const std::string& value) const;
+ void PrintHD(std::stringstream& out) const;
+ void PrintSQ(std::stringstream& out) const;
+ void PrintRG(std::stringstream& out) const;
+ void PrintPG(std::stringstream& out) const;
+ void PrintCO(std::stringstream& out) const;
+
+ // data members
+ private:
+ const SamHeader& m_header;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // SAM_FORMAT_PRINTER_H
--- /dev/null
+// ***************************************************************************
+// SamHeaderValidator.cpp (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for validating SamHeader data
+// ***************************************************************************
+
+#include <api/SamConstants.h>
+#include <api/SamHeader.h>
+#include <api/internal/SamHeaderValidator_p.h>
+#include <api/internal/SamHeaderVersion_p.h>
+using namespace BamTools;
+using namespace BamTools::Internal;
+
+#include <iostream>
+#include <set>
+#include <sstream>
+#include <vector>
+using namespace std;
+
+// -------------------------------------------------------------------
+// Allow validation rules to vary between SAM header versions
+//
+// use SAM_VERSION_X_Y to tag important changes
+//
+// Together, they will allow for comparisons like:
+// if ( m_version < SAM_VERSION_2_0 ) {
+// // use some older rule
+// else
+// // use rule introduced with version 2.0
+
+static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0);
+static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3);
+
+// -----------------------------------------
+// SamHeaderValidatorPrivate implementation
+
+class SamHeaderValidator::SamHeaderValidatorPrivate {
+
+ // ctor & dtor
+ public:
+ SamHeaderValidatorPrivate(const SamHeader& header);
+ ~SamHeaderValidatorPrivate(void) { }
+
+ // 'public' methods
+ public:
+ bool Validate(bool verbose);
+
+ // internal validation methods
+ private:
+
+ // validate header metadata
+ bool ValidateMetadata(void);
+ bool ValidateVersion(void);
+ bool ContainsOnlyDigits(const string& s);
+ bool ValidateSortOrder(void);
+ bool ValidateGroupOrder(void);
+
+ // validate sequence dictionary
+ bool ValidateSequenceDictionary(void);
+ bool ContainsUniqueSequenceNames(void);
+ bool CheckNameFormat(const string& name);
+ bool ValidateSequence(const SamSequence& seq);
+ bool CheckLengthInRange(const string& length);
+
+ // validate read group dictionary
+ bool ValidateReadGroupDictionary(void);
+ bool ValidateReadGroup(const SamReadGroup& rg);
+ bool ContainsUniqueIDsAndPlatformUnits(void);
+ bool CheckReadGroupID(const string& id);
+ bool CheckSequencingTechnology(const string& technology);
+ bool Is454(const string& technology);
+ bool IsHelicos(const string& technology);
+ bool IsIllumina(const string& technology);
+ bool IsPacBio(const string& technology);
+ bool IsSolid(const string& technology);
+
+ // validate program data
+ bool ValidateProgramData(void);
+ bool ContainsUniqueProgramIds(void);
+ bool ValidatePreviousProgramIds(void);
+
+ // error reporting
+ private:
+ void AddError(const string& message);
+ void AddWarning(const string& message);
+ void PrintErrorMessages(void);
+ void PrintWarningMessages(void);
+
+ // data members
+ private:
+ const SamHeader& m_header;
+ const SamHeaderVersion m_version;
+
+ bool m_isVerboseOutput;
+ const string ERROR_PREFIX;
+ const string WARN_PREFIX;
+ const string NEWLINE;
+ vector<string> m_errorMessages;
+ vector<string> m_warningMessages;
+};
+
+SamHeaderValidator::SamHeaderValidatorPrivate::SamHeaderValidatorPrivate(const SamHeader& header)
+ : m_header(header)
+ , m_version( header.Version )
+ , m_isVerboseOutput(false)
+ , ERROR_PREFIX("ERROR: ")
+ , WARN_PREFIX("WARNING: ")
+ , NEWLINE("\n")
+{ }
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::Validate(bool verbose) {
+
+ // set error reporting mode
+ m_isVerboseOutput = verbose;
+
+ // validate header components
+ bool isValid = true;
+ isValid &= ValidateMetadata();
+ isValid &= ValidateSequenceDictionary();
+ isValid &= ValidateReadGroupDictionary();
+ isValid &= ValidateProgramData();
+
+ // report errors if desired
+ if ( m_isVerboseOutput ) {
+ PrintErrorMessages();
+ PrintWarningMessages();
+ }
+
+ // return validation status
+ return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateMetadata(void) {
+ bool isValid = true;
+ isValid &= ValidateVersion();
+ isValid &= ValidateSortOrder();
+ isValid &= ValidateGroupOrder();
+ return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateVersion(void) {
+
+ const string& version = m_header.Version;
+
+ // warn if version not present
+ if ( version.empty() ) {
+ AddWarning("Version (VN) missing. Not required, but strongly recommended");
+ return true;
+ }
+
+ // invalid if version does not contain a period
+ const size_t periodFound = version.find(Constants::SAM_PERIOD);
+ if ( periodFound == string::npos ) {
+ AddError("Invalid version (VN) format: " + version);
+ return false;
+ }
+
+ // invalid if major version is empty or contains non-digits
+ const string majorVersion = version.substr(0, periodFound);
+ if ( majorVersion.empty() || !ContainsOnlyDigits(majorVersion) ) {
+ AddError("Invalid version (VN) format: " + version);
+ return false;
+ }
+
+ // invalid if major version is empty or contains non-digits
+ const string minorVersion = version.substr(periodFound + 1);
+ if ( minorVersion.empty() || !ContainsOnlyDigits(minorVersion) ) {
+ AddError("Invalid version (VN) format: " + version);
+ return false;
+ }
+
+ // TODO: check if version is not just syntactically OK,
+ // but is also a valid SAM version ( 1.0 .. CURRENT )
+
+ // all checked out this far, then version is OK
+ return true;
+}
+
+// assumes non-empty input string
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsOnlyDigits(const string& s) {
+ const size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS);
+ return ( nonDigitPosition == string::npos ) ;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateSortOrder(void) {
+
+ const string& sortOrder = m_header.SortOrder;
+
+ // warn if sort order not present
+ if ( sortOrder.empty() ) {
+ AddWarning("Sort order (SO) missing. Not required, but strongly recommended");
+ return true;
+ }
+
+ // if sort order is valid keyword
+ if ( sortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ||
+ sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ||
+ sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED
+ )
+ { return true; }
+
+ // otherwise
+ AddError("Invalid sort order (SO): " + sortOrder);
+ return false;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateGroupOrder(void) {
+
+ const string& groupOrder = m_header.GroupOrder;
+
+ // if no group order, no problem, just return OK
+ if ( groupOrder.empty() ) return true;
+
+ // if group order is valid keyword
+ if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE ||
+ groupOrder == Constants::SAM_HD_GROUPORDER_QUERY ||
+ groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE
+ )
+ { return true; }
+
+ // otherwise
+ AddError("Invalid group order (GO): " + groupOrder);
+ return false;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateSequenceDictionary(void) {
+
+ // TODO: warn/error if no sequences ?
+
+ bool isValid = true;
+
+ // check for unique sequence names
+ isValid &= ContainsUniqueSequenceNames();
+
+ // iterate over sequences
+ const SamSequenceDictionary& sequences = m_header.Sequences;
+ SamSequenceConstIterator seqIter = sequences.ConstBegin();
+ SamSequenceConstIterator seqEnd = sequences.ConstEnd();
+ for ( ; seqIter != seqEnd; ++seqIter ) {
+ const SamSequence& seq = (*seqIter);
+ isValid &= ValidateSequence(seq);
+ }
+
+ // return validation state
+ return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsUniqueSequenceNames(void) {
+
+ bool isValid = true;
+ set<string> sequenceNames;
+ set<string>::iterator nameIter;
+
+ // iterate over sequences
+ const SamSequenceDictionary& sequences = m_header.Sequences;
+ SamSequenceConstIterator seqIter = sequences.ConstBegin();
+ SamSequenceConstIterator seqEnd = sequences.ConstEnd();
+ for ( ; seqIter != seqEnd; ++seqIter ) {
+ const SamSequence& seq = (*seqIter);
+ const string& name = seq.Name;
+
+ // lookup sequence name
+ nameIter = sequenceNames.find(name);
+
+ // error if found (duplicate entry)
+ if ( nameIter != sequenceNames.end() ) {
+ AddError("Sequence name (SN): " + name + " is not unique");
+ isValid = false;
+ }
+
+ // otherwise ok, store name
+ sequenceNames.insert(name);
+ }
+
+ // return validation state
+ return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateSequence(const SamSequence& seq) {
+ bool isValid = true;
+ isValid &= CheckNameFormat(seq.Name);
+ isValid &= CheckLengthInRange(seq.Length);
+ return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckNameFormat(const string& name) {
+
+ // invalid if name is empty
+ if ( name.empty() ) {
+ AddError("Sequence entry (@SQ) is missing SN tag");
+ return false;
+ }
+
+ // invalid if first character is a reserved char
+ const char firstChar = name.at(0);
+ if ( firstChar == Constants::SAM_EQUAL || firstChar == Constants::SAM_STAR ) {
+ AddError("Invalid sequence name (SN): " + name);
+ return false;
+ }
+ // otherwise OK
+ return true;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckLengthInRange(const string& length) {
+
+ // invalid if empty
+ if ( length.empty() ) {
+ AddError("Sequence entry (@SQ) is missing LN tag");
+ return false;
+ }
+
+ // convert string length to numeric
+ stringstream lengthStream(length);
+ unsigned int sequenceLength;
+ lengthStream >> sequenceLength;
+
+ // invalid if length outside accepted range
+ if ( sequenceLength < Constants::SAM_SQ_LENGTH_MIN || sequenceLength > Constants::SAM_SQ_LENGTH_MAX ) {
+ AddError("Sequence length (LN): " + length + " out of range");
+ return false;
+ }
+
+ // otherwise OK
+ return true;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateReadGroupDictionary(void) {
+
+ // TODO: warn/error if no read groups ?
+
+ bool isValid = true;
+
+ // check for unique read group IDs & platform units
+ isValid &= ContainsUniqueIDsAndPlatformUnits();
+
+ // iterate over read groups
+ const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
+ SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
+ SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
+ for ( ; rgIter != rgEnd; ++rgIter ) {
+ const SamReadGroup& rg = (*rgIter);
+ isValid &= ValidateReadGroup(rg);
+ }
+
+ // return validation state
+ return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsUniqueIDsAndPlatformUnits(void) {
+
+ bool isValid = true;
+ set<string> readGroupIds;
+ set<string> platformUnits;
+ set<string>::iterator idIter;
+ set<string>::iterator puIter;
+
+ // iterate over sequences
+ const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
+ SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
+ SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
+ for ( ; rgIter != rgEnd; ++rgIter ) {
+ const SamReadGroup& rg = (*rgIter);
+
+ // --------------------------------
+ // check for unique ID
+
+ // lookup read group ID
+ const string& id = rg.ID;
+ idIter = readGroupIds.find(id);
+
+ // error if found (duplicate entry)
+ if ( idIter != readGroupIds.end() ) {
+ AddError("Read group ID (ID): " + id + " is not unique");
+ isValid = false;
+ }
+
+ // otherwise ok, store id
+ readGroupIds.insert(id);
+
+ // --------------------------------
+ // check for unique platform unit
+
+ // lookup platform unit
+ const string& pu = rg.PlatformUnit;
+ puIter = platformUnits.find(pu);
+
+ // error if found (duplicate entry)
+ if ( puIter != platformUnits.end() ) {
+ AddError("Platform unit (PU): " + pu + " is not unique");
+ isValid = false;
+ }
+
+ // otherwise ok, store platform unit
+ platformUnits.insert(pu);
+ }
+
+ // return validation state
+ return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateReadGroup(const SamReadGroup& rg) {
+ bool isValid = true;
+ isValid &= CheckReadGroupID(rg.ID);
+ isValid &= CheckSequencingTechnology(rg.SequencingTechnology);
+ return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckReadGroupID(const string& id) {
+
+ // invalid if empty
+ if ( id.empty() ) {
+ AddError("Read group entry (@RG) is missing ID tag");
+ return false;
+ }
+
+ // otherwise OK
+ return true;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckSequencingTechnology(const string& technology) {
+
+ // if no technology provided, no problem, just return OK
+ if ( technology.empty() ) return true;
+
+ // if technology is valid keyword
+ if ( Is454(technology) ||
+ IsHelicos(technology) ||
+ IsIllumina(technology) ||
+ IsPacBio(technology) ||
+ IsSolid(technology)
+ )
+ { return true; }
+
+ // otherwise
+ AddError("Invalid read group sequencing platform (PL): " + technology);
+ return false;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::Is454(const string& technology) {
+ return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_454 ||
+ technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_LOWER ||
+ technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_UPPER
+ );
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::IsHelicos(const string& technology) {
+ return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_LOWER ||
+ technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_UPPER
+ );
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::IsIllumina(const string& technology) {
+ return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_LOWER ||
+ technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_UPPER
+ );
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::IsPacBio(const string& technology) {
+ return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_LOWER ||
+ technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_UPPER
+ );
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::IsSolid(const string& technology) {
+ return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_LOWER ||
+ technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_UPPER
+ );
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateProgramData(void) {
+ bool isValid = true;
+ isValid &= ContainsUniqueProgramIds();
+ isValid &= ValidatePreviousProgramIds();
+ return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsUniqueProgramIds(void) {
+ bool isValid = true;
+ // TODO: once we have ability to handle multiple @PG entries,
+ // check here for duplicate ID's
+ // but for now, just return true
+ return isValid;
+}
+
+bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidatePreviousProgramIds(void) {
+ bool isValid = true;
+ // TODO: check that PP entries are valid later, after we get multiple @PG-entry handling
+ // just return true for now
+ return isValid;
+}
+void SamHeaderValidator::SamHeaderValidatorPrivate::AddError(const string& message) {
+ m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE);
+}
+
+void SamHeaderValidator::SamHeaderValidatorPrivate::AddWarning(const string& message) {
+ m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE);
+}
+
+void SamHeaderValidator::SamHeaderValidatorPrivate::PrintErrorMessages(void) {
+
+ // skip if no error messages
+ if ( m_errorMessages.empty() ) return;
+
+ // print error header line
+ cerr << "* SAM header has " << m_errorMessages.size() << " errors:" << endl;
+
+ // print each error message
+ vector<string>::const_iterator errorIter = m_errorMessages.begin();
+ vector<string>::const_iterator errorEnd = m_errorMessages.end();
+ for ( ; errorIter != errorEnd; ++errorIter )
+ cerr << (*errorIter);
+}
+
+void SamHeaderValidator::SamHeaderValidatorPrivate::PrintWarningMessages(void) {
+
+ // skip if no warning messages
+ if ( m_warningMessages.empty() ) return;
+
+ // print warning header line
+ cerr << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl;
+
+ // print each warning message
+ vector<string>::const_iterator warnIter = m_warningMessages.begin();
+ vector<string>::const_iterator warnEnd = m_warningMessages.end();
+ for ( ; warnIter != warnEnd; ++warnIter )
+ cerr << (*warnIter);
+}
+
+// -----------------------------------
+// SamHeaderValidator implementation
+
+SamHeaderValidator::SamHeaderValidator(const BamTools::SamHeader& header)
+ : d( new SamHeaderValidatorPrivate(header) )
+{ }
+
+SamHeaderValidator::~SamHeaderValidator(void) {
+ delete d;
+ d = 0;
+}
+
+bool SamHeaderValidator::Validate(bool verbose) { return d->Validate(verbose); }
--- /dev/null
+// ***************************************************************************
+// SamHeaderValidator.h (c) 2010 Derek Barnett
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 December 2010 (DB)
+// ---------------------------------------------------------------------------
+// Provides functionality for validating SamHeader data
+// ***************************************************************************
+
+#ifndef SAM_HEADER_VALIDATOR_P_H
+#define SAM_HEADER_VALIDATOR_P_H
+
+// -------------
+// W A R N I N G
+// -------------
+//
+// This file is not part of the BamTools API. It exists purely as an
+// implementation detail. This header file may change from version to version
+// without notice, or even be removed.
+//
+// We mean it.
+
+#include <string>
+#include <vector>
+
+namespace BamTools {
+
+class SamHeader;
+
+namespace Internal {
+
+class SamHeaderValidator {
+
+ public:
+ SamHeaderValidator(const BamTools::SamHeader& header);
+ ~SamHeaderValidator(void);
+
+ public:
+ // validates SamHeader data
+ // prints error & warning messages to stderr when (verbose == true)
+ bool Validate(bool verbose = false);
+
+ private:
+ struct SamHeaderValidatorPrivate;
+ SamHeaderValidatorPrivate* d;
+};
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // SAM_HEADER_VALIDATOR_P_H
--- /dev/null
+#ifndef SAM_HEADERVERSION_P_H
+#define SAM_HEADERVERSION_P_H
+
+#include <api/SamConstants.h>
+#include <sstream>
+#include <string>
+
+namespace BamTools {
+namespace Internal {
+
+class SamHeaderVersion {
+
+ // ctors & dtor
+ public:
+ SamHeaderVersion(void)
+ : m_majorVersion(0)
+ , m_minorVersion(0)
+ { }
+
+ explicit SamHeaderVersion(const std::string& version)
+ : m_majorVersion(0)
+ , m_minorVersion(0)
+ {
+ SetVersion(version);
+ }
+
+ SamHeaderVersion(const unsigned int& major, const unsigned int& minor)
+ : m_majorVersion(major)
+ , m_minorVersion(minor)
+ { }
+
+ ~SamHeaderVersion(void) {
+ m_majorVersion = 0;
+ m_minorVersion = 0;
+ }
+
+ // acess data
+ public:
+ unsigned int MajorVersion(void) const { return m_majorVersion; }
+ unsigned int MinorVersion(void) const { return m_minorVersion; }
+
+ inline void SetVersion(const std::string& version);
+ inline std::string ToString(void) const;
+
+ // data members
+ private:
+ unsigned int m_majorVersion;
+ unsigned int m_minorVersion;
+};
+
+inline
+void SamHeaderVersion::SetVersion(const std::string& version) {
+
+ // do nothing if version is empty
+ if ( !version.empty() ) {
+
+ // do nothing if period not found
+ const size_t periodFound = version.find(Constants::SAM_PERIOD);
+ if ( periodFound != std::string::npos ) {
+
+ // store major version if non-empty and contains only digits
+ const std::string& majorVersion = version.substr(0, periodFound);
+ if ( majorVersion.empty() ) {
+ const size_t nonDigitFound = majorVersion.find_first_not_of(Constants::SAM_DIGITS);
+ if ( nonDigitFound == std::string::npos ) {
+ std::stringstream major(majorVersion);
+ major >> m_majorVersion;
+ }
+ }
+
+ // store minor version if non-empty and contains only digits
+ const std::string& minorVersion = version.substr(periodFound + 1);
+ if ( minorVersion.empty() ) {
+ const size_t nonDigitFound = minorVersion.find_first_not_of(Constants::SAM_DIGITS);
+ if ( nonDigitFound == std::string::npos ) {
+ std::stringstream minor(minorVersion);
+ minor >> m_minorVersion;
+ }
+ }
+ }
+ }
+}
+
+// -----------------------------------------------------
+// printing
+
+inline std::string SamHeaderVersion::ToString(void) const {
+ std::stringstream version;
+ version << m_majorVersion << Constants::SAM_PERIOD << m_minorVersion;
+ return version.str();
+}
+
+// -----------------------------------------------------
+// comparison operators
+
+inline bool operator==(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) {
+ return (lhs.MajorVersion() == rhs.MajorVersion()) &&
+ (lhs.MinorVersion() == rhs.MinorVersion());
+}
+
+inline bool operator<(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) {
+ if ( lhs.MajorVersion() == rhs.MajorVersion() )
+ return lhs.MinorVersion() < rhs.MinorVersion();
+ else
+ return lhs.MajorVersion() < rhs.MajorVersion();
+}
+
+inline bool operator> (const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return rhs < lhs; }
+inline bool operator<=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return !(lhs>rhs); }
+inline bool operator>=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return !(lhs<rhs); }
+
+} // namespace Internal
+} // namespace BamTools
+
+#endif // SAM_HEADERVERSION_P_H