From ff5f2ec7c437660185a406d01739f42534105412 Mon Sep 17 00:00:00 2001 From: derek Date: Thu, 23 Dec 2010 22:33:33 -0500 Subject: [PATCH] Added SAM header-handling classes for read/write/validate. * Not fully connected to the BamReader/Writer API yet, but will be phased in soon. * Will enable clients to query, modify & validate a BAM file's SAM header data using the BamTools API, instead of having to use hand-rolled string-parsing code on the result of BamReader::GetHeaderText(). --- src/api/CMakeLists.txt | 75 +-- src/api/SamConstants.h | 104 +++++ src/api/SamHeader.cpp | 102 ++++ src/api/SamHeader.h | 81 ++++ src/api/SamReadGroup.cpp | 68 +++ src/api/SamReadGroup.h | 67 +++ src/api/SamReadGroupDictionary.cpp | 168 +++++++ src/api/SamReadGroupDictionary.h | 95 ++++ src/api/SamSequence.cpp | 46 ++ src/api/SamSequence.h | 64 +++ src/api/SamSequenceDictionary.cpp | 159 +++++++ src/api/SamSequenceDictionary.h | 94 ++++ src/api/internal/SamFormatParser_p.cpp | 221 +++++++++ src/api/internal/SamFormatParser_p.h | 62 +++ src/api/internal/SamFormatPrinter_p.cpp | 185 ++++++++ src/api/internal/SamFormatPrinter_p.h | 61 +++ src/api/internal/SamHeaderValidator_p.cpp | 545 ++++++++++++++++++++++ src/api/internal/SamHeaderValidator_p.h | 52 +++ src/api/internal/SamHeaderVersion_p.h | 115 +++++ 19 files changed, 2328 insertions(+), 36 deletions(-) create mode 100644 src/api/SamConstants.h create mode 100644 src/api/SamHeader.cpp create mode 100644 src/api/SamHeader.h create mode 100644 src/api/SamReadGroup.cpp create mode 100644 src/api/SamReadGroup.h create mode 100644 src/api/SamReadGroupDictionary.cpp create mode 100644 src/api/SamReadGroupDictionary.h create mode 100644 src/api/SamSequence.cpp create mode 100644 src/api/SamSequence.h create mode 100644 src/api/SamSequenceDictionary.cpp create mode 100644 src/api/SamSequenceDictionary.h create mode 100644 src/api/internal/SamFormatParser_p.cpp create mode 100644 src/api/internal/SamFormatParser_p.h create mode 100644 src/api/internal/SamFormatPrinter_p.cpp create mode 100644 src/api/internal/SamFormatPrinter_p.h create mode 100644 src/api/internal/SamHeaderValidator_p.cpp create mode 100644 src/api/internal/SamHeaderValidator_p.h create mode 100644 src/api/internal/SamHeaderVersion_p.h diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt index 951ba87..9e41c72 100644 --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@ -11,39 +11,36 @@ include_directories( ${BamTools_SOURCE_DIR}/src ) # add compiler definitions add_definitions( -DBAMTOOLS_API_LIBRARY ) # (for proper exporting of library symbols) +# list of all BamTools API source (.cpp) files +set( BamToolsAPISources + BamAlignment.cpp + BamIndex.cpp + BamMultiReader.cpp + BamReader.cpp + BamWriter.cpp + BGZF.cpp + SamHeader.cpp + SamReadGroup.cpp + SamReadGroupDictionary.cpp + SamSequence.cpp + SamSequenceDictionary.cpp + internal/BamMultiReader_p.cpp + internal/BamReader_p.cpp + internal/BamStandardIndex_p.cpp + internal/BamToolsIndex_p.cpp + internal/BamWriter_p.cpp + internal/SamFormatParser_p.cpp + internal/SamFormatPrinter_p.cpp + internal/SamHeaderValidator_p.cpp +) + # create main BamTools API shared library -add_library( BamTools SHARED - BamAlignment.cpp - BamIndex.cpp - BamMultiReader.cpp - BamReader.cpp - BamWriter.cpp - BGZF.cpp - internal/BamMultiReader_p.cpp - internal/BamReader_p.cpp - internal/BamStandardIndex_p.cpp - internal/BamToolsIndex_p.cpp - internal/BamWriter_p.cpp - ) -# set shared lib properties +add_library( BamTools SHARED ${BamToolsAPISources} ) set_target_properties( BamTools PROPERTIES SOVERSION "0.9.1" ) set_target_properties( BamTools PROPERTIES OUTPUT_NAME "bamtools" ) # create main BamTools API static library -add_library( BamTools-static STATIC - BamAlignment.cpp - BamIndex.cpp - BamMultiReader.cpp - BamReader.cpp - BamWriter.cpp - BGZF.cpp - internal/BamMultiReader_p.cpp - internal/BamReader_p.cpp - internal/BamStandardIndex_p.cpp - internal/BamToolsIndex_p.cpp - internal/BamWriter_p.cpp - ) -# set static lib properties +add_library( BamTools-static STATIC ${BamToolsAPISources} ) set_target_properties( BamTools-static PROPERTIES OUTPUT_NAME "bamtools" ) set_target_properties( BamTools-static PROPERTIES PREFIX "lib" ) @@ -58,11 +55,17 @@ install( TARGETS BamTools-static ARCHIVE DESTINATION "lib/bamtools") # export API headers include(../ExportHeader.cmake) set(ApiIncludeDir "api") -ExportHeader(APIHeaders api_global.h ${ApiIncludeDir}) -ExportHeader(APIHeaders BamAlignment.h ${ApiIncludeDir}) -ExportHeader(APIHeaders BamAux.h ${ApiIncludeDir}) -ExportHeader(APIHeaders BamIndex.h ${ApiIncludeDir}) -ExportHeader(APIHeaders BamMultiReader.h ${ApiIncludeDir}) -ExportHeader(APIHeaders BamReader.h ${ApiIncludeDir}) -ExportHeader(APIHeaders BamWriter.h ${ApiIncludeDir}) -ExportHeader(APIHeaders BGZF.h ${ApiIncludeDir}) +ExportHeader(APIHeaders api_global.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamAlignment.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamAux.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamIndex.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamMultiReader.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamReader.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamWriter.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BGZF.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamConstants.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamHeader.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamReadGroup.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamReadGroupDictionary.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamSequence.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamSequenceDictionary.h ${ApiIncludeDir}) diff --git a/src/api/SamConstants.h b/src/api/SamConstants.h new file mode 100644 index 0000000..6412b3d --- /dev/null +++ b/src/api/SamConstants.h @@ -0,0 +1,104 @@ +// *************************************************************************** +// SamConstants.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides constants for SAM header +// *************************************************************************** + +#ifndef SAM_CONSTANTS_H +#define SAM_CONSTANTS_H + +#include +#include + +namespace BamTools { +namespace Constants { + +const char SAM_COLON = ':'; +const char SAM_EQUAL = '='; +const char SAM_PERIOD = '.'; +const char SAM_STAR = '*'; +const char SAM_TAB = '\t'; +const std::string SAM_DIGITS = "0123456789"; + +// HD entries +const std::string SAM_HD_BEGIN_TOKEN = "@HD"; +const std::string SAM_HD_VERSION_TAG = "VN"; +const std::string SAM_HD_SORTORDER_TAG = "SO"; +const std::string SAM_HD_GROUPORDER_TAG = "GO"; + +// SQ entries +const std::string SAM_SQ_BEGIN_TOKEN = "@SQ"; +const std::string SAM_SQ_NAME_TAG = "SN"; +const std::string SAM_SQ_LENGTH_TAG = "LN"; +const std::string SAM_SQ_ASSEMBLYID_TAG = "AS"; +const std::string SAM_SQ_URI_TAG = "UR"; +const std::string SAM_SQ_CHECKSUM_TAG = "M5"; +const std::string SAM_SQ_SPECIES_TAG = "SP"; + +// RG entries +const std::string SAM_RG_BEGIN_TOKEN = "@RG"; +const std::string SAM_RG_ID_TAG = "ID"; +const std::string SAM_RG_SAMPLE_TAG = "SM"; +const std::string SAM_RG_LIBRARY_TAG = "LB"; +const std::string SAM_RG_DESCRIPTION_TAG = "DS"; +const std::string SAM_RG_PLATFORMUNIT_TAG = "PU"; +const std::string SAM_RG_PREDICTEDINSERTSIZE_TAG = "PI"; +const std::string SAM_RG_SEQCENTER_TAG = "CN"; +const std::string SAM_RG_PRODUCTIONDATE_TAG = "DT"; +const std::string SAM_RG_SEQTECHNOLOGY_TAG = "PL"; + +// PG entries +const std::string SAM_PG_BEGIN_TOKEN = "@PG"; +const std::string SAM_PG_NAME_TAG = "ID"; +const std::string SAM_PG_VERSION_TAG = "VN"; +const std::string SAM_PG_COMMANDLINE_TAG = "CL"; + +// CO entries +const std::string SAM_CO_BEGIN_TOKEN = "@CO"; + +// HD:SO values +const std::string SAM_HD_SORTORDER_COORDINATE = "coordinate"; +const std::string SAM_HD_SORTORDER_QUERYNAME = "queryname"; +const std::string SAM_HD_SORTORDER_UNSORTED = "unsorted"; + +// HD:GO values +const std::string SAM_HD_GROUPORDER_NONE = "none"; +const std::string SAM_HD_GROUPORDER_QUERY = "query"; +const std::string SAM_HD_GROUPORDER_REFERENCE = "reference"; + +// SQ:LN values +const unsigned int SAM_SQ_LENGTH_MIN = 1; +const unsigned int SAM_SQ_LENGTH_MAX = 536870911; // 2^29 - 1 + +// -------------- +// RG:PL values + +// 454 +const std::string SAM_RG_SEQTECHNOLOGY_454 = "454"; +const std::string SAM_RG_SEQTECHNOLOGY_LS454_LOWER = "ls454"; +const std::string SAM_RG_SEQTECHNOLOGY_LS454_UPPER = "LS454"; + +// Helicos +const std::string SAM_RG_SEQTECHNOLOGY_HELICOS_LOWER = "helicos"; +const std::string SAM_RG_SEQTECHNOLOGY_HELICOS_UPPER = "HELICOS"; + +// Illumina +const std::string SAM_RG_SEQTECHNOLOGY_ILLUMINA_LOWER = "illumina"; +const std::string SAM_RG_SEQTECHNOLOGY_ILLUMINA_UPPER = "ILLUMINA"; + +// PacBio +const std::string SAM_RG_SEQTECHNOLOGY_PACBIO_LOWER = "pacbio"; +const std::string SAM_RG_SEQTECHNOLOGY_PACBIO_UPPER = "PACBIO"; + +// SOLiD +const std::string SAM_RG_SEQTECHNOLOGY_SOLID_LOWER = "solid"; +const std::string SAM_RG_SEQTECHNOLOGY_SOLID_UPPER = "SOLID"; + +} // namespace Constants +} // namespace BamTools + +#endif // SAM_CONSTANTS_H diff --git a/src/api/SamHeader.cpp b/src/api/SamHeader.cpp new file mode 100644 index 0000000..405033f --- /dev/null +++ b/src/api/SamHeader.cpp @@ -0,0 +1,102 @@ +// *************************************************************************** +// SamHeader.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for querying/manipulating SAM header data +// ************************************************************************** + +#include +#include +#include +#include +using namespace BamTools; +using namespace BamTools::Internal; +using namespace std; + +SamHeader::SamHeader(const string& headerText) + : Version("") + , SortOrder("") + , GroupOrder("") + , ProgramName("") + , ProgramVersion("") + , ProgramCommandLine("") +{ + SamFormatParser parser(*this); + parser.Parse(headerText); +} + +SamHeader::~SamHeader(void) { + Clear(); +} + +void SamHeader::Clear(void) { + Version.clear(); + SortOrder.clear(); + GroupOrder.clear(); + Sequences.Clear(); + ReadGroups.Clear(); + ProgramName.clear(); + ProgramVersion.clear(); + ProgramCommandLine.clear(); + Comments.clear(); +} + +// retrieve the SAM header, with any local modifications +string SamHeader::ToString(void) const { + SamFormatPrinter printer(*this); + return printer.ToString(); +} + +// query if header contains @HD ID: +bool SamHeader::HasVersion(void) const { + return (!Version.empty()); +} + +// query if header contains @HD SO: +bool SamHeader::HasSortOrder(void) const { + return (!SortOrder.empty()); +} + +// query if header contains @HD GO: +bool SamHeader::HasGroupOrder(void) const { + return (!GroupOrder.empty()); +} + +// query if header contains @SQ entries +bool SamHeader::HasSequences(void) const { + return (!Sequences.IsEmpty()); +} + +// query if header contains @RG entries +bool SamHeader::HasReadGroups(void) const { + return (!ReadGroups.IsEmpty()); +} + +// query if header contains @PG ID: +bool SamHeader::HasProgramName(void) const { + return (!ProgramName.empty()); +} + +// query if header contains @HD VN: +bool SamHeader::HasProgramVersion(void) const { + return (!ProgramVersion.empty()); +} + +// query if header contains @HD CL: +bool SamHeader::HasProgramCommandLine(void) const { + return (!ProgramCommandLine.empty()); +} + +// query if header contains @CO entries +bool SamHeader::HasComments(void) const { + return (!Comments.empty()); +} + +// validation +bool SamHeader::IsValid(bool verbose) const { + SamHeaderValidator validator(*this); + return validator.Validate(verbose); +} diff --git a/src/api/SamHeader.h b/src/api/SamHeader.h new file mode 100644 index 0000000..b51f175 --- /dev/null +++ b/src/api/SamHeader.h @@ -0,0 +1,81 @@ +// *************************************************************************** +// SamHeader.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for querying/manipulating SAM header data +// ************************************************************************** + +#ifndef SAM_HEADER_H +#define SAM_HEADER_H + +#include +#include +#include +#include +#include + +namespace BamTools { + +struct API_EXPORT SamHeader { + + // ctor & dtor + public: + explicit SamHeader(const std::string& headerText = ""); + ~SamHeader(void); + + // query/modify entire SamHeader at once + public: + + // clear all header contents + void Clear(void); + + // checks if SAM header is well-formed + // @verbose - if true, validation errors & warnings will be printed to stderr + // otherwise, output is suppressed and only validation check occurs + bool IsValid(bool verbose = false) const; + + // retrieves the printable, SAM-formatted header + // (with any local modifications since construction) + std::string ToString(void) const; + + // query if header contains data elements + public: + bool HasVersion(void) const; + bool HasSortOrder(void) const; + bool HasGroupOrder(void) const; + bool HasSequences(void) const; + bool HasReadGroups(void) const; + bool HasProgramName(void) const; + bool HasProgramVersion(void) const; + bool HasProgramCommandLine(void) const; + bool HasComments(void) const; + + // data members + public: + + // header metadata (@HD line) + std::string Version; // VN: + std::string SortOrder; // SO: + std::string GroupOrder; // GO: + + // header sequences (@SQ entries) + SamSequenceDictionary Sequences; + + // header read groups (@RG entries) + SamReadGroupDictionary ReadGroups; + + // header program data (@PG entries) + std::string ProgramName; // ID: + std::string ProgramVersion; // VN: + std::string ProgramCommandLine; // CL: + + // header comments (@CO entries) + std::vector Comments; +}; + +} // namespace BamTools + +#endif // SAM_HEADER_H diff --git a/src/api/SamReadGroup.cpp b/src/api/SamReadGroup.cpp new file mode 100644 index 0000000..96c8e4e --- /dev/null +++ b/src/api/SamReadGroup.cpp @@ -0,0 +1,68 @@ +// *************************************************************************** +// SamReadGroup.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for querying/manipulating read group data +// ************************************************************************** + +#include +using namespace BamTools; +using namespace std; + +// default ctor +SamReadGroup::SamReadGroup(void) + : ID("") + , Sample("") + , Library("") + , Description("") + , PlatformUnit("") + , PredictedInsertSize("") + , SequencingCenter("") + , ProductionDate("") + , SequencingTechnology("") +{ } + +// ctor with provided ID +SamReadGroup::SamReadGroup(const string& id) + : ID(id) + , Sample("") + , Library("") + , Description("") + , PlatformUnit("") + , PredictedInsertSize("") + , SequencingCenter("") + , ProductionDate("") + , SequencingTechnology("") +{ } + +// dtor +SamReadGroup::~SamReadGroup(void) { + Clear(); +} + +// clear all contents +void SamReadGroup::Clear(void) { + ID.clear(); + Sample.clear(); + Library.clear(); + Description.clear(); + PlatformUnit.clear(); + PredictedInsertSize.clear(); + SequencingCenter.clear(); + ProductionDate.clear(); + SequencingTechnology.clear(); +} + +// convenience methods to check if SamReadGroup contains these values: +bool SamReadGroup::HasID(void) const { return (!ID.empty()); } +bool SamReadGroup::HasSample(void) const { return (!Sample.empty()); } +bool SamReadGroup::HasLibrary(void) const { return (!Library.empty()); } +bool SamReadGroup::HasDescription(void) const { return (!Description.empty()); } +bool SamReadGroup::HasPlatformUnit(void) const { return (!PlatformUnit.empty()); } +bool SamReadGroup::HasPredictedInsertSize(void) const { return (!PredictedInsertSize.empty()); } +bool SamReadGroup::HasSequencingCenter(void) const { return (!SequencingCenter.empty()); } +bool SamReadGroup::HasProductionDate(void) const { return (!ProductionDate.empty()); } +bool SamReadGroup::HasSequencingTechnology(void) const { return (!SequencingTechnology.empty()); } diff --git a/src/api/SamReadGroup.h b/src/api/SamReadGroup.h new file mode 100644 index 0000000..fac4612 --- /dev/null +++ b/src/api/SamReadGroup.h @@ -0,0 +1,67 @@ +// *************************************************************************** +// SamReadGroup.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for querying/manipulating read group data +// ************************************************************************** + +#ifndef SAM_READGROUP_H +#define SAM_READGROUP_H + +#include "api/api_global.h" +#include + +namespace BamTools { + +class API_EXPORT SamReadGroup { + + // ctor & dtor + public: + SamReadGroup(void); + SamReadGroup(const std::string& id); + ~SamReadGroup(void); + + // public methods + public: + + // clear all contents + void Clear(void); + + // convenience methods to check if SamReadGroup contains these values: + bool HasID(void) const; + bool HasSample(void) const; + bool HasLibrary(void) const; + bool HasDescription(void) const; + bool HasPlatformUnit(void) const; + bool HasPredictedInsertSize(void) const; + bool HasSequencingCenter(void) const; + bool HasProductionDate(void) const; + bool HasSequencingTechnology(void) const; + + // data members + public: + std::string ID; // ID: + std::string Sample; // SM: + std::string Library; // LB: + std::string Description; // DS: + std::string PlatformUnit; // PU: + std::string PredictedInsertSize; // PI: + std::string SequencingCenter; // CN: + std::string ProductionDate; // DT: + std::string SequencingTechnology; // PL: +}; + +// --------------------------------------------------- +// comparison operators + +// for equality: compare IDs +inline bool operator==(const SamReadGroup& lhs, const SamReadGroup& rhs) { + return lhs.ID == rhs.ID; +} + +} // namespace BamTools + +#endif // SAM_READGROUP_H diff --git a/src/api/SamReadGroupDictionary.cpp b/src/api/SamReadGroupDictionary.cpp new file mode 100644 index 0000000..fb03fac --- /dev/null +++ b/src/api/SamReadGroupDictionary.cpp @@ -0,0 +1,168 @@ +// *************************************************************************** +// SamReadGroupDictionary.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides container operations for collection of read group entries +// ************************************************************************* + +#include +using namespace BamTools; + +#include +#include +using namespace std; + +// ctor +SamReadGroupDictionary::SamReadGroupDictionary(void) { } + +// dtor +SamReadGroupDictionary::~SamReadGroupDictionary(void) { + m_data.clear(); +} + +// adds read group if not already in container +void SamReadGroupDictionary::Add(const SamReadGroup& readGroup) { + if ( IsEmpty() || !Contains(readGroup) ) + m_data.push_back(readGroup); +} + +// overload to support std::string +void SamReadGroupDictionary::Add(const string& readGroupId) { + Add( SamReadGroup(readGroupId) ); +} + +// add multiple read groups +void SamReadGroupDictionary::Add(const vector& readGroups) { + vector::const_iterator rgIter = readGroups.begin(); + vector::const_iterator rgEnd = readGroups.end(); + for ( ; rgIter!= rgEnd; ++rgIter ) + Add(*rgIter); +} + +// overload to support std::string +void SamReadGroupDictionary::Add(const vector& readGroupIds) { + vector::const_iterator rgIter = readGroupIds.begin(); + vector::const_iterator rgEnd = readGroupIds.end(); + for ( ; rgIter!= rgEnd; ++rgIter ) + Add(*rgIter); +} + +// returns iterator to container begin +SamReadGroupIterator SamReadGroupDictionary::Begin(void) { + return m_data.begin(); +} + +// returns const_iterator to container begin +SamReadGroupConstIterator SamReadGroupDictionary::Begin(void) const { + return m_data.begin(); +} + +// clear read group container +void SamReadGroupDictionary::Clear(void) { + m_data.clear(); +} + +// explicit request for const_iterator to container begin +SamReadGroupConstIterator SamReadGroupDictionary::ConstBegin(void) const { + return m_data.begin(); +} + +// explicit request for const_iterator to container end +SamReadGroupConstIterator SamReadGroupDictionary::ConstEnd(void) const { + return m_data.end(); +} + +// returns true if container contains a read group with this ID tag +bool SamReadGroupDictionary::Contains(const string& readGroupId) const { + return ( IndexOf(readGroupId) != (int)m_data.size() ); +} + +bool SamReadGroupDictionary::Contains(const SamReadGroup& readGroup) const { + return ( IndexOf(readGroup) != (int)m_data.size() ); +} + +// returns iterator to container end +SamReadGroupIterator SamReadGroupDictionary::End(void) { + return m_data.end(); +} + +// returns const_iterator to container begin +SamReadGroupConstIterator SamReadGroupDictionary::End(void) const { + return m_data.end(); +} + +// returns vector index of read group if found +// returns vector::size() (invalid index) if not found +int SamReadGroupDictionary::IndexOf(const SamReadGroup& readGroup) const { + SamReadGroupConstIterator begin = ConstBegin(); + SamReadGroupConstIterator iter = begin; + SamReadGroupConstIterator end = ConstEnd(); + for ( ; iter != end; ++iter ) + if ( *iter == readGroup ) break; + return distance( begin, iter ); +} + +// overload to support std::string +int SamReadGroupDictionary::IndexOf(const string& readGroupId) const { + return IndexOf( SamReadGroup(readGroupId) ); +} + +// returns true if container is empty +bool SamReadGroupDictionary::IsEmpty(void) const { + return m_data.empty(); +} + +// removes read group (if it exists) +void SamReadGroupDictionary::Remove(const SamReadGroup& readGroup) { + if ( Contains(readGroup) ) + m_data.erase( m_data.begin() + IndexOf(readGroup) ); +} + +// overlaod to support std::string +void SamReadGroupDictionary::Remove(const string& readGroupId) { + Remove( SamReadGroup(readGroupId) ); +} + +// remove multiple read groups +void SamReadGroupDictionary::Remove(const vector& readGroups) { + vector::const_iterator rgIter = readGroups.begin(); + vector::const_iterator rgEnd = readGroups.end(); + for ( ; rgIter!= rgEnd; ++rgIter ) + Remove(*rgIter); +} + +// overload to support std::string +void SamReadGroupDictionary::Remove(const vector& readGroupIds) { + vector::const_iterator rgIter = readGroupIds.begin(); + vector::const_iterator rgEnd = readGroupIds.end(); + for ( ; rgIter!= rgEnd; ++rgIter ) + Remove(*rgIter); +} + +// returns size of container (number of current read groups) +int SamReadGroupDictionary::Size(void) const { + return m_data.size(); +} + +// retrieves the SamReadGroup object associated with this ID +// if readGroupId is unknown, a new SamReadGroup is created with this ID +// and a reference to this new read group entry is returned (like std::map) +SamReadGroup& SamReadGroupDictionary::operator[](const std::string& readGroupId) { + + // look up read group ID + int index = IndexOf(readGroupId); + + // if found, return read group at index + if ( index != (int)m_data.size() ) + return m_data[index]; + + // otherwise, append new read group and return reference + else { + SamReadGroup rg(readGroupId); + m_data.push_back(rg); + return m_data.back(); + } +} diff --git a/src/api/SamReadGroupDictionary.h b/src/api/SamReadGroupDictionary.h new file mode 100644 index 0000000..d21ccf8 --- /dev/null +++ b/src/api/SamReadGroupDictionary.h @@ -0,0 +1,95 @@ +// *************************************************************************** +// SamReadGroupDictionary.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides container operations for collection of read group entries +// ************************************************************************* + +#ifndef SAM_READGROUP_DICTIONARY_H +#define SAM_READGROUP_DICTIONARY_H + +#include +#include +#include +#include + +namespace BamTools { + +typedef std::vector SamReadGroupContainer; +typedef SamReadGroupContainer::iterator SamReadGroupIterator; +typedef SamReadGroupContainer::const_iterator SamReadGroupConstIterator; + +// stores read groups +// can access read groups using SamReadGroup object or (std::string) read group ID tag +class API_EXPORT SamReadGroupDictionary { + + // ctor & dtor + public: + SamReadGroupDictionary(void); + ~SamReadGroupDictionary(void); + + // query/modify read group data + public: + // add a read group + void Add(const SamReadGroup& readGroup); + void Add(const std::string& readGroupIds); + + // add multiple read groups + void Add(const std::vector& readGroups); + void Add(const std::vector& readGroupIds); + + // clear all read groups records + void Clear(void); + + // returns true if dictionary contains this read group + bool Contains(const SamReadGroup& readGroup) const; + bool Contains(const std::string& readGroupId) const; + + // returns true if dictionary is empty + bool IsEmpty(void) const; + + // remove a single read group (does nothing if read group not found) + void Remove(const SamReadGroup& readGroup); + void Remove(const std::string& readGroupId); + + // remove multiple read groups + void Remove(const std::vector& readGroups); + void Remove(const std::vector& readGroupIds); + + // returns size of dictionary (number of current elements) + int Size(void) const; + + // retrieves the SamReadGroup object associated with this ID + // if readGroupId is unknown, a new SamReadGroup is created with this ID (and no other data) + // and a reference to this new read group entry is returned (like std::map) + // + // * To avoid these partial entries being created, it is recommended to check + // for existence first using Contains() + SamReadGroup& operator[](const std::string& readGroupId); + + // retrieve read group iterators + // these are typedefs for STL iterators and thus are compatible with STL containers/algorithms + public: + SamReadGroupIterator Begin(void); + SamReadGroupConstIterator Begin(void) const; + SamReadGroupConstIterator ConstBegin(void) const; + SamReadGroupIterator End(void); + SamReadGroupConstIterator End(void) const; + SamReadGroupConstIterator ConstEnd(void) const; + + // internal methods + private: + int IndexOf(const SamReadGroup& readGroup) const; + int IndexOf(const std::string& readGroupId) const; + + // data members + private: + SamReadGroupContainer m_data; +}; + +} // namespace BamTools + +#endif // SAM_READGROUP_DICTIONARY diff --git a/src/api/SamSequence.cpp b/src/api/SamSequence.cpp new file mode 100644 index 0000000..0554604 --- /dev/null +++ b/src/api/SamSequence.cpp @@ -0,0 +1,46 @@ +// *************************************************************************** +// SamSequence.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for querying/manipulating sequence data +// ************************************************************************* + +#include +using namespace BamTools; +using namespace std; + +// ctor +SamSequence::SamSequence(const string& name) + : Name(name) + , Length("") + , AssemblyID("") + , Checksum("") + , URI("") + , Species("") +{ } + +// dtor +SamSequence::~SamSequence(void) { + Clear(); +} + +// clear all contents +void SamSequence::Clear(void) { + Name.clear(); + Length.clear(); + AssemblyID.clear(); + Checksum.clear(); + URI.clear(); + Species.clear(); +} + +// convenience methods to check if SamSequence contains these values: +bool SamSequence::HasName(void) const { return (!Name.empty()); } +bool SamSequence::HasLength(void) const { return (!Length.empty()); } +bool SamSequence::HasAssemblyID(void) const { return (!AssemblyID.empty()); } +bool SamSequence::HasChecksum(void) const { return (!Checksum.empty()); } +bool SamSequence::HasURI(void) const { return (!URI.empty()); } +bool SamSequence::HasSpecies(void) const { return (!Species.empty()); } diff --git a/src/api/SamSequence.h b/src/api/SamSequence.h new file mode 100644 index 0000000..db6891d --- /dev/null +++ b/src/api/SamSequence.h @@ -0,0 +1,64 @@ +// *************************************************************************** +// SamSequence.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for querying/manipulating sequence data +// ************************************************************************** + +#ifndef SAM_SEQUENCE_H +#define SAM_SEQUENCE_H + +#include +#include + +namespace BamTools { + +class API_EXPORT SamSequence { + + // ctor & dtor + public: + SamSequence(const std::string& name = ""); + ~SamSequence(void); + + // public methods + public: + + // clear all contents + void Clear(void); + + // convenience methods to check if SamSequence contains these values: + bool HasName(void) const; + bool HasLength(void) const; + bool HasAssemblyID(void) const; + bool HasChecksum(void) const; + bool HasURI(void) const; + bool HasSpecies(void) const; + + // data members + public: + std::string Name; // SN: + std::string Length; // LN: + std::string AssemblyID; // AS: + std::string Checksum; // M5: + std::string URI; // UR: + std::string Species; // SP: +}; + +// --------------------------------------------------- +// comparison operators + +// for equality: compare Name, Length, & Checksum (if it exists for both) +inline bool operator==(const SamSequence& lhs, const SamSequence& rhs) { + if ( lhs.Name != rhs.Name ) return false; + if ( lhs.Length != rhs.Length ) return false; + if ( lhs.HasChecksum() && rhs.HasChecksum() ) + return (lhs.Checksum == rhs.Checksum); + else return true; +} + +} // namespace BamTools + +#endif // SAM_SEQUENCE_H diff --git a/src/api/SamSequenceDictionary.cpp b/src/api/SamSequenceDictionary.cpp new file mode 100644 index 0000000..c023a39 --- /dev/null +++ b/src/api/SamSequenceDictionary.cpp @@ -0,0 +1,159 @@ +#include +using namespace BamTools; + +#include +using namespace std; + +// ctor +SamSequenceDictionary::SamSequenceDictionary(void) { } + +// dtor +SamSequenceDictionary::~SamSequenceDictionary(void) { + m_data.clear(); +} + +// adds sequence if not already in container +void SamSequenceDictionary::Add(const SamSequence& sequence) { + if ( IsEmpty() || !Contains(sequence) ) + m_data.push_back(sequence); +} + +// overload to support std::string +void SamSequenceDictionary::Add(const string& sequenceName) { + Add( SamSequence(sequenceName) ); +} + +// add multiple sequences +void SamSequenceDictionary::Add(const vector& sequences) { + vector::const_iterator rgIter = sequences.begin(); + vector::const_iterator rgEnd = sequences.end(); + for ( ; rgIter!= rgEnd; ++rgIter ) + Add(*rgIter); +} + +// overload to support std::string +void SamSequenceDictionary::Add(const vector& sequenceNames) { + vector::const_iterator rgIter = sequenceNames.begin(); + vector::const_iterator rgEnd = sequenceNames.end(); + for ( ; rgIter!= rgEnd; ++rgIter ) + Add(*rgIter); +} + +// returns iterator to container begin +SamSequenceIterator SamSequenceDictionary::Begin(void) { + return m_data.begin(); +} + +// returns const_iterator to container begin +SamSequenceConstIterator SamSequenceDictionary::Begin(void) const { + return m_data.begin(); +} + +// clear sequence container +void SamSequenceDictionary::Clear(void) { + m_data.clear(); +} + +// explicit request for const_iterator to container begin +SamSequenceConstIterator SamSequenceDictionary::ConstBegin(void) const { + return m_data.begin(); +} + +// explicit request for const_iterator to container end +SamSequenceConstIterator SamSequenceDictionary::ConstEnd(void) const { + return m_data.end(); +} + +// returns true if container contains a sequence with this ID tag +bool SamSequenceDictionary::Contains(const string& sequenceName) const { + return ( IndexOf(sequenceName) != (int)m_data.size() ); +} + +bool SamSequenceDictionary::Contains(const SamSequence& seq) const { + return ( IndexOf(seq) != (int)m_data.size() ); +} + +// returns iterator to container end +SamSequenceIterator SamSequenceDictionary::End(void) { + return m_data.end(); +} + +// returns const_iterator to container begin +SamSequenceConstIterator SamSequenceDictionary::End(void) const { + return m_data.end(); +} + +// returns vector index of sequence if found +// returns vector::size() (invalid index) if not found +int SamSequenceDictionary::IndexOf(const SamSequence& sequence) const { + SamSequenceConstIterator begin = ConstBegin(); + SamSequenceConstIterator iter = begin; + SamSequenceConstIterator end = ConstEnd(); + for ( ; iter != end; ++iter ) + if ( *iter == sequence ) break; + return distance( begin, iter ); +} + +// overload to support std::string +int SamSequenceDictionary::IndexOf(const string& sequenceName) const { + return IndexOf( SamSequence(sequenceName) ); +} + +// returns true if container is empty +bool SamSequenceDictionary::IsEmpty(void) const { + return m_data.empty(); +} + +// removes sequence (if it exists) +void SamSequenceDictionary::Remove(const SamSequence& sequence) { + if ( Contains(sequence) ) + m_data.erase( m_data.begin() + IndexOf(sequence) ); +} + +// overlaod to support std::string +void SamSequenceDictionary::Remove(const string& sequenceName) { + Remove( SamSequence(sequenceName) ); +} + +// remove multiple sequences +void SamSequenceDictionary::Remove(const vector& sequences) { + vector::const_iterator rgIter = sequences.begin(); + vector::const_iterator rgEnd = sequences.end(); + for ( ; rgIter!= rgEnd; ++rgIter ) + Remove(*rgIter); +} + +// overload to support std::string +void SamSequenceDictionary::Remove(const vector& sequenceNames) { + vector::const_iterator rgIter = sequenceNames.begin(); + vector::const_iterator rgEnd = sequenceNames.end(); + for ( ; rgIter!= rgEnd; ++rgIter ) + Remove(*rgIter); +} + +// returns size of container (number of current sequences) +int SamSequenceDictionary::Size(void) const { + return m_data.size(); +} + +// retrieves the SamSequence object associated with this name +// if sequenceName is unknown, a new SamSequence is created with this name (and invalid length 0) +// and a reference to this new sequence entry is returned (like std::map) +SamSequence& SamSequenceDictionary::operator[](const std::string& sequenceName) { + + // look up sequence ID + int index = IndexOf(sequenceName); + + // if found, return sequence at index + if ( index != (int)m_data.size() ) + return m_data[index]; + + // otherwise, append new sequence and return reference + else { + SamSequence seq(sequenceName); + seq.Length = "0"; + m_data.push_back(seq); + return m_data.back(); + } +} + diff --git a/src/api/SamSequenceDictionary.h b/src/api/SamSequenceDictionary.h new file mode 100644 index 0000000..bcd1652 --- /dev/null +++ b/src/api/SamSequenceDictionary.h @@ -0,0 +1,94 @@ +// *************************************************************************** +// SamSequenceDictionary.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides container operations for collection of sequence entries +// ************************************************************************* + +#ifndef SAM_SEQUENCE_DICTIONARY_H +#define SAM_SEQUENCE_DICTIONARY_H + +#include +#include +#include +#include + +namespace BamTools { + +typedef std::vector SamSequenceContainer; +typedef SamSequenceContainer::iterator SamSequenceIterator; +typedef SamSequenceContainer::const_iterator SamSequenceConstIterator; + +class API_EXPORT SamSequenceDictionary { + + // ctor & dtor + public: + SamSequenceDictionary(void); + ~SamSequenceDictionary(void); + + // query/modify sequence data + public: + // add a sequence + void Add(const SamSequence& sequence); + void Add(const std::string& sequenceNames); + + // add multiple sequences + void Add(const std::vector& sequences); + void Add(const std::vector& sequenceNames); + + // clear all sequence records + void Clear(void); + + // returns true if dictionary contains this sequence + bool Contains(const SamSequence& sequence) const; + bool Contains(const std::string& sequenceName) const; + + // returns true if dictionary is empty + bool IsEmpty(void) const; + + // remove a single sequence (does nothing if sequence not found) + void Remove(const SamSequence& sequence); + void Remove(const std::string& sequenceName); + + // remove multiple sequences + void Remove(const std::vector& sequences); + void Remove(const std::vector& sequenceNames); + + // returns size of dictionary (number of current elements) + int Size(void) const; + + // retrieves the SamSequence object associated with this name + // if sequenceName is unknown, a new SamSequence is created with this name (and invalid length 0) + // and a reference to this new sequence entry is returned (like std::map) + // + // * To avoid these partial entries being created, it is recommended to check + // for existence first using Contains() + SamSequence& operator[](const std::string& sequenceName); + + // retrieve sequence iterators + // these are typedefs for STL iterators and thus are compatible with STL containers/algorithms + public: + SamSequenceIterator Begin(void); + SamSequenceConstIterator Begin(void) const; + SamSequenceConstIterator ConstBegin(void) const; + SamSequenceIterator End(void); + SamSequenceConstIterator End(void) const; + SamSequenceConstIterator ConstEnd(void) const; + + // internal methods + private: + int IndexOf(const SamSequence& sequence) const; + int IndexOf(const std::string& sequenceName) const; + + // data members + private: + SamSequenceContainer m_data; +}; + +} // namespace BamTools + +#endif // SAM_SEQUENCE_DICTIONARY + diff --git a/src/api/internal/SamFormatParser_p.cpp b/src/api/internal/SamFormatParser_p.cpp new file mode 100644 index 0000000..aa690b8 --- /dev/null +++ b/src/api/internal/SamFormatParser_p.cpp @@ -0,0 +1,221 @@ +// *************************************************************************** +// SamFormatParser.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for parsing SAM header text into SamHeader object +// *************************************************************************** + +#include +#include +#include +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +using namespace std; + +SamFormatParser::SamFormatParser(SamHeader& header) + : m_header(header) +{ } + +SamFormatParser::~SamFormatParser(void) { } + +void SamFormatParser::Parse(const string& headerText) { + + // clear header's prior contents + m_header.Clear(); + + // empty header is OK, but skip processing + if ( headerText.empty() ) + return; + + // other wise parse SAM lines + istringstream headerStream(headerText); + string headerLine = ""; + while ( getline(headerStream, headerLine) ) + ParseSamLine(headerLine); + return; +} + +void SamFormatParser::ParseSamLine(const string& line) { + + // skip if line is not long enough to contain true values + if (line.length() < 5 ) return; + + // determine token at beginning of line + const string firstToken = line.substr(0,3); + string restOfLine = line.substr(4); + if ( firstToken == Constants::SAM_HD_BEGIN_TOKEN) ParseHDLine(restOfLine); + else if ( firstToken == Constants::SAM_SQ_BEGIN_TOKEN) ParseSQLine(restOfLine); + else if ( firstToken == Constants::SAM_RG_BEGIN_TOKEN) ParseRGLine(restOfLine); + else if ( firstToken == Constants::SAM_PG_BEGIN_TOKEN) ParsePGLine(restOfLine); + else if ( firstToken == Constants::SAM_CO_BEGIN_TOKEN) ParseCOLine(restOfLine); + else cerr << "SAM Format Error - unknown token: " << firstToken << endl; + return; +} + +void SamFormatParser::ParseHDLine(const string& line) { + + // split HD lines into tokens + vector tokens = Split(line, Constants::SAM_TAB); + + // iterate over tokens + vector::const_iterator tokenIter = tokens.begin(); + vector::const_iterator tokenEnd = tokens.end(); + for ( ; tokenIter != tokenEnd; ++tokenIter ) { + + // get tag/value + const string tokenTag = (*tokenIter).substr(0,2); + const string tokenValue = (*tokenIter).substr(3); + + // set header contents + if ( tokenTag == Constants::SAM_HD_VERSION_TAG ) m_header.Version = tokenValue; + else if ( tokenTag == Constants::SAM_HD_GROUPORDER_TAG ) m_header.GroupOrder = tokenValue; + else if ( tokenTag == Constants::SAM_HD_SORTORDER_TAG ) m_header.SortOrder = tokenValue; + else + cerr << "SAM Format Error - unknown HD tag: " << tokenTag << endl; + } + + // if @HD line exists, VN must be provided + if ( !m_header.HasVersion() ) { + cerr << "SAM Format Error - @HD line is missing VN tag!" << endl; + return; + } +} + +void SamFormatParser::ParseSQLine(const string& line) { + + SamSequence seq; + + // split SQ line into tokens + vector tokens = Split(line, Constants::SAM_TAB); + + // iterate over tokens + vector::const_iterator tokenIter = tokens.begin(); + vector::const_iterator tokenEnd = tokens.end(); + for ( ; tokenIter != tokenEnd; ++tokenIter ) { + + // get tag/value + const string tokenTag = (*tokenIter).substr(0,2); + const string tokenValue = (*tokenIter).substr(3); + + // set sequence contents + if ( tokenTag == Constants::SAM_SQ_NAME_TAG ) seq.Name = tokenValue; + else if ( tokenTag == Constants::SAM_SQ_LENGTH_TAG ) seq.Length = tokenValue; + else if ( tokenTag == Constants::SAM_SQ_ASSEMBLYID_TAG ) seq.AssemblyID = tokenValue; + else if ( tokenTag == Constants::SAM_SQ_URI_TAG ) seq.URI = tokenValue; + else if ( tokenTag == Constants::SAM_SQ_CHECKSUM_TAG ) seq.Checksum = tokenValue; + else if ( tokenTag == Constants::SAM_SQ_SPECIES_TAG ) seq.Species = tokenValue; + else + cerr << "SAM Format Error - unknown SQ tag: " << tokenTag << endl; + } + + // if @SQ line exists, SN must be provided + if ( !seq.HasName() ) { + cerr << "SAM Format Error - @SQ line is missing SN tag!" << endl; + return; + } + + // if @SQ line exists, LN must be provided + if ( !seq.HasLength() ) { + cerr << "SAM Format Error - @SQ line is missing LN tag!" << endl; + return; + } + + // store SAM sequence entry + m_header.Sequences.Add(seq); +} + +void SamFormatParser::ParseRGLine(const string& line) { + + SamReadGroup rg; + + // split string into tokens + vector tokens = Split(line, Constants::SAM_TAB); + + // iterate over tokens + vector::const_iterator tokenIter = tokens.begin(); + vector::const_iterator tokenEnd = tokens.end(); + for ( ; tokenIter != tokenEnd; ++tokenIter ) { + + // get token tag/value + const string tokenTag = (*tokenIter).substr(0,2); + const string tokenValue = (*tokenIter).substr(3); + + // set read group contents + if ( tokenTag == Constants::SAM_RG_ID_TAG ) rg.ID = tokenValue; + else if ( tokenTag == Constants::SAM_RG_SAMPLE_TAG ) rg.Sample = tokenValue; + else if ( tokenTag == Constants::SAM_RG_LIBRARY_TAG ) rg.Library = tokenValue; + else if ( tokenTag == Constants::SAM_RG_DESCRIPTION_TAG ) rg.Description = tokenValue; + else if ( tokenTag == Constants::SAM_RG_PLATFORMUNIT_TAG ) rg.PlatformUnit = tokenValue; + else if ( tokenTag == Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG ) rg.PredictedInsertSize = tokenValue; + else if ( tokenTag == Constants::SAM_RG_SEQCENTER_TAG ) rg.SequencingCenter = tokenValue; + else if ( tokenTag == Constants::SAM_RG_PRODUCTIONDATE_TAG ) rg.ProductionDate = tokenValue; + else if ( tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG ) rg.SequencingTechnology = tokenValue; + else + cerr << "SAM Format Error - unknown RG tag: " << tokenTag << endl; + } + + // if @RG line exists, ID must be provided + if ( !rg.HasID() ) { + cerr << "SAM Format Error - @RG line is missing ID tag!" << endl; + return; + } + + // if @RG line exists, SM must be provided + if ( !rg.HasSample() ) { + cerr << "SAM Format Error - @RG line is missing SM tag!" << endl; + return; + } + + // store SAM read group entry + m_header.ReadGroups.Add(rg); +} + +void SamFormatParser::ParsePGLine(const string& line) { + + // split string into tokens + vector tokens = Split(line, Constants::SAM_TAB); + + // iterate over tokens + vector::const_iterator tokenIter = tokens.begin(); + vector::const_iterator tokenEnd = tokens.end(); + for ( ; tokenIter != tokenEnd; ++tokenIter ) { + + // get token tag/value + const string tokenTag = (*tokenIter).substr(0,2); + const string tokenValue = (*tokenIter).substr(3); + + // set header contents + if ( tokenTag == Constants::SAM_PG_NAME_TAG ) m_header.ProgramName = tokenValue; + else if ( tokenTag == Constants::SAM_PG_VERSION_TAG ) m_header.ProgramVersion = tokenValue; + else if ( tokenTag == Constants::SAM_PG_COMMANDLINE_TAG ) m_header.ProgramCommandLine = tokenValue; + else + cerr << "SAM Format Error - unknown PG tag: " << tokenTag << endl; + } + + // if @PG line exists, ID must be provided + if ( !m_header.HasProgramName() ) { + cerr << "SAM Format Error - @PG line is missing ID tag!" << endl; + return; + } +} + +void SamFormatParser::ParseCOLine(const string& line) { + // simply add line to comments list + m_header.Comments.push_back(line); +} + +const vector SamFormatParser::Split(const string& line, const char delim) { + vector tokens; + stringstream lineStream(line); + string token; + while ( getline(lineStream, token, delim) ) + tokens.push_back(token); + return tokens; +} diff --git a/src/api/internal/SamFormatParser_p.h b/src/api/internal/SamFormatParser_p.h new file mode 100644 index 0000000..daabe39 --- /dev/null +++ b/src/api/internal/SamFormatParser_p.h @@ -0,0 +1,62 @@ +// *************************************************************************** +// SamFormatParser.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for parsing SAM header text into SamHeader object +// *************************************************************************** + +#ifndef SAM_FORMAT_PARSER_H +#define SAM_FORMAT_PARSER_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include +#include + +namespace BamTools { + +class SamHeader; + +namespace Internal { + +class SamFormatParser { + + // ctor & dtor + public: + SamFormatParser(BamTools::SamHeader& header); + ~SamFormatParser(void); + + // parse text & populate header data + public: + void Parse(const std::string& headerText); + + // internal methods + private: + void ParseSamLine(const std::string& line); + void ParseHDLine(const std::string& line); + void ParseSQLine(const std::string& line); + void ParseRGLine(const std::string& line); + void ParsePGLine(const std::string& line); + void ParseCOLine(const std::string& line); + const std::vector Split(const std::string& line, const char delim); + + // data members + private: + SamHeader& m_header; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // SAM_FORMAT_PARSER_H diff --git a/src/api/internal/SamFormatPrinter_p.cpp b/src/api/internal/SamFormatPrinter_p.cpp new file mode 100644 index 0000000..dcde46e --- /dev/null +++ b/src/api/internal/SamFormatPrinter_p.cpp @@ -0,0 +1,185 @@ +// *************************************************************************** +// SamFormatPrinter.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for printing formatted SAM header to string +// *************************************************************************** + +#include +#include +#include +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +using namespace std; + +SamFormatPrinter::SamFormatPrinter(const SamHeader& header) + : m_header(header) +{ } + +SamFormatPrinter::~SamFormatPrinter(void) { } + +const string SamFormatPrinter::FormatTag(const string &tag, const string &value) const { + return string(Constants::SAM_TAB + tag + Constants::SAM_COLON + value); +} + +const string SamFormatPrinter::ToString(void) const { + + // clear out stream + stringstream out(""); + + // generate formatted header text + PrintHD(out); + PrintSQ(out); + PrintRG(out); + PrintPG(out); + PrintCO(out); + + // return result + return out.str(); +} + +void SamFormatPrinter::PrintHD(std::stringstream& out) const { + + // if header has @HD data + if ( m_header.HasVersion() ) { + + // @HD VN: + out << Constants::SAM_HD_BEGIN_TOKEN + << FormatTag(Constants::SAM_HD_VERSION_TAG, m_header.Version); + + // SO: + if ( m_header.HasSortOrder() ) + out << FormatTag(Constants::SAM_HD_SORTORDER_TAG, m_header.SortOrder); + + // GO: + if ( m_header.HasGroupOrder() ) + out << FormatTag(Constants::SAM_HD_GROUPORDER_TAG, m_header.GroupOrder); + + // newline + out << endl; + } +} + +void SamFormatPrinter::PrintSQ(std::stringstream& out) const { + + // iterate over sequence entries + SamSequenceConstIterator seqIter = m_header.Sequences.ConstBegin(); + SamSequenceConstIterator seqEnd = m_header.Sequences.ConstEnd(); + for ( ; seqIter != seqEnd; ++seqIter ) { + const SamSequence& seq = (*seqIter); + + // @SQ SN: LN: + out << Constants::SAM_SQ_BEGIN_TOKEN + << FormatTag(Constants::SAM_SQ_NAME_TAG, seq.Name) + << FormatTag(Constants::SAM_SQ_LENGTH_TAG, seq.Length); + + // AS: + if ( seq.HasAssemblyID() ) + out << FormatTag(Constants::SAM_SQ_ASSEMBLYID_TAG, seq.AssemblyID); + + // M5: + if ( seq.HasChecksum() ) + out << FormatTag(Constants::SAM_SQ_CHECKSUM_TAG, seq.Checksum); + + // UR: + if ( seq.HasURI() ) + out << FormatTag(Constants::SAM_SQ_URI_TAG, seq.URI); + + // SP: + if ( seq.HasSpecies() ) + out << FormatTag(Constants::SAM_SQ_SPECIES_TAG, seq.Species); + + // newline + out << endl; + } +} + +void SamFormatPrinter::PrintRG(std::stringstream& out) const { + + // iterate over read group entries + SamReadGroupConstIterator rgIter = m_header.ReadGroups.ConstBegin(); + SamReadGroupConstIterator rgEnd = m_header.ReadGroups.ConstEnd(); + for ( ; rgIter != rgEnd; ++rgIter ) { + const SamReadGroup& rg = (*rgIter); + + // @RG ID: SM: + out << Constants::SAM_RG_BEGIN_TOKEN + << FormatTag(Constants::SAM_RG_ID_TAG, rg.ID) + << FormatTag(Constants::SAM_RG_SAMPLE_TAG, rg.Sample); + + // LB: + if ( rg.HasLibrary() ) + out << FormatTag(Constants::SAM_RG_LIBRARY_TAG, rg.Library); + + // DS: + if ( rg.HasDescription() ) + out << FormatTag(Constants::SAM_RG_DESCRIPTION_TAG, rg.Description); + + // PU: + if ( rg.HasPlatformUnit() ) + out << FormatTag(Constants::SAM_RG_PLATFORMUNIT_TAG, rg.PlatformUnit); + + // PI: + if ( rg.HasPredictedInsertSize() ) + out << FormatTag(Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG, rg.PredictedInsertSize); + + // CN: + if ( rg.HasSequencingCenter() ) + out << FormatTag(Constants::SAM_RG_SEQCENTER_TAG, rg.SequencingCenter); + + // DT: + if ( rg.HasProductionDate() ) + out << FormatTag(Constants::SAM_RG_PRODUCTIONDATE_TAG, rg.ProductionDate); + + // PL: + if ( rg.HasSequencingTechnology() ) + out << FormatTag(Constants::SAM_RG_SEQTECHNOLOGY_TAG, rg.SequencingTechnology); + + // newline + out << endl; + } +} + +void SamFormatPrinter::PrintPG(std::stringstream& out) const { + + // if header has @PG data + if ( m_header.HasProgramName() ) { + + // @PG ID: + out << Constants::SAM_PG_BEGIN_TOKEN + << FormatTag(Constants::SAM_PG_NAME_TAG, m_header.ProgramName); + + // VN: + if ( m_header.HasProgramVersion() ) + out << FormatTag(Constants::SAM_PG_VERSION_TAG, m_header.ProgramVersion); + + // CL: + if ( m_header.HasProgramCommandLine() ) + out << FormatTag(Constants::SAM_PG_COMMANDLINE_TAG, m_header.ProgramCommandLine); + + // newline + out << endl; + } +} + +void SamFormatPrinter::PrintCO(std::stringstream& out) const { + + // iterate over comments + vector::const_iterator commentIter = m_header.Comments.begin(); + vector::const_iterator commentEnd = m_header.Comments.end(); + for ( ; commentIter != commentEnd; ++commentIter ) { + + // @CO + out << Constants::SAM_CO_BEGIN_TOKEN + << Constants::SAM_TAB + << (*commentIter) + << endl; + } +} diff --git a/src/api/internal/SamFormatPrinter_p.h b/src/api/internal/SamFormatPrinter_p.h new file mode 100644 index 0000000..5e28e97 --- /dev/null +++ b/src/api/internal/SamFormatPrinter_p.h @@ -0,0 +1,61 @@ +// *************************************************************************** +// SamFormatPrinter.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for printing formatted SAM header to string +// *************************************************************************** + +#ifndef SAM_FORMAT_PRINTER_H +#define SAM_FORMAT_PRINTER_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include +#include + +namespace BamTools { + +class SamHeader; + +namespace Internal { + +class SamFormatPrinter { + + // ctor & dtor + public: + SamFormatPrinter(const BamTools::SamHeader& header); + ~SamFormatPrinter(void); + + // generates SAM-formatted string from header data + public: + const std::string ToString(void) const; + + // internal methods + private: + const std::string FormatTag(const std::string& tag, const std::string& value) const; + void PrintHD(std::stringstream& out) const; + void PrintSQ(std::stringstream& out) const; + void PrintRG(std::stringstream& out) const; + void PrintPG(std::stringstream& out) const; + void PrintCO(std::stringstream& out) const; + + // data members + private: + const SamHeader& m_header; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // SAM_FORMAT_PRINTER_H diff --git a/src/api/internal/SamHeaderValidator_p.cpp b/src/api/internal/SamHeaderValidator_p.cpp new file mode 100644 index 0000000..4409411 --- /dev/null +++ b/src/api/internal/SamHeaderValidator_p.cpp @@ -0,0 +1,545 @@ +// *************************************************************************** +// SamHeaderValidator.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for validating SamHeader data +// *************************************************************************** + +#include +#include +#include +#include +using namespace BamTools; +using namespace BamTools::Internal; + +#include +#include +#include +#include +using namespace std; + +// ------------------------------------------------------------------- +// Allow validation rules to vary between SAM header versions +// +// use SAM_VERSION_X_Y to tag important changes +// +// Together, they will allow for comparisons like: +// if ( m_version < SAM_VERSION_2_0 ) { +// // use some older rule +// else +// // use rule introduced with version 2.0 + +static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0); +static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3); + +// ----------------------------------------- +// SamHeaderValidatorPrivate implementation + +class SamHeaderValidator::SamHeaderValidatorPrivate { + + // ctor & dtor + public: + SamHeaderValidatorPrivate(const SamHeader& header); + ~SamHeaderValidatorPrivate(void) { } + + // 'public' methods + public: + bool Validate(bool verbose); + + // internal validation methods + private: + + // validate header metadata + bool ValidateMetadata(void); + bool ValidateVersion(void); + bool ContainsOnlyDigits(const string& s); + bool ValidateSortOrder(void); + bool ValidateGroupOrder(void); + + // validate sequence dictionary + bool ValidateSequenceDictionary(void); + bool ContainsUniqueSequenceNames(void); + bool CheckNameFormat(const string& name); + bool ValidateSequence(const SamSequence& seq); + bool CheckLengthInRange(const string& length); + + // validate read group dictionary + bool ValidateReadGroupDictionary(void); + bool ValidateReadGroup(const SamReadGroup& rg); + bool ContainsUniqueIDsAndPlatformUnits(void); + bool CheckReadGroupID(const string& id); + bool CheckSequencingTechnology(const string& technology); + bool Is454(const string& technology); + bool IsHelicos(const string& technology); + bool IsIllumina(const string& technology); + bool IsPacBio(const string& technology); + bool IsSolid(const string& technology); + + // validate program data + bool ValidateProgramData(void); + bool ContainsUniqueProgramIds(void); + bool ValidatePreviousProgramIds(void); + + // error reporting + private: + void AddError(const string& message); + void AddWarning(const string& message); + void PrintErrorMessages(void); + void PrintWarningMessages(void); + + // data members + private: + const SamHeader& m_header; + const SamHeaderVersion m_version; + + bool m_isVerboseOutput; + const string ERROR_PREFIX; + const string WARN_PREFIX; + const string NEWLINE; + vector m_errorMessages; + vector m_warningMessages; +}; + +SamHeaderValidator::SamHeaderValidatorPrivate::SamHeaderValidatorPrivate(const SamHeader& header) + : m_header(header) + , m_version( header.Version ) + , m_isVerboseOutput(false) + , ERROR_PREFIX("ERROR: ") + , WARN_PREFIX("WARNING: ") + , NEWLINE("\n") +{ } + +bool SamHeaderValidator::SamHeaderValidatorPrivate::Validate(bool verbose) { + + // set error reporting mode + m_isVerboseOutput = verbose; + + // validate header components + bool isValid = true; + isValid &= ValidateMetadata(); + isValid &= ValidateSequenceDictionary(); + isValid &= ValidateReadGroupDictionary(); + isValid &= ValidateProgramData(); + + // report errors if desired + if ( m_isVerboseOutput ) { + PrintErrorMessages(); + PrintWarningMessages(); + } + + // return validation status + return isValid; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateMetadata(void) { + bool isValid = true; + isValid &= ValidateVersion(); + isValid &= ValidateSortOrder(); + isValid &= ValidateGroupOrder(); + return isValid; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateVersion(void) { + + const string& version = m_header.Version; + + // warn if version not present + if ( version.empty() ) { + AddWarning("Version (VN) missing. Not required, but strongly recommended"); + return true; + } + + // invalid if version does not contain a period + const size_t periodFound = version.find(Constants::SAM_PERIOD); + if ( periodFound == string::npos ) { + AddError("Invalid version (VN) format: " + version); + return false; + } + + // invalid if major version is empty or contains non-digits + const string majorVersion = version.substr(0, periodFound); + if ( majorVersion.empty() || !ContainsOnlyDigits(majorVersion) ) { + AddError("Invalid version (VN) format: " + version); + return false; + } + + // invalid if major version is empty or contains non-digits + const string minorVersion = version.substr(periodFound + 1); + if ( minorVersion.empty() || !ContainsOnlyDigits(minorVersion) ) { + AddError("Invalid version (VN) format: " + version); + return false; + } + + // TODO: check if version is not just syntactically OK, + // but is also a valid SAM version ( 1.0 .. CURRENT ) + + // all checked out this far, then version is OK + return true; +} + +// assumes non-empty input string +bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsOnlyDigits(const string& s) { + const size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS); + return ( nonDigitPosition == string::npos ) ; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateSortOrder(void) { + + const string& sortOrder = m_header.SortOrder; + + // warn if sort order not present + if ( sortOrder.empty() ) { + AddWarning("Sort order (SO) missing. Not required, but strongly recommended"); + return true; + } + + // if sort order is valid keyword + if ( sortOrder == Constants::SAM_HD_SORTORDER_COORDINATE || + sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME || + sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED + ) + { return true; } + + // otherwise + AddError("Invalid sort order (SO): " + sortOrder); + return false; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateGroupOrder(void) { + + const string& groupOrder = m_header.GroupOrder; + + // if no group order, no problem, just return OK + if ( groupOrder.empty() ) return true; + + // if group order is valid keyword + if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE || + groupOrder == Constants::SAM_HD_GROUPORDER_QUERY || + groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE + ) + { return true; } + + // otherwise + AddError("Invalid group order (GO): " + groupOrder); + return false; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateSequenceDictionary(void) { + + // TODO: warn/error if no sequences ? + + bool isValid = true; + + // check for unique sequence names + isValid &= ContainsUniqueSequenceNames(); + + // iterate over sequences + const SamSequenceDictionary& sequences = m_header.Sequences; + SamSequenceConstIterator seqIter = sequences.ConstBegin(); + SamSequenceConstIterator seqEnd = sequences.ConstEnd(); + for ( ; seqIter != seqEnd; ++seqIter ) { + const SamSequence& seq = (*seqIter); + isValid &= ValidateSequence(seq); + } + + // return validation state + return isValid; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsUniqueSequenceNames(void) { + + bool isValid = true; + set sequenceNames; + set::iterator nameIter; + + // iterate over sequences + const SamSequenceDictionary& sequences = m_header.Sequences; + SamSequenceConstIterator seqIter = sequences.ConstBegin(); + SamSequenceConstIterator seqEnd = sequences.ConstEnd(); + for ( ; seqIter != seqEnd; ++seqIter ) { + const SamSequence& seq = (*seqIter); + const string& name = seq.Name; + + // lookup sequence name + nameIter = sequenceNames.find(name); + + // error if found (duplicate entry) + if ( nameIter != sequenceNames.end() ) { + AddError("Sequence name (SN): " + name + " is not unique"); + isValid = false; + } + + // otherwise ok, store name + sequenceNames.insert(name); + } + + // return validation state + return isValid; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateSequence(const SamSequence& seq) { + bool isValid = true; + isValid &= CheckNameFormat(seq.Name); + isValid &= CheckLengthInRange(seq.Length); + return isValid; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckNameFormat(const string& name) { + + // invalid if name is empty + if ( name.empty() ) { + AddError("Sequence entry (@SQ) is missing SN tag"); + return false; + } + + // invalid if first character is a reserved char + const char firstChar = name.at(0); + if ( firstChar == Constants::SAM_EQUAL || firstChar == Constants::SAM_STAR ) { + AddError("Invalid sequence name (SN): " + name); + return false; + } + // otherwise OK + return true; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckLengthInRange(const string& length) { + + // invalid if empty + if ( length.empty() ) { + AddError("Sequence entry (@SQ) is missing LN tag"); + return false; + } + + // convert string length to numeric + stringstream lengthStream(length); + unsigned int sequenceLength; + lengthStream >> sequenceLength; + + // invalid if length outside accepted range + if ( sequenceLength < Constants::SAM_SQ_LENGTH_MIN || sequenceLength > Constants::SAM_SQ_LENGTH_MAX ) { + AddError("Sequence length (LN): " + length + " out of range"); + return false; + } + + // otherwise OK + return true; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateReadGroupDictionary(void) { + + // TODO: warn/error if no read groups ? + + bool isValid = true; + + // check for unique read group IDs & platform units + isValid &= ContainsUniqueIDsAndPlatformUnits(); + + // iterate over read groups + const SamReadGroupDictionary& readGroups = m_header.ReadGroups; + SamReadGroupConstIterator rgIter = readGroups.ConstBegin(); + SamReadGroupConstIterator rgEnd = readGroups.ConstEnd(); + for ( ; rgIter != rgEnd; ++rgIter ) { + const SamReadGroup& rg = (*rgIter); + isValid &= ValidateReadGroup(rg); + } + + // return validation state + return isValid; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsUniqueIDsAndPlatformUnits(void) { + + bool isValid = true; + set readGroupIds; + set platformUnits; + set::iterator idIter; + set::iterator puIter; + + // iterate over sequences + const SamReadGroupDictionary& readGroups = m_header.ReadGroups; + SamReadGroupConstIterator rgIter = readGroups.ConstBegin(); + SamReadGroupConstIterator rgEnd = readGroups.ConstEnd(); + for ( ; rgIter != rgEnd; ++rgIter ) { + const SamReadGroup& rg = (*rgIter); + + // -------------------------------- + // check for unique ID + + // lookup read group ID + const string& id = rg.ID; + idIter = readGroupIds.find(id); + + // error if found (duplicate entry) + if ( idIter != readGroupIds.end() ) { + AddError("Read group ID (ID): " + id + " is not unique"); + isValid = false; + } + + // otherwise ok, store id + readGroupIds.insert(id); + + // -------------------------------- + // check for unique platform unit + + // lookup platform unit + const string& pu = rg.PlatformUnit; + puIter = platformUnits.find(pu); + + // error if found (duplicate entry) + if ( puIter != platformUnits.end() ) { + AddError("Platform unit (PU): " + pu + " is not unique"); + isValid = false; + } + + // otherwise ok, store platform unit + platformUnits.insert(pu); + } + + // return validation state + return isValid; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateReadGroup(const SamReadGroup& rg) { + bool isValid = true; + isValid &= CheckReadGroupID(rg.ID); + isValid &= CheckSequencingTechnology(rg.SequencingTechnology); + return isValid; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckReadGroupID(const string& id) { + + // invalid if empty + if ( id.empty() ) { + AddError("Read group entry (@RG) is missing ID tag"); + return false; + } + + // otherwise OK + return true; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckSequencingTechnology(const string& technology) { + + // if no technology provided, no problem, just return OK + if ( technology.empty() ) return true; + + // if technology is valid keyword + if ( Is454(technology) || + IsHelicos(technology) || + IsIllumina(technology) || + IsPacBio(technology) || + IsSolid(technology) + ) + { return true; } + + // otherwise + AddError("Invalid read group sequencing platform (PL): " + technology); + return false; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::Is454(const string& technology) { + return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_454 || + technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_LOWER || + technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_UPPER + ); +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::IsHelicos(const string& technology) { + return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_LOWER || + technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_UPPER + ); +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::IsIllumina(const string& technology) { + return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_LOWER || + technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_UPPER + ); +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::IsPacBio(const string& technology) { + return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_LOWER || + technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_UPPER + ); +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::IsSolid(const string& technology) { + return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_LOWER || + technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_UPPER + ); +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateProgramData(void) { + bool isValid = true; + isValid &= ContainsUniqueProgramIds(); + isValid &= ValidatePreviousProgramIds(); + return isValid; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsUniqueProgramIds(void) { + bool isValid = true; + // TODO: once we have ability to handle multiple @PG entries, + // check here for duplicate ID's + // but for now, just return true + return isValid; +} + +bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidatePreviousProgramIds(void) { + bool isValid = true; + // TODO: check that PP entries are valid later, after we get multiple @PG-entry handling + // just return true for now + return isValid; +} +void SamHeaderValidator::SamHeaderValidatorPrivate::AddError(const string& message) { + m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE); +} + +void SamHeaderValidator::SamHeaderValidatorPrivate::AddWarning(const string& message) { + m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE); +} + +void SamHeaderValidator::SamHeaderValidatorPrivate::PrintErrorMessages(void) { + + // skip if no error messages + if ( m_errorMessages.empty() ) return; + + // print error header line + cerr << "* SAM header has " << m_errorMessages.size() << " errors:" << endl; + + // print each error message + vector::const_iterator errorIter = m_errorMessages.begin(); + vector::const_iterator errorEnd = m_errorMessages.end(); + for ( ; errorIter != errorEnd; ++errorIter ) + cerr << (*errorIter); +} + +void SamHeaderValidator::SamHeaderValidatorPrivate::PrintWarningMessages(void) { + + // skip if no warning messages + if ( m_warningMessages.empty() ) return; + + // print warning header line + cerr << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl; + + // print each warning message + vector::const_iterator warnIter = m_warningMessages.begin(); + vector::const_iterator warnEnd = m_warningMessages.end(); + for ( ; warnIter != warnEnd; ++warnIter ) + cerr << (*warnIter); +} + +// ----------------------------------- +// SamHeaderValidator implementation + +SamHeaderValidator::SamHeaderValidator(const BamTools::SamHeader& header) + : d( new SamHeaderValidatorPrivate(header) ) +{ } + +SamHeaderValidator::~SamHeaderValidator(void) { + delete d; + d = 0; +} + +bool SamHeaderValidator::Validate(bool verbose) { return d->Validate(verbose); } diff --git a/src/api/internal/SamHeaderValidator_p.h b/src/api/internal/SamHeaderValidator_p.h new file mode 100644 index 0000000..41c04ee --- /dev/null +++ b/src/api/internal/SamHeaderValidator_p.h @@ -0,0 +1,52 @@ +// *************************************************************************** +// SamHeaderValidator.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for validating SamHeader data +// *************************************************************************** + +#ifndef SAM_HEADER_VALIDATOR_P_H +#define SAM_HEADER_VALIDATOR_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include +#include + +namespace BamTools { + +class SamHeader; + +namespace Internal { + +class SamHeaderValidator { + + public: + SamHeaderValidator(const BamTools::SamHeader& header); + ~SamHeaderValidator(void); + + public: + // validates SamHeader data + // prints error & warning messages to stderr when (verbose == true) + bool Validate(bool verbose = false); + + private: + struct SamHeaderValidatorPrivate; + SamHeaderValidatorPrivate* d; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // SAM_HEADER_VALIDATOR_P_H diff --git a/src/api/internal/SamHeaderVersion_p.h b/src/api/internal/SamHeaderVersion_p.h new file mode 100644 index 0000000..ff96471 --- /dev/null +++ b/src/api/internal/SamHeaderVersion_p.h @@ -0,0 +1,115 @@ +#ifndef SAM_HEADERVERSION_P_H +#define SAM_HEADERVERSION_P_H + +#include +#include +#include + +namespace BamTools { +namespace Internal { + +class SamHeaderVersion { + + // ctors & dtor + public: + SamHeaderVersion(void) + : m_majorVersion(0) + , m_minorVersion(0) + { } + + explicit SamHeaderVersion(const std::string& version) + : m_majorVersion(0) + , m_minorVersion(0) + { + SetVersion(version); + } + + SamHeaderVersion(const unsigned int& major, const unsigned int& minor) + : m_majorVersion(major) + , m_minorVersion(minor) + { } + + ~SamHeaderVersion(void) { + m_majorVersion = 0; + m_minorVersion = 0; + } + + // acess data + public: + unsigned int MajorVersion(void) const { return m_majorVersion; } + unsigned int MinorVersion(void) const { return m_minorVersion; } + + inline void SetVersion(const std::string& version); + inline std::string ToString(void) const; + + // data members + private: + unsigned int m_majorVersion; + unsigned int m_minorVersion; +}; + +inline +void SamHeaderVersion::SetVersion(const std::string& version) { + + // do nothing if version is empty + if ( !version.empty() ) { + + // do nothing if period not found + const size_t periodFound = version.find(Constants::SAM_PERIOD); + if ( periodFound != std::string::npos ) { + + // store major version if non-empty and contains only digits + const std::string& majorVersion = version.substr(0, periodFound); + if ( majorVersion.empty() ) { + const size_t nonDigitFound = majorVersion.find_first_not_of(Constants::SAM_DIGITS); + if ( nonDigitFound == std::string::npos ) { + std::stringstream major(majorVersion); + major >> m_majorVersion; + } + } + + // store minor version if non-empty and contains only digits + const std::string& minorVersion = version.substr(periodFound + 1); + if ( minorVersion.empty() ) { + const size_t nonDigitFound = minorVersion.find_first_not_of(Constants::SAM_DIGITS); + if ( nonDigitFound == std::string::npos ) { + std::stringstream minor(minorVersion); + minor >> m_minorVersion; + } + } + } + } +} + +// ----------------------------------------------------- +// printing + +inline std::string SamHeaderVersion::ToString(void) const { + std::stringstream version; + version << m_majorVersion << Constants::SAM_PERIOD << m_minorVersion; + return version.str(); +} + +// ----------------------------------------------------- +// comparison operators + +inline bool operator==(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { + return (lhs.MajorVersion() == rhs.MajorVersion()) && + (lhs.MinorVersion() == rhs.MinorVersion()); +} + +inline bool operator<(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { + if ( lhs.MajorVersion() == rhs.MajorVersion() ) + return lhs.MinorVersion() < rhs.MinorVersion(); + else + return lhs.MajorVersion() < rhs.MajorVersion(); +} + +inline bool operator> (const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return rhs < lhs; } +inline bool operator<=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return !(lhs>rhs); } +inline bool operator>=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return !(lhs