X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Fapi%2Finternal%2FSamHeaderValidator_p.cpp;h=094e79adb795ecb4a3560fab9700dce91ecbf80d;hb=9f1ce8c47aeadb6dc1320b52ee671c3341b97935;hp=4aa6395bf519269f32f4de9da897ec554af27eba;hpb=8c80d760637f8df39262683cd2570f0589423d36;p=bamtools.git diff --git a/src/api/internal/SamHeaderValidator_p.cpp b/src/api/internal/SamHeaderValidator_p.cpp index 4aa6395..094e79a 100644 --- a/src/api/internal/SamHeaderValidator_p.cpp +++ b/src/api/internal/SamHeaderValidator_p.cpp @@ -1,25 +1,47 @@ // *************************************************************************** // SamHeaderValidator.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College -// All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 March 2011 (DB) +// Last modified: 10 October 2011 (DB) // --------------------------------------------------------------------------- // Provides functionality for validating SamHeader data // *************************************************************************** -#include -#include -#include -#include +#include "api/SamConstants.h" +#include "api/SamHeader.h" +#include "api/internal/SamHeaderValidator_p.h" +#include "api/internal/SamHeaderVersion_p.h" using namespace BamTools; using namespace BamTools::Internal; -#include +#include #include #include using namespace std; +// ------------------------ +// static utility methods +// ------------------------- + +static +bool caseInsensitiveCompare(const string& lhs, const string& rhs) { + + // can omit checking chars if lengths not equal + const int lhsLength = lhs.length(); + const int rhsLength = rhs.length(); + if ( lhsLength != rhsLength ) + return false; + + // do *basic* toupper checks on each string char's + for ( int i = 0; i < lhsLength; ++i ) { + if ( toupper( (int)lhs.at(i)) != toupper( (int)rhs.at(i)) ) + return false; + } + + // otherwise OK + return true; +} + // ------------------------------------------------------------------------ // Allow validation rules to vary, as needed, between SAM header versions // @@ -32,7 +54,10 @@ using namespace std; // // use rule introduced with version 2.0 static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0); +static const SamHeaderVersion SAM_VERSION_1_1 = SamHeaderVersion(1,1); +static const SamHeaderVersion SAM_VERSION_1_2 = SamHeaderVersion(1,2); static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3); +static const SamHeaderVersion SAM_VERSION_1_4 = SamHeaderVersion(1,4); // TODO: This functionality is currently unused. // Make validation "version-aware." @@ -49,25 +74,62 @@ SamHeaderValidator::SamHeaderValidator(const SamHeader& header) SamHeaderValidator::~SamHeaderValidator(void) { } -bool SamHeaderValidator::Validate(bool verbose) { +void SamHeaderValidator::AddError(const string& message) { + m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE); +} + +void SamHeaderValidator::AddWarning(const string& message) { + m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE); +} + +void SamHeaderValidator::PrintErrorMessages(ostream& stream) { + + // skip if no error messages + if ( m_errorMessages.empty() ) + return; + + // print error header line + stream << "* SAM header has " << m_errorMessages.size() << " errors:" << endl; + + // print each error message + vector::const_iterator errorIter = m_errorMessages.begin(); + vector::const_iterator errorEnd = m_errorMessages.end(); + for ( ; errorIter != errorEnd; ++errorIter ) + stream << (*errorIter); +} + +void SamHeaderValidator::PrintMessages(ostream& stream) { + PrintErrorMessages(stream); + PrintWarningMessages(stream); +} + +void SamHeaderValidator::PrintWarningMessages(ostream& stream) { + + // skip if no warning messages + if ( m_warningMessages.empty() ) + return; + + // print warning header line + stream << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl; + + // print each warning message + vector::const_iterator warnIter = m_warningMessages.begin(); + vector::const_iterator warnEnd = m_warningMessages.end(); + for ( ; warnIter != warnEnd; ++warnIter ) + stream << (*warnIter); +} - // validate header components +// entry point for validation +bool SamHeaderValidator::Validate(void) { bool isValid = true; isValid &= ValidateMetadata(); isValid &= ValidateSequenceDictionary(); isValid &= ValidateReadGroupDictionary(); - isValid &= ValidateProgramData(); - - // report errors if desired - if ( verbose ) { - PrintErrorMessages(); - PrintWarningMessages(); - } - - // return validation status + isValid &= ValidateProgramChain(); return isValid; } +// check all SAM header 'metadata' bool SamHeaderValidator::ValidateMetadata(void) { bool isValid = true; isValid &= ValidateVersion(); @@ -76,6 +138,7 @@ bool SamHeaderValidator::ValidateMetadata(void) { return isValid; } +// check SAM header version tag bool SamHeaderValidator::ValidateVersion(void) { const string& version = m_header.Version; @@ -120,6 +183,7 @@ bool SamHeaderValidator::ContainsOnlyDigits(const string& s) { return ( nonDigitPosition == string::npos ) ; } +// validate SAM header sort order tag bool SamHeaderValidator::ValidateSortOrder(void) { const string& sortOrder = m_header.SortOrder; @@ -144,6 +208,7 @@ bool SamHeaderValidator::ValidateSortOrder(void) { return false; } +// validate SAM header group order tag bool SamHeaderValidator::ValidateGroupOrder(void) { const string& groupOrder = m_header.GroupOrder; @@ -166,10 +231,9 @@ bool SamHeaderValidator::ValidateGroupOrder(void) { return false; } +// validate SAM header sequence dictionary bool SamHeaderValidator::ValidateSequenceDictionary(void) { - // TODO: warn/error if no sequences ? - bool isValid = true; // check for unique sequence names @@ -188,6 +252,7 @@ bool SamHeaderValidator::ValidateSequenceDictionary(void) { return isValid; } +// make sure all SQ names are unique bool SamHeaderValidator::ContainsUniqueSequenceNames(void) { bool isValid = true; @@ -200,9 +265,9 @@ bool SamHeaderValidator::ContainsUniqueSequenceNames(void) { SamSequenceConstIterator seqEnd = sequences.ConstEnd(); for ( ; seqIter != seqEnd; ++seqIter ) { const SamSequence& seq = (*seqIter); - const string& name = seq.Name; // lookup sequence name + const string& name = seq.Name; nameIter = sequenceNames.find(name); // error if found (duplicate entry) @@ -219,6 +284,7 @@ bool SamHeaderValidator::ContainsUniqueSequenceNames(void) { return isValid; } +// validate SAM header sequence entry bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) { bool isValid = true; isValid &= CheckNameFormat(seq.Name); @@ -226,6 +292,7 @@ bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) { return isValid; } +// check sequence name is valid format bool SamHeaderValidator::CheckNameFormat(const string& name) { // invalid if name is empty @@ -244,6 +311,7 @@ bool SamHeaderValidator::CheckNameFormat(const string& name) { return true; } +// check that sequence length is within accepted range bool SamHeaderValidator::CheckLengthInRange(const string& length) { // invalid if empty @@ -267,10 +335,9 @@ bool SamHeaderValidator::CheckLengthInRange(const string& length) { return true; } +// validate SAM header read group dictionary bool SamHeaderValidator::ValidateReadGroupDictionary(void) { - // TODO: warn/error if no read groups ? - bool isValid = true; // check for unique read group IDs & platform units @@ -289,6 +356,7 @@ bool SamHeaderValidator::ValidateReadGroupDictionary(void) { return isValid; } +// make sure RG IDs and platform units are unique bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) { bool isValid = true; @@ -341,6 +409,7 @@ bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) { return isValid; } +// validate SAM header read group entry bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) { bool isValid = true; isValid &= CheckReadGroupID(rg.ID); @@ -348,6 +417,7 @@ bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) { return isValid; } +// make sure RG ID exists bool SamHeaderValidator::CheckReadGroupID(const string& id) { // invalid if empty @@ -360,6 +430,7 @@ bool SamHeaderValidator::CheckReadGroupID(const string& id) { return true; } +// make sure RG sequencing tech is one of the accepted keywords bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) { // if no technology provided, no problem, just return OK @@ -367,11 +438,13 @@ bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) { return true; // if technology is valid keyword - if ( Is454(technology) || - IsHelicos(technology) || - IsIllumina(technology) || - IsPacBio(technology) || - IsSolid(technology) + if ( caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_CAPILLARY) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_HELICOS) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_IONTORRENT) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_LS454) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_PACBIO) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_SOLID) ) { return true; @@ -382,92 +455,70 @@ bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) { return false; } -bool SamHeaderValidator::Is454(const string& technology) { - return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_454 || - technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_LOWER || - technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_UPPER - ); -} - -bool SamHeaderValidator::IsHelicos(const string& technology) { - return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_LOWER || - technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_UPPER - ); -} - -bool SamHeaderValidator::IsIllumina(const string& technology) { - return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_LOWER || - technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_UPPER - ); -} - -bool SamHeaderValidator::IsPacBio(const string& technology) { - return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_LOWER || - technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_UPPER - ); -} - -bool SamHeaderValidator::IsSolid(const string& technology) { - return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_LOWER || - technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_UPPER - ); -} - -bool SamHeaderValidator::ValidateProgramData(void) { +// validate the SAM header "program chain" +bool SamHeaderValidator::ValidateProgramChain(void) { bool isValid = true; isValid &= ContainsUniqueProgramIds(); isValid &= ValidatePreviousProgramIds(); return isValid; } +// make sure all PG IDs are unique bool SamHeaderValidator::ContainsUniqueProgramIds(void) { - bool isValid = true; - // TODO: once we have ability to handle multiple @PG entries, - // check here for duplicate ID's - // but for now, just return true - return isValid; -} -bool SamHeaderValidator::ValidatePreviousProgramIds(void) { bool isValid = true; - // TODO: check that PP entries are valid later, after we get multiple @PG-entry handling - // just return true for now - return isValid; -} -void SamHeaderValidator::AddError(const string& message) { - m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE); -} + set programIds; + set::iterator pgIdIter; -void SamHeaderValidator::AddWarning(const string& message) { - m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE); -} + // iterate over program records + const SamProgramChain& programs = m_header.Programs; + SamProgramConstIterator pgIter = programs.ConstBegin(); + SamProgramConstIterator pgEnd = programs.ConstEnd(); + for ( ; pgIter != pgEnd; ++pgIter ) { + const SamProgram& pg = (*pgIter); -void SamHeaderValidator::PrintErrorMessages(void) { + // lookup program ID + const string& pgId = pg.ID; + pgIdIter = programIds.find(pgId); - // skip if no error messages - if ( m_errorMessages.empty() ) return; + // error if found (duplicate entry) + if ( pgIdIter != programIds.end() ) { + AddError("Program ID (ID): " + pgId + " is not unique"); + isValid = false; + } - // print error header line - cerr << "* SAM header has " << m_errorMessages.size() << " errors:" << endl; + // otherwise ok, store ID + programIds.insert(pgId); + } - // print each error message - vector::const_iterator errorIter = m_errorMessages.begin(); - vector::const_iterator errorEnd = m_errorMessages.end(); - for ( ; errorIter != errorEnd; ++errorIter ) - cerr << (*errorIter); + // return validation state + return isValid; } -void SamHeaderValidator::PrintWarningMessages(void) { +// make sure that any PP tags present point to existing @PG IDs +bool SamHeaderValidator::ValidatePreviousProgramIds(void) { - // skip if no warning messages - if ( m_warningMessages.empty() ) return; + bool isValid = true; - // print warning header line - cerr << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl; + // iterate over program records + const SamProgramChain& programs = m_header.Programs; + SamProgramConstIterator pgIter = programs.ConstBegin(); + SamProgramConstIterator pgEnd = programs.ConstEnd(); + for ( ; pgIter != pgEnd; ++pgIter ) { + const SamProgram& pg = (*pgIter); + + // ignore record for validation if PreviousProgramID is empty + const string& ppId = pg.PreviousProgramID; + if ( ppId.empty() ) + continue; + + // see if program "chain" contains an entry for ppId + if ( !programs.Contains(ppId) ) { + AddError("PreviousProgramID (PP): " + ppId + " is not a known ID"); + isValid = false; + } + } - // print each warning message - vector::const_iterator warnIter = m_warningMessages.begin(); - vector::const_iterator warnEnd = m_warningMessages.end(); - for ( ; warnIter != warnEnd; ++warnIter ) - cerr << (*warnIter); + // return validation state + return isValid; }