X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Fapi%2Finternal%2FSamHeaderValidator_p.cpp;h=c76fff95ba3a9fd2d8198abd1fd1cb035f7df91f;hb=af6a3d8491e485969d2df306e41cb9439dec4039;hp=131fc3dd62db967da57f85a66e8815ee44df33c7;hpb=0b8c7955dc2fed672eab69b3319e70b9f3a0ac27;p=bamtools.git diff --git a/src/api/internal/SamHeaderValidator_p.cpp b/src/api/internal/SamHeaderValidator_p.cpp index 131fc3d..c76fff9 100644 --- a/src/api/internal/SamHeaderValidator_p.cpp +++ b/src/api/internal/SamHeaderValidator_p.cpp @@ -1,25 +1,47 @@ // *************************************************************************** // SamHeaderValidator.cpp (c) 2010 Derek Barnett // Marth Lab, Department of Biology, Boston College -// All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 13 January 2011 (DB) +// Last modified: 14 October 2011 (DB) // --------------------------------------------------------------------------- // Provides functionality for validating SamHeader data // *************************************************************************** -#include -#include -#include -#include +#include "api/SamConstants.h" +#include "api/SamHeader.h" +#include "api/internal/SamHeaderValidator_p.h" +#include "api/internal/SamHeaderVersion_p.h" using namespace BamTools; using namespace BamTools::Internal; -#include +#include #include #include using namespace std; +// ------------------------ +// static utility methods +// ------------------------- + +static +bool caseInsensitiveCompare(const string& lhs, const string& rhs) { + + // can omit checking chars if lengths not equal + const int lhsLength = lhs.length(); + const int rhsLength = rhs.length(); + if ( lhsLength != rhsLength ) + return false; + + // do *basic* toupper checks on each string char's + for ( int i = 0; i < lhsLength; ++i ) { + if ( toupper( (int)lhs.at(i)) != toupper( (int)rhs.at(i)) ) + return false; + } + + // otherwise OK + return true; +} + // ------------------------------------------------------------------------ // Allow validation rules to vary, as needed, between SAM header versions // @@ -32,7 +54,10 @@ using namespace std; // // use rule introduced with version 2.0 static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0); +static const SamHeaderVersion SAM_VERSION_1_1 = SamHeaderVersion(1,1); +static const SamHeaderVersion SAM_VERSION_1_2 = SamHeaderVersion(1,2); static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3); +static const SamHeaderVersion SAM_VERSION_1_4 = SamHeaderVersion(1,4); // TODO: This functionality is currently unused. // Make validation "version-aware." @@ -49,25 +74,62 @@ SamHeaderValidator::SamHeaderValidator(const SamHeader& header) SamHeaderValidator::~SamHeaderValidator(void) { } -bool SamHeaderValidator::Validate(bool verbose) { +void SamHeaderValidator::AddError(const string& message) { + m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE); +} + +void SamHeaderValidator::AddWarning(const string& message) { + m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE); +} + +void SamHeaderValidator::PrintErrorMessages(ostream& stream) { + + // skip if no error messages + if ( m_errorMessages.empty() ) + return; + + // print error header line + stream << "* SAM header has " << m_errorMessages.size() << " errors:" << endl; + + // print each error message + vector::const_iterator errorIter = m_errorMessages.begin(); + vector::const_iterator errorEnd = m_errorMessages.end(); + for ( ; errorIter != errorEnd; ++errorIter ) + stream << (*errorIter); +} + +void SamHeaderValidator::PrintMessages(ostream& stream) { + PrintErrorMessages(stream); + PrintWarningMessages(stream); +} + +void SamHeaderValidator::PrintWarningMessages(ostream& stream) { + + // skip if no warning messages + if ( m_warningMessages.empty() ) + return; + + // print warning header line + stream << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl; + + // print each warning message + vector::const_iterator warnIter = m_warningMessages.begin(); + vector::const_iterator warnEnd = m_warningMessages.end(); + for ( ; warnIter != warnEnd; ++warnIter ) + stream << (*warnIter); +} - // validate header components +// entry point for validation +bool SamHeaderValidator::Validate(void) { bool isValid = true; isValid &= ValidateMetadata(); isValid &= ValidateSequenceDictionary(); isValid &= ValidateReadGroupDictionary(); - isValid &= ValidateProgramData(); - - // report errors if desired - if ( verbose ) { - PrintErrorMessages(); - PrintWarningMessages(); - } - - // return validation status + isValid &= ValidateProgramChain(); return isValid; } +// check all SAM header 'metadata' bool SamHeaderValidator::ValidateMetadata(void) { bool isValid = true; isValid &= ValidateVersion(); @@ -76,6 +138,7 @@ bool SamHeaderValidator::ValidateMetadata(void) { return isValid; } +// check SAM header version tag bool SamHeaderValidator::ValidateVersion(void) { const string& version = m_header.Version; @@ -120,6 +183,7 @@ bool SamHeaderValidator::ContainsOnlyDigits(const string& s) { return ( nonDigitPosition == string::npos ) ; } +// validate SAM header sort order tag bool SamHeaderValidator::ValidateSortOrder(void) { const string& sortOrder = m_header.SortOrder; @@ -135,36 +199,41 @@ bool SamHeaderValidator::ValidateSortOrder(void) { sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME || sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED ) - { return true; } + { + return true; + } // otherwise AddError("Invalid sort order (SO): " + sortOrder); return false; } +// validate SAM header group order tag bool SamHeaderValidator::ValidateGroupOrder(void) { const string& groupOrder = m_header.GroupOrder; // if no group order, no problem, just return OK - if ( groupOrder.empty() ) return true; + if ( groupOrder.empty() ) + return true; // if group order is valid keyword if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE || groupOrder == Constants::SAM_HD_GROUPORDER_QUERY || groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE ) - { return true; } + { + return true; + } // otherwise AddError("Invalid group order (GO): " + groupOrder); return false; } +// validate SAM header sequence dictionary bool SamHeaderValidator::ValidateSequenceDictionary(void) { - // TODO: warn/error if no sequences ? - bool isValid = true; // check for unique sequence names @@ -183,6 +252,7 @@ bool SamHeaderValidator::ValidateSequenceDictionary(void) { return isValid; } +// make sure all SQ names are unique bool SamHeaderValidator::ContainsUniqueSequenceNames(void) { bool isValid = true; @@ -195,9 +265,9 @@ bool SamHeaderValidator::ContainsUniqueSequenceNames(void) { SamSequenceConstIterator seqEnd = sequences.ConstEnd(); for ( ; seqIter != seqEnd; ++seqIter ) { const SamSequence& seq = (*seqIter); - const string& name = seq.Name; // lookup sequence name + const string& name = seq.Name; nameIter = sequenceNames.find(name); // error if found (duplicate entry) @@ -214,6 +284,7 @@ bool SamHeaderValidator::ContainsUniqueSequenceNames(void) { return isValid; } +// validate SAM header sequence entry bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) { bool isValid = true; isValid &= CheckNameFormat(seq.Name); @@ -221,6 +292,7 @@ bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) { return isValid; } +// check sequence name is valid format bool SamHeaderValidator::CheckNameFormat(const string& name) { // invalid if name is empty @@ -239,6 +311,7 @@ bool SamHeaderValidator::CheckNameFormat(const string& name) { return true; } +// check that sequence length is within accepted range bool SamHeaderValidator::CheckLengthInRange(const string& length) { // invalid if empty @@ -262,10 +335,9 @@ bool SamHeaderValidator::CheckLengthInRange(const string& length) { return true; } +// validate SAM header read group dictionary bool SamHeaderValidator::ValidateReadGroupDictionary(void) { - // TODO: warn/error if no read groups ? - bool isValid = true; // check for unique read group IDs & platform units @@ -284,6 +356,7 @@ bool SamHeaderValidator::ValidateReadGroupDictionary(void) { return isValid; } +// make sure RG IDs and platform units are unique bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) { bool isValid = true; @@ -336,6 +409,7 @@ bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) { return isValid; } +// validate SAM header read group entry bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) { bool isValid = true; isValid &= CheckReadGroupID(rg.ID); @@ -343,6 +417,7 @@ bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) { return isValid; } +// make sure RG ID exists bool SamHeaderValidator::CheckReadGroupID(const string& id) { // invalid if empty @@ -355,111 +430,95 @@ bool SamHeaderValidator::CheckReadGroupID(const string& id) { return true; } +// make sure RG sequencing tech is one of the accepted keywords bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) { // if no technology provided, no problem, just return OK - if ( technology.empty() ) return true; + if ( technology.empty() ) + return true; // if technology is valid keyword - if ( Is454(technology) || - IsHelicos(technology) || - IsIllumina(technology) || - IsPacBio(technology) || - IsSolid(technology) + if ( caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_CAPILLARY) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_HELICOS) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_IONTORRENT) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_LS454) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_PACBIO) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_SOLID) ) - { return true; } + { + return true; + } // otherwise AddError("Invalid read group sequencing platform (PL): " + technology); return false; } -bool SamHeaderValidator::Is454(const string& technology) { - return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_454 || - technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_LOWER || - technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_UPPER - ); -} - -bool SamHeaderValidator::IsHelicos(const string& technology) { - return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_LOWER || - technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_UPPER - ); -} - -bool SamHeaderValidator::IsIllumina(const string& technology) { - return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_LOWER || - technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_UPPER - ); -} - -bool SamHeaderValidator::IsPacBio(const string& technology) { - return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_LOWER || - technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_UPPER - ); -} - -bool SamHeaderValidator::IsSolid(const string& technology) { - return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_LOWER || - technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_UPPER - ); -} - -bool SamHeaderValidator::ValidateProgramData(void) { +// validate the SAM header "program chain" +bool SamHeaderValidator::ValidateProgramChain(void) { bool isValid = true; isValid &= ContainsUniqueProgramIds(); isValid &= ValidatePreviousProgramIds(); return isValid; } +// make sure all PG IDs are unique bool SamHeaderValidator::ContainsUniqueProgramIds(void) { - bool isValid = true; - // TODO: once we have ability to handle multiple @PG entries, - // check here for duplicate ID's - // but for now, just return true - return isValid; -} -bool SamHeaderValidator::ValidatePreviousProgramIds(void) { bool isValid = true; - // TODO: check that PP entries are valid later, after we get multiple @PG-entry handling - // just return true for now - return isValid; -} -void SamHeaderValidator::AddError(const string& message) { - m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE); -} + set programIds; + set::iterator pgIdIter; -void SamHeaderValidator::AddWarning(const string& message) { - m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE); -} + // iterate over program records + const SamProgramChain& programs = m_header.Programs; + SamProgramConstIterator pgIter = programs.ConstBegin(); + SamProgramConstIterator pgEnd = programs.ConstEnd(); + for ( ; pgIter != pgEnd; ++pgIter ) { + const SamProgram& pg = (*pgIter); -void SamHeaderValidator::PrintErrorMessages(void) { + // lookup program ID + const string& pgId = pg.ID; + pgIdIter = programIds.find(pgId); - // skip if no error messages - if ( m_errorMessages.empty() ) return; + // error if found (duplicate entry) + if ( pgIdIter != programIds.end() ) { + AddError("Program ID (ID): " + pgId + " is not unique"); + isValid = false; + } - // print error header line - cerr << "* SAM header has " << m_errorMessages.size() << " errors:" << endl; + // otherwise ok, store ID + programIds.insert(pgId); + } - // print each error message - vector::const_iterator errorIter = m_errorMessages.begin(); - vector::const_iterator errorEnd = m_errorMessages.end(); - for ( ; errorIter != errorEnd; ++errorIter ) - cerr << (*errorIter); + // return validation state + return isValid; } -void SamHeaderValidator::PrintWarningMessages(void) { +// make sure that any PP tags present point to existing @PG IDs +bool SamHeaderValidator::ValidatePreviousProgramIds(void) { - // skip if no warning messages - if ( m_warningMessages.empty() ) return; + bool isValid = true; - // print warning header line - cerr << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl; + // iterate over program records + const SamProgramChain& programs = m_header.Programs; + SamProgramConstIterator pgIter = programs.ConstBegin(); + SamProgramConstIterator pgEnd = programs.ConstEnd(); + for ( ; pgIter != pgEnd; ++pgIter ) { + const SamProgram& pg = (*pgIter); + + // ignore record for validation if PreviousProgramID is empty + const string& ppId = pg.PreviousProgramID; + if ( ppId.empty() ) + continue; + + // see if program "chain" contains an entry for ppId + if ( !programs.Contains(ppId) ) { + AddError("PreviousProgramID (PP): " + ppId + " is not a known ID"); + isValid = false; + } + } - // print each warning message - vector::const_iterator warnIter = m_warningMessages.begin(); - vector::const_iterator warnEnd = m_warningMessages.end(); - for ( ; warnIter != warnEnd; ++warnIter ) - cerr << (*warnIter); + // return validation state + return isValid; }