// ***************************************************************************
// SamHeaderValidator.cpp (c) 2010 Derek Barnett
// Marth Lab, Department of Biology, Boston College
-// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 13 January 2011 (DB)
+// Last modified: 14 October 2011 (DB)
// ---------------------------------------------------------------------------
// Provides functionality for validating SamHeader data
// ***************************************************************************
-#include <api/SamConstants.h>
-#include <api/SamHeader.h>
-#include <api/internal/SamHeaderValidator_p.h>
-#include <api/internal/SamHeaderVersion_p.h>
+#include "api/SamConstants.h"
+#include "api/SamHeader.h"
+#include "api/internal/SamHeaderValidator_p.h"
+#include "api/internal/SamHeaderVersion_p.h"
using namespace BamTools;
using namespace BamTools::Internal;
-#include <iostream>
+#include <cctype>
#include <set>
#include <sstream>
using namespace std;
+// ------------------------
+// static utility methods
+// -------------------------
+
+static
+bool caseInsensitiveCompare(const string& lhs, const string& rhs) {
+
+ // can omit checking chars if lengths not equal
+ const int lhsLength = lhs.length();
+ const int rhsLength = rhs.length();
+ if ( lhsLength != rhsLength )
+ return false;
+
+ // do *basic* toupper checks on each string char's
+ for ( int i = 0; i < lhsLength; ++i ) {
+ if ( toupper( (int)lhs.at(i)) != toupper( (int)rhs.at(i)) )
+ return false;
+ }
+
+ // otherwise OK
+ return true;
+}
+
// ------------------------------------------------------------------------
// Allow validation rules to vary, as needed, between SAM header versions
//
// // use rule introduced with version 2.0
static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0);
+static const SamHeaderVersion SAM_VERSION_1_1 = SamHeaderVersion(1,1);
+static const SamHeaderVersion SAM_VERSION_1_2 = SamHeaderVersion(1,2);
static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3);
+static const SamHeaderVersion SAM_VERSION_1_4 = SamHeaderVersion(1,4);
// TODO: This functionality is currently unused.
// Make validation "version-aware."
SamHeaderValidator::~SamHeaderValidator(void) { }
-bool SamHeaderValidator::Validate(bool verbose) {
+void SamHeaderValidator::AddError(const string& message) {
+ m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE);
+}
+
+void SamHeaderValidator::AddWarning(const string& message) {
+ m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE);
+}
+
+void SamHeaderValidator::PrintErrorMessages(ostream& stream) {
+
+ // skip if no error messages
+ if ( m_errorMessages.empty() )
+ return;
+
+ // print error header line
+ stream << "* SAM header has " << m_errorMessages.size() << " errors:" << endl;
+
+ // print each error message
+ vector<string>::const_iterator errorIter = m_errorMessages.begin();
+ vector<string>::const_iterator errorEnd = m_errorMessages.end();
+ for ( ; errorIter != errorEnd; ++errorIter )
+ stream << (*errorIter);
+}
+
+void SamHeaderValidator::PrintMessages(ostream& stream) {
+ PrintErrorMessages(stream);
+ PrintWarningMessages(stream);
+}
+
+void SamHeaderValidator::PrintWarningMessages(ostream& stream) {
+
+ // skip if no warning messages
+ if ( m_warningMessages.empty() )
+ return;
+
+ // print warning header line
+ stream << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl;
+
+ // print each warning message
+ vector<string>::const_iterator warnIter = m_warningMessages.begin();
+ vector<string>::const_iterator warnEnd = m_warningMessages.end();
+ for ( ; warnIter != warnEnd; ++warnIter )
+ stream << (*warnIter);
+}
- // validate header components
+// entry point for validation
+bool SamHeaderValidator::Validate(void) {
bool isValid = true;
isValid &= ValidateMetadata();
isValid &= ValidateSequenceDictionary();
isValid &= ValidateReadGroupDictionary();
- isValid &= ValidateProgramData();
-
- // report errors if desired
- if ( verbose ) {
- PrintErrorMessages();
- PrintWarningMessages();
- }
-
- // return validation status
+ isValid &= ValidateProgramChain();
return isValid;
}
+// check all SAM header 'metadata'
bool SamHeaderValidator::ValidateMetadata(void) {
bool isValid = true;
isValid &= ValidateVersion();
return isValid;
}
+// check SAM header version tag
bool SamHeaderValidator::ValidateVersion(void) {
const string& version = m_header.Version;
return ( nonDigitPosition == string::npos ) ;
}
+// validate SAM header sort order tag
bool SamHeaderValidator::ValidateSortOrder(void) {
const string& sortOrder = m_header.SortOrder;
sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ||
sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED
)
- { return true; }
+ {
+ return true;
+ }
// otherwise
AddError("Invalid sort order (SO): " + sortOrder);
return false;
}
+// validate SAM header group order tag
bool SamHeaderValidator::ValidateGroupOrder(void) {
const string& groupOrder = m_header.GroupOrder;
// if no group order, no problem, just return OK
- if ( groupOrder.empty() ) return true;
+ if ( groupOrder.empty() )
+ return true;
// if group order is valid keyword
if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE ||
groupOrder == Constants::SAM_HD_GROUPORDER_QUERY ||
groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE
)
- { return true; }
+ {
+ return true;
+ }
// otherwise
AddError("Invalid group order (GO): " + groupOrder);
return false;
}
+// validate SAM header sequence dictionary
bool SamHeaderValidator::ValidateSequenceDictionary(void) {
- // TODO: warn/error if no sequences ?
-
bool isValid = true;
// check for unique sequence names
return isValid;
}
+// make sure all SQ names are unique
bool SamHeaderValidator::ContainsUniqueSequenceNames(void) {
bool isValid = true;
SamSequenceConstIterator seqEnd = sequences.ConstEnd();
for ( ; seqIter != seqEnd; ++seqIter ) {
const SamSequence& seq = (*seqIter);
- const string& name = seq.Name;
// lookup sequence name
+ const string& name = seq.Name;
nameIter = sequenceNames.find(name);
// error if found (duplicate entry)
return isValid;
}
+// validate SAM header sequence entry
bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) {
bool isValid = true;
isValid &= CheckNameFormat(seq.Name);
return isValid;
}
+// check sequence name is valid format
bool SamHeaderValidator::CheckNameFormat(const string& name) {
// invalid if name is empty
return true;
}
+// check that sequence length is within accepted range
bool SamHeaderValidator::CheckLengthInRange(const string& length) {
// invalid if empty
return true;
}
+// validate SAM header read group dictionary
bool SamHeaderValidator::ValidateReadGroupDictionary(void) {
- // TODO: warn/error if no read groups ?
-
bool isValid = true;
// check for unique read group IDs & platform units
return isValid;
}
+// make sure RG IDs and platform units are unique
bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) {
bool isValid = true;
return isValid;
}
+// validate SAM header read group entry
bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) {
bool isValid = true;
isValid &= CheckReadGroupID(rg.ID);
return isValid;
}
+// make sure RG ID exists
bool SamHeaderValidator::CheckReadGroupID(const string& id) {
// invalid if empty
return true;
}
+// make sure RG sequencing tech is one of the accepted keywords
bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) {
// if no technology provided, no problem, just return OK
- if ( technology.empty() ) return true;
+ if ( technology.empty() )
+ return true;
// if technology is valid keyword
- if ( Is454(technology) ||
- IsHelicos(technology) ||
- IsIllumina(technology) ||
- IsPacBio(technology) ||
- IsSolid(technology)
+ if ( caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_CAPILLARY) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_HELICOS) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_IONTORRENT) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_LS454) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_PACBIO) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_SOLID)
)
- { return true; }
+ {
+ return true;
+ }
// otherwise
AddError("Invalid read group sequencing platform (PL): " + technology);
return false;
}
-bool SamHeaderValidator::Is454(const string& technology) {
- return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_454 ||
- technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_LOWER ||
- technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_UPPER
- );
-}
-
-bool SamHeaderValidator::IsHelicos(const string& technology) {
- return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_LOWER ||
- technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_UPPER
- );
-}
-
-bool SamHeaderValidator::IsIllumina(const string& technology) {
- return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_LOWER ||
- technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_UPPER
- );
-}
-
-bool SamHeaderValidator::IsPacBio(const string& technology) {
- return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_LOWER ||
- technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_UPPER
- );
-}
-
-bool SamHeaderValidator::IsSolid(const string& technology) {
- return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_LOWER ||
- technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_UPPER
- );
-}
-
-bool SamHeaderValidator::ValidateProgramData(void) {
+// validate the SAM header "program chain"
+bool SamHeaderValidator::ValidateProgramChain(void) {
bool isValid = true;
isValid &= ContainsUniqueProgramIds();
isValid &= ValidatePreviousProgramIds();
return isValid;
}
+// make sure all PG IDs are unique
bool SamHeaderValidator::ContainsUniqueProgramIds(void) {
- bool isValid = true;
- // TODO: once we have ability to handle multiple @PG entries,
- // check here for duplicate ID's
- // but for now, just return true
- return isValid;
-}
-bool SamHeaderValidator::ValidatePreviousProgramIds(void) {
bool isValid = true;
- // TODO: check that PP entries are valid later, after we get multiple @PG-entry handling
- // just return true for now
- return isValid;
-}
-void SamHeaderValidator::AddError(const string& message) {
- m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE);
-}
+ set<string> programIds;
+ set<string>::iterator pgIdIter;
-void SamHeaderValidator::AddWarning(const string& message) {
- m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE);
-}
+ // iterate over program records
+ const SamProgramChain& programs = m_header.Programs;
+ SamProgramConstIterator pgIter = programs.ConstBegin();
+ SamProgramConstIterator pgEnd = programs.ConstEnd();
+ for ( ; pgIter != pgEnd; ++pgIter ) {
+ const SamProgram& pg = (*pgIter);
-void SamHeaderValidator::PrintErrorMessages(void) {
+ // lookup program ID
+ const string& pgId = pg.ID;
+ pgIdIter = programIds.find(pgId);
- // skip if no error messages
- if ( m_errorMessages.empty() ) return;
+ // error if found (duplicate entry)
+ if ( pgIdIter != programIds.end() ) {
+ AddError("Program ID (ID): " + pgId + " is not unique");
+ isValid = false;
+ }
- // print error header line
- cerr << "* SAM header has " << m_errorMessages.size() << " errors:" << endl;
+ // otherwise ok, store ID
+ programIds.insert(pgId);
+ }
- // print each error message
- vector<string>::const_iterator errorIter = m_errorMessages.begin();
- vector<string>::const_iterator errorEnd = m_errorMessages.end();
- for ( ; errorIter != errorEnd; ++errorIter )
- cerr << (*errorIter);
+ // return validation state
+ return isValid;
}
-void SamHeaderValidator::PrintWarningMessages(void) {
+// make sure that any PP tags present point to existing @PG IDs
+bool SamHeaderValidator::ValidatePreviousProgramIds(void) {
- // skip if no warning messages
- if ( m_warningMessages.empty() ) return;
+ bool isValid = true;
- // print warning header line
- cerr << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl;
+ // iterate over program records
+ const SamProgramChain& programs = m_header.Programs;
+ SamProgramConstIterator pgIter = programs.ConstBegin();
+ SamProgramConstIterator pgEnd = programs.ConstEnd();
+ for ( ; pgIter != pgEnd; ++pgIter ) {
+ const SamProgram& pg = (*pgIter);
+
+ // ignore record for validation if PreviousProgramID is empty
+ const string& ppId = pg.PreviousProgramID;
+ if ( ppId.empty() )
+ continue;
+
+ // see if program "chain" contains an entry for ppId
+ if ( !programs.Contains(ppId) ) {
+ AddError("PreviousProgramID (PP): " + ppId + " is not a known ID");
+ isValid = false;
+ }
+ }
- // print each warning message
- vector<string>::const_iterator warnIter = m_warningMessages.begin();
- vector<string>::const_iterator warnEnd = m_warningMessages.end();
- for ( ; warnIter != warnEnd; ++warnIter )
- cerr << (*warnIter);
+ // return validation state
+ return isValid;
}