// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 23 December 2010 (DB)
+// Last modified: 18 April 2011 (DB)
// ---------------------------------------------------------------------------
// Provides functionality for validating SamHeader data
// ***************************************************************************
using namespace BamTools;
using namespace BamTools::Internal;
+#include <cctype>
#include <iostream>
#include <set>
#include <sstream>
-#include <vector>
using namespace std;
-// -------------------------------------------------------------------
-// Allow validation rules to vary between SAM header versions
+namespace BamTools {
+namespace Internal {
+
+bool caseInsensitiveCompare(const string& lhs, const string& rhs) {
+
+ // can omit checking chars if lengths not equal
+ const int lhsLength = lhs.length();
+ const int rhsLength = rhs.length();
+ if ( lhsLength != rhsLength )
+ return false;
+
+ // do *basic* toupper checks on each string char's
+ for ( int i = 0; i < lhsLength; ++i ) {
+ if ( toupper( (int)lhs.at(i)) != toupper( (int)rhs.at(i)) )
+ return false;
+ }
+
+ // otherwise OK
+ return true;
+}
+
+} // namespace Internal
+} // namespace BamTools
+
+// ------------------------------------------------------------------------
+// Allow validation rules to vary, as needed, between SAM header versions
//
// use SAM_VERSION_X_Y to tag important changes
//
// // use rule introduced with version 2.0
static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0);
+static const SamHeaderVersion SAM_VERSION_1_1 = SamHeaderVersion(1,1);
+static const SamHeaderVersion SAM_VERSION_1_2 = SamHeaderVersion(1,2);
static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3);
+static const SamHeaderVersion SAM_VERSION_1_4 = SamHeaderVersion(1,4);
-// -----------------------------------------
-// SamHeaderValidatorPrivate implementation
-
-class SamHeaderValidator::SamHeaderValidatorPrivate {
-
- // ctor & dtor
- public:
- SamHeaderValidatorPrivate(const SamHeader& header);
- ~SamHeaderValidatorPrivate(void) { }
-
- // 'public' methods
- public:
- bool Validate(bool verbose);
-
- // internal validation methods
- private:
-
- // validate header metadata
- bool ValidateMetadata(void);
- bool ValidateVersion(void);
- bool ContainsOnlyDigits(const string& s);
- bool ValidateSortOrder(void);
- bool ValidateGroupOrder(void);
-
- // validate sequence dictionary
- bool ValidateSequenceDictionary(void);
- bool ContainsUniqueSequenceNames(void);
- bool CheckNameFormat(const string& name);
- bool ValidateSequence(const SamSequence& seq);
- bool CheckLengthInRange(const string& length);
-
- // validate read group dictionary
- bool ValidateReadGroupDictionary(void);
- bool ValidateReadGroup(const SamReadGroup& rg);
- bool ContainsUniqueIDsAndPlatformUnits(void);
- bool CheckReadGroupID(const string& id);
- bool CheckSequencingTechnology(const string& technology);
- bool Is454(const string& technology);
- bool IsHelicos(const string& technology);
- bool IsIllumina(const string& technology);
- bool IsPacBio(const string& technology);
- bool IsSolid(const string& technology);
-
- // validate program data
- bool ValidateProgramData(void);
- bool ContainsUniqueProgramIds(void);
- bool ValidatePreviousProgramIds(void);
-
- // error reporting
- private:
- void AddError(const string& message);
- void AddWarning(const string& message);
- void PrintErrorMessages(void);
- void PrintWarningMessages(void);
-
- // data members
- private:
- const SamHeader& m_header;
- const SamHeaderVersion m_version;
-
- bool m_isVerboseOutput;
- const string ERROR_PREFIX;
- const string WARN_PREFIX;
- const string NEWLINE;
- vector<string> m_errorMessages;
- vector<string> m_warningMessages;
-};
-
-SamHeaderValidator::SamHeaderValidatorPrivate::SamHeaderValidatorPrivate(const SamHeader& header)
+// TODO: This functionality is currently unused.
+// Make validation "version-aware."
+//
+// ------------------------------------------------------------------------
+
+const string SamHeaderValidator::ERROR_PREFIX = "ERROR: ";
+const string SamHeaderValidator::WARN_PREFIX = "WARNING: ";
+const string SamHeaderValidator::NEWLINE = "\n";
+
+SamHeaderValidator::SamHeaderValidator(const SamHeader& header)
: m_header(header)
- , m_version( header.Version )
- , m_isVerboseOutput(false)
- , ERROR_PREFIX("ERROR: ")
- , WARN_PREFIX("WARNING: ")
- , NEWLINE("\n")
{ }
-bool SamHeaderValidator::SamHeaderValidatorPrivate::Validate(bool verbose) {
+SamHeaderValidator::~SamHeaderValidator(void) { }
- // set error reporting mode
- m_isVerboseOutput = verbose;
+bool SamHeaderValidator::Validate(bool verbose) {
// validate header components
bool isValid = true;
isValid &= ValidateMetadata();
isValid &= ValidateSequenceDictionary();
isValid &= ValidateReadGroupDictionary();
- isValid &= ValidateProgramData();
+ isValid &= ValidateProgramChain();
// report errors if desired
- if ( m_isVerboseOutput ) {
+ if ( verbose ) {
PrintErrorMessages();
PrintWarningMessages();
}
return isValid;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateMetadata(void) {
+bool SamHeaderValidator::ValidateMetadata(void) {
bool isValid = true;
isValid &= ValidateVersion();
isValid &= ValidateSortOrder();
return isValid;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateVersion(void) {
+bool SamHeaderValidator::ValidateVersion(void) {
const string& version = m_header.Version;
}
// assumes non-empty input string
-bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsOnlyDigits(const string& s) {
+bool SamHeaderValidator::ContainsOnlyDigits(const string& s) {
const size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS);
return ( nonDigitPosition == string::npos ) ;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateSortOrder(void) {
+bool SamHeaderValidator::ValidateSortOrder(void) {
const string& sortOrder = m_header.SortOrder;
sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ||
sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED
)
- { return true; }
+ {
+ return true;
+ }
// otherwise
AddError("Invalid sort order (SO): " + sortOrder);
return false;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateGroupOrder(void) {
+bool SamHeaderValidator::ValidateGroupOrder(void) {
const string& groupOrder = m_header.GroupOrder;
// if no group order, no problem, just return OK
- if ( groupOrder.empty() ) return true;
+ if ( groupOrder.empty() )
+ return true;
// if group order is valid keyword
if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE ||
groupOrder == Constants::SAM_HD_GROUPORDER_QUERY ||
groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE
)
- { return true; }
+ {
+ return true;
+ }
// otherwise
AddError("Invalid group order (GO): " + groupOrder);
return false;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateSequenceDictionary(void) {
-
- // TODO: warn/error if no sequences ?
+bool SamHeaderValidator::ValidateSequenceDictionary(void) {
bool isValid = true;
return isValid;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsUniqueSequenceNames(void) {
+bool SamHeaderValidator::ContainsUniqueSequenceNames(void) {
bool isValid = true;
set<string> sequenceNames;
SamSequenceConstIterator seqEnd = sequences.ConstEnd();
for ( ; seqIter != seqEnd; ++seqIter ) {
const SamSequence& seq = (*seqIter);
- const string& name = seq.Name;
// lookup sequence name
+ const string& name = seq.Name;
nameIter = sequenceNames.find(name);
// error if found (duplicate entry)
return isValid;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateSequence(const SamSequence& seq) {
+bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) {
bool isValid = true;
isValid &= CheckNameFormat(seq.Name);
isValid &= CheckLengthInRange(seq.Length);
return isValid;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckNameFormat(const string& name) {
+bool SamHeaderValidator::CheckNameFormat(const string& name) {
// invalid if name is empty
if ( name.empty() ) {
return true;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckLengthInRange(const string& length) {
+bool SamHeaderValidator::CheckLengthInRange(const string& length) {
// invalid if empty
if ( length.empty() ) {
return true;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateReadGroupDictionary(void) {
-
- // TODO: warn/error if no read groups ?
+bool SamHeaderValidator::ValidateReadGroupDictionary(void) {
bool isValid = true;
return isValid;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsUniqueIDsAndPlatformUnits(void) {
+bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) {
bool isValid = true;
set<string> readGroupIds;
return isValid;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateReadGroup(const SamReadGroup& rg) {
+bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) {
bool isValid = true;
isValid &= CheckReadGroupID(rg.ID);
isValid &= CheckSequencingTechnology(rg.SequencingTechnology);
return isValid;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckReadGroupID(const string& id) {
+bool SamHeaderValidator::CheckReadGroupID(const string& id) {
// invalid if empty
if ( id.empty() ) {
return true;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckSequencingTechnology(const string& technology) {
+bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) {
// if no technology provided, no problem, just return OK
- if ( technology.empty() ) return true;
+ if ( technology.empty() )
+ return true;
// if technology is valid keyword
- if ( Is454(technology) ||
- IsHelicos(technology) ||
- IsIllumina(technology) ||
- IsPacBio(technology) ||
- IsSolid(technology)
+ if ( caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_CAPILLARY) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_HELICOS) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_IONTORRENT) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_LS454) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_PACBIO) ||
+ caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_SOLID)
)
- { return true; }
+ {
+ return true;
+ }
// otherwise
AddError("Invalid read group sequencing platform (PL): " + technology);
return false;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::Is454(const string& technology) {
- return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_454 ||
- technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_LOWER ||
- technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_UPPER
- );
-}
-
-bool SamHeaderValidator::SamHeaderValidatorPrivate::IsHelicos(const string& technology) {
- return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_LOWER ||
- technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_UPPER
- );
-}
-
-bool SamHeaderValidator::SamHeaderValidatorPrivate::IsIllumina(const string& technology) {
- return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_LOWER ||
- technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_UPPER
- );
-}
-
-bool SamHeaderValidator::SamHeaderValidatorPrivate::IsPacBio(const string& technology) {
- return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_LOWER ||
- technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_UPPER
- );
-}
-
-bool SamHeaderValidator::SamHeaderValidatorPrivate::IsSolid(const string& technology) {
- return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_LOWER ||
- technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_UPPER
- );
-}
-
-bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateProgramData(void) {
+bool SamHeaderValidator::ValidateProgramChain(void) {
bool isValid = true;
isValid &= ContainsUniqueProgramIds();
isValid &= ValidatePreviousProgramIds();
return isValid;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsUniqueProgramIds(void) {
+bool SamHeaderValidator::ContainsUniqueProgramIds(void) {
+
bool isValid = true;
- // TODO: once we have ability to handle multiple @PG entries,
- // check here for duplicate ID's
- // but for now, just return true
+ set<string> programIds;
+ set<string>::iterator pgIdIter;
+
+ // iterate over program records
+ const SamProgramChain& programs = m_header.Programs;
+ SamProgramConstIterator pgIter = programs.ConstBegin();
+ SamProgramConstIterator pgEnd = programs.ConstEnd();
+ for ( ; pgIter != pgEnd; ++pgIter ) {
+ const SamProgram& pg = (*pgIter);
+
+ // lookup program ID
+ const string& pgId = pg.ID;
+ pgIdIter = programIds.find(pgId);
+
+ // error if found (duplicate entry)
+ if ( pgIdIter != programIds.end() ) {
+ AddError("Program ID (ID): " + pgId + " is not unique");
+ isValid = false;
+ }
+
+ // otherwise ok, store ID
+ programIds.insert(pgId);
+ }
+
+ // return validation state
return isValid;
}
-bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidatePreviousProgramIds(void) {
+bool SamHeaderValidator::ValidatePreviousProgramIds(void) {
+
bool isValid = true;
- // TODO: check that PP entries are valid later, after we get multiple @PG-entry handling
- // just return true for now
+
+ // iterate over program records
+ const SamProgramChain& programs = m_header.Programs;
+ SamProgramConstIterator pgIter = programs.ConstBegin();
+ SamProgramConstIterator pgEnd = programs.ConstEnd();
+ for ( ; pgIter != pgEnd; ++pgIter ) {
+ const SamProgram& pg = (*pgIter);
+
+ // ignore record for validation if PreviousProgramID is empty
+ const string& ppId = pg.PreviousProgramID;
+ if ( ppId.empty() )
+ continue;
+
+ // see if program "chain" contains an entry for ppId
+ if ( !programs.Contains(ppId) ) {
+ AddError("PreviousProgramID (PP): " + ppId + " is not a known ID");
+ isValid = false;
+ }
+ }
+
+ // return validation state
return isValid;
}
-void SamHeaderValidator::SamHeaderValidatorPrivate::AddError(const string& message) {
+void SamHeaderValidator::AddError(const string& message) {
m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE);
}
-void SamHeaderValidator::SamHeaderValidatorPrivate::AddWarning(const string& message) {
+void SamHeaderValidator::AddWarning(const string& message) {
m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE);
}
-void SamHeaderValidator::SamHeaderValidatorPrivate::PrintErrorMessages(void) {
+void SamHeaderValidator::PrintErrorMessages(void) {
// skip if no error messages
if ( m_errorMessages.empty() ) return;
cerr << (*errorIter);
}
-void SamHeaderValidator::SamHeaderValidatorPrivate::PrintWarningMessages(void) {
+void SamHeaderValidator::PrintWarningMessages(void) {
// skip if no warning messages
if ( m_warningMessages.empty() ) return;
for ( ; warnIter != warnEnd; ++warnIter )
cerr << (*warnIter);
}
-
-// -----------------------------------
-// SamHeaderValidator implementation
-
-SamHeaderValidator::SamHeaderValidator(const BamTools::SamHeader& header)
- : d( new SamHeaderValidatorPrivate(header) )
-{ }
-
-SamHeaderValidator::~SamHeaderValidator(void) {
- delete d;
- d = 0;
-}
-
-bool SamHeaderValidator::Validate(bool verbose) { return d->Validate(verbose); }