1 // ***************************************************************************
2 // SamHeaderValidator.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 21 March 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides functionality for validating SamHeader data
9 // ***************************************************************************
11 #include <api/SamConstants.h>
12 #include <api/SamHeader.h>
13 #include <api/internal/SamHeaderValidator_p.h>
14 #include <api/internal/SamHeaderVersion_p.h>
15 using namespace BamTools;
16 using namespace BamTools::Internal;
23 // ------------------------------------------------------------------------
24 // Allow validation rules to vary, as needed, between SAM header versions
26 // use SAM_VERSION_X_Y to tag important changes
28 // Together, they will allow for comparisons like:
29 // if ( m_version < SAM_VERSION_2_0 ) {
30 // // use some older rule
32 // // use rule introduced with version 2.0
34 static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0);
35 static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3);
37 // TODO: This functionality is currently unused.
38 // Make validation "version-aware."
40 // ------------------------------------------------------------------------
42 const string SamHeaderValidator::ERROR_PREFIX = "ERROR: ";
43 const string SamHeaderValidator::WARN_PREFIX = "WARNING: ";
44 const string SamHeaderValidator::NEWLINE = "\n";
46 SamHeaderValidator::SamHeaderValidator(const SamHeader& header)
50 SamHeaderValidator::~SamHeaderValidator(void) { }
52 bool SamHeaderValidator::Validate(bool verbose) {
54 // validate header components
56 isValid &= ValidateMetadata();
57 isValid &= ValidateSequenceDictionary();
58 isValid &= ValidateReadGroupDictionary();
59 isValid &= ValidateProgramData();
61 // report errors if desired
64 PrintWarningMessages();
67 // return validation status
71 bool SamHeaderValidator::ValidateMetadata(void) {
73 isValid &= ValidateVersion();
74 isValid &= ValidateSortOrder();
75 isValid &= ValidateGroupOrder();
79 bool SamHeaderValidator::ValidateVersion(void) {
81 const string& version = m_header.Version;
83 // warn if version not present
84 if ( version.empty() ) {
85 AddWarning("Version (VN) missing. Not required, but strongly recommended");
89 // invalid if version does not contain a period
90 const size_t periodFound = version.find(Constants::SAM_PERIOD);
91 if ( periodFound == string::npos ) {
92 AddError("Invalid version (VN) format: " + version);
96 // invalid if major version is empty or contains non-digits
97 const string majorVersion = version.substr(0, periodFound);
98 if ( majorVersion.empty() || !ContainsOnlyDigits(majorVersion) ) {
99 AddError("Invalid version (VN) format: " + version);
103 // invalid if major version is empty or contains non-digits
104 const string minorVersion = version.substr(periodFound + 1);
105 if ( minorVersion.empty() || !ContainsOnlyDigits(minorVersion) ) {
106 AddError("Invalid version (VN) format: " + version);
110 // TODO: check if version is not just syntactically OK,
111 // but is also a valid SAM version ( 1.0 .. CURRENT )
113 // all checked out this far, then version is OK
117 // assumes non-empty input string
118 bool SamHeaderValidator::ContainsOnlyDigits(const string& s) {
119 const size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS);
120 return ( nonDigitPosition == string::npos ) ;
123 bool SamHeaderValidator::ValidateSortOrder(void) {
125 const string& sortOrder = m_header.SortOrder;
127 // warn if sort order not present
128 if ( sortOrder.empty() ) {
129 AddWarning("Sort order (SO) missing. Not required, but strongly recommended");
133 // if sort order is valid keyword
134 if ( sortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ||
135 sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ||
136 sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED
143 AddError("Invalid sort order (SO): " + sortOrder);
147 bool SamHeaderValidator::ValidateGroupOrder(void) {
149 const string& groupOrder = m_header.GroupOrder;
151 // if no group order, no problem, just return OK
152 if ( groupOrder.empty() )
155 // if group order is valid keyword
156 if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE ||
157 groupOrder == Constants::SAM_HD_GROUPORDER_QUERY ||
158 groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE
165 AddError("Invalid group order (GO): " + groupOrder);
169 bool SamHeaderValidator::ValidateSequenceDictionary(void) {
171 // TODO: warn/error if no sequences ?
175 // check for unique sequence names
176 isValid &= ContainsUniqueSequenceNames();
178 // iterate over sequences
179 const SamSequenceDictionary& sequences = m_header.Sequences;
180 SamSequenceConstIterator seqIter = sequences.ConstBegin();
181 SamSequenceConstIterator seqEnd = sequences.ConstEnd();
182 for ( ; seqIter != seqEnd; ++seqIter ) {
183 const SamSequence& seq = (*seqIter);
184 isValid &= ValidateSequence(seq);
187 // return validation state
191 bool SamHeaderValidator::ContainsUniqueSequenceNames(void) {
194 set<string> sequenceNames;
195 set<string>::iterator nameIter;
197 // iterate over sequences
198 const SamSequenceDictionary& sequences = m_header.Sequences;
199 SamSequenceConstIterator seqIter = sequences.ConstBegin();
200 SamSequenceConstIterator seqEnd = sequences.ConstEnd();
201 for ( ; seqIter != seqEnd; ++seqIter ) {
202 const SamSequence& seq = (*seqIter);
203 const string& name = seq.Name;
205 // lookup sequence name
206 nameIter = sequenceNames.find(name);
208 // error if found (duplicate entry)
209 if ( nameIter != sequenceNames.end() ) {
210 AddError("Sequence name (SN): " + name + " is not unique");
214 // otherwise ok, store name
215 sequenceNames.insert(name);
218 // return validation state
222 bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) {
224 isValid &= CheckNameFormat(seq.Name);
225 isValid &= CheckLengthInRange(seq.Length);
229 bool SamHeaderValidator::CheckNameFormat(const string& name) {
231 // invalid if name is empty
232 if ( name.empty() ) {
233 AddError("Sequence entry (@SQ) is missing SN tag");
237 // invalid if first character is a reserved char
238 const char firstChar = name.at(0);
239 if ( firstChar == Constants::SAM_EQUAL || firstChar == Constants::SAM_STAR ) {
240 AddError("Invalid sequence name (SN): " + name);
247 bool SamHeaderValidator::CheckLengthInRange(const string& length) {
250 if ( length.empty() ) {
251 AddError("Sequence entry (@SQ) is missing LN tag");
255 // convert string length to numeric
256 stringstream lengthStream(length);
257 unsigned int sequenceLength;
258 lengthStream >> sequenceLength;
260 // invalid if length outside accepted range
261 if ( sequenceLength < Constants::SAM_SQ_LENGTH_MIN || sequenceLength > Constants::SAM_SQ_LENGTH_MAX ) {
262 AddError("Sequence length (LN): " + length + " out of range");
270 bool SamHeaderValidator::ValidateReadGroupDictionary(void) {
272 // TODO: warn/error if no read groups ?
276 // check for unique read group IDs & platform units
277 isValid &= ContainsUniqueIDsAndPlatformUnits();
279 // iterate over read groups
280 const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
281 SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
282 SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
283 for ( ; rgIter != rgEnd; ++rgIter ) {
284 const SamReadGroup& rg = (*rgIter);
285 isValid &= ValidateReadGroup(rg);
288 // return validation state
292 bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) {
295 set<string> readGroupIds;
296 set<string> platformUnits;
297 set<string>::iterator idIter;
298 set<string>::iterator puIter;
300 // iterate over sequences
301 const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
302 SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
303 SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
304 for ( ; rgIter != rgEnd; ++rgIter ) {
305 const SamReadGroup& rg = (*rgIter);
307 // --------------------------------
308 // check for unique ID
310 // lookup read group ID
311 const string& id = rg.ID;
312 idIter = readGroupIds.find(id);
314 // error if found (duplicate entry)
315 if ( idIter != readGroupIds.end() ) {
316 AddError("Read group ID (ID): " + id + " is not unique");
320 // otherwise ok, store id
321 readGroupIds.insert(id);
323 // --------------------------------
324 // check for unique platform unit
326 // lookup platform unit
327 const string& pu = rg.PlatformUnit;
328 puIter = platformUnits.find(pu);
330 // error if found (duplicate entry)
331 if ( puIter != platformUnits.end() ) {
332 AddError("Platform unit (PU): " + pu + " is not unique");
336 // otherwise ok, store platform unit
337 platformUnits.insert(pu);
340 // return validation state
344 bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) {
346 isValid &= CheckReadGroupID(rg.ID);
347 isValid &= CheckSequencingTechnology(rg.SequencingTechnology);
351 bool SamHeaderValidator::CheckReadGroupID(const string& id) {
355 AddError("Read group entry (@RG) is missing ID tag");
363 bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) {
365 // if no technology provided, no problem, just return OK
366 if ( technology.empty() )
369 // if technology is valid keyword
370 if ( Is454(technology) ||
371 IsHelicos(technology) ||
372 IsIllumina(technology) ||
373 IsPacBio(technology) ||
381 AddError("Invalid read group sequencing platform (PL): " + technology);
385 bool SamHeaderValidator::Is454(const string& technology) {
386 return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_454 ||
387 technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_LOWER ||
388 technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_UPPER
392 bool SamHeaderValidator::IsHelicos(const string& technology) {
393 return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_LOWER ||
394 technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_UPPER
398 bool SamHeaderValidator::IsIllumina(const string& technology) {
399 return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_LOWER ||
400 technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_UPPER
404 bool SamHeaderValidator::IsPacBio(const string& technology) {
405 return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_LOWER ||
406 technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_UPPER
410 bool SamHeaderValidator::IsSolid(const string& technology) {
411 return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_LOWER ||
412 technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_UPPER
416 bool SamHeaderValidator::ValidateProgramData(void) {
418 isValid &= ContainsUniqueProgramIds();
419 isValid &= ValidatePreviousProgramIds();
423 bool SamHeaderValidator::ContainsUniqueProgramIds(void) {
425 // TODO: once we have ability to handle multiple @PG entries,
426 // check here for duplicate ID's
427 // but for now, just return true
431 bool SamHeaderValidator::ValidatePreviousProgramIds(void) {
433 // TODO: check that PP entries are valid later, after we get multiple @PG-entry handling
434 // just return true for now
437 void SamHeaderValidator::AddError(const string& message) {
438 m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE);
441 void SamHeaderValidator::AddWarning(const string& message) {
442 m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE);
445 void SamHeaderValidator::PrintErrorMessages(void) {
447 // skip if no error messages
448 if ( m_errorMessages.empty() ) return;
450 // print error header line
451 cerr << "* SAM header has " << m_errorMessages.size() << " errors:" << endl;
453 // print each error message
454 vector<string>::const_iterator errorIter = m_errorMessages.begin();
455 vector<string>::const_iterator errorEnd = m_errorMessages.end();
456 for ( ; errorIter != errorEnd; ++errorIter )
457 cerr << (*errorIter);
460 void SamHeaderValidator::PrintWarningMessages(void) {
462 // skip if no warning messages
463 if ( m_warningMessages.empty() ) return;
465 // print warning header line
466 cerr << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl;
468 // print each warning message
469 vector<string>::const_iterator warnIter = m_warningMessages.begin();
470 vector<string>::const_iterator warnEnd = m_warningMessages.end();
471 for ( ; warnIter != warnEnd; ++warnIter )