1 // ***************************************************************************
2 // SamHeaderValidator.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 23 December 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides functionality for validating SamHeader data
9 // ***************************************************************************
11 #include <api/SamConstants.h>
12 #include <api/SamHeader.h>
13 #include <api/internal/SamHeaderValidator_p.h>
14 #include <api/internal/SamHeaderVersion_p.h>
15 using namespace BamTools;
16 using namespace BamTools::Internal;
24 // -------------------------------------------------------------------
25 // Allow validation rules to vary between SAM header versions
27 // use SAM_VERSION_X_Y to tag important changes
29 // Together, they will allow for comparisons like:
30 // if ( m_version < SAM_VERSION_2_0 ) {
31 // // use some older rule
33 // // use rule introduced with version 2.0
35 static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0);
36 static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3);
38 // -----------------------------------------
39 // SamHeaderValidatorPrivate implementation
41 class SamHeaderValidator::SamHeaderValidatorPrivate {
45 SamHeaderValidatorPrivate(const SamHeader& header);
46 ~SamHeaderValidatorPrivate(void) { }
50 bool Validate(bool verbose);
52 // internal validation methods
55 // validate header metadata
56 bool ValidateMetadata(void);
57 bool ValidateVersion(void);
58 bool ContainsOnlyDigits(const string& s);
59 bool ValidateSortOrder(void);
60 bool ValidateGroupOrder(void);
62 // validate sequence dictionary
63 bool ValidateSequenceDictionary(void);
64 bool ContainsUniqueSequenceNames(void);
65 bool CheckNameFormat(const string& name);
66 bool ValidateSequence(const SamSequence& seq);
67 bool CheckLengthInRange(const string& length);
69 // validate read group dictionary
70 bool ValidateReadGroupDictionary(void);
71 bool ValidateReadGroup(const SamReadGroup& rg);
72 bool ContainsUniqueIDsAndPlatformUnits(void);
73 bool CheckReadGroupID(const string& id);
74 bool CheckSequencingTechnology(const string& technology);
75 bool Is454(const string& technology);
76 bool IsHelicos(const string& technology);
77 bool IsIllumina(const string& technology);
78 bool IsPacBio(const string& technology);
79 bool IsSolid(const string& technology);
81 // validate program data
82 bool ValidateProgramData(void);
83 bool ContainsUniqueProgramIds(void);
84 bool ValidatePreviousProgramIds(void);
88 void AddError(const string& message);
89 void AddWarning(const string& message);
90 void PrintErrorMessages(void);
91 void PrintWarningMessages(void);
95 const SamHeader& m_header;
96 const SamHeaderVersion m_version;
98 bool m_isVerboseOutput;
99 const string ERROR_PREFIX;
100 const string WARN_PREFIX;
101 const string NEWLINE;
102 vector<string> m_errorMessages;
103 vector<string> m_warningMessages;
106 SamHeaderValidator::SamHeaderValidatorPrivate::SamHeaderValidatorPrivate(const SamHeader& header)
108 , m_version( header.Version )
109 , m_isVerboseOutput(false)
110 , ERROR_PREFIX("ERROR: ")
111 , WARN_PREFIX("WARNING: ")
115 bool SamHeaderValidator::SamHeaderValidatorPrivate::Validate(bool verbose) {
117 // set error reporting mode
118 m_isVerboseOutput = verbose;
120 // validate header components
122 isValid &= ValidateMetadata();
123 isValid &= ValidateSequenceDictionary();
124 isValid &= ValidateReadGroupDictionary();
125 isValid &= ValidateProgramData();
127 // report errors if desired
128 if ( m_isVerboseOutput ) {
129 PrintErrorMessages();
130 PrintWarningMessages();
133 // return validation status
137 bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateMetadata(void) {
139 isValid &= ValidateVersion();
140 isValid &= ValidateSortOrder();
141 isValid &= ValidateGroupOrder();
145 bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateVersion(void) {
147 const string& version = m_header.Version;
149 // warn if version not present
150 if ( version.empty() ) {
151 AddWarning("Version (VN) missing. Not required, but strongly recommended");
155 // invalid if version does not contain a period
156 const size_t periodFound = version.find(Constants::SAM_PERIOD);
157 if ( periodFound == string::npos ) {
158 AddError("Invalid version (VN) format: " + version);
162 // invalid if major version is empty or contains non-digits
163 const string majorVersion = version.substr(0, periodFound);
164 if ( majorVersion.empty() || !ContainsOnlyDigits(majorVersion) ) {
165 AddError("Invalid version (VN) format: " + version);
169 // invalid if major version is empty or contains non-digits
170 const string minorVersion = version.substr(periodFound + 1);
171 if ( minorVersion.empty() || !ContainsOnlyDigits(minorVersion) ) {
172 AddError("Invalid version (VN) format: " + version);
176 // TODO: check if version is not just syntactically OK,
177 // but is also a valid SAM version ( 1.0 .. CURRENT )
179 // all checked out this far, then version is OK
183 // assumes non-empty input string
184 bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsOnlyDigits(const string& s) {
185 const size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS);
186 return ( nonDigitPosition == string::npos ) ;
189 bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateSortOrder(void) {
191 const string& sortOrder = m_header.SortOrder;
193 // warn if sort order not present
194 if ( sortOrder.empty() ) {
195 AddWarning("Sort order (SO) missing. Not required, but strongly recommended");
199 // if sort order is valid keyword
200 if ( sortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ||
201 sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ||
202 sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED
207 AddError("Invalid sort order (SO): " + sortOrder);
211 bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateGroupOrder(void) {
213 const string& groupOrder = m_header.GroupOrder;
215 // if no group order, no problem, just return OK
216 if ( groupOrder.empty() ) return true;
218 // if group order is valid keyword
219 if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE ||
220 groupOrder == Constants::SAM_HD_GROUPORDER_QUERY ||
221 groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE
226 AddError("Invalid group order (GO): " + groupOrder);
230 bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateSequenceDictionary(void) {
232 // TODO: warn/error if no sequences ?
236 // check for unique sequence names
237 isValid &= ContainsUniqueSequenceNames();
239 // iterate over sequences
240 const SamSequenceDictionary& sequences = m_header.Sequences;
241 SamSequenceConstIterator seqIter = sequences.ConstBegin();
242 SamSequenceConstIterator seqEnd = sequences.ConstEnd();
243 for ( ; seqIter != seqEnd; ++seqIter ) {
244 const SamSequence& seq = (*seqIter);
245 isValid &= ValidateSequence(seq);
248 // return validation state
252 bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsUniqueSequenceNames(void) {
255 set<string> sequenceNames;
256 set<string>::iterator nameIter;
258 // iterate over sequences
259 const SamSequenceDictionary& sequences = m_header.Sequences;
260 SamSequenceConstIterator seqIter = sequences.ConstBegin();
261 SamSequenceConstIterator seqEnd = sequences.ConstEnd();
262 for ( ; seqIter != seqEnd; ++seqIter ) {
263 const SamSequence& seq = (*seqIter);
264 const string& name = seq.Name;
266 // lookup sequence name
267 nameIter = sequenceNames.find(name);
269 // error if found (duplicate entry)
270 if ( nameIter != sequenceNames.end() ) {
271 AddError("Sequence name (SN): " + name + " is not unique");
275 // otherwise ok, store name
276 sequenceNames.insert(name);
279 // return validation state
283 bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateSequence(const SamSequence& seq) {
285 isValid &= CheckNameFormat(seq.Name);
286 isValid &= CheckLengthInRange(seq.Length);
290 bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckNameFormat(const string& name) {
292 // invalid if name is empty
293 if ( name.empty() ) {
294 AddError("Sequence entry (@SQ) is missing SN tag");
298 // invalid if first character is a reserved char
299 const char firstChar = name.at(0);
300 if ( firstChar == Constants::SAM_EQUAL || firstChar == Constants::SAM_STAR ) {
301 AddError("Invalid sequence name (SN): " + name);
308 bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckLengthInRange(const string& length) {
311 if ( length.empty() ) {
312 AddError("Sequence entry (@SQ) is missing LN tag");
316 // convert string length to numeric
317 stringstream lengthStream(length);
318 unsigned int sequenceLength;
319 lengthStream >> sequenceLength;
321 // invalid if length outside accepted range
322 if ( sequenceLength < Constants::SAM_SQ_LENGTH_MIN || sequenceLength > Constants::SAM_SQ_LENGTH_MAX ) {
323 AddError("Sequence length (LN): " + length + " out of range");
331 bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateReadGroupDictionary(void) {
333 // TODO: warn/error if no read groups ?
337 // check for unique read group IDs & platform units
338 isValid &= ContainsUniqueIDsAndPlatformUnits();
340 // iterate over read groups
341 const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
342 SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
343 SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
344 for ( ; rgIter != rgEnd; ++rgIter ) {
345 const SamReadGroup& rg = (*rgIter);
346 isValid &= ValidateReadGroup(rg);
349 // return validation state
353 bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsUniqueIDsAndPlatformUnits(void) {
356 set<string> readGroupIds;
357 set<string> platformUnits;
358 set<string>::iterator idIter;
359 set<string>::iterator puIter;
361 // iterate over sequences
362 const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
363 SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
364 SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
365 for ( ; rgIter != rgEnd; ++rgIter ) {
366 const SamReadGroup& rg = (*rgIter);
368 // --------------------------------
369 // check for unique ID
371 // lookup read group ID
372 const string& id = rg.ID;
373 idIter = readGroupIds.find(id);
375 // error if found (duplicate entry)
376 if ( idIter != readGroupIds.end() ) {
377 AddError("Read group ID (ID): " + id + " is not unique");
381 // otherwise ok, store id
382 readGroupIds.insert(id);
384 // --------------------------------
385 // check for unique platform unit
387 // lookup platform unit
388 const string& pu = rg.PlatformUnit;
389 puIter = platformUnits.find(pu);
391 // error if found (duplicate entry)
392 if ( puIter != platformUnits.end() ) {
393 AddError("Platform unit (PU): " + pu + " is not unique");
397 // otherwise ok, store platform unit
398 platformUnits.insert(pu);
401 // return validation state
405 bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateReadGroup(const SamReadGroup& rg) {
407 isValid &= CheckReadGroupID(rg.ID);
408 isValid &= CheckSequencingTechnology(rg.SequencingTechnology);
412 bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckReadGroupID(const string& id) {
416 AddError("Read group entry (@RG) is missing ID tag");
424 bool SamHeaderValidator::SamHeaderValidatorPrivate::CheckSequencingTechnology(const string& technology) {
426 // if no technology provided, no problem, just return OK
427 if ( technology.empty() ) return true;
429 // if technology is valid keyword
430 if ( Is454(technology) ||
431 IsHelicos(technology) ||
432 IsIllumina(technology) ||
433 IsPacBio(technology) ||
439 AddError("Invalid read group sequencing platform (PL): " + technology);
443 bool SamHeaderValidator::SamHeaderValidatorPrivate::Is454(const string& technology) {
444 return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_454 ||
445 technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_LOWER ||
446 technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_UPPER
450 bool SamHeaderValidator::SamHeaderValidatorPrivate::IsHelicos(const string& technology) {
451 return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_LOWER ||
452 technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_UPPER
456 bool SamHeaderValidator::SamHeaderValidatorPrivate::IsIllumina(const string& technology) {
457 return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_LOWER ||
458 technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_UPPER
462 bool SamHeaderValidator::SamHeaderValidatorPrivate::IsPacBio(const string& technology) {
463 return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_LOWER ||
464 technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_UPPER
468 bool SamHeaderValidator::SamHeaderValidatorPrivate::IsSolid(const string& technology) {
469 return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_LOWER ||
470 technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_UPPER
474 bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidateProgramData(void) {
476 isValid &= ContainsUniqueProgramIds();
477 isValid &= ValidatePreviousProgramIds();
481 bool SamHeaderValidator::SamHeaderValidatorPrivate::ContainsUniqueProgramIds(void) {
483 // TODO: once we have ability to handle multiple @PG entries,
484 // check here for duplicate ID's
485 // but for now, just return true
489 bool SamHeaderValidator::SamHeaderValidatorPrivate::ValidatePreviousProgramIds(void) {
491 // TODO: check that PP entries are valid later, after we get multiple @PG-entry handling
492 // just return true for now
495 void SamHeaderValidator::SamHeaderValidatorPrivate::AddError(const string& message) {
496 m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE);
499 void SamHeaderValidator::SamHeaderValidatorPrivate::AddWarning(const string& message) {
500 m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE);
503 void SamHeaderValidator::SamHeaderValidatorPrivate::PrintErrorMessages(void) {
505 // skip if no error messages
506 if ( m_errorMessages.empty() ) return;
508 // print error header line
509 cerr << "* SAM header has " << m_errorMessages.size() << " errors:" << endl;
511 // print each error message
512 vector<string>::const_iterator errorIter = m_errorMessages.begin();
513 vector<string>::const_iterator errorEnd = m_errorMessages.end();
514 for ( ; errorIter != errorEnd; ++errorIter )
515 cerr << (*errorIter);
518 void SamHeaderValidator::SamHeaderValidatorPrivate::PrintWarningMessages(void) {
520 // skip if no warning messages
521 if ( m_warningMessages.empty() ) return;
523 // print warning header line
524 cerr << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl;
526 // print each warning message
527 vector<string>::const_iterator warnIter = m_warningMessages.begin();
528 vector<string>::const_iterator warnEnd = m_warningMessages.end();
529 for ( ; warnIter != warnEnd; ++warnIter )
533 // -----------------------------------
534 // SamHeaderValidator implementation
536 SamHeaderValidator::SamHeaderValidator(const BamTools::SamHeader& header)
537 : d( new SamHeaderValidatorPrivate(header) )
540 SamHeaderValidator::~SamHeaderValidator(void) {
545 bool SamHeaderValidator::Validate(bool verbose) { return d->Validate(verbose); }