1 // ***************************************************************************
2 // SamHeaderValidator.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 18 April 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides functionality for validating SamHeader data
8 // ***************************************************************************
10 #include <api/SamConstants.h>
11 #include <api/SamHeader.h>
12 #include <api/internal/SamHeaderValidator_p.h>
13 #include <api/internal/SamHeaderVersion_p.h>
14 using namespace BamTools;
15 using namespace BamTools::Internal;
26 bool caseInsensitiveCompare(const string& lhs, const string& rhs) {
28 // can omit checking chars if lengths not equal
29 const int lhsLength = lhs.length();
30 const int rhsLength = rhs.length();
31 if ( lhsLength != rhsLength )
34 // do *basic* toupper checks on each string char's
35 for ( int i = 0; i < lhsLength; ++i ) {
36 if ( toupper( (int)lhs.at(i)) != toupper( (int)rhs.at(i)) )
44 } // namespace Internal
45 } // namespace BamTools
47 // ------------------------------------------------------------------------
48 // Allow validation rules to vary, as needed, between SAM header versions
50 // use SAM_VERSION_X_Y to tag important changes
52 // Together, they will allow for comparisons like:
53 // if ( m_version < SAM_VERSION_2_0 ) {
54 // // use some older rule
56 // // use rule introduced with version 2.0
58 static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0);
59 static const SamHeaderVersion SAM_VERSION_1_1 = SamHeaderVersion(1,1);
60 static const SamHeaderVersion SAM_VERSION_1_2 = SamHeaderVersion(1,2);
61 static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3);
62 static const SamHeaderVersion SAM_VERSION_1_4 = SamHeaderVersion(1,4);
64 // TODO: This functionality is currently unused.
65 // Make validation "version-aware."
67 // ------------------------------------------------------------------------
69 const string SamHeaderValidator::ERROR_PREFIX = "ERROR: ";
70 const string SamHeaderValidator::WARN_PREFIX = "WARNING: ";
71 const string SamHeaderValidator::NEWLINE = "\n";
73 SamHeaderValidator::SamHeaderValidator(const SamHeader& header)
77 SamHeaderValidator::~SamHeaderValidator(void) { }
79 bool SamHeaderValidator::Validate(bool verbose) {
81 // validate header components
83 isValid &= ValidateMetadata();
84 isValid &= ValidateSequenceDictionary();
85 isValid &= ValidateReadGroupDictionary();
86 isValid &= ValidateProgramChain();
88 // report errors if desired
91 PrintWarningMessages();
94 // return validation status
98 bool SamHeaderValidator::ValidateMetadata(void) {
100 isValid &= ValidateVersion();
101 isValid &= ValidateSortOrder();
102 isValid &= ValidateGroupOrder();
106 bool SamHeaderValidator::ValidateVersion(void) {
108 const string& version = m_header.Version;
110 // warn if version not present
111 if ( version.empty() ) {
112 AddWarning("Version (VN) missing. Not required, but strongly recommended");
116 // invalid if version does not contain a period
117 const size_t periodFound = version.find(Constants::SAM_PERIOD);
118 if ( periodFound == string::npos ) {
119 AddError("Invalid version (VN) format: " + version);
123 // invalid if major version is empty or contains non-digits
124 const string majorVersion = version.substr(0, periodFound);
125 if ( majorVersion.empty() || !ContainsOnlyDigits(majorVersion) ) {
126 AddError("Invalid version (VN) format: " + version);
130 // invalid if major version is empty or contains non-digits
131 const string minorVersion = version.substr(periodFound + 1);
132 if ( minorVersion.empty() || !ContainsOnlyDigits(minorVersion) ) {
133 AddError("Invalid version (VN) format: " + version);
137 // TODO: check if version is not just syntactically OK,
138 // but is also a valid SAM version ( 1.0 .. CURRENT )
140 // all checked out this far, then version is OK
144 // assumes non-empty input string
145 bool SamHeaderValidator::ContainsOnlyDigits(const string& s) {
146 const size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS);
147 return ( nonDigitPosition == string::npos ) ;
150 bool SamHeaderValidator::ValidateSortOrder(void) {
152 const string& sortOrder = m_header.SortOrder;
154 // warn if sort order not present
155 if ( sortOrder.empty() ) {
156 AddWarning("Sort order (SO) missing. Not required, but strongly recommended");
160 // if sort order is valid keyword
161 if ( sortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ||
162 sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ||
163 sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED
170 AddError("Invalid sort order (SO): " + sortOrder);
174 bool SamHeaderValidator::ValidateGroupOrder(void) {
176 const string& groupOrder = m_header.GroupOrder;
178 // if no group order, no problem, just return OK
179 if ( groupOrder.empty() )
182 // if group order is valid keyword
183 if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE ||
184 groupOrder == Constants::SAM_HD_GROUPORDER_QUERY ||
185 groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE
192 AddError("Invalid group order (GO): " + groupOrder);
196 bool SamHeaderValidator::ValidateSequenceDictionary(void) {
200 // check for unique sequence names
201 isValid &= ContainsUniqueSequenceNames();
203 // iterate over sequences
204 const SamSequenceDictionary& sequences = m_header.Sequences;
205 SamSequenceConstIterator seqIter = sequences.ConstBegin();
206 SamSequenceConstIterator seqEnd = sequences.ConstEnd();
207 for ( ; seqIter != seqEnd; ++seqIter ) {
208 const SamSequence& seq = (*seqIter);
209 isValid &= ValidateSequence(seq);
212 // return validation state
216 bool SamHeaderValidator::ContainsUniqueSequenceNames(void) {
219 set<string> sequenceNames;
220 set<string>::iterator nameIter;
222 // iterate over sequences
223 const SamSequenceDictionary& sequences = m_header.Sequences;
224 SamSequenceConstIterator seqIter = sequences.ConstBegin();
225 SamSequenceConstIterator seqEnd = sequences.ConstEnd();
226 for ( ; seqIter != seqEnd; ++seqIter ) {
227 const SamSequence& seq = (*seqIter);
229 // lookup sequence name
230 const string& name = seq.Name;
231 nameIter = sequenceNames.find(name);
233 // error if found (duplicate entry)
234 if ( nameIter != sequenceNames.end() ) {
235 AddError("Sequence name (SN): " + name + " is not unique");
239 // otherwise ok, store name
240 sequenceNames.insert(name);
243 // return validation state
247 bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) {
249 isValid &= CheckNameFormat(seq.Name);
250 isValid &= CheckLengthInRange(seq.Length);
254 bool SamHeaderValidator::CheckNameFormat(const string& name) {
256 // invalid if name is empty
257 if ( name.empty() ) {
258 AddError("Sequence entry (@SQ) is missing SN tag");
262 // invalid if first character is a reserved char
263 const char firstChar = name.at(0);
264 if ( firstChar == Constants::SAM_EQUAL || firstChar == Constants::SAM_STAR ) {
265 AddError("Invalid sequence name (SN): " + name);
272 bool SamHeaderValidator::CheckLengthInRange(const string& length) {
275 if ( length.empty() ) {
276 AddError("Sequence entry (@SQ) is missing LN tag");
280 // convert string length to numeric
281 stringstream lengthStream(length);
282 unsigned int sequenceLength;
283 lengthStream >> sequenceLength;
285 // invalid if length outside accepted range
286 if ( sequenceLength < Constants::SAM_SQ_LENGTH_MIN || sequenceLength > Constants::SAM_SQ_LENGTH_MAX ) {
287 AddError("Sequence length (LN): " + length + " out of range");
295 bool SamHeaderValidator::ValidateReadGroupDictionary(void) {
299 // check for unique read group IDs & platform units
300 isValid &= ContainsUniqueIDsAndPlatformUnits();
302 // iterate over read groups
303 const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
304 SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
305 SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
306 for ( ; rgIter != rgEnd; ++rgIter ) {
307 const SamReadGroup& rg = (*rgIter);
308 isValid &= ValidateReadGroup(rg);
311 // return validation state
315 bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) {
318 set<string> readGroupIds;
319 set<string> platformUnits;
320 set<string>::iterator idIter;
321 set<string>::iterator puIter;
323 // iterate over sequences
324 const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
325 SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
326 SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
327 for ( ; rgIter != rgEnd; ++rgIter ) {
328 const SamReadGroup& rg = (*rgIter);
330 // --------------------------------
331 // check for unique ID
333 // lookup read group ID
334 const string& id = rg.ID;
335 idIter = readGroupIds.find(id);
337 // error if found (duplicate entry)
338 if ( idIter != readGroupIds.end() ) {
339 AddError("Read group ID (ID): " + id + " is not unique");
343 // otherwise ok, store id
344 readGroupIds.insert(id);
346 // --------------------------------
347 // check for unique platform unit
349 // lookup platform unit
350 const string& pu = rg.PlatformUnit;
351 puIter = platformUnits.find(pu);
353 // error if found (duplicate entry)
354 if ( puIter != platformUnits.end() ) {
355 AddError("Platform unit (PU): " + pu + " is not unique");
359 // otherwise ok, store platform unit
360 platformUnits.insert(pu);
363 // return validation state
367 bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) {
369 isValid &= CheckReadGroupID(rg.ID);
370 isValid &= CheckSequencingTechnology(rg.SequencingTechnology);
374 bool SamHeaderValidator::CheckReadGroupID(const string& id) {
378 AddError("Read group entry (@RG) is missing ID tag");
386 bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) {
388 // if no technology provided, no problem, just return OK
389 if ( technology.empty() )
392 // if technology is valid keyword
393 if ( caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_CAPILLARY) ||
394 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_HELICOS) ||
395 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA) ||
396 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_IONTORRENT) ||
397 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_LS454) ||
398 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_PACBIO) ||
399 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_SOLID)
406 AddError("Invalid read group sequencing platform (PL): " + technology);
410 bool SamHeaderValidator::ValidateProgramChain(void) {
412 isValid &= ContainsUniqueProgramIds();
413 isValid &= ValidatePreviousProgramIds();
417 bool SamHeaderValidator::ContainsUniqueProgramIds(void) {
420 set<string> programIds;
421 set<string>::iterator pgIdIter;
423 // iterate over program records
424 const SamProgramChain& programs = m_header.Programs;
425 SamProgramConstIterator pgIter = programs.ConstBegin();
426 SamProgramConstIterator pgEnd = programs.ConstEnd();
427 for ( ; pgIter != pgEnd; ++pgIter ) {
428 const SamProgram& pg = (*pgIter);
431 const string& pgId = pg.ID;
432 pgIdIter = programIds.find(pgId);
434 // error if found (duplicate entry)
435 if ( pgIdIter != programIds.end() ) {
436 AddError("Program ID (ID): " + pgId + " is not unique");
440 // otherwise ok, store ID
441 programIds.insert(pgId);
444 // return validation state
448 bool SamHeaderValidator::ValidatePreviousProgramIds(void) {
452 // iterate over program records
453 const SamProgramChain& programs = m_header.Programs;
454 SamProgramConstIterator pgIter = programs.ConstBegin();
455 SamProgramConstIterator pgEnd = programs.ConstEnd();
456 for ( ; pgIter != pgEnd; ++pgIter ) {
457 const SamProgram& pg = (*pgIter);
459 // ignore record for validation if PreviousProgramID is empty
460 const string& ppId = pg.PreviousProgramID;
464 // see if program "chain" contains an entry for ppId
465 if ( !programs.Contains(ppId) ) {
466 AddError("PreviousProgramID (PP): " + ppId + " is not a known ID");
471 // return validation state
474 void SamHeaderValidator::AddError(const string& message) {
475 m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE);
478 void SamHeaderValidator::AddWarning(const string& message) {
479 m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE);
482 void SamHeaderValidator::PrintErrorMessages(void) {
484 // skip if no error messages
485 if ( m_errorMessages.empty() ) return;
487 // print error header line
488 cerr << "* SAM header has " << m_errorMessages.size() << " errors:" << endl;
490 // print each error message
491 vector<string>::const_iterator errorIter = m_errorMessages.begin();
492 vector<string>::const_iterator errorEnd = m_errorMessages.end();
493 for ( ; errorIter != errorEnd; ++errorIter )
494 cerr << (*errorIter);
497 void SamHeaderValidator::PrintWarningMessages(void) {
499 // skip if no warning messages
500 if ( m_warningMessages.empty() ) return;
502 // print warning header line
503 cerr << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl;
505 // print each warning message
506 vector<string>::const_iterator warnIter = m_warningMessages.begin();
507 vector<string>::const_iterator warnEnd = m_warningMessages.end();
508 for ( ; warnIter != warnEnd; ++warnIter )