1 // ***************************************************************************
2 // SamHeaderValidator.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 25 October 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Provides functionality for validating SamHeader data
8 // ***************************************************************************
10 #include "api/SamConstants.h"
11 #include "api/SamHeader.h"
12 #include "api/internal/sam/SamHeaderValidator_p.h"
13 #include "api/internal/sam/SamHeaderVersion_p.h"
14 using namespace BamTools;
15 using namespace BamTools::Internal;
22 // ------------------------
23 // static utility methods
24 // -------------------------
27 bool caseInsensitiveCompare(const string& lhs, const string& rhs) {
29 // can omit checking chars if lengths not equal
30 const int lhsLength = lhs.length();
31 const int rhsLength = rhs.length();
32 if ( lhsLength != rhsLength )
35 // do *basic* toupper checks on each string char's
36 for ( int i = 0; i < lhsLength; ++i ) {
37 if ( toupper( (int)lhs.at(i)) != toupper( (int)rhs.at(i)) )
45 // ------------------------------------------------------------------------
46 // Allow validation rules to vary, as needed, between SAM header versions
48 // use SAM_VERSION_X_Y to tag important changes
50 // Together, they will allow for comparisons like:
51 // if ( m_version < SAM_VERSION_2_0 ) {
52 // // use some older rule
54 // // use rule introduced with version 2.0
56 static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0);
57 static const SamHeaderVersion SAM_VERSION_1_1 = SamHeaderVersion(1,1);
58 static const SamHeaderVersion SAM_VERSION_1_2 = SamHeaderVersion(1,2);
59 static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3);
60 static const SamHeaderVersion SAM_VERSION_1_4 = SamHeaderVersion(1,4);
62 // TODO: This functionality is currently unused.
63 // Make validation "version-aware."
65 // ------------------------------------------------------------------------
67 const string SamHeaderValidator::ERROR_PREFIX = "ERROR: ";
68 const string SamHeaderValidator::WARN_PREFIX = "WARNING: ";
69 const string SamHeaderValidator::NEWLINE = "\n";
71 SamHeaderValidator::SamHeaderValidator(const SamHeader& header)
75 SamHeaderValidator::~SamHeaderValidator(void) { }
77 void SamHeaderValidator::AddError(const string& message) {
78 m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE);
81 void SamHeaderValidator::AddWarning(const string& message) {
82 m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE);
85 void SamHeaderValidator::PrintErrorMessages(ostream& stream) {
87 // skip if no error messages
88 if ( m_errorMessages.empty() )
91 // print error header line
92 stream << "* SAM header has " << m_errorMessages.size() << " errors:" << endl;
94 // print each error message
95 vector<string>::const_iterator errorIter = m_errorMessages.begin();
96 vector<string>::const_iterator errorEnd = m_errorMessages.end();
97 for ( ; errorIter != errorEnd; ++errorIter )
98 stream << (*errorIter);
101 void SamHeaderValidator::PrintMessages(ostream& stream) {
102 PrintErrorMessages(stream);
103 PrintWarningMessages(stream);
106 void SamHeaderValidator::PrintWarningMessages(ostream& stream) {
108 // skip if no warning messages
109 if ( m_warningMessages.empty() )
112 // print warning header line
113 stream << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl;
115 // print each warning message
116 vector<string>::const_iterator warnIter = m_warningMessages.begin();
117 vector<string>::const_iterator warnEnd = m_warningMessages.end();
118 for ( ; warnIter != warnEnd; ++warnIter )
119 stream << (*warnIter);
122 // entry point for validation
123 bool SamHeaderValidator::Validate(void) {
125 isValid &= ValidateMetadata();
126 isValid &= ValidateSequenceDictionary();
127 isValid &= ValidateReadGroupDictionary();
128 isValid &= ValidateProgramChain();
132 // check all SAM header 'metadata'
133 bool SamHeaderValidator::ValidateMetadata(void) {
135 isValid &= ValidateVersion();
136 isValid &= ValidateSortOrder();
137 isValid &= ValidateGroupOrder();
141 // check SAM header version tag
142 bool SamHeaderValidator::ValidateVersion(void) {
144 const string& version = m_header.Version;
146 // warn if version not present
147 if ( version.empty() ) {
148 AddWarning("Version (VN) missing. Not required, but strongly recommended");
152 // invalid if version does not contain a period
153 const size_t periodFound = version.find(Constants::SAM_PERIOD);
154 if ( periodFound == string::npos ) {
155 AddError("Invalid version (VN) format: " + version);
159 // invalid if major version is empty or contains non-digits
160 const string majorVersion = version.substr(0, periodFound);
161 if ( majorVersion.empty() || !ContainsOnlyDigits(majorVersion) ) {
162 AddError("Invalid version (VN) format: " + version);
166 // invalid if major version is empty or contains non-digits
167 const string minorVersion = version.substr(periodFound + 1);
168 if ( minorVersion.empty() || !ContainsOnlyDigits(minorVersion) ) {
169 AddError("Invalid version (VN) format: " + version);
173 // TODO: check if version is not just syntactically OK,
174 // but is also a valid SAM version ( 1.0 .. CURRENT )
176 // all checked out this far, then version is OK
180 // assumes non-empty input string
181 bool SamHeaderValidator::ContainsOnlyDigits(const string& s) {
182 const size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS);
183 return ( nonDigitPosition == string::npos ) ;
186 // validate SAM header sort order tag
187 bool SamHeaderValidator::ValidateSortOrder(void) {
189 const string& sortOrder = m_header.SortOrder;
191 // warn if sort order not present
192 if ( sortOrder.empty() ) {
193 AddWarning("Sort order (SO) missing. Not required, but strongly recommended");
197 // if sort order is valid keyword
198 if ( sortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ||
199 sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ||
200 sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED
207 AddError("Invalid sort order (SO): " + sortOrder);
211 // validate SAM header group order tag
212 bool SamHeaderValidator::ValidateGroupOrder(void) {
214 const string& groupOrder = m_header.GroupOrder;
216 // if no group order, no problem, just return OK
217 if ( groupOrder.empty() )
220 // if group order is valid keyword
221 if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE ||
222 groupOrder == Constants::SAM_HD_GROUPORDER_QUERY ||
223 groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE
230 AddError("Invalid group order (GO): " + groupOrder);
234 // validate SAM header sequence dictionary
235 bool SamHeaderValidator::ValidateSequenceDictionary(void) {
239 // check for unique sequence names
240 isValid &= ContainsUniqueSequenceNames();
242 // iterate over sequences
243 const SamSequenceDictionary& sequences = m_header.Sequences;
244 SamSequenceConstIterator seqIter = sequences.ConstBegin();
245 SamSequenceConstIterator seqEnd = sequences.ConstEnd();
246 for ( ; seqIter != seqEnd; ++seqIter ) {
247 const SamSequence& seq = (*seqIter);
248 isValid &= ValidateSequence(seq);
251 // return validation state
255 // make sure all SQ names are unique
256 bool SamHeaderValidator::ContainsUniqueSequenceNames(void) {
259 set<string> sequenceNames;
260 set<string>::iterator nameIter;
262 // iterate over sequences
263 const SamSequenceDictionary& sequences = m_header.Sequences;
264 SamSequenceConstIterator seqIter = sequences.ConstBegin();
265 SamSequenceConstIterator seqEnd = sequences.ConstEnd();
266 for ( ; seqIter != seqEnd; ++seqIter ) {
267 const SamSequence& seq = (*seqIter);
269 // lookup sequence name
270 const string& name = seq.Name;
271 nameIter = sequenceNames.find(name);
273 // error if found (duplicate entry)
274 if ( nameIter != sequenceNames.end() ) {
275 AddError("Sequence name (SN): " + name + " is not unique");
279 // otherwise ok, store name
280 sequenceNames.insert(name);
283 // return validation state
287 // validate SAM header sequence entry
288 bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) {
290 isValid &= CheckNameFormat(seq.Name);
291 isValid &= CheckLengthInRange(seq.Length);
295 // check sequence name is valid format
296 bool SamHeaderValidator::CheckNameFormat(const string& name) {
298 // invalid if name is empty
299 if ( name.empty() ) {
300 AddError("Sequence entry (@SQ) is missing SN tag");
304 // invalid if first character is a reserved char
305 const char firstChar = name.at(0);
306 if ( firstChar == Constants::SAM_EQUAL || firstChar == Constants::SAM_STAR ) {
307 AddError("Invalid sequence name (SN): " + name);
314 // check that sequence length is within accepted range
315 bool SamHeaderValidator::CheckLengthInRange(const string& length) {
318 if ( length.empty() ) {
319 AddError("Sequence entry (@SQ) is missing LN tag");
323 // convert string length to numeric
324 stringstream lengthStream(length);
325 unsigned int sequenceLength;
326 lengthStream >> sequenceLength;
328 // invalid if length outside accepted range
329 if ( sequenceLength < Constants::SAM_SQ_LENGTH_MIN || sequenceLength > Constants::SAM_SQ_LENGTH_MAX ) {
330 AddError("Sequence length (LN): " + length + " out of range");
338 // validate SAM header read group dictionary
339 bool SamHeaderValidator::ValidateReadGroupDictionary(void) {
343 // check for unique read group IDs & platform units
344 isValid &= ContainsUniqueIDsAndPlatformUnits();
346 // iterate over read groups
347 const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
348 SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
349 SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
350 for ( ; rgIter != rgEnd; ++rgIter ) {
351 const SamReadGroup& rg = (*rgIter);
352 isValid &= ValidateReadGroup(rg);
355 // return validation state
359 // make sure RG IDs and platform units are unique
360 bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) {
363 set<string> readGroupIds;
364 set<string> platformUnits;
365 set<string>::iterator idIter;
366 set<string>::iterator puIter;
368 // iterate over sequences
369 const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
370 SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
371 SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
372 for ( ; rgIter != rgEnd; ++rgIter ) {
373 const SamReadGroup& rg = (*rgIter);
375 // --------------------------------
376 // check for unique ID
378 // lookup read group ID
379 const string& id = rg.ID;
380 idIter = readGroupIds.find(id);
382 // error if found (duplicate entry)
383 if ( idIter != readGroupIds.end() ) {
384 AddError("Read group ID (ID): " + id + " is not unique");
388 // otherwise ok, store id
389 readGroupIds.insert(id);
391 // --------------------------------
392 // check for unique platform unit
394 // lookup platform unit
395 const string& pu = rg.PlatformUnit;
396 puIter = platformUnits.find(pu);
398 // error if found (duplicate entry)
399 if ( puIter != platformUnits.end() ) {
400 AddError("Platform unit (PU): " + pu + " is not unique");
404 // otherwise ok, store platform unit
405 platformUnits.insert(pu);
408 // return validation state
412 // validate SAM header read group entry
413 bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) {
415 isValid &= CheckReadGroupID(rg.ID);
416 isValid &= CheckSequencingTechnology(rg.SequencingTechnology);
420 // make sure RG ID exists
421 bool SamHeaderValidator::CheckReadGroupID(const string& id) {
425 AddError("Read group entry (@RG) is missing ID tag");
433 // make sure RG sequencing tech is one of the accepted keywords
434 bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) {
436 // if no technology provided, no problem, just return OK
437 if ( technology.empty() )
440 // if technology is valid keyword
441 if ( caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_CAPILLARY) ||
442 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_HELICOS) ||
443 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA) ||
444 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_IONTORRENT) ||
445 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_LS454) ||
446 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_PACBIO) ||
447 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_SOLID)
454 AddError("Invalid read group sequencing platform (PL): " + technology);
458 // validate the SAM header "program chain"
459 bool SamHeaderValidator::ValidateProgramChain(void) {
461 isValid &= ContainsUniqueProgramIds();
462 isValid &= ValidatePreviousProgramIds();
466 // make sure all PG IDs are unique
467 bool SamHeaderValidator::ContainsUniqueProgramIds(void) {
470 set<string> programIds;
471 set<string>::iterator pgIdIter;
473 // iterate over program records
474 const SamProgramChain& programs = m_header.Programs;
475 SamProgramConstIterator pgIter = programs.ConstBegin();
476 SamProgramConstIterator pgEnd = programs.ConstEnd();
477 for ( ; pgIter != pgEnd; ++pgIter ) {
478 const SamProgram& pg = (*pgIter);
481 const string& pgId = pg.ID;
482 pgIdIter = programIds.find(pgId);
484 // error if found (duplicate entry)
485 if ( pgIdIter != programIds.end() ) {
486 AddError("Program ID (ID): " + pgId + " is not unique");
490 // otherwise ok, store ID
491 programIds.insert(pgId);
494 // return validation state
498 // make sure that any PP tags present point to existing @PG IDs
499 bool SamHeaderValidator::ValidatePreviousProgramIds(void) {
503 // iterate over program records
504 const SamProgramChain& programs = m_header.Programs;
505 SamProgramConstIterator pgIter = programs.ConstBegin();
506 SamProgramConstIterator pgEnd = programs.ConstEnd();
507 for ( ; pgIter != pgEnd; ++pgIter ) {
508 const SamProgram& pg = (*pgIter);
510 // ignore record for validation if PreviousProgramID is empty
511 const string& ppId = pg.PreviousProgramID;
515 // see if program "chain" contains an entry for ppId
516 if ( !programs.Contains(ppId) ) {
517 AddError("PreviousProgramID (PP): " + ppId + " is not a known ID");
522 // return validation state