1 // ***************************************************************************
2 // SamHeaderValidator.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 18 April 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides functionality for validating SamHeader data
9 // ***************************************************************************
11 #include <api/SamConstants.h>
12 #include <api/SamHeader.h>
13 #include <api/internal/SamHeaderValidator_p.h>
14 #include <api/internal/SamHeaderVersion_p.h>
15 using namespace BamTools;
16 using namespace BamTools::Internal;
27 bool caseInsensitiveCompare(const string& lhs, const string& rhs) {
29 // can omit checking chars if lengths not equal
30 const int lhsLength = lhs.length();
31 const int rhsLength = rhs.length();
32 if ( lhsLength != rhsLength )
35 // do *basic* toupper checks on each string char's
36 for ( int i = 0; i < lhsLength; ++i ) {
37 if ( toupper( (int)lhs.at(i)) != toupper( (int)rhs.at(i)) )
45 } // namespace Internal
46 } // namespace BamTools
48 // ------------------------------------------------------------------------
49 // Allow validation rules to vary, as needed, between SAM header versions
51 // use SAM_VERSION_X_Y to tag important changes
53 // Together, they will allow for comparisons like:
54 // if ( m_version < SAM_VERSION_2_0 ) {
55 // // use some older rule
57 // // use rule introduced with version 2.0
59 static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0);
60 static const SamHeaderVersion SAM_VERSION_1_1 = SamHeaderVersion(1,1);
61 static const SamHeaderVersion SAM_VERSION_1_2 = SamHeaderVersion(1,2);
62 static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3);
63 static const SamHeaderVersion SAM_VERSION_1_4 = SamHeaderVersion(1,4);
65 // TODO: This functionality is currently unused.
66 // Make validation "version-aware."
68 // ------------------------------------------------------------------------
70 const string SamHeaderValidator::ERROR_PREFIX = "ERROR: ";
71 const string SamHeaderValidator::WARN_PREFIX = "WARNING: ";
72 const string SamHeaderValidator::NEWLINE = "\n";
74 SamHeaderValidator::SamHeaderValidator(const SamHeader& header)
78 SamHeaderValidator::~SamHeaderValidator(void) { }
80 bool SamHeaderValidator::Validate(bool verbose) {
82 // validate header components
84 isValid &= ValidateMetadata();
85 isValid &= ValidateSequenceDictionary();
86 isValid &= ValidateReadGroupDictionary();
87 isValid &= ValidateProgramChain();
89 // report errors if desired
92 PrintWarningMessages();
95 // return validation status
99 bool SamHeaderValidator::ValidateMetadata(void) {
101 isValid &= ValidateVersion();
102 isValid &= ValidateSortOrder();
103 isValid &= ValidateGroupOrder();
107 bool SamHeaderValidator::ValidateVersion(void) {
109 const string& version = m_header.Version;
111 // warn if version not present
112 if ( version.empty() ) {
113 AddWarning("Version (VN) missing. Not required, but strongly recommended");
117 // invalid if version does not contain a period
118 const size_t periodFound = version.find(Constants::SAM_PERIOD);
119 if ( periodFound == string::npos ) {
120 AddError("Invalid version (VN) format: " + version);
124 // invalid if major version is empty or contains non-digits
125 const string majorVersion = version.substr(0, periodFound);
126 if ( majorVersion.empty() || !ContainsOnlyDigits(majorVersion) ) {
127 AddError("Invalid version (VN) format: " + version);
131 // invalid if major version is empty or contains non-digits
132 const string minorVersion = version.substr(periodFound + 1);
133 if ( minorVersion.empty() || !ContainsOnlyDigits(minorVersion) ) {
134 AddError("Invalid version (VN) format: " + version);
138 // TODO: check if version is not just syntactically OK,
139 // but is also a valid SAM version ( 1.0 .. CURRENT )
141 // all checked out this far, then version is OK
145 // assumes non-empty input string
146 bool SamHeaderValidator::ContainsOnlyDigits(const string& s) {
147 const size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS);
148 return ( nonDigitPosition == string::npos ) ;
151 bool SamHeaderValidator::ValidateSortOrder(void) {
153 const string& sortOrder = m_header.SortOrder;
155 // warn if sort order not present
156 if ( sortOrder.empty() ) {
157 AddWarning("Sort order (SO) missing. Not required, but strongly recommended");
161 // if sort order is valid keyword
162 if ( sortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ||
163 sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ||
164 sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED
171 AddError("Invalid sort order (SO): " + sortOrder);
175 bool SamHeaderValidator::ValidateGroupOrder(void) {
177 const string& groupOrder = m_header.GroupOrder;
179 // if no group order, no problem, just return OK
180 if ( groupOrder.empty() )
183 // if group order is valid keyword
184 if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE ||
185 groupOrder == Constants::SAM_HD_GROUPORDER_QUERY ||
186 groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE
193 AddError("Invalid group order (GO): " + groupOrder);
197 bool SamHeaderValidator::ValidateSequenceDictionary(void) {
201 // check for unique sequence names
202 isValid &= ContainsUniqueSequenceNames();
204 // iterate over sequences
205 const SamSequenceDictionary& sequences = m_header.Sequences;
206 SamSequenceConstIterator seqIter = sequences.ConstBegin();
207 SamSequenceConstIterator seqEnd = sequences.ConstEnd();
208 for ( ; seqIter != seqEnd; ++seqIter ) {
209 const SamSequence& seq = (*seqIter);
210 isValid &= ValidateSequence(seq);
213 // return validation state
217 bool SamHeaderValidator::ContainsUniqueSequenceNames(void) {
220 set<string> sequenceNames;
221 set<string>::iterator nameIter;
223 // iterate over sequences
224 const SamSequenceDictionary& sequences = m_header.Sequences;
225 SamSequenceConstIterator seqIter = sequences.ConstBegin();
226 SamSequenceConstIterator seqEnd = sequences.ConstEnd();
227 for ( ; seqIter != seqEnd; ++seqIter ) {
228 const SamSequence& seq = (*seqIter);
230 // lookup sequence name
231 const string& name = seq.Name;
232 nameIter = sequenceNames.find(name);
234 // error if found (duplicate entry)
235 if ( nameIter != sequenceNames.end() ) {
236 AddError("Sequence name (SN): " + name + " is not unique");
240 // otherwise ok, store name
241 sequenceNames.insert(name);
244 // return validation state
248 bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) {
250 isValid &= CheckNameFormat(seq.Name);
251 isValid &= CheckLengthInRange(seq.Length);
255 bool SamHeaderValidator::CheckNameFormat(const string& name) {
257 // invalid if name is empty
258 if ( name.empty() ) {
259 AddError("Sequence entry (@SQ) is missing SN tag");
263 // invalid if first character is a reserved char
264 const char firstChar = name.at(0);
265 if ( firstChar == Constants::SAM_EQUAL || firstChar == Constants::SAM_STAR ) {
266 AddError("Invalid sequence name (SN): " + name);
273 bool SamHeaderValidator::CheckLengthInRange(const string& length) {
276 if ( length.empty() ) {
277 AddError("Sequence entry (@SQ) is missing LN tag");
281 // convert string length to numeric
282 stringstream lengthStream(length);
283 unsigned int sequenceLength;
284 lengthStream >> sequenceLength;
286 // invalid if length outside accepted range
287 if ( sequenceLength < Constants::SAM_SQ_LENGTH_MIN || sequenceLength > Constants::SAM_SQ_LENGTH_MAX ) {
288 AddError("Sequence length (LN): " + length + " out of range");
296 bool SamHeaderValidator::ValidateReadGroupDictionary(void) {
300 // check for unique read group IDs & platform units
301 isValid &= ContainsUniqueIDsAndPlatformUnits();
303 // iterate over read groups
304 const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
305 SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
306 SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
307 for ( ; rgIter != rgEnd; ++rgIter ) {
308 const SamReadGroup& rg = (*rgIter);
309 isValid &= ValidateReadGroup(rg);
312 // return validation state
316 bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) {
319 set<string> readGroupIds;
320 set<string> platformUnits;
321 set<string>::iterator idIter;
322 set<string>::iterator puIter;
324 // iterate over sequences
325 const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
326 SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
327 SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
328 for ( ; rgIter != rgEnd; ++rgIter ) {
329 const SamReadGroup& rg = (*rgIter);
331 // --------------------------------
332 // check for unique ID
334 // lookup read group ID
335 const string& id = rg.ID;
336 idIter = readGroupIds.find(id);
338 // error if found (duplicate entry)
339 if ( idIter != readGroupIds.end() ) {
340 AddError("Read group ID (ID): " + id + " is not unique");
344 // otherwise ok, store id
345 readGroupIds.insert(id);
347 // --------------------------------
348 // check for unique platform unit
350 // lookup platform unit
351 const string& pu = rg.PlatformUnit;
352 puIter = platformUnits.find(pu);
354 // error if found (duplicate entry)
355 if ( puIter != platformUnits.end() ) {
356 AddError("Platform unit (PU): " + pu + " is not unique");
360 // otherwise ok, store platform unit
361 platformUnits.insert(pu);
364 // return validation state
368 bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) {
370 isValid &= CheckReadGroupID(rg.ID);
371 isValid &= CheckSequencingTechnology(rg.SequencingTechnology);
375 bool SamHeaderValidator::CheckReadGroupID(const string& id) {
379 AddError("Read group entry (@RG) is missing ID tag");
387 bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) {
389 // if no technology provided, no problem, just return OK
390 if ( technology.empty() )
393 // if technology is valid keyword
394 if ( caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_CAPILLARY) ||
395 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_HELICOS) ||
396 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA) ||
397 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_IONTORRENT) ||
398 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_LS454) ||
399 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_PACBIO) ||
400 caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_SOLID)
407 AddError("Invalid read group sequencing platform (PL): " + technology);
411 bool SamHeaderValidator::ValidateProgramChain(void) {
413 isValid &= ContainsUniqueProgramIds();
414 isValid &= ValidatePreviousProgramIds();
418 bool SamHeaderValidator::ContainsUniqueProgramIds(void) {
421 set<string> programIds;
422 set<string>::iterator pgIdIter;
424 // iterate over program records
425 const SamProgramChain& programs = m_header.Programs;
426 SamProgramConstIterator pgIter = programs.ConstBegin();
427 SamProgramConstIterator pgEnd = programs.ConstEnd();
428 for ( ; pgIter != pgEnd; ++pgIter ) {
429 const SamProgram& pg = (*pgIter);
432 const string& pgId = pg.ID;
433 pgIdIter = programIds.find(pgId);
435 // error if found (duplicate entry)
436 if ( pgIdIter != programIds.end() ) {
437 AddError("Program ID (ID): " + pgId + " is not unique");
441 // otherwise ok, store ID
442 programIds.insert(pgId);
445 // return validation state
449 bool SamHeaderValidator::ValidatePreviousProgramIds(void) {
453 // iterate over program records
454 const SamProgramChain& programs = m_header.Programs;
455 SamProgramConstIterator pgIter = programs.ConstBegin();
456 SamProgramConstIterator pgEnd = programs.ConstEnd();
457 for ( ; pgIter != pgEnd; ++pgIter ) {
458 const SamProgram& pg = (*pgIter);
460 // ignore record for validation if PreviousProgramID is empty
461 const string& ppId = pg.PreviousProgramID;
465 // see if program "chain" contains an entry for ppId
466 if ( !programs.Contains(ppId) ) {
467 AddError("PreviousProgramID (PP): " + ppId + " is not a known ID");
472 // return validation state
475 void SamHeaderValidator::AddError(const string& message) {
476 m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE);
479 void SamHeaderValidator::AddWarning(const string& message) {
480 m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE);
483 void SamHeaderValidator::PrintErrorMessages(void) {
485 // skip if no error messages
486 if ( m_errorMessages.empty() ) return;
488 // print error header line
489 cerr << "* SAM header has " << m_errorMessages.size() << " errors:" << endl;
491 // print each error message
492 vector<string>::const_iterator errorIter = m_errorMessages.begin();
493 vector<string>::const_iterator errorEnd = m_errorMessages.end();
494 for ( ; errorIter != errorEnd; ++errorIter )
495 cerr << (*errorIter);
498 void SamHeaderValidator::PrintWarningMessages(void) {
500 // skip if no warning messages
501 if ( m_warningMessages.empty() ) return;
503 // print warning header line
504 cerr << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl;
506 // print each warning message
507 vector<string>::const_iterator warnIter = m_warningMessages.begin();
508 vector<string>::const_iterator warnEnd = m_warningMessages.end();
509 for ( ; warnIter != warnEnd; ++warnIter )