1 // ***************************************************************************
2 // SamHeaderValidator.cpp (c) 2010 Derek Barnett
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 13 January 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Provides functionality for validating SamHeader data
9 // ***************************************************************************
11 #include <api/SamConstants.h>
12 #include <api/SamHeader.h>
13 #include <api/internal/SamHeaderValidator_p.h>
14 #include <api/internal/SamHeaderVersion_p.h>
15 using namespace BamTools;
16 using namespace BamTools::Internal;
23 // ------------------------------------------------------------------------
24 // Allow validation rules to vary, as needed, between SAM header versions
26 // use SAM_VERSION_X_Y to tag important changes
28 // Together, they will allow for comparisons like:
29 // if ( m_version < SAM_VERSION_2_0 ) {
30 // // use some older rule
32 // // use rule introduced with version 2.0
34 static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0);
35 static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3);
37 // TODO: This functionality is currently unused.
38 // Make validation "version-aware."
40 // ------------------------------------------------------------------------
42 const string SamHeaderValidator::ERROR_PREFIX = "ERROR: ";
43 const string SamHeaderValidator::WARN_PREFIX = "WARNING: ";
44 const string SamHeaderValidator::NEWLINE = "\n";
46 SamHeaderValidator::SamHeaderValidator(const SamHeader& header)
50 SamHeaderValidator::~SamHeaderValidator(void) { }
52 bool SamHeaderValidator::Validate(bool verbose) {
54 // validate header components
56 isValid &= ValidateMetadata();
57 isValid &= ValidateSequenceDictionary();
58 isValid &= ValidateReadGroupDictionary();
59 isValid &= ValidateProgramData();
61 // report errors if desired
64 PrintWarningMessages();
67 // return validation status
71 bool SamHeaderValidator::ValidateMetadata(void) {
73 isValid &= ValidateVersion();
74 isValid &= ValidateSortOrder();
75 isValid &= ValidateGroupOrder();
79 bool SamHeaderValidator::ValidateVersion(void) {
81 const string& version = m_header.Version;
83 // warn if version not present
84 if ( version.empty() ) {
85 AddWarning("Version (VN) missing. Not required, but strongly recommended");
89 // invalid if version does not contain a period
90 const size_t periodFound = version.find(Constants::SAM_PERIOD);
91 if ( periodFound == string::npos ) {
92 AddError("Invalid version (VN) format: " + version);
96 // invalid if major version is empty or contains non-digits
97 const string majorVersion = version.substr(0, periodFound);
98 if ( majorVersion.empty() || !ContainsOnlyDigits(majorVersion) ) {
99 AddError("Invalid version (VN) format: " + version);
103 // invalid if major version is empty or contains non-digits
104 const string minorVersion = version.substr(periodFound + 1);
105 if ( minorVersion.empty() || !ContainsOnlyDigits(minorVersion) ) {
106 AddError("Invalid version (VN) format: " + version);
110 // TODO: check if version is not just syntactically OK,
111 // but is also a valid SAM version ( 1.0 .. CURRENT )
113 // all checked out this far, then version is OK
117 // assumes non-empty input string
118 bool SamHeaderValidator::ContainsOnlyDigits(const string& s) {
119 const size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS);
120 return ( nonDigitPosition == string::npos ) ;
123 bool SamHeaderValidator::ValidateSortOrder(void) {
125 const string& sortOrder = m_header.SortOrder;
127 // warn if sort order not present
128 if ( sortOrder.empty() ) {
129 AddWarning("Sort order (SO) missing. Not required, but strongly recommended");
133 // if sort order is valid keyword
134 if ( sortOrder == Constants::SAM_HD_SORTORDER_COORDINATE ||
135 sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME ||
136 sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED
141 AddError("Invalid sort order (SO): " + sortOrder);
145 bool SamHeaderValidator::ValidateGroupOrder(void) {
147 const string& groupOrder = m_header.GroupOrder;
149 // if no group order, no problem, just return OK
150 if ( groupOrder.empty() ) return true;
152 // if group order is valid keyword
153 if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE ||
154 groupOrder == Constants::SAM_HD_GROUPORDER_QUERY ||
155 groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE
160 AddError("Invalid group order (GO): " + groupOrder);
164 bool SamHeaderValidator::ValidateSequenceDictionary(void) {
166 // TODO: warn/error if no sequences ?
170 // check for unique sequence names
171 isValid &= ContainsUniqueSequenceNames();
173 // iterate over sequences
174 const SamSequenceDictionary& sequences = m_header.Sequences;
175 SamSequenceConstIterator seqIter = sequences.ConstBegin();
176 SamSequenceConstIterator seqEnd = sequences.ConstEnd();
177 for ( ; seqIter != seqEnd; ++seqIter ) {
178 const SamSequence& seq = (*seqIter);
179 isValid &= ValidateSequence(seq);
182 // return validation state
186 bool SamHeaderValidator::ContainsUniqueSequenceNames(void) {
189 set<string> sequenceNames;
190 set<string>::iterator nameIter;
192 // iterate over sequences
193 const SamSequenceDictionary& sequences = m_header.Sequences;
194 SamSequenceConstIterator seqIter = sequences.ConstBegin();
195 SamSequenceConstIterator seqEnd = sequences.ConstEnd();
196 for ( ; seqIter != seqEnd; ++seqIter ) {
197 const SamSequence& seq = (*seqIter);
198 const string& name = seq.Name;
200 // lookup sequence name
201 nameIter = sequenceNames.find(name);
203 // error if found (duplicate entry)
204 if ( nameIter != sequenceNames.end() ) {
205 AddError("Sequence name (SN): " + name + " is not unique");
209 // otherwise ok, store name
210 sequenceNames.insert(name);
213 // return validation state
217 bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) {
219 isValid &= CheckNameFormat(seq.Name);
220 isValid &= CheckLengthInRange(seq.Length);
224 bool SamHeaderValidator::CheckNameFormat(const string& name) {
226 // invalid if name is empty
227 if ( name.empty() ) {
228 AddError("Sequence entry (@SQ) is missing SN tag");
232 // invalid if first character is a reserved char
233 const char firstChar = name.at(0);
234 if ( firstChar == Constants::SAM_EQUAL || firstChar == Constants::SAM_STAR ) {
235 AddError("Invalid sequence name (SN): " + name);
242 bool SamHeaderValidator::CheckLengthInRange(const string& length) {
245 if ( length.empty() ) {
246 AddError("Sequence entry (@SQ) is missing LN tag");
250 // convert string length to numeric
251 stringstream lengthStream(length);
252 unsigned int sequenceLength;
253 lengthStream >> sequenceLength;
255 // invalid if length outside accepted range
256 if ( sequenceLength < Constants::SAM_SQ_LENGTH_MIN || sequenceLength > Constants::SAM_SQ_LENGTH_MAX ) {
257 AddError("Sequence length (LN): " + length + " out of range");
265 bool SamHeaderValidator::ValidateReadGroupDictionary(void) {
267 // TODO: warn/error if no read groups ?
271 // check for unique read group IDs & platform units
272 isValid &= ContainsUniqueIDsAndPlatformUnits();
274 // iterate over read groups
275 const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
276 SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
277 SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
278 for ( ; rgIter != rgEnd; ++rgIter ) {
279 const SamReadGroup& rg = (*rgIter);
280 isValid &= ValidateReadGroup(rg);
283 // return validation state
287 bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) {
290 set<string> readGroupIds;
291 set<string> platformUnits;
292 set<string>::iterator idIter;
293 set<string>::iterator puIter;
295 // iterate over sequences
296 const SamReadGroupDictionary& readGroups = m_header.ReadGroups;
297 SamReadGroupConstIterator rgIter = readGroups.ConstBegin();
298 SamReadGroupConstIterator rgEnd = readGroups.ConstEnd();
299 for ( ; rgIter != rgEnd; ++rgIter ) {
300 const SamReadGroup& rg = (*rgIter);
302 // --------------------------------
303 // check for unique ID
305 // lookup read group ID
306 const string& id = rg.ID;
307 idIter = readGroupIds.find(id);
309 // error if found (duplicate entry)
310 if ( idIter != readGroupIds.end() ) {
311 AddError("Read group ID (ID): " + id + " is not unique");
315 // otherwise ok, store id
316 readGroupIds.insert(id);
318 // --------------------------------
319 // check for unique platform unit
321 // lookup platform unit
322 const string& pu = rg.PlatformUnit;
323 puIter = platformUnits.find(pu);
325 // error if found (duplicate entry)
326 if ( puIter != platformUnits.end() ) {
327 AddError("Platform unit (PU): " + pu + " is not unique");
331 // otherwise ok, store platform unit
332 platformUnits.insert(pu);
335 // return validation state
339 bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) {
341 isValid &= CheckReadGroupID(rg.ID);
342 isValid &= CheckSequencingTechnology(rg.SequencingTechnology);
346 bool SamHeaderValidator::CheckReadGroupID(const string& id) {
350 AddError("Read group entry (@RG) is missing ID tag");
358 bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) {
360 // if no technology provided, no problem, just return OK
361 if ( technology.empty() ) return true;
363 // if technology is valid keyword
364 if ( Is454(technology) ||
365 IsHelicos(technology) ||
366 IsIllumina(technology) ||
367 IsPacBio(technology) ||
373 AddError("Invalid read group sequencing platform (PL): " + technology);
377 bool SamHeaderValidator::Is454(const string& technology) {
378 return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_454 ||
379 technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_LOWER ||
380 technology == Constants::SAM_RG_SEQTECHNOLOGY_LS454_UPPER
384 bool SamHeaderValidator::IsHelicos(const string& technology) {
385 return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_LOWER ||
386 technology == Constants::SAM_RG_SEQTECHNOLOGY_HELICOS_UPPER
390 bool SamHeaderValidator::IsIllumina(const string& technology) {
391 return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_LOWER ||
392 technology == Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA_UPPER
396 bool SamHeaderValidator::IsPacBio(const string& technology) {
397 return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_LOWER ||
398 technology == Constants::SAM_RG_SEQTECHNOLOGY_PACBIO_UPPER
402 bool SamHeaderValidator::IsSolid(const string& technology) {
403 return ( technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_LOWER ||
404 technology == Constants::SAM_RG_SEQTECHNOLOGY_SOLID_UPPER
408 bool SamHeaderValidator::ValidateProgramData(void) {
410 isValid &= ContainsUniqueProgramIds();
411 isValid &= ValidatePreviousProgramIds();
415 bool SamHeaderValidator::ContainsUniqueProgramIds(void) {
417 // TODO: once we have ability to handle multiple @PG entries,
418 // check here for duplicate ID's
419 // but for now, just return true
423 bool SamHeaderValidator::ValidatePreviousProgramIds(void) {
425 // TODO: check that PP entries are valid later, after we get multiple @PG-entry handling
426 // just return true for now
429 void SamHeaderValidator::AddError(const string& message) {
430 m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE);
433 void SamHeaderValidator::AddWarning(const string& message) {
434 m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE);
437 void SamHeaderValidator::PrintErrorMessages(void) {
439 // skip if no error messages
440 if ( m_errorMessages.empty() ) return;
442 // print error header line
443 cerr << "* SAM header has " << m_errorMessages.size() << " errors:" << endl;
445 // print each error message
446 vector<string>::const_iterator errorIter = m_errorMessages.begin();
447 vector<string>::const_iterator errorEnd = m_errorMessages.end();
448 for ( ; errorIter != errorEnd; ++errorIter )
449 cerr << (*errorIter);
452 void SamHeaderValidator::PrintWarningMessages(void) {
454 // skip if no warning messages
455 if ( m_warningMessages.empty() ) return;
457 // print warning header line
458 cerr << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl;
460 // print each warning message
461 vector<string>::const_iterator warnIter = m_warningMessages.begin();
462 vector<string>::const_iterator warnEnd = m_warningMessages.end();
463 for ( ; warnIter != warnEnd; ++warnIter )