From: Derek Date: Wed, 22 Sep 2010 02:00:14 +0000 (-0400) Subject: Implemented 'compound rule' logic in the FilterTool's script support. X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=c66ba836246a1eada3557a5c8dcc9d3558ed093b;p=bamtools.git Implemented 'compound rule' logic in the FilterTool's script support. * Previously the script was limited to doing 'OR' comparisons on various property sets (what I call here filters). Now, by providing each filter with an id, you can use these id's to define a compound rule. * Documentation is severely lacking on this end at the moment, but I hope to have a good explanation up soon. I think this interface could provide a powerful flexibility in querying BAM files for very specific cases for further analyses. --- diff --git a/src/toolkit/bamtools_filter.cpp b/src/toolkit/bamtools_filter.cpp index 5c8f338..cb153cf 100644 --- a/src/toolkit/bamtools_filter.cpp +++ b/src/toolkit/bamtools_filter.cpp @@ -3,20 +3,16 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 31 August 2010 +// Last modified: 21 September 2010 // --------------------------------------------------------------------------- -// Filters a single BAM file (or filters multiple BAM files and merges) -// according to some user-specified criteria. +// Filters BAM file(s) according to some user-specified criteria. // *************************************************************************** -// std includes #include #include #include #include #include - -// BamTools includes #include "bamtools_filter.h" #include "bamtools_filter_engine.h" #include "bamtools_options.h" @@ -24,10 +20,7 @@ #include "BamReader.h" #include "BamMultiReader.h" #include "BamWriter.h" - -//JsonCPP includes #include "jsoncpp/json.h" - using namespace std; using namespace BamTools; using namespace Json; @@ -63,6 +56,62 @@ const string REFERENCE_PROPERTY = "reference"; const string TRUE_STR = "true"; const string FALSE_STR = "false"; +RefVector filterToolReferences; + +struct BamAlignmentChecker { + bool check(const PropertyFilter& filter, const BamAlignment& al) { + + bool keepAlignment = true; + const PropertyMap& properties = filter.Properties; + PropertyMap::const_iterator propertyIter = properties.begin(); + PropertyMap::const_iterator propertyEnd = properties.end(); + for ( ; propertyIter != propertyEnd; ++propertyIter ) { + + // check alignment data field depending on propertyName + const string& propertyName = (*propertyIter).first; + const PropertyFilterValue& valueFilter = (*propertyIter).second; + + if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= valueFilter.check(al.AlignmentFlag); + else if ( propertyName == INSERTSIZE_PROPERTY ) keepAlignment &= valueFilter.check(al.InsertSize); + else if ( propertyName == ISDUPLICATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsDuplicate()); + else if ( propertyName == ISFAILEDQC_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFailedQC()); + else if ( propertyName == ISFIRSTMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFirstMate()); + else if ( propertyName == ISMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMapped()); + else if ( propertyName == ISMATEMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateMapped()); + else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateReverseStrand()); + else if ( propertyName == ISPAIRED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPaired()); + else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPrimaryAlignment()); + else if ( propertyName == ISPROPERPAIR_PROPERTY ) keepAlignment &= valueFilter.check(al.IsProperPair()); + else if ( propertyName == ISREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsReverseStrand()); + else if ( propertyName == ISSECONDMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsSecondMate()); + else if ( propertyName == MAPQUALITY_PROPERTY ) keepAlignment &= valueFilter.check(al.MapQuality); + else if ( propertyName == MATEPOSITION_PROPERTY ) keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) ); + else if ( propertyName == MATEREFERENCE_PROPERTY ) { + if ( !al.IsPaired() || !al.IsMateMapped() ) return false; + BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID"); + const string& refName = filterToolReferences.at(al.MateRefID).RefName; + keepAlignment &= valueFilter.check(refName); + } + else if ( propertyName == NAME_PROPERTY ) keepAlignment &= valueFilter.check(al.Name); + else if ( propertyName == POSITION_PROPERTY ) keepAlignment &= valueFilter.check(al.Position); + else if ( propertyName == QUERYBASES_PROPERTY ) keepAlignment &= valueFilter.check(al.QueryBases); + else if ( propertyName == REFERENCE_PROPERTY ) { + BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID"); + const string& refName = filterToolReferences.at(al.RefID).RefName; + keepAlignment &= valueFilter.check(refName); + } + else BAMTOOLS_ASSERT_UNREACHABLE; + + // if alignment fails at ANY point, just quit and return false + if ( !keepAlignment ) + return false; + } + + BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here"); + return keepAlignment; + } +}; + } // namespace BamTools // --------------------------------------------- @@ -94,7 +143,7 @@ class FilterTool::FilterToolPrivate { private: vector m_propertyNames; FilterTool::FilterSettings* m_settings; - RefVector m_references; + FilterEngine m_filterEngine; }; // --------------------------------------------- @@ -288,7 +337,6 @@ FilterTool::FilterToolPrivate::~FilterToolPrivate(void) { } bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName, const map& propertyTokens) { - // dummy temp values for token parsing bool boolValue; int32_t int32Value; @@ -323,8 +371,8 @@ bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filt propertyName == ISSECONDMATE_PROPERTY ) { - FilterEngine::parseToken(token, boolValue, type); - FilterEngine::setProperty(filterName, propertyName, boolValue, type); + m_filterEngine.parseToken(token, boolValue, type); + m_filterEngine.setProperty(filterName, propertyName, boolValue, type); } // int32_t conversion @@ -333,22 +381,22 @@ bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filt propertyName == POSITION_PROPERTY ) { - FilterEngine::parseToken(token, int32Value, type); - FilterEngine::setProperty(filterName, propertyName, int32Value, type); + m_filterEngine.parseToken(token, int32Value, type); + m_filterEngine.setProperty(filterName, propertyName, int32Value, type); } // uint16_t conversion else if ( propertyName == MAPQUALITY_PROPERTY ) { - FilterEngine::parseToken(token, uint16Value, type); - FilterEngine::setProperty(filterName, propertyName, uint16Value, type); + m_filterEngine.parseToken(token, uint16Value, type); + m_filterEngine.setProperty(filterName, propertyName, uint16Value, type); } // uint32_t conversion else if ( propertyName == ALIGNMENTFLAG_PROPERTY ) { - FilterEngine::parseToken(token, uint32Value, type); - FilterEngine::setProperty(filterName, propertyName, uint32Value, type); + m_filterEngine.parseToken(token, uint32Value, type); + m_filterEngine.setProperty(filterName, propertyName, uint32Value, type); } // string conversion @@ -358,8 +406,8 @@ bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filt propertyName == REFERENCE_PROPERTY ) { - FilterEngine::parseToken(token, stringValue, type); - FilterEngine::setProperty(filterName, propertyName, stringValue, type); + m_filterEngine.parseToken(token, stringValue, type); + m_filterEngine.setProperty(filterName, propertyName, stringValue, type); } // else unknown property @@ -372,55 +420,7 @@ bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filt } bool FilterTool::FilterToolPrivate::CheckAlignment(const BamAlignment& al) { - - bool keepAlignment = true; - - // only consider properties that are actually enabled - // iterate over these enabled properties - const vector enabledProperties = FilterEngine::enabledPropertyNames(); - vector::const_iterator propIter = enabledProperties.begin(); - vector::const_iterator propEnd = enabledProperties.end(); - for ( ; propIter != propEnd; ++propIter ) { - - // check alignment data field depending on propertyName - const string& propertyName = (*propIter); - if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= FilterEngine::check(ALIGNMENTFLAG_PROPERTY, al.AlignmentFlag); - else if ( propertyName == INSERTSIZE_PROPERTY ) keepAlignment &= FilterEngine::check(INSERTSIZE_PROPERTY, al.InsertSize); - else if ( propertyName == ISDUPLICATE_PROPERTY ) keepAlignment &= FilterEngine::check(ISDUPLICATE_PROPERTY, al.IsDuplicate()); - else if ( propertyName == ISFAILEDQC_PROPERTY ) keepAlignment &= FilterEngine::check(ISFAILEDQC_PROPERTY, al.IsFailedQC()); - else if ( propertyName == ISFIRSTMATE_PROPERTY ) keepAlignment &= FilterEngine::check(ISFIRSTMATE_PROPERTY, al.IsFirstMate()); - else if ( propertyName == ISMAPPED_PROPERTY ) keepAlignment &= FilterEngine::check(ISMAPPED_PROPERTY, al.IsMapped()); - else if ( propertyName == ISMATEMAPPED_PROPERTY ) keepAlignment &= FilterEngine::check(ISMATEMAPPED_PROPERTY, al.IsMateMapped()); - else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY ) keepAlignment &= FilterEngine::check(ISMATEREVERSESTRAND_PROPERTY, al.IsMateReverseStrand()); - else if ( propertyName == ISPAIRED_PROPERTY ) keepAlignment &= FilterEngine::check(ISPAIRED_PROPERTY, al.IsPaired()); - else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY ) keepAlignment &= FilterEngine::check(ISPRIMARYALIGNMENT_PROPERTY, al.IsPrimaryAlignment()); - else if ( propertyName == ISPROPERPAIR_PROPERTY ) keepAlignment &= FilterEngine::check(ISPROPERPAIR_PROPERTY, al.IsProperPair()); - else if ( propertyName == ISREVERSESTRAND_PROPERTY ) keepAlignment &= FilterEngine::check(ISREVERSESTRAND_PROPERTY, al.IsReverseStrand()); - else if ( propertyName == ISSECONDMATE_PROPERTY ) keepAlignment &= FilterEngine::check(ISSECONDMATE_PROPERTY, al.IsSecondMate()); - else if ( propertyName == MAPQUALITY_PROPERTY ) keepAlignment &= FilterEngine::check(MAPQUALITY_PROPERTY, al.MapQuality); - else if ( propertyName == MATEPOSITION_PROPERTY ) keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && FilterEngine::check(MATEPOSITION_PROPERTY, al.MateRefID) ); - else if ( propertyName == MATEREFERENCE_PROPERTY ) { - if ( !al.IsPaired() || !al.IsMateMapped() ) return false; - BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)m_references.size())), "Invalid MateRefID"); - const string& refName = m_references.at(al.MateRefID).RefName; - keepAlignment &= FilterEngine::check(MATEREFERENCE_PROPERTY, refName); - } - else if ( propertyName == NAME_PROPERTY ) keepAlignment &= FilterEngine::check(NAME_PROPERTY, al.Name); - else if ( propertyName == POSITION_PROPERTY ) keepAlignment &= FilterEngine::check(POSITION_PROPERTY, al.Position); - else if ( propertyName == QUERYBASES_PROPERTY ) keepAlignment &= FilterEngine::check(QUERYBASES_PROPERTY, al.QueryBases); - else if ( propertyName == REFERENCE_PROPERTY ) { - BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)m_references.size())), "Invalid RefID"); - const string& refName = m_references.at(al.RefID).RefName; - keepAlignment &= FilterEngine::check(REFERENCE_PROPERTY, refName); - } - else BAMTOOLS_ASSERT_MESSAGE( false, "Unknown property"); - - // if alignment fails at ANY point, just quit and return false - if ( !keepAlignment ) return false; - } - - // return success (should still be true at this point) - return keepAlignment; + return m_filterEngine.check(al); } const string FilterTool::FilterToolPrivate::GetScriptContents(void) { @@ -483,18 +483,18 @@ void FilterTool::FilterToolPrivate::InitProperties(void) { m_propertyNames.push_back(QUERYBASES_PROPERTY); m_propertyNames.push_back(REFERENCE_PROPERTY); - // add vector contents to FilterEngine + // add vector contents to FilterEngine vector::const_iterator propertyNameIter = m_propertyNames.begin(); vector::const_iterator propertyNameEnd = m_propertyNames.end(); for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter ) - FilterEngine::addProperty((*propertyNameIter)); + m_filterEngine.addProperty((*propertyNameIter)); } bool FilterTool::FilterToolPrivate::ParseCommandLine(void) { // add a rule set to filter engine const string CMD = "COMMAND_LINE"; - FilterEngine::addFilter(CMD); + m_filterEngine.addFilter(CMD); // map property names to command line args map propertyTokens; @@ -541,7 +541,7 @@ bool FilterTool::FilterToolPrivate::ParseFilterObject(const string& filterName, } // add this filter to engin - FilterEngine::addFilter(filterName); + m_filterEngine.addFilter(filterName); // add token list to this filter return AddPropertyTokensToFilter(filterName, propertyTokens); @@ -595,14 +595,15 @@ bool FilterTool::FilterToolPrivate::ParseScript(void) { success &= ParseFilterObject(filterName, filter); } - // see if user defined "rule" for these filters + // see if user defined a "rule" for these filters + // otherwise, use filter engine's default rule behavior + string ruleString(""); const Json::Value rule = root["rule"]; - if ( !rule.isNull() ) { - cout << "found rule: " << rule.asString() << endl; - } else { - cout << "no rule found!" << endl; - } + if ( rule.isString() ) + ruleString = rule.asString(); + m_filterEngine.setRule(ruleString); + // return success/fail return success; } @@ -629,12 +630,12 @@ bool FilterTool::FilterToolPrivate::Run(void) { BamMultiReader reader; reader.Open(m_settings->InputFiles, false, true); const string headerText = reader.GetHeaderText(); - m_references = reader.GetReferenceData(); + filterToolReferences = reader.GetReferenceData(); // open writer BamWriter writer; bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression ); - writer.Open(m_settings->OutputFilename, headerText, m_references, writeUncompressed); + writer.Open(m_settings->OutputFilename, headerText, filterToolReferences, writeUncompressed); BamAlignment al; @@ -710,7 +711,7 @@ bool FilterTool::FilterToolPrivate::Run(void) { bool FilterTool::FilterToolPrivate::SetupFilters(void) { - // add known properties to FilterEngine + // add known properties to FilterEngine InitProperties(); // parse script for filter rules, if given diff --git a/src/utils/Makefile b/src/utils/Makefile index d56bfe0..6e88122 100644 --- a/src/utils/Makefile +++ b/src/utils/Makefile @@ -15,7 +15,6 @@ INCLUDES = -I$(API_DIR)/ # define our source and object files # ---------------------------------- SOURCES = bamtools_fasta.cpp \ - bamtools_filter_engine.cpp \ bamtools_options.cpp \ bamtools_pileup_engine.cpp \ bamtools_utilities.cpp diff --git a/src/utils/bamtools_filter_engine.cpp b/src/utils/bamtools_filter_engine.cpp deleted file mode 100644 index c9d0715..0000000 --- a/src/utils/bamtools_filter_engine.cpp +++ /dev/null @@ -1,101 +0,0 @@ -// *************************************************************************** -// bamtools_filter_engine.cpp (c) 2010 Derek Barnett, Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 18 September 2010 -// --------------------------------------------------------------------------- -// -// *************************************************************************** - -#include -#include "bamtools_filter_engine.h" -using namespace std; -using namespace BamTools; - -// --------------------------------------------------------- -// FilterValue implementation - -// checks a string query against filter (value, compare type) -bool PropertyFilterValue::check(const string& query) const { - - // ensure filter value & query are same type - if ( !Value.is_type() ) { - cerr << "Cannot compare different types!" << endl; - return false; - } - - // localize string version of our filter value - const string& valueString = Value.get(); - - // string matching based on our filter type - switch ( Type ) { - case ( PropertyFilterValue::CONTAINS) : return ( query.find(valueString) != string::npos ); - case ( PropertyFilterValue::ENDS_WITH) : return ( query.find(valueString) == (query.length() - valueString.length()) ); - case ( PropertyFilterValue::EXACT) : return ( query == valueString ); - case ( PropertyFilterValue::GREATER_THAN) : return ( query > valueString ); - case ( PropertyFilterValue::GREATER_THAN_EQUAL) : return ( query >= valueString ); - case ( PropertyFilterValue::LESS_THAN) : return ( query < valueString ); - case ( PropertyFilterValue::LESS_THAN_EQUAL) : return ( query <= valueString ); - case ( PropertyFilterValue::NOT) : return ( query != valueString ); - case ( PropertyFilterValue::STARTS_WITH) : return ( query.find(valueString) == 0 ); - default : BAMTOOLS_ASSERT_UNREACHABLE; - } - return false; -} - -// --------------------------------------------------------- -// FilterEngine implementation - -// static FilterEngine data members -FilterMap FilterEngine::m_filters; -vector FilterEngine::m_properties; - -// creates a new filter set, returns true if created, false if error or already exists -bool FilterEngine::addFilter(const string& filterName) { - return (m_filters.insert(make_pair(filterName, PropertyFilter()))).second; -} - -// return list of current filter names -const vector FilterEngine::filterNames(void) { - vector names; - names.reserve(m_filters.size()); - FilterMap::const_iterator mapIter = m_filters.begin(); - FilterMap::const_iterator mapEnd = m_filters.end(); - for ( ; mapIter != mapEnd; ++mapIter ) - names.push_back( (*mapIter).first ); - return names; -} - -// add a new known property (& type) to engine -bool FilterEngine::addProperty(const string& propertyName) { - const vector propertyNames = allPropertyNames(); - bool found = binary_search( propertyNames.begin(), propertyNames.end(), propertyName ); - if ( found ) return false; - m_properties.push_back( Property(propertyName) ); - sort( m_properties.begin(), m_properties.end() ); - return true; -} - - -// returns list of all properties known by FilterEngine ( any created using addProperty() ) -const vector FilterEngine::allPropertyNames(void) { - vector names; - names.reserve(m_properties.size()); - vector::const_iterator propIter = m_properties.begin(); - vector::const_iterator propEnd = m_properties.end(); - for ( ; propIter != propEnd; ++propIter ) - names.push_back( (*propIter).Name ); - return names; -} - -// returns list of property names that are 'enabled' ( only those touched by setProperty() ) -const vector FilterEngine::enabledPropertyNames(void) { - vector names; - names.reserve(m_properties.size()); - vector::const_iterator propIter = m_properties.begin(); - vector::const_iterator propEnd = m_properties.end(); - for ( ; propIter != propEnd; ++propIter ) - if ( (*propIter).IsEnabled ) names.push_back( (*propIter).Name ); - return names; -} diff --git a/src/utils/bamtools_filter_engine.h b/src/utils/bamtools_filter_engine.h index 2297aea..924b043 100644 --- a/src/utils/bamtools_filter_engine.h +++ b/src/utils/bamtools_filter_engine.h @@ -3,8 +3,39 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 30 August 2010 +// Last modified: 21 September 2010 // --------------------------------------------------------------------------- +// Provides a generic filter engine based on filter-sets of properties, +// with possible "rules" (compound logical expressions) to create more complex +// queries on a data set. +// +// FilterEngine consists, most importantly, of : +// +// a list of possible properties (each tagged whether it has been 'enabled' as a filter) +// a map of filterName => propertySet +// queue for compound rule expression (i.e. "(filter1 AND filter2) OR !filter3" ) +// +// Each propertySet is a list of properties enabled for this particular filter object +// +// Implemented as a map of propertyNames to propertyFilterValue +// ( "property1" => pfv1 +// "property2" => pfv2 +// "property4" => pfv4 +// etc. ) +// +// Any properties that are 'possible', via FilterEngine::addProperty(), but not enabled +// via FilterEngine::setProperty() (in our example, say "property3"), evaluate to true +// for any query. Meaning that if a property is not set on this filter, we don't care +// about it here, so it passes though OK. +// +// A propertyFilterValue contains a value and comparison type +// +// ( pfv1: Value = 50, Type = GREATER_THAN_EQUAL +// pfv2: Value = "foo", Type = STARTS_WITH +// pfv4: Value = "bar", Type = CONTAINS +// etc. ) +// +// This allows for more complex queries (than simple isEqual?) against a variety of data types. // // *************************************************************************** @@ -12,156 +43,128 @@ #define BAMTOOLS_FILTER_ENGINE_H #include +#include #include +#include +#include +#include #include #include #include #include +#include "bamtools_filter_properties.h" +#include "bamtools_filter_ruleparser.h" #include "bamtools_utilities.h" -#include "bamtools_variant.h" namespace BamTools { -struct PropertyFilterValue { - - // define valid ValueCompareTypes - enum ValueCompareType { CONTAINS = 0 - , ENDS_WITH - , EXACT - , GREATER_THAN - , GREATER_THAN_EQUAL - , LESS_THAN - , LESS_THAN_EQUAL - , NOT - , STARTS_WITH - }; - - // ctor - PropertyFilterValue(const Variant& value = Variant(), - const ValueCompareType& type = PropertyFilterValue::EXACT) - : Value(value) - , Type(type) - { } - - // filter check methods - template - bool check(const T& query) const; - bool check(const std::string& query) const; - - // data members - Variant Value; - ValueCompareType Type; -}; - -inline -const std::string toString(const PropertyFilterValue::ValueCompareType& type) { - - switch ( type ) { - case ( PropertyFilterValue::CONTAINS ) : return std::string( "CONTAINS"); - case ( PropertyFilterValue::ENDS_WITH ) : return std::string( "ENDS_WITH"); - case ( PropertyFilterValue::EXACT ) : return std::string( "EXACT"); - case ( PropertyFilterValue::GREATER_THAN ) : return std::string( "GREATER_THAN"); - case ( PropertyFilterValue::GREATER_THAN_EQUAL ) : return std::string( "GREATER_THAN_EQUAL"); - case ( PropertyFilterValue::LESS_THAN ) : return std::string( "LESS_THAN"); - case ( PropertyFilterValue::LESS_THAN_EQUAL ) : return std::string( "LESS_THAN_EQUAL"); - case ( PropertyFilterValue::NOT ) : return std::string( "NOT"); - case ( PropertyFilterValue::STARTS_WITH ) : return std::string( "STARTS_WITH"); - default : BAMTOOLS_ASSERT_UNREACHABLE; - } - return std::string(); -} - -// property name => property filter value -// ('name' => ('SSR', STARTS_WITH), 'mapQuality' => (50, GREATER_THAN_EQUAL), etc...) -typedef std::map PropertyMap; - -struct PropertyFilter { - - // will be used more later - // if we implement a compound 'rules' system - i.e. "(filter1 AND filter2) OR filter 3" - enum FilterCompareType { AND = 0 - , EXACT - , NOT - , OR - }; - - // data members - PropertyMap Properties; - FilterCompareType Type; - - // ctor - PropertyFilter(void) : Type( PropertyFilter::EXACT ) { } - - // filter check methods - template - bool check(const std::string& propertyName, const T& query) const; +struct FilterCompareType { + enum Type { AND = 0 + , NOT + , OR + }; }; - -// filter name => properties -// ('filter1' => properties1, 'filter2' => properties2, etc...) -typedef std::map FilterMap; -// used to store properties known to engine & keep track of enabled state -struct Property { - std::string Name; - bool IsEnabled; - Property(const std::string& name, bool isEnabled = false) - : Name(name) - , IsEnabled(isEnabled) - { } -}; - -inline bool operator< (const Property& lhs, const Property& rhs) { return lhs.Name < rhs.Name; } -inline bool operator== (const Property& lhs, const Property& rhs) { return lhs.Name == rhs.Name; } +// ----------------------------------------------------------- +// FilterEngine +template class FilterEngine { + // ctor & dtor + public: + FilterEngine(void) + : m_ruleString("") + , m_isRuleQueueGenerated(false) + , m_defaultCompareType(FilterCompareType::OR) + , AND_OPERATOR("&") + , OR_OPERATOR("|") + , NOT_OPERATOR("!") + { } + + ~FilterEngine(void) { } + // 'filter set' methods public: // creates a new filter set, returns true if created, false if error or already exists - static bool addFilter(const std::string& filterName); + bool addFilter(const std::string& filterName); // return list of current filter names - static const std::vector filterNames(void); + const std::vector filterNames(void); // 'property' methods public: // add a new known property (& type) to engine - static bool addProperty(const std::string& propertyName); + bool addProperty(const std::string& propertyName); // sets property filter (value, type) for propertyName, on a particular filter set // setProperty("filter1", "mapQuality", 50, GREATER_THAN_EQUAL) template - static bool setProperty(const std::string& filterName, - const std::string& propertyName, - const T& value, - const PropertyFilterValue::ValueCompareType& type = PropertyFilterValue::EXACT); + bool setProperty(const std::string& filterName, + const std::string& propertyName, + const T& value, + const PropertyFilterValue::ValueCompareType& type = PropertyFilterValue::EXACT); // returns list of all properties known by FilterEngine ( any created using addProperty() ) - static const std::vector allPropertyNames(void); + const std::vector allPropertyNames(void); // returns list of property names that are 'enabled' ( only those touched by setProperty() ) - static const std::vector enabledPropertyNames(void); + const std::vector enabledPropertyNames(void); + // 'rule' methods + public: + + // sets comparison operator between filters if no rule string given + // default is to do an OR on each filter + void setDefaultCompareType(const FilterCompareType::Type& type = FilterCompareType::OR); + + // sets rule string for building expression queue + // if empty, creates + void setRule(const std::string& ruleString = ""); + // token parsing (for property filter generation) public: template - static bool parseToken(const std::string& token, T& value, PropertyFilterValue::ValueCompareType& type); + bool parseToken(const std::string& token, T& value, PropertyFilterValue::ValueCompareType& type); // query evaluation public: - // returns true if query passes all filters on 'propertyName' + // returns true if query passes all filters in FilterEngine template - static bool check(const std::string& propertyName, const T& query); + bool check(const T& query); + // internal rule-handling methods + private: + void buildDefaultRuleString(void); + void buildRuleQueue(void); + template + bool evaluateFilterRules(const T& query); + // data members private: // all 'filter sets' - static FilterMap m_filters; + FilterMap m_filters; // all known properties - static std::vector m_properties; + std::vector m_properties; + + // infix expression of filter-set comparison rules + std::string m_ruleString; + + // postfix expression of tokens (filterNames) and operators (as strings) + // if this is empty, uses m_compareType to build default expression queue + std::queue m_ruleQueue; + + // flag to test if the rule expression queue has been generated + bool m_isRuleQueueGenerated; + + // 'default' comparison operator between filters if no rule string given + // if this is changed, m_ruleString is used to build new m_ruleQueue + FilterCompareType::Type m_defaultCompareType; + + // client-specified checking type ( provides method: bool check(PropertyFilter, T object) ) + FilterChecker m_checker; // token-parsing constants static const int NOT_CHAR = (int)'!'; @@ -169,65 +172,192 @@ class FilterEngine { static const int GREATER_THAN_CHAR = (int)'>'; static const int LESS_THAN_CHAR = (int)'<'; static const int WILDCARD_CHAR = (int)'*'; + + // filter evaluation constants + const std::string AND_OPERATOR; + const std::string OR_OPERATOR; + const std::string NOT_OPERATOR; }; -// ------------------------------------------------------------------- -// template methods +// creates a new filter set, returns true if created, false if error or already exists +template +inline bool FilterEngine::addFilter(const std::string& filterName) { + return (m_filters.insert(std::make_pair(filterName, PropertyFilter()))).second; +} + +// add a new known property & type to engine +template +inline bool FilterEngine::addProperty(const std::string& propertyName) { + const std::vector propertyNames = allPropertyNames(); + bool found = std::binary_search( propertyNames.begin(), propertyNames.end(), propertyName ); + if ( found ) return false; + m_properties.push_back( Property(propertyName) ); + std::sort( m_properties.begin(), m_properties.end() ); + return true; +} + +// returns list of all properties known by FilterEngine +// ( any that were created using addProperty() ) +template +inline const std::vector FilterEngine::allPropertyNames(void) { + // set up stringlist + std::vector names; + names.reserve(m_properties.size()); + // iterate through all properties, appending to stringlist + std::vector::const_iterator propIter = m_properties.begin(); + std::vector::const_iterator propEnd = m_properties.end(); + for ( ; propIter != propEnd; ++propIter ) + names.push_back( (*propIter).Name ); + // return stringlist + return names; +} -// checks a query against a filter (value, compare type) -template -bool PropertyFilterValue::check(const T& query) const { +// builds a default rule string based on m_defaultCompareType +// used if user supplied an explicit rule string +template +inline void FilterEngine::buildDefaultRuleString(void) { - // ensure filter value & query are same type - if ( !Value.is_type() ) { - std::cerr << "Cannot compare different types!" << std::endl; - return false; + // set up temp string stream + std::stringstream ruleStream(""); + + // get first filterName + FilterMap::const_iterator mapIter = m_filters.begin(); + ruleStream << (*mapIter).first; + + // if there are more filters present + // iterate over remaining filters, appending compare operator and filter name + if ( m_filters.size() > 1 ) { + for ( ++mapIter ; mapIter != m_filters.end(); ++mapIter ) + ruleStream << ( (m_defaultCompareType == FilterCompareType::AND) ? " & " : " | " ) + << (*mapIter).first; } + + // set m_ruleString from temp stream + m_ruleString = ruleStream.str(); +} + +// build expression queue based on ruleString +template +inline void FilterEngine::buildRuleQueue(void) { + + // skip if no filters present + if ( m_filters.empty() ) return; + + // clear out any prior expression queue data + while ( !m_ruleQueue.empty() ) + m_ruleQueue.pop(); + + // create a rule string, if not provided + if ( m_ruleString.empty() ) + buildDefaultRuleString(); - // string matching - if ( Value.is_type() ) { - std::cerr << "Cannot compare different types - query is a string!" << std::endl; - return false; - } + // initialize RuleParser, run, and retrieve results + RuleParser ruleParser(m_ruleString); + ruleParser.parse(); + m_ruleQueue = ruleParser.results(); - // numeric matching based on our filter type - switch ( Type ) { - case ( PropertyFilterValue::EXACT) : return ( query == Value.get() ); - case ( PropertyFilterValue::GREATER_THAN) : return ( query > Value.get() ); - case ( PropertyFilterValue::GREATER_THAN_EQUAL) : return ( query >= Value.get() ); - case ( PropertyFilterValue::LESS_THAN) : return ( query < Value.get() ); - case ( PropertyFilterValue::LESS_THAN_EQUAL) : return ( query <= Value.get() ); - case ( PropertyFilterValue::NOT) : return ( query != Value.get() ); - default : BAMTOOLS_ASSERT_UNREACHABLE; - } - return false; + // set flag if rule queue contains any values + m_isRuleQueueGenerated = (!m_ruleQueue.empty()); } -template -bool PropertyFilter::check(const std::string& propertyName, const T& query) const { +// returns whether query value passes filter engine rules +template template +bool FilterEngine::check(const T& query) { - // if propertyName found for this filter, - PropertyMap::const_iterator propIter = Properties.find(propertyName); - if ( propIter != Properties.end() ) { - const PropertyFilterValue& filterValue = (*propIter).second; - - // check - switch ( Type ) { - case ( PropertyFilter::EXACT ) : return filterValue.check(query); - case ( PropertyFilter::NOT ) : return !filterValue.check(query); - case ( PropertyFilter::AND ) : - case ( PropertyFilter::OR ) : BAMTOOLS_ASSERT_MESSAGE(false, "Cannot use a binary compare operator on 1 value"); - default : BAMTOOLS_ASSERT_UNREACHABLE; - } - return false; // unreachable + // return result of querying against filter rules + return evaluateFilterRules(query); +} + +// returns list of property names that are 'enabled' ( only those touched by setProperty() ) +template +inline const std::vector FilterEngine::enabledPropertyNames(void) { + // initialize stringlist + std::vector names; + names.reserve(m_properties.size()); + // iterate over all properties, appending if enabled + std::vector::const_iterator propIter = m_properties.begin(); + std::vector::const_iterator propEnd = m_properties.end(); + for ( ; propIter != propEnd; ++propIter ) + if ( (*propIter).IsEnabled ) + names.push_back( (*propIter).Name ); + // return stringlist + return names; +} + +// evaluates postfix rule queue - with each filter as an operand, AND|OR|NOT as operators +template template +bool FilterEngine::evaluateFilterRules(const T& query) { + + // build ruleQueue if not done before + if ( !m_isRuleQueueGenerated ) + buildRuleQueue(); + + std::stack resultStack; + FilterMap::const_iterator filterIter; + FilterMap::const_iterator filterEnd = m_filters.end(); + std::queue ruleQueueCopy = m_ruleQueue; + while ( !ruleQueueCopy.empty() ) { + const std::string& token = ruleQueueCopy.front(); + + // token is NOT_OPERATOR + if ( token == FilterEngine::NOT_OPERATOR ) { + BAMTOOLS_ASSERT_MESSAGE( !resultStack.empty(), "Empty result stack - cannot apply operator: !" ); + resultStack.top() = !resultStack.top(); + } + + // token is AND_OPERATOR + else if ( token == FilterEngine::AND_OPERATOR ) { + BAMTOOLS_ASSERT_MESSAGE( resultStack.size() >= 2 , "Not enough operands - cannot apply operator: &" ); + bool topResult = resultStack.top(); + resultStack.pop(); + resultStack.top() &= topResult; + } + + // token is OR_OPERATOR + else if ( token == FilterEngine::OR_OPERATOR ) { + BAMTOOLS_ASSERT_MESSAGE( resultStack.size() >= 2 , "Not enough operands - cannot apply operator: |" ); + bool topResult = resultStack.top(); + resultStack.pop(); + resultStack.top() |= topResult; + } + + // token is an operand + else { + // look up PropertyFilter that matches this token + filterIter = m_filters.find(token); + BAMTOOLS_ASSERT_MESSAGE( (filterIter != filterEnd), "Filter mentioned in rule, not found in FilterEngine" ); + const PropertyFilter& filter = (*filterIter).second; + bool result = m_checker.check(filter, query); + resultStack.push( result ); + } + + // pop token from ruleQueue + ruleQueueCopy.pop(); } + + // return last result + BAMTOOLS_ASSERT_MESSAGE( resultStack.size() == 1, "Result stack should only have one value remaining - cannot return result" ); + return resultStack.top(); +} - // property unknown to this filter - else return true; +// return list of current filter names +template +inline const std::vector FilterEngine::filterNames(void) { + // initialize stringlist + std::vector names; + names.reserve(m_filters.size()); + // iterate over all filters, appending filter name + FilterMap::const_iterator mapIter = m_filters.begin(); + FilterMap::const_iterator mapEnd = m_filters.end(); + for ( ; mapIter != mapEnd; ++mapIter ) + names.push_back( (*mapIter).first ); + // return stringlist + return names; } -template -bool FilterEngine::parseToken(const std::string& token, T& value, PropertyFilterValue::ValueCompareType& type) { +// parse a filterValue token string that may contain comparison qualifiers (">50", "*SRR", etc.) +template template +bool FilterEngine::parseToken(const std::string& token, T& value, PropertyFilterValue::ValueCompareType& type) { // skip if token is empty if ( token.empty() ) return false; @@ -244,20 +374,19 @@ bool FilterEngine::parseToken(const std::string& token, T& value, PropertyFilter // more than one character, check for special chars else { const int firstChar = (int)token.at(0); - - switch ( (int)firstChar ) { + switch ( firstChar ) { - case ( (int)FilterEngine::NOT_CHAR ) : + case ( FilterEngine::NOT_CHAR ) : strippedToken = token.substr(1); type = PropertyFilterValue::NOT; break; - case ( (int)FilterEngine::GREATER_THAN_CHAR ) : + case ( FilterEngine::GREATER_THAN_CHAR ) : // check for '>=' case - if ( token.at(1) == FilterEngine::EQUAL_CHAR ) { + if ( token.at(1) == FilterEngine::EQUAL_CHAR ) { if ( token.length() == 2 ) return false; strippedToken = token.substr(2); type = PropertyFilterValue::GREATER_THAN_EQUAL; @@ -271,10 +400,10 @@ bool FilterEngine::parseToken(const std::string& token, T& value, PropertyFilter break; - case ( (int)FilterEngine::LESS_THAN_CHAR ) : + case ( FilterEngine::LESS_THAN_CHAR ) : // check for '<=' case - if ( token.at(1) == FilterEngine::EQUAL_CHAR ) { + if ( token.at(1) == FilterEngine::EQUAL_CHAR ) { if ( token.length() == 2 ) return false; strippedToken = token.substr(2); type = PropertyFilterValue::LESS_THAN_EQUAL; @@ -288,10 +417,10 @@ bool FilterEngine::parseToken(const std::string& token, T& value, PropertyFilter break; - case ( (int)FilterEngine::WILDCARD_CHAR ) : + case ( FilterEngine::WILDCARD_CHAR ) : // check for *str* case (CONTAINS) - if ( token.at( token.length() - 1 ) == FilterEngine::WILDCARD_CHAR ) { + if ( token.at( token.length() - 1 ) == FilterEngine::WILDCARD_CHAR ) { if ( token.length() == 2 ) return false; strippedToken = token.substr(1, token.length() - 2); type = PropertyFilterValue::CONTAINS; @@ -305,11 +434,10 @@ bool FilterEngine::parseToken(const std::string& token, T& value, PropertyFilter break; - default : // check for str* case (STARTS_WITH) - if ( token.at( token.length() - 1 ) == FilterEngine::WILDCARD_CHAR ) { + if ( token.at( token.length() - 1 ) == FilterEngine::WILDCARD_CHAR ) { if ( token.length() == 2 ) return false; strippedToken = token.substr(0, token.length() - 1); type = PropertyFilterValue::STARTS_WITH; @@ -348,13 +476,27 @@ bool FilterEngine::parseToken(const std::string& token, T& value, PropertyFilter return true; } +// sets comparison operator between filters if no rule string given +// default is to do an OR on each filter +template +inline void FilterEngine::setDefaultCompareType(const FilterCompareType::Type& type) { + // check for supported compare type + if ( type == FilterCompareType::AND || type == FilterCompareType::OR ) { + // if not the current compare type + if ( m_defaultCompareType != type ) { + m_defaultCompareType = type; + buildRuleQueue(); + } + } +} + // sets property filter (value, type) for propertyName, on a particular filter set // setProperty("filter1", "mapQuality", 50, GREATER_THAN_EQUAL) -template -bool FilterEngine::setProperty(const std::string& filterName, - const std::string& propertyName, - const T& value, - const PropertyFilterValue::ValueCompareType& type) +template template +bool FilterEngine::setProperty(const std::string& filterName, + const std::string& propertyName, + const T& value, + const PropertyFilterValue::ValueCompareType& type) { // lookup filter by name, return false if not found FilterMap::iterator filterIter = m_filters.find(filterName); @@ -394,37 +536,19 @@ bool FilterEngine::setProperty(const std::string& filterName, } // property already known, set as enabled - else - (*knownPropertyIter).IsEnabled = true; + else (*knownPropertyIter).IsEnabled = true; // return success return true; } -// returns false if query does not pass any filters on 'propertyName' -// returns true if property unknown (i.e. nothing has been set for this property... so query is considered to pass filter) -template -bool FilterEngine::check(const std::string& propertyName, const T& query) { - - // check enabled properties list - // return true if no properties enabled at all OR if property is unknown to FilterEngine - const std::vector enabledProperties = enabledPropertyNames(); - if ( enabledProperties.empty() ) return true; - const bool found = std::binary_search( enabledProperties.begin(), enabledProperties.end(), propertyName ); - if ( !found ) return true; - - // iterate over all filters in FilterEngine - FilterMap::const_iterator filterIter = m_filters.begin(); - FilterMap::const_iterator filterEnd = m_filters.end(); - for ( ; filterIter != filterEnd; ++filterIter ) { - - // check query against this filter - const PropertyFilter& filter = (*filterIter).second; - if ( filter.check(propertyName, query) ) return true; +// sets user-specified rule string & signals update of rule-expression queue +template +inline void FilterEngine::setRule(const std::string& ruleString) { + if ( m_ruleString != ruleString) { + m_ruleString = ruleString; + buildRuleQueue(); } - - // query passes none of the filters with current property enabled - return false; } } // namespace BamTools diff --git a/src/utils/bamtools_filter_properties.h b/src/utils/bamtools_filter_properties.h new file mode 100644 index 0000000..119da06 --- /dev/null +++ b/src/utils/bamtools_filter_properties.h @@ -0,0 +1,195 @@ +// *************************************************************************** +// bamtools_filter_properties.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 30 August 2010 +// --------------------------------------------------------------------------- +// Provides support data structures & methods for FilterEngine +// +// The FilterEngine consists, most importantly, of : +// +// a list of possible properties (each tagged whether it has been 'enabled' as a filter) +// a map of filterName => propertySet +// queue for compound rule expression (i.e. "(filter1 AND filter2) OR !filter3" ) +// +// Each propertySet is a list of properties enabled for this particular filter object +// +// Implemented as a map of propertyNames to propertyFilterValue +// ( "property1" => pfv1 +// "property2" => pfv2 +// "property4" => pfv4 +// etc. ) +// +// Any properties that are 'possible', via FilterEngine::addProperty(), but not enabled +// via FilterEngine::setProperty() (in our example, say "property3"), evaluate to true +// for any query. Meaning that if a property is not set on this filter, we don't care +// about it here, so it passes though OK. +// +// A propertyFilterValue contains a value and comparison type +// +// ( pfv1: Value = 50, Type = GREATER_THAN_EQUAL +// pfv2: Value = "foo", Type = STARTS_WITH +// pfv4: Value = "bar", Type = CONTAINS +// etc. ) +// +// This allows for more complex queries (than simple isEqual?) against a variety of data types. +// +// *************************************************************************** + +#ifndef BAMTOOLS_FILTER_PROPERTIES_H +#define BAMTOOLS_FILTER_PROPERTIES_H + +#include +#include +#include +#include "bamtools_utilities.h" +#include "bamtools_variant.h" + +namespace BamTools { + +// ---------------------------------------------------------- +// PropertyFilterValue + +struct PropertyFilterValue { + + // define valid ValueCompareTypes + enum ValueCompareType { CONTAINS = 0 + , ENDS_WITH + , EXACT + , GREATER_THAN + , GREATER_THAN_EQUAL + , LESS_THAN + , LESS_THAN_EQUAL + , NOT + , STARTS_WITH + }; + + // ctor + PropertyFilterValue(const Variant& value = Variant(), + const ValueCompareType& type = PropertyFilterValue::EXACT) + : Value(value) + , Type(type) + { } + + // filter check methods + template + bool check(const T& query) const; + bool check(const std::string& query) const; + + // data members + Variant Value; + ValueCompareType Type; +}; + +// checks a query against a filter (value, compare type) +template +bool PropertyFilterValue::check(const T& query) const { + + // ensure filter value & query are same type + if ( !Value.is_type() ) { + std::cerr << "Cannot compare different types!" << std::endl; + return false; + } + + // string matching + if ( Value.is_type() ) { + std::cerr << "Cannot compare different types - query is a string!" << std::endl; + return false; + } + + // numeric matching based on our filter type + switch ( Type ) { + case ( PropertyFilterValue::EXACT) : return ( query == Value.get() ); + case ( PropertyFilterValue::GREATER_THAN) : return ( query > Value.get() ); + case ( PropertyFilterValue::GREATER_THAN_EQUAL) : return ( query >= Value.get() ); + case ( PropertyFilterValue::LESS_THAN) : return ( query < Value.get() ); + case ( PropertyFilterValue::LESS_THAN_EQUAL) : return ( query <= Value.get() ); + case ( PropertyFilterValue::NOT) : return ( query != Value.get() ); + default : BAMTOOLS_ASSERT_UNREACHABLE; + } + return false; +} + +// checks a string query against filter (value, compare type) +inline +bool PropertyFilterValue::check(const std::string& query) const { + + // ensure filter value & query are same type + if ( !Value.is_type() ) { + std::cerr << "Cannot compare different types!" << std::endl; + return false; + } + + // localize string version of our filter value + const std::string& valueString = Value.get(); + + // string matching based on our filter type + switch ( Type ) { + case ( PropertyFilterValue::CONTAINS) : return ( query.find(valueString) != std::string::npos ); + case ( PropertyFilterValue::ENDS_WITH) : return ( query.find(valueString) == (query.length() - valueString.length()) ); + case ( PropertyFilterValue::EXACT) : return ( query == valueString ); + case ( PropertyFilterValue::GREATER_THAN) : return ( query > valueString ); + case ( PropertyFilterValue::GREATER_THAN_EQUAL) : return ( query >= valueString ); + case ( PropertyFilterValue::LESS_THAN) : return ( query < valueString ); + case ( PropertyFilterValue::LESS_THAN_EQUAL) : return ( query <= valueString ); + case ( PropertyFilterValue::NOT) : return ( query != valueString ); + case ( PropertyFilterValue::STARTS_WITH) : return ( query.find(valueString) == 0 ); + default : BAMTOOLS_ASSERT_UNREACHABLE; + } + return false; +} + +inline +const std::string toString(const PropertyFilterValue::ValueCompareType& type) { + + switch ( type ) { + case ( PropertyFilterValue::CONTAINS ) : return std::string( "CONTAINS"); + case ( PropertyFilterValue::ENDS_WITH ) : return std::string( "ENDS_WITH"); + case ( PropertyFilterValue::EXACT ) : return std::string( "EXACT"); + case ( PropertyFilterValue::GREATER_THAN ) : return std::string( "GREATER_THAN"); + case ( PropertyFilterValue::GREATER_THAN_EQUAL ) : return std::string( "GREATER_THAN_EQUAL"); + case ( PropertyFilterValue::LESS_THAN ) : return std::string( "LESS_THAN"); + case ( PropertyFilterValue::LESS_THAN_EQUAL ) : return std::string( "LESS_THAN_EQUAL"); + case ( PropertyFilterValue::NOT ) : return std::string( "NOT"); + case ( PropertyFilterValue::STARTS_WITH ) : return std::string( "STARTS_WITH"); + default : BAMTOOLS_ASSERT_UNREACHABLE; + } + return std::string(); +} + +// property name => property filter value +// ('name' => ('SSR', STARTS_WITH), 'mapQuality' => (50, GREATER_THAN_EQUAL), etc...) +typedef std::map PropertyMap; + +// ---------------------------------------------------------- +// PropertyFilter + +struct PropertyFilter { + // data members + PropertyMap Properties; +}; + +// filter name => properties +// ('filter1' => properties1, 'filter2' => properties2, etc...) +typedef std::map FilterMap; + +// ---------------------------------------------------------- +// Property + +// used to store properties known to engine & keep track of enabled state +struct Property { + std::string Name; + bool IsEnabled; + Property(const std::string& name, bool isEnabled = false) + : Name(name) + , IsEnabled(isEnabled) + { } +}; + +inline bool operator< (const Property& lhs, const Property& rhs) { return lhs.Name < rhs.Name; } +inline bool operator== (const Property& lhs, const Property& rhs) { return lhs.Name == rhs.Name; } + +} // namespace BamTools + +#endif // BAMTOOLS_FILTER_PROPERTIES_H diff --git a/src/utils/bamtools_filter_ruleparser.h b/src/utils/bamtools_filter_ruleparser.h new file mode 100644 index 0000000..7e3b1fd --- /dev/null +++ b/src/utils/bamtools_filter_ruleparser.h @@ -0,0 +1,318 @@ +// *************************************************************************** +// bamtools_filter_ruleparser.h (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 21 September 2010 +// --------------------------------------------------------------------------- +// Provides a compound rule parser for FilterEngine. +// *************************************************************************** + +#ifndef BAMTOOLS_FILTER_RULEPARSER_H +#define BAMTOOLS_FILTER_RULEPARSER_H + +#include +#include +#include +#include "bamtools_utilities.h" + +namespace BamTools { + +// ------------------------------------------- +// char constants + +const char LEFT_PARENTHESIS_CHAR = '('; +const char RIGHT_PARENTHESIS_CHAR = ')'; +const char AND_OPERATOR_CHAR = '&'; +const char OR_OPERATOR_CHAR = '|'; +const char NOT_OPERATOR_CHAR = '!'; +const char SPACE_CHAR = ' '; + +// ------------------------------------------- +// RuleToken implementation + +struct RuleToken { + + // enums + enum RuleTokenType { OPERAND = 0 + , AND_OPERATOR + , OR_OPERATOR + , NOT_OPERATOR + , LEFT_PARENTHESIS + , RIGHT_PARENTHESIS + }; + + // data members + RuleTokenType Type; + std::string Value; +}; + +inline int priority(const RuleToken& token) { + switch ( token.Type ) { + case ( RuleToken::NOT_OPERATOR ) : return 3; + case ( RuleToken::AND_OPERATOR ) : return 2; + case ( RuleToken::OR_OPERATOR ) : return 1; + case ( RuleToken::LEFT_PARENTHESIS ) : return 0; + case ( RuleToken::RIGHT_PARENTHESIS ) : return 0; + default: BAMTOOLS_ASSERT_UNREACHABLE; + } +} + +inline bool isRightAssociative(const RuleToken& token) { + return (token.Type == RuleToken::NOT_OPERATOR || + token.Type == RuleToken::LEFT_PARENTHESIS); +} + +inline bool isLeftAssociative(const RuleToken& token) { + return !isRightAssociative(token); +} + +inline bool isLeftParenthesis(const RuleToken& token) { + return ( token.Type == RuleToken::LEFT_PARENTHESIS ); +} + +inline bool isRightParenthesis(const RuleToken& token) { + return ( token.Type == RuleToken::RIGHT_PARENTHESIS ); +} + +inline bool isOperand(const RuleToken& token) { + return ( token.Type == RuleToken::OPERAND ); +} + +inline bool isOperator(const RuleToken& token) { + return ( token.Type == RuleToken::AND_OPERATOR || + token.Type == RuleToken::OR_OPERATOR || + token.Type == RuleToken::NOT_OPERATOR); +} + +// ------------------------------------------- +// RuleParser implementation + +class RuleParser { + + // ctor & dtor + public: + RuleParser(const std::string& ruleString) + : m_ruleString(ruleString) + { + // initialize char markers + m_begin = (char*)m_ruleString.c_str(); + m_end = m_begin + m_ruleString.length(); + ignoreQuotes(); + } + + ~RuleParser(void) { } + + // public interface + public: + void parse(void); + std::queue results(void) const { return m_ruleQueue; } + + // internal methods + private: + char getNextChar(void); + void ignoreQuotes(void); + bool readToken(RuleToken& token); + void skipSpaces(void); + + // data members + private: + std::string m_ruleString; + char* m_begin; + char* m_current; + char* m_end; + + std::queue m_ruleQueue; + std::stack m_operatorStack; +}; + +inline +char RuleParser::getNextChar(void) { + if ( m_current == m_end ) return 0; + return *m_current++; +} + +inline +void RuleParser::ignoreQuotes(void) { + if ( *m_begin == '\"' ) ++m_begin; + if ( *m_end == '\"' ) --m_end; +} + +inline +void RuleParser::parse(void) { + + // clear out any prior data + while ( !m_ruleQueue.empty() ) + m_ruleQueue.pop(); + + // skip if no rule to parse + if ( m_ruleString.empty() ) return; + + // start at beginning of ruleString + m_current = m_begin; + + // iterate through tokens in rule string + RuleToken token; + while ( readToken(token) ) { + + if ( token.Value.empty() ) break; + + // if token is an operand + if ( isOperand(token) ) + m_ruleQueue.push(token.Value); + + // if token is an operator + else if ( isOperator(token) ) { + + // pop any operators at top of stack with higher priority + while ( !m_operatorStack.empty() ) { + const RuleToken& opToken = m_operatorStack.top(); + if ( (isLeftAssociative(token) && (priority(token) <= priority(opToken))) || + (isRightAssociative(token) && (priority(token) < priority(opToken))) + ) + { + m_ruleQueue.push(opToken.Value); + m_operatorStack.pop(); + } + else break; + } + + // push current operator token onto stack + m_operatorStack.push(token); + } + + // if token is left parenthesis + else if ( isLeftParenthesis(token) ) + m_operatorStack.push(token); + + // if token is right parenthesis + else if ( isRightParenthesis(token) ) { + + bool foundLeftParenthesis = false; + + // push operators into rule queue until left parenthesis found + while ( !m_operatorStack.empty() && !foundLeftParenthesis ) { + const RuleToken& opToken = m_operatorStack.top(); + if ( !isLeftParenthesis(opToken) ) + m_ruleQueue.push(opToken.Value); + else + foundLeftParenthesis = true; + m_operatorStack.pop(); + } + + // no left parenthesis found, error + BAMTOOLS_ASSERT_MESSAGE( foundLeftParenthesis, "ERROR: Mismatched parenthesis in rule string.1"); + } + + // error: unknown operand + else BAMTOOLS_ASSERT_UNREACHABLE; + } + + // while there are still operators on stack + while ( !m_operatorStack.empty() ) { + const RuleToken& token = m_operatorStack.top(); + BAMTOOLS_ASSERT_MESSAGE( (!isLeftParenthesis(token) && !isRightParenthesis(token)), "ERROR: Mismatched parenthesis in rule string.2"); + m_ruleQueue.push(token.Value); + m_operatorStack.pop(); + } +} + +inline +bool RuleParser::readToken(RuleToken& token) { + + // skip any preceding whitespace + skipSpaces(); + if ( m_current == m_end ) return false; + + // clear out prior token value + token.Value.clear(); + + // read chars while still in token + char c = 1; + bool keepReading = true; + bool inOperandString = false; + while ( keepReading && (c != 0) ) { + + // get next char + c = getNextChar(); + switch (c) { + + // current char is '(' + case ( LEFT_PARENTHESIS_CHAR ) : + token.Type = RuleToken::LEFT_PARENTHESIS; + token.Value.append(1, LEFT_PARENTHESIS_CHAR); + keepReading = false; + break; + + // current char is ')' + case ( RIGHT_PARENTHESIS_CHAR ) : + if ( inOperandString ) + --m_current; + else { + token.Type = RuleToken::RIGHT_PARENTHESIS; + token.Value.append(1, RIGHT_PARENTHESIS_CHAR); + } + keepReading = false; + break; + + // current char is '&' + case ( AND_OPERATOR_CHAR ) : + if ( inOperandString ) + --m_current; + else { + token.Type = RuleToken::AND_OPERATOR; + token.Value.append(1, AND_OPERATOR_CHAR); + } + keepReading = false; + break; + + // current char is '|' + case ( OR_OPERATOR_CHAR ) : + if ( inOperandString ) + --m_current; + else { + token.Type = RuleToken::OR_OPERATOR; + token.Value.append(1, OR_OPERATOR_CHAR); + } + keepReading = false; + break; + + // current char is '!' + case ( NOT_OPERATOR_CHAR ) : + token.Type = RuleToken::NOT_OPERATOR; + token.Value.append(1, NOT_OPERATOR_CHAR); + keepReading = false; + break; + + // current char is ' ' + case ( SPACE_CHAR ) : + keepReading = false; + break; + + // current char is a true value token + default: + if ( c != 0 ) { + token.Type = RuleToken::OPERAND; + token.Value.append(1, c); + inOperandString = true; + keepReading = true; + } + } + } + + return true; +} + +inline +void RuleParser::skipSpaces(void) { + while ( m_current != m_end ) { + const char c = *m_current; + if ( c == ' ' || c == '\t' || c == '\r' || c == '\n') + ++m_current; + else break; + } +} + +} // namespace BamTools + +#endif // BAMTOOLS_FILTER_RULEPARSER_H \ No newline at end of file