From: Derek Date: Mon, 30 Aug 2010 22:53:59 +0000 (-0400) Subject: Posted implementation of FilterTool. X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=32ad5a7c3cf34ddc0f4d38b12fc5112386122e45;p=bamtools.git Posted implementation of FilterTool. --- diff --git a/src/toolkit/bamtools_filter.cpp b/src/toolkit/bamtools_filter.cpp index 2249022..6941da2 100644 --- a/src/toolkit/bamtools_filter.cpp +++ b/src/toolkit/bamtools_filter.cpp @@ -3,42 +3,207 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 1 June 2010 +// Last modified: 30 August 2010 // --------------------------------------------------------------------------- // Filters a single BAM file (or filters multiple BAM files and merges) // according to some user-specified criteria. // *************************************************************************** +// std includes +#include #include +#include #include #include +// BamTools includes #include "bamtools_filter.h" +#include "bamtools_filter_engine.h" #include "bamtools_options.h" +#include "bamtools_utilities.h" #include "BamReader.h" #include "BamMultiReader.h" +#include "BamWriter.h" + +//JsonCPP includes +#include "jsoncpp/json.h" using namespace std; using namespace BamTools; +using namespace Json; + +namespace BamTools { + +// ------------------------------- +// string literal constants + +// property names +const string ALIGNMENTFLAG_PROPERTY = "alignmentFlag"; +const string INSERTSIZE_PROPERTY = "insertSize"; +const string ISDUPLICATE_PROPERTY = "isDuplicate"; +const string ISFAILEDQC_PROPERTY = "isFailedQC"; +const string ISFIRSTMATE_PROPERTY = "isFirstMate"; +const string ISMAPPED_PROPERTY = "isMapped"; +const string ISMATEMAPPED_PROPERTY = "isMateMapped"; +const string ISMATEREVERSESTRAND_PROPERTY = "isMateReverseStrand"; +const string ISPAIRED_PROPERTY = "isPaired"; +const string ISPRIMARYALIGNMENT_PROPERTY = "isPrimaryAlignment"; +const string ISPROPERPAIR_PROPERTY = "isProperPair"; +const string ISREVERSESTRAND_PROPERTY = "isReverseStrand"; +const string ISSECONDMATE_PROPERTY = "isSecondMate"; +const string MAPQUALITY_PROPERTY = "mapQuality"; +const string MATEPOSITION_PROPERTY = "matePosition"; +const string MATEREFERENCE_PROPERTY = "mateReference"; +const string NAME_PROPERTY = "name"; +const string POSITION_PROPERTY = "position"; +const string QUERYBASES_PROPERTY = "queryBases"; +const string REFERENCE_PROPERTY = "reference"; + +// boolalpha +const string TRUE_STR = "true"; +const string FALSE_STR = "false"; + +} // namespace BamTools + +// --------------------------------------------- +// FilterToolPrivate declaration + +class FilterTool::FilterToolPrivate { + + // ctor & dtor + public: + FilterToolPrivate(FilterTool::FilterSettings* settings); + ~FilterToolPrivate(void); + + // 'public' interface + public: + bool Run(void); + + // internal methods + private: + bool AddPropertyTokensToFilter(const string& filterName, const map& propertyTokens); + bool CheckAlignment(const BamAlignment& al); + const string GetScriptContents(void); + void InitProperties(void); + bool ParseCommandLine(void); + bool ParseFilterObject(const string& filterName, const Json::Value& filterObject); + bool ParseScript(void); + bool SetupFilters(void); + + // data members + private: + vector m_propertyNames; + FilterTool::FilterSettings* m_settings; + RefVector m_references; +}; // --------------------------------------------- // FilterSettings implementation struct FilterTool::FilterSettings { + // ---------------------------------- + // IO opts + // flags bool HasInputBamFilename; bool HasOutputBamFilename; - + bool HasRegion; + bool HasScriptFilename; + // filenames vector InputFiles; string OutputFilename; + string Region; + string ScriptFilename; + + // ----------------------------------- + // General filter opts + + // flags + bool HasAlignmentFlagFilter; + bool HasInsertSizeFilter; + bool HasMapQualityFilter; + bool HasNameFilter; + bool HasQueryBasesFilter; + +// bool HasTagFilters; + + // filters + string AlignmentFlagFilter; + string InsertSizeFilter; + string NameFilter; + string MapQualityFilter; + string QueryBasesFilter; + +// vector TagFilters; + + // ----------------------------------- + // AlignmentFlag filter opts + + // flags + bool HasIsDuplicateFilter; + bool HasIsFailedQCFilter; + bool HasIsFirstMateFilter; + bool HasIsMappedFilter; + bool HasIsMateMappedFilter; + bool HasIsMateReverseStrandFilter; + bool HasIsPairedFilter; + bool HasIsPrimaryAlignmentFilter; + bool HasIsProperPairFilter; + bool HasIsReverseStrandFilter; + bool HasIsSecondMateFilter; + + // filters + string IsDuplicateFilter; + string IsFailedQCFilter; + string IsFirstMateFilter; + string IsMappedFilter; + string IsMateMappedFilter; + string IsMateReverseStrandFilter; + string IsPairedFilter; + string IsPrimaryAlignmentFilter; + string IsProperPairFilter; + string IsReverseStrandFilter; + string IsSecondMateFilter; + // --------------------------------- // constructor + FilterSettings(void) : HasInputBamFilename(false) , HasOutputBamFilename(false) + , HasRegion(false) + , HasScriptFilename(false) , OutputFilename(Options::StandardOut()) + , HasAlignmentFlagFilter(false) + , HasInsertSizeFilter(false) + , HasMapQualityFilter(false) + , HasNameFilter(false) + , HasQueryBasesFilter(false) +// , HasTagFilters(false) + , HasIsDuplicateFilter(false) + , HasIsFailedQCFilter(false) + , HasIsFirstMateFilter(false) + , HasIsMappedFilter(false) + , HasIsMateMappedFilter(false) + , HasIsMateReverseStrandFilter(false) + , HasIsPairedFilter(false) + , HasIsPrimaryAlignmentFilter(false) + , HasIsProperPairFilter(false) + , HasIsReverseStrandFilter(false) + , HasIsSecondMateFilter(false) + , IsDuplicateFilter(TRUE_STR) + , IsFailedQCFilter(TRUE_STR) + , IsFirstMateFilter(TRUE_STR) + , IsMappedFilter(TRUE_STR) + , IsMateMappedFilter(TRUE_STR) + , IsMateReverseStrandFilter(TRUE_STR) + , IsPairedFilter(TRUE_STR) + , IsPrimaryAlignmentFilter(TRUE_STR) + , IsProperPairFilter(TRUE_STR) + , IsReverseStrandFilter(TRUE_STR) + , IsSecondMateFilter(TRUE_STR) { } }; @@ -48,19 +213,46 @@ struct FilterTool::FilterSettings { FilterTool::FilterTool(void) : AbstractTool() , m_settings(new FilterSettings) + , m_impl(0) { // set program details - Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", "-in [-in ... ] -out "); + Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", "-in [-in ... ] -out -region [ [-script HasInputBamFilename, m_settings->InputFiles, IO_Opts, Options::StandardIn()); - Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); + Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); + Options::AddValueOption("-region", "REGION", "only read data from this genomic region (see README for more details)", "", m_settings->HasRegion, m_settings->Region, IO_Opts); + Options::AddValueOption("-script", "filename", "the filter script file (see README for more details)", "", m_settings->HasScriptFilename, m_settings->ScriptFilename, IO_Opts); + + OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters"); + Options::AddValueOption("-alignmentFlag", "int", "keep reads with this *exact* alignment flag (for more detailed queries, see below)", "", m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, FilterOpts); + Options::AddValueOption("-insertSize", "int", "keep reads with insert size that mathces pattern", "", m_settings->HasInsertSizeFilter, m_settings->InsertSizeFilter, FilterOpts); + Options::AddValueOption("-mapQuality", "[0-255]", "keep reads with map quality that matches pattern", "", m_settings->HasMapQualityFilter, m_settings->MapQualityFilter, FilterOpts); + Options::AddValueOption("-name", "string", "keep reads with name that matches pattern", "", m_settings->HasNameFilter, m_settings->NameFilter, FilterOpts); + Options::AddValueOption("-queryBases", "string", "keep reads with motif that mathces pattern", "", m_settings->HasQueryBasesFilter, m_settings->QueryBasesFilter, FilterOpts); + +// Options::AddValueOption("-tag", "TAG:VALUE", "keep reads with this key=>value pair. If multiple tags are given, reads must match all", "", m_settings->HasTagFilters, m_settings->TagFilters, FilterOpts); + + OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters"); + Options::AddValueOption("-isDuplicate", "true/false", "keep only alignments that are marked as duplicate?", "", m_settings->HasIsDuplicateFilter, m_settings->IsDuplicateFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isFailedQC", "true/false", "keep only alignments that failed QC?", "", m_settings->HasIsFailedQCFilter, m_settings->IsFailedQCFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isFirstMate", "true/false", "keep only alignments marked as first mate?", "", m_settings->HasIsFirstMateFilter, m_settings->IsFirstMateFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isMapped", "true/false", "keep only alignments that were mapped?", "", m_settings->HasIsMappedFilter, m_settings->IsMappedFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isMateMapped", "true/false", "keep only alignments with mates that mapped", "", m_settings->HasIsMateMappedFilter, m_settings->IsMateMappedFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isMateReverseStrand", "true/false", "keep only alignments with mate on reverese strand?", "", m_settings->HasIsMateReverseStrandFilter, m_settings->IsMateReverseStrandFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isPaired", "true/false", "keep only alignments that were sequenced as paired?","", m_settings->HasIsPairedFilter, m_settings->IsPairedFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isPrimaryAlignment", "true/false", "keep only alignments marked as primary?", "", m_settings->HasIsPrimaryAlignmentFilter, m_settings->IsPrimaryAlignmentFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isProperPair", "true/false", "keep only alignments that passed PE resolution?", "", m_settings->HasIsProperPairFilter, m_settings->IsProperPairFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isReverseStrand", "true/false", "keep only alignments on reverse strand?", "", m_settings->HasIsReverseStrandFilter, m_settings->IsReverseStrandFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isSecondMate", "true/false", "keep only alignments marked as second mate?", "", m_settings->HasIsSecondMateFilter, m_settings->IsSecondMateFilter, AlignmentFlagOpts, TRUE_STR); } FilterTool::~FilterTool(void) { delete m_settings; m_settings = 0; + + delete m_impl; + m_impl = 0; } int FilterTool::Help(void) { @@ -72,18 +264,468 @@ int FilterTool::Run(int argc, char* argv[]) { // parse command line arguments Options::Parse(argc, argv, 1); + + // run internal FilterTool implementation, return success/fail + m_impl = new FilterToolPrivate(m_settings); + + if ( m_impl->Run() ) return 0; + else return 1; +} + +// --------------------------------------------- +// FilterToolPrivate implementation + +// constructor +FilterTool::FilterToolPrivate::FilterToolPrivate(FilterTool::FilterSettings* settings) + : m_settings(settings) +{ } + +// destructor +FilterTool::FilterToolPrivate::~FilterToolPrivate(void) { } + +bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName, const map& propertyTokens) { + + + // dummy temp values for token parsing + bool boolValue; + int32_t int32Value; + uint16_t uint16Value; + uint32_t uint32Value; + string stringValue; + PropertyFilterValue::ValueCompareType type; + + // iterate over property token map + map::const_iterator mapIter = propertyTokens.begin(); + map::const_iterator mapEnd = propertyTokens.end(); + for ( ; mapIter != mapEnd; ++mapIter ) { + + const string& propertyName = (*mapIter).first; + const string& token = (*mapIter).second; + + // ------------------------------ + // convert token to value & compare type + // then add to filter engine + + // bool conversion + if ( propertyName == ISDUPLICATE_PROPERTY || + propertyName == ISFAILEDQC_PROPERTY || + propertyName == ISFIRSTMATE_PROPERTY || + propertyName == ISMAPPED_PROPERTY || + propertyName == ISMATEMAPPED_PROPERTY || + propertyName == ISMATEREVERSESTRAND_PROPERTY || + propertyName == ISPAIRED_PROPERTY || + propertyName == ISPRIMARYALIGNMENT_PROPERTY || + propertyName == ISPROPERPAIR_PROPERTY || + propertyName == ISREVERSESTRAND_PROPERTY || + propertyName == ISSECONDMATE_PROPERTY + ) + { + FilterEngine::parseToken(token, boolValue, type); + FilterEngine::setProperty(filterName, propertyName, boolValue, type); + } + + // int32_t conversion + else if ( propertyName == INSERTSIZE_PROPERTY || + propertyName == MATEPOSITION_PROPERTY || + propertyName == POSITION_PROPERTY + ) + { + FilterEngine::parseToken(token, int32Value, type); + FilterEngine::setProperty(filterName, propertyName, int32Value, type); + } + + // uint16_t conversion + else if ( propertyName == MAPQUALITY_PROPERTY ) + { + FilterEngine::parseToken(token, uint16Value, type); + FilterEngine::setProperty(filterName, propertyName, uint16Value, type); + } + + // uint32_t conversion + else if ( propertyName == ALIGNMENTFLAG_PROPERTY ) + { + FilterEngine::parseToken(token, uint32Value, type); + FilterEngine::setProperty(filterName, propertyName, uint32Value, type); + } + + // string conversion + else if ( propertyName == MATEREFERENCE_PROPERTY || + propertyName == NAME_PROPERTY || + propertyName == QUERYBASES_PROPERTY || + propertyName == REFERENCE_PROPERTY + ) + { + FilterEngine::parseToken(token, stringValue, type); + FilterEngine::setProperty(filterName, propertyName, stringValue, type); + } + + // else unknown property + else { + cerr << "Unknown property: " << propertyName << "!" << endl; + return false; + } + } + return true; +} + +bool FilterTool::FilterToolPrivate::CheckAlignment(const BamAlignment& al) { + + bool keepAlignment = true; + + // only consider properties that are actually enabled + // iterate over these enabled properties + const vector enabledProperties = FilterEngine::enabledPropertyNames(); + vector::const_iterator propIter = enabledProperties.begin(); + vector::const_iterator propEnd = enabledProperties.end(); + for ( ; propIter != propEnd; ++propIter ) { + + // check alignment data field depending on propertyName + const string& propertyName = (*propIter); + if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= FilterEngine::check(ALIGNMENTFLAG_PROPERTY, al.AlignmentFlag); + else if ( propertyName == INSERTSIZE_PROPERTY ) keepAlignment &= FilterEngine::check(INSERTSIZE_PROPERTY, al.InsertSize); + else if ( propertyName == ISDUPLICATE_PROPERTY ) keepAlignment &= FilterEngine::check(ISDUPLICATE_PROPERTY, al.IsDuplicate()); + else if ( propertyName == ISFAILEDQC_PROPERTY ) keepAlignment &= FilterEngine::check(ISFAILEDQC_PROPERTY, al.IsFailedQC()); + else if ( propertyName == ISFIRSTMATE_PROPERTY ) keepAlignment &= FilterEngine::check(ISFIRSTMATE_PROPERTY, al.IsFirstMate()); + else if ( propertyName == ISMAPPED_PROPERTY ) keepAlignment &= FilterEngine::check(ISMAPPED_PROPERTY, al.IsMapped()); + else if ( propertyName == ISMATEMAPPED_PROPERTY ) keepAlignment &= FilterEngine::check(ISMATEMAPPED_PROPERTY, al.IsMateMapped()); + else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY ) keepAlignment &= FilterEngine::check(ISMATEREVERSESTRAND_PROPERTY, al.IsMateReverseStrand()); + else if ( propertyName == ISPAIRED_PROPERTY ) keepAlignment &= FilterEngine::check(ISPAIRED_PROPERTY, al.IsPaired()); + else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY ) keepAlignment &= FilterEngine::check(ISPRIMARYALIGNMENT_PROPERTY, al.IsPrimaryAlignment()); + else if ( propertyName == ISPROPERPAIR_PROPERTY ) keepAlignment &= FilterEngine::check(ISPROPERPAIR_PROPERTY, al.IsProperPair()); + else if ( propertyName == ISREVERSESTRAND_PROPERTY ) keepAlignment &= FilterEngine::check(ISREVERSESTRAND_PROPERTY, al.IsReverseStrand()); + else if ( propertyName == ISSECONDMATE_PROPERTY ) keepAlignment &= FilterEngine::check(ISSECONDMATE_PROPERTY, al.IsSecondMate()); + else if ( propertyName == MAPQUALITY_PROPERTY ) keepAlignment &= FilterEngine::check(MAPQUALITY_PROPERTY, al.MapQuality); + else if ( propertyName == MATEPOSITION_PROPERTY ) keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && FilterEngine::check(MATEPOSITION_PROPERTY, al.MateRefID) ); + else if ( propertyName == MATEREFERENCE_PROPERTY ) { + if ( !al.IsPaired() || !al.IsMateMapped() ) return false; + BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)m_references.size())), "Invalid MateRefID"); + const string& refName = m_references.at(al.MateRefID).RefName; + keepAlignment &= FilterEngine::check(MATEREFERENCE_PROPERTY, refName); + } + else if ( propertyName == NAME_PROPERTY ) keepAlignment &= FilterEngine::check(NAME_PROPERTY, al.Name); + else if ( propertyName == POSITION_PROPERTY ) keepAlignment &= FilterEngine::check(POSITION_PROPERTY, al.Position); + else if ( propertyName == QUERYBASES_PROPERTY ) keepAlignment &= FilterEngine::check(QUERYBASES_PROPERTY, al.QueryBases); + else if ( propertyName == REFERENCE_PROPERTY ) { + BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)m_references.size())), "Invalid RefID"); + const string& refName = m_references.at(al.RefID).RefName; + keepAlignment &= FilterEngine::check(REFERENCE_PROPERTY, refName); + } + else BAMTOOLS_ASSERT_MESSAGE( false, "Unknown property"); + + // if alignment fails at ANY point, just quit and return false + if ( !keepAlignment ) return false; + } + + // return success (should still be true at this point) + return keepAlignment; +} + +const string FilterTool::FilterToolPrivate::GetScriptContents(void) { + + // open script for reading + FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb"); + if ( !inFile ) { + cerr << "FilterTool error: Could not open script: " << m_settings->ScriptFilename << " for reading" << endl; + return false; + } + + // read in entire script contents + char buffer[1024]; + ostringstream docStream(""); + while ( true ) { + + // peek ahead, make sure there is data available + char ch = fgetc(inFile); + ungetc(ch, inFile); + if( feof(inFile) ) break; + + // read next block of data + if ( fgets(buffer, 1024, inFile) == 0 ) { + cerr << "FilterTool error : could not read from script" << endl; + return false; + } + + docStream << buffer; + } + + // close script file + fclose(inFile); + + // import buffer contents to document, return + string document = docStream.str(); + return document; +} + +void FilterTool::FilterToolPrivate::InitProperties(void) { + + // store property names in vector + m_propertyNames.push_back(ALIGNMENTFLAG_PROPERTY); + m_propertyNames.push_back(INSERTSIZE_PROPERTY); + m_propertyNames.push_back(ISDUPLICATE_PROPERTY); + m_propertyNames.push_back(ISFAILEDQC_PROPERTY); + m_propertyNames.push_back(ISFIRSTMATE_PROPERTY); + m_propertyNames.push_back(ISMAPPED_PROPERTY); + m_propertyNames.push_back(ISMATEMAPPED_PROPERTY); + m_propertyNames.push_back(ISMATEREVERSESTRAND_PROPERTY); + m_propertyNames.push_back(ISPAIRED_PROPERTY); + m_propertyNames.push_back(ISPRIMARYALIGNMENT_PROPERTY); + m_propertyNames.push_back(ISPROPERPAIR_PROPERTY); + m_propertyNames.push_back(ISREVERSESTRAND_PROPERTY); + m_propertyNames.push_back(ISSECONDMATE_PROPERTY); + m_propertyNames.push_back(MAPQUALITY_PROPERTY); + m_propertyNames.push_back(MATEPOSITION_PROPERTY); + m_propertyNames.push_back(MATEREFERENCE_PROPERTY); + m_propertyNames.push_back(NAME_PROPERTY); + m_propertyNames.push_back(POSITION_PROPERTY); + m_propertyNames.push_back(QUERYBASES_PROPERTY); + m_propertyNames.push_back(REFERENCE_PROPERTY); + + // add vector contents to FilterEngine + vector::const_iterator propertyNameIter = m_propertyNames.begin(); + vector::const_iterator propertyNameEnd = m_propertyNames.end(); + for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter ) + FilterEngine::addProperty((*propertyNameIter)); +} + +bool FilterTool::FilterToolPrivate::ParseCommandLine(void) { + + // add a rule set to filter engine + const string CMD = "COMMAND_LINE"; + FilterEngine::addFilter(CMD); + + // map property names to command line args + map propertyTokens; + if ( m_settings->HasAlignmentFlagFilter ) propertyTokens.insert( make_pair(ALIGNMENTFLAG_PROPERTY, m_settings->AlignmentFlagFilter) ); + if ( m_settings->HasInsertSizeFilter ) propertyTokens.insert( make_pair(INSERTSIZE_PROPERTY, m_settings->InsertSizeFilter) ); + if ( m_settings->HasIsDuplicateFilter ) propertyTokens.insert( make_pair(ISDUPLICATE_PROPERTY, m_settings->IsDuplicateFilter) ); + if ( m_settings->HasIsFailedQCFilter ) propertyTokens.insert( make_pair(ISFAILEDQC_PROPERTY, m_settings->IsFailedQCFilter) ); + if ( m_settings->HasIsFirstMateFilter ) propertyTokens.insert( make_pair(ISFIRSTMATE_PROPERTY, m_settings->IsFirstMateFilter) ); + if ( m_settings->HasIsMappedFilter ) propertyTokens.insert( make_pair(ISMAPPED_PROPERTY, m_settings->IsMappedFilter) ); + if ( m_settings->HasIsMateMappedFilter ) propertyTokens.insert( make_pair(ISMATEMAPPED_PROPERTY, m_settings->IsMateMappedFilter) ); + if ( m_settings->HasIsMateReverseStrandFilter ) propertyTokens.insert( make_pair(ISMATEREVERSESTRAND_PROPERTY, m_settings->IsMateReverseStrandFilter) ); + if ( m_settings->HasIsPairedFilter ) propertyTokens.insert( make_pair(ISPAIRED_PROPERTY, m_settings->IsPairedFilter) ); + if ( m_settings->HasIsPrimaryAlignmentFilter ) propertyTokens.insert( make_pair(ISPRIMARYALIGNMENT_PROPERTY, m_settings->IsPrimaryAlignmentFilter) ); + if ( m_settings->HasIsProperPairFilter ) propertyTokens.insert( make_pair(ISPROPERPAIR_PROPERTY, m_settings->IsProperPairFilter) ); + if ( m_settings->HasIsReverseStrandFilter ) propertyTokens.insert( make_pair(ISREVERSESTRAND_PROPERTY, m_settings->IsReverseStrandFilter) ); + if ( m_settings->HasIsSecondMateFilter ) propertyTokens.insert( make_pair(ISSECONDMATE_PROPERTY, m_settings->IsSecondMateFilter) ); + if ( m_settings->HasMapQualityFilter ) propertyTokens.insert( make_pair(MAPQUALITY_PROPERTY, m_settings->MapQualityFilter) ); + if ( m_settings->HasNameFilter ) propertyTokens.insert( make_pair(NAME_PROPERTY, m_settings->NameFilter) ); + if ( m_settings->HasQueryBasesFilter ) propertyTokens.insert( make_pair(QUERYBASES_PROPERTY, m_settings->QueryBasesFilter) ); + + // send add these properties to filter set "COMMAND_LINE" + return AddPropertyTokensToFilter(CMD, propertyTokens); +} + +bool FilterTool::FilterToolPrivate::ParseFilterObject(const string& filterName, const Json::Value& filterObject) { + + // filter object parsing variables + Json::Value null(Json::nullValue); + Json::Value propertyValue; + + // store results + map propertyTokens; + + // iterate over known properties + vector::const_iterator propertyNameIter = m_propertyNames.begin(); + vector::const_iterator propertyNameEnd = m_propertyNames.end(); + for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter ) { + const string& propertyName = (*propertyNameIter); + + // if property defined in filter, add to token list + propertyValue = filterObject.get(propertyName, null); + if ( propertyValue != null ) + propertyTokens.insert( make_pair(propertyName, propertyValue.asString()) ); + } + + // add this filter to engin + FilterEngine::addFilter(filterName); + // add token list to this filter + return AddPropertyTokensToFilter(filterName, propertyTokens); +} + +bool FilterTool::FilterToolPrivate::ParseScript(void) { + + // read in script contents from file + const string document = GetScriptContents(); + + // set up JsonCPP reader and attempt to parse script + Json::Value root; + Json::Reader reader; + if ( !reader.parse(document, root) ) { + // use built-in error reporting mechanism to alert user what was wrong with the script + cerr << "Failed to parse configuration\n" << reader.getFormatedErrorMessages(); + return false; + } + + // see if root object contains multiple filters + const Json::Value filters = root["filters"]; + if ( !filters.isNull() ) { + + bool success = true; + + // iterate over any filters found + int filterIndex = 0; + Json::Value::const_iterator filtersIter = filters.begin(); + Json::Value::const_iterator filtersEnd = filters.end(); + for ( ; filtersIter != filtersEnd; ++filtersIter, ++filterIndex ) { + Json::Value filter = (*filtersIter); + + // convert filter index to string + stringstream convert; + convert << filterIndex; + const string filterName = convert.str(); + + // create & parse filter + success &= ParseFilterObject(filterName, filter); + + } + return success; + } + + // otherwise, root is the only filter (just contains properties) + // create & parse filter named "ROOT" + else return ParseFilterObject("ROOT", root); +} + + +bool FilterTool::FilterToolPrivate::Run(void) { + // set to default input if none provided if ( !m_settings->HasInputBamFilename ) m_settings->InputFiles.push_back(Options::StandardIn()); - // open files + // initialize defined properties & user-specified filters + // quit if failed + if ( !SetupFilters() ) return 1; + + // open reader without index BamMultiReader reader; - reader.Open(m_settings->InputFiles, false); - - // do filtering + reader.Open(m_settings->InputFiles, false, true); + const string headerText = reader.GetHeaderText(); + m_references = reader.GetReferenceData(); + + // open writer + BamWriter writer; + writer.Open(m_settings->OutputFilename, headerText, m_references); + + // set up error handling + ostringstream errorStream(""); + bool foundError(false); + // if no REGION specified, run filter on entire file contents + if ( !m_settings->HasRegion ) { + BamAlignment al; + while ( reader.GetNextAlignment(al) ) { + // perform main filter check + if ( CheckAlignment(al) ) + writer.SaveAlignment(al); + } + } + + // REGION specified + else { + + // attempt to parse string into BamRegion struct + BamRegion region; + if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { + + // check if there are index files *.bai/*.bti corresponding to the input files + bool hasDefaultIndex = false; + bool hasBamtoolsIndex = false; + bool hasNoIndex = false; + int defaultIndexCount = 0; + int bamtoolsIndexCount = 0; + for (vector::const_iterator f = m_settings->InputFiles.begin(); f != m_settings->InputFiles.end(); ++f) { + + if ( Utilities::FileExists(*f + ".bai") ) { + hasDefaultIndex = true; + ++defaultIndexCount; + } + + if ( Utilities::FileExists(*f + ".bti") ) { + hasBamtoolsIndex = true; + ++bamtoolsIndexCount; + } + + if ( !hasDefaultIndex && !hasBamtoolsIndex ) { + hasNoIndex = true; + cerr << "*WARNING - could not find index file for " << *f + << ", parsing whole file(s) to get alignment counts for target region" + << " (could be slow)" << endl; + break; + } + } + + // determine if index file types are heterogeneous + bool hasDifferentIndexTypes = false; + if ( defaultIndexCount > 0 && bamtoolsIndexCount > 0 ) { + hasDifferentIndexTypes = true; + cerr << "*WARNING - different index file formats found" + << ", parsing whole file(s) to get alignment counts for target region" + << " (could be slow)" << endl; + } + + // if any input file has no index, or if input files use different index formats + // can't use BamMultiReader to jump directly (**for now**) + if ( hasNoIndex || hasDifferentIndexTypes ) { + + // read through sequentially, but onlt perform filter on reads overlapping REGION + BamAlignment al; + while( reader.GetNextAlignment(al) ) { + if ( (al.RefID >= region.LeftRefID) && ( (al.Position + al.Length) >= region.LeftPosition ) && + (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) + { + // perform main filter check + if ( CheckAlignment(al) ) + writer.SaveAlignment(al); + } + } + } + + // has index file for each input file (and same format) + else { + + // this is kind of a hack...? + BamMultiReader reader; + reader.Open(m_settings->InputFiles, true, true, hasDefaultIndex ); + + if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) { + foundError = true; + errorStream << "Could not set BamReader region to REGION: " << m_settings->Region << endl; + } else { + + // filter only alignments from specified region + BamAlignment al; + while ( reader.GetNextAlignment(al) ) { + // perform main filter check + if ( CheckAlignment(al) ) + writer.SaveAlignment(al); + } + } + } + + } else { + foundError = true; + errorStream << "Could not parse REGION: " << m_settings->Region << endl; + errorStream << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl; + } + } + // clean up & exit reader.Close(); + writer.Close(); return 0; -} \ No newline at end of file +} + +bool FilterTool::FilterToolPrivate::SetupFilters(void) { + + // add known properties to FilterEngine + InitProperties(); + + // parse script for filter rules, if given + if ( m_settings->HasScriptFilename ) return ParseScript(); + + // otherwise check command line for filters + else return ParseCommandLine(); +} diff --git a/src/toolkit/bamtools_filter.h b/src/toolkit/bamtools_filter.h index fe8728b..2abb0e7 100644 --- a/src/toolkit/bamtools_filter.h +++ b/src/toolkit/bamtools_filter.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 2 June 2010 +// Last modified: 28 August 2010 // --------------------------------------------------------------------------- // Filters a single BAM file (or filters multiple BAM files and merges) // according to some user-specified criteria. @@ -29,6 +29,9 @@ class FilterTool : public AbstractTool { private: struct FilterSettings; FilterSettings* m_settings; + + struct FilterToolPrivate; + FilterToolPrivate* m_impl; }; } // namespace BamTools diff --git a/src/utils/bamtools_filter_engine.cpp b/src/utils/bamtools_filter_engine.cpp index 0689ebe..3ad9430 100644 --- a/src/utils/bamtools_filter_engine.cpp +++ b/src/utils/bamtools_filter_engine.cpp @@ -8,6 +8,7 @@ // // *************************************************************************** +#include #include "bamtools_filter_engine.h" #include "BamAux.h" using namespace std; @@ -27,7 +28,7 @@ bool PropertyFilterValue::check(const string& query) const { // localize string version of our filter value const string& valueString = Value.get(); - + // string matching based on our filter type switch ( Type ) { case ( PropertyFilterValue::CONTAINS) : return ( query.find(valueString) != string::npos ); @@ -44,22 +45,6 @@ bool PropertyFilterValue::check(const string& query) const { return false; } -// -------------------------------------------------------- -// PropertyFilter implementation -PropertyFilter::PropertyFilter(void) - : Type(PropertyFilter::EXACT) - , LeftChild(0) - , RightChild(0) -{ } - -PropertyFilter::~PropertyFilter(void) { - delete LeftChild; - LeftChild = 0; - - delete RightChild; - RightChild = 0; -} - // --------------------------------------------------------- // FilterEngine implementation @@ -115,87 +100,3 @@ const vector FilterEngine::enabledPropertyNames(void) { if ( (*propIter).IsEnabled ) names.push_back( (*propIter).Name ); return names; } - -// ================================================================ -// DEBUGGING - -void FilterEngine::print(void) { - cout << endl; - printAllProperties(); - printEnabledProperties(); - printFilters(); -} - -void FilterEngine::printAllProperties(void) { - - cout << "=======================================" << endl; - cout << "All Properties: " << endl; - cout << endl; - - const vector propertyNames = allPropertyNames(); - vector::const_iterator nameIter = propertyNames.begin(); - vector::const_iterator nameEnd = propertyNames.end(); - for ( ; nameIter != nameEnd; ++nameIter ) - cout << (*nameIter) << endl; - cout << endl; -} - -void FilterEngine::printEnabledProperties(void) { - - cout << "=======================================" << endl; - cout << "Enabled Properties: " << endl; - cout << endl; - - const vector propertyNames = enabledPropertyNames(); - vector::const_iterator nameIter = propertyNames.begin(); - vector::const_iterator nameEnd = propertyNames.end(); - for ( ; nameIter != nameEnd; ++nameIter ) - cout << (*nameIter) << endl; - cout << endl; -} - -void FilterEngine::printFilters(void) { - - cout << "=======================================" << endl; - cout << "Current Filters: " << endl; - cout << endl; - - // iterate over all filters in FilterEngine - FilterMap::const_iterator filterIter = m_filters.begin(); - FilterMap::const_iterator filterEnd = m_filters.end(); - for ( ; filterIter != filterEnd; ++filterIter ) { - cout << "Filter Name: " << (*filterIter).first << endl; - - // see if filter set has this property enabled - const PropertyFilter& filter = (*filterIter).second; - PropertyMap::const_iterator propIter = filter.Properties.begin(); - PropertyMap::const_iterator propEnd = filter.Properties.end(); - for ( ; propIter != propEnd; ++propIter ) { - - cout << " - " << (*propIter).first << " : "; - const PropertyFilterValue& filterValue = (*propIter).second; - - if ( filterValue.Value.is_type() ) cout << "\t" << boolalpha << filterValue.Value.get(); - else if ( filterValue.Value.is_type() ) cout << "\t" << filterValue.Value.get(); - else if ( filterValue.Value.is_type() ) cout << "\t" << filterValue.Value.get(); - else if ( filterValue.Value.is_type() ) cout << "\t" << filterValue.Value.get(); - else if ( filterValue.Value.is_type() ) cout << "\t" << filterValue.Value.get(); - else if ( filterValue.Value.is_type() ) cout << "\t" << filterValue.Value.get(); - else cout << "** UNKNOWN VALUE TYPE!! **"; - - switch( filterValue.Type ) { - case (PropertyFilterValue::CONTAINS) : cout << " (contains)"; break; - case (PropertyFilterValue::ENDS_WITH) : cout << " (ends_with)"; break; - case (PropertyFilterValue::EXACT) : cout << " (exact)"; break; - case (PropertyFilterValue::GREATER_THAN) : cout << " (greater_than)"; break; - case (PropertyFilterValue::GREATER_THAN_EQUAL) : cout << " (greater_than_equal)"; break; - case (PropertyFilterValue::LESS_THAN) : cout << " (less_than)"; break; - case (PropertyFilterValue::LESS_THAN_EQUAL) : cout << " (less_than_equal)"; break; - case (PropertyFilterValue::NOT) : cout << " (not)"; break; - case (PropertyFilterValue::STARTS_WITH) : cout << " (starts_with)"; break; - default : cout << " : ** UNKNOWN COMPARE TYPE!! **"; - } - cout << endl; - } - } -} diff --git a/src/utils/bamtools_filter_engine.h b/src/utils/bamtools_filter_engine.h index c01ab01..2297aea 100644 --- a/src/utils/bamtools_filter_engine.h +++ b/src/utils/bamtools_filter_engine.h @@ -12,7 +12,6 @@ #define BAMTOOLS_FILTER_ENGINE_H #include -#include #include #include #include @@ -78,6 +77,8 @@ typedef std::map PropertyMap; struct PropertyFilter { + // will be used more later + // if we implement a compound 'rules' system - i.e. "(filter1 AND filter2) OR filter 3" enum FilterCompareType { AND = 0 , EXACT , NOT @@ -87,12 +88,9 @@ struct PropertyFilter { // data members PropertyMap Properties; FilterCompareType Type; - PropertyFilter* LeftChild; - PropertyFilter* RightChild; - - // ctor & dtor - PropertyFilter(void); - ~PropertyFilter(void); + + // ctor + PropertyFilter(void) : Type( PropertyFilter::EXACT ) { } // filter check methods template @@ -156,13 +154,6 @@ class FilterEngine { // returns true if query passes all filters on 'propertyName' template static bool check(const std::string& propertyName, const T& query); - - // debugging - public: - static void print(void); - static void printAllProperties(void); - static void printEnabledProperties(void); - static void printFilters(void); // data members private: @@ -215,48 +206,24 @@ bool PropertyFilterValue::check(const T& query) const { template bool PropertyFilter::check(const std::string& propertyName, const T& query) const { - // if this filter is a 'leaf' filter - if ( (LeftChild == 0 ) && ( RightChild == 0 ) ) { - - // if propertyName found for this filter, - PropertyMap::const_iterator propIter = Properties.find(propertyName); - if ( propIter != Properties.end() ) { - const PropertyFilterValue& filterValue = (*propIter).second; - - // check - switch ( Type ) { - case ( PropertyFilter::EXACT ) : return filterValue.check(query); - case ( PropertyFilter::NOT ) : return !filterValue.check(query); - case ( PropertyFilter::AND ) : - case ( PropertyFilter::OR ) : BAMTOOLS_ASSERT_MESSAGE(false, "Cannot use a binary compare operator on 1 value"); - default : BAMTOOLS_ASSERT_UNREACHABLE; - } - return false; // unreachable - } - - // property unknown to this filter - else return true; - } - - // if this filter is a parent filter (contains valid left & right children) - else if ( LeftChild && RightChild ) { + // if propertyName found for this filter, + PropertyMap::const_iterator propIter = Properties.find(propertyName); + if ( propIter != Properties.end() ) { + const PropertyFilterValue& filterValue = (*propIter).second; - // return result of children using this filter's compare type (AND, OR) - switch ( Type ) { - case ( PropertyFilter::AND ) : return LeftChild->check(propertyName, query) && RightChild->check(propertyName, query); - case ( PropertyFilter::OR ) : return LeftChild->check(propertyName, query) || RightChild->check(propertyName, query); - case ( PropertyFilter::EXACT) : - case ( PropertyFilter::NOT) : BAMTOOLS_ASSERT_MESSAGE(false, "Cannot use a unary compare operator on 2 values"); - default : BAMTOOLS_ASSERT_UNREACHABLE; - } - return false; // unreachable - } - - // otherwise filter contains one child... invalid state - else { - BAMTOOLS_ASSERT_MESSAGE(false, "PropertyFilter needs both children to do a binary compare"); - return false; + // check + switch ( Type ) { + case ( PropertyFilter::EXACT ) : return filterValue.check(query); + case ( PropertyFilter::NOT ) : return !filterValue.check(query); + case ( PropertyFilter::AND ) : + case ( PropertyFilter::OR ) : BAMTOOLS_ASSERT_MESSAGE(false, "Cannot use a binary compare operator on 1 value"); + default : BAMTOOLS_ASSERT_UNREACHABLE; + } + return false; // unreachable } + + // property unknown to this filter + else return true; } template @@ -453,22 +420,11 @@ bool FilterEngine::check(const std::string& propertyName, const T& query) { // check query against this filter const PropertyFilter& filter = (*filterIter).second; - if ( !filter.check(propertyName, query) ) return false; - - -// // see if filter set has this property enabled -// const PropertyFilter& filter = (*filterIter).second; -// PropertyMap::const_iterator propIter = filter.Properties.find(propertyName); -// if ( propIter != filter.Properties.end() ) { -// -// // if so, check query against filter, return false if check fails -// const PropertyFilterValue& propertyFilter = (*propIter).second; -// if ( !propertyFilter.check(query) ) return false; -// } + if ( filter.check(propertyName, query) ) return true; } - // query passes all filters with property enabled - return true; + // query passes none of the filters with current property enabled + return false; } } // namespace BamTools