]> git.donarmstrong.com Git - bamtools.git/commitdiff
Posted implementation of FilterTool.
authorDerek <derekwbarnett@gmail.com>
Mon, 30 Aug 2010 22:53:59 +0000 (18:53 -0400)
committerDerek <derekwbarnett@gmail.com>
Mon, 30 Aug 2010 22:53:59 +0000 (18:53 -0400)
src/toolkit/bamtools_filter.cpp
src/toolkit/bamtools_filter.h
src/utils/bamtools_filter_engine.cpp
src/utils/bamtools_filter_engine.h

index 2249022edfb896d5a289a4a1a4ed3cd6bdba9fee..6941da2d56d7719d2950f7fa8cf7250f9f30e35f 100644 (file)
 // Marth Lab, Department of Biology, Boston College
 // All rights reserved.
 // ---------------------------------------------------------------------------
-// Last modified: 1 June 2010
+// Last modified: 30 August 2010
 // ---------------------------------------------------------------------------
 // Filters a single BAM file (or filters multiple BAM files and merges) 
 // according to some user-specified criteria.
 // ***************************************************************************
 
+// std includes
+#include <cstdio>
 #include <iostream>
+#include <sstream>
 #include <string>
 #include <vector>
 
+// BamTools includes
 #include "bamtools_filter.h"
+#include "bamtools_filter_engine.h"
 #include "bamtools_options.h"
+#include "bamtools_utilities.h"
 #include "BamReader.h"
 #include "BamMultiReader.h"
+#include "BamWriter.h"
+
+//JsonCPP includes
+#include "jsoncpp/json.h"
 
 using namespace std;
 using namespace BamTools; 
+using namespace Json;
+  
+namespace BamTools {
+  
+// -------------------------------  
+// string literal constants  
+
+// property names
+const string ALIGNMENTFLAG_PROPERTY       = "alignmentFlag";
+const string INSERTSIZE_PROPERTY          = "insertSize";
+const string ISDUPLICATE_PROPERTY         = "isDuplicate";
+const string ISFAILEDQC_PROPERTY          = "isFailedQC";
+const string ISFIRSTMATE_PROPERTY         = "isFirstMate";
+const string ISMAPPED_PROPERTY            = "isMapped";
+const string ISMATEMAPPED_PROPERTY        = "isMateMapped";
+const string ISMATEREVERSESTRAND_PROPERTY = "isMateReverseStrand";
+const string ISPAIRED_PROPERTY            = "isPaired";
+const string ISPRIMARYALIGNMENT_PROPERTY  = "isPrimaryAlignment";
+const string ISPROPERPAIR_PROPERTY        = "isProperPair";
+const string ISREVERSESTRAND_PROPERTY     = "isReverseStrand";
+const string ISSECONDMATE_PROPERTY        = "isSecondMate";
+const string MAPQUALITY_PROPERTY          = "mapQuality";
+const string MATEPOSITION_PROPERTY        = "matePosition";
+const string MATEREFERENCE_PROPERTY       = "mateReference";
+const string NAME_PROPERTY                = "name";
+const string POSITION_PROPERTY            = "position";
+const string QUERYBASES_PROPERTY          = "queryBases";
+const string REFERENCE_PROPERTY           = "reference";
+
+// boolalpha
+const string TRUE_STR  = "true";
+const string FALSE_STR = "false";
+    
+} // namespace BamTools
+  
+// ---------------------------------------------
+// FilterToolPrivate declaration
+
+class FilterTool::FilterToolPrivate {
+      
+    // ctor & dtor
+    public:
+        FilterToolPrivate(FilterTool::FilterSettings* settings);
+        ~FilterToolPrivate(void);  
+        
+    // 'public' interface
+    public:
+        bool Run(void);
+        
+    // internal methods
+    private:
+        bool AddPropertyTokensToFilter(const string& filterName, const map<string, string>& propertyTokens);
+        bool CheckAlignment(const BamAlignment& al);
+        const string GetScriptContents(void);
+        void InitProperties(void);
+        bool ParseCommandLine(void);
+        bool ParseFilterObject(const string& filterName, const Json::Value& filterObject);
+        bool ParseScript(void);
+        bool SetupFilters(void);
+        
+    // data members
+    private:
+        vector<string> m_propertyNames;
+        FilterTool::FilterSettings* m_settings;
+        RefVector m_references;
+};
   
 // ---------------------------------------------
 // FilterSettings implementation
 
 struct FilterTool::FilterSettings {
 
+    // ----------------------------------
+    // IO opts
+  
     // flags
     bool HasInputBamFilename;
     bool HasOutputBamFilename;
-
+    bool HasRegion;
+    bool HasScriptFilename;
+    
     // filenames
     vector<string> InputFiles;
     string OutputFilename;
+    string Region;
+    string ScriptFilename;
+    
+    // -----------------------------------
+    // General filter opts
+    
+    // flags
+    bool HasAlignmentFlagFilter;
+    bool HasInsertSizeFilter;
+    bool HasMapQualityFilter;
+    bool HasNameFilter;
+    bool HasQueryBasesFilter;
+    
+//     bool HasTagFilters;
+
+    // filters
+    string AlignmentFlagFilter;
+    string InsertSizeFilter;
+    string NameFilter;
+    string MapQualityFilter;
+    string QueryBasesFilter;
+    
+//     vector<string> TagFilters;
+
+    // -----------------------------------
+    // AlignmentFlag filter opts
+    
+    // flags
+    bool HasIsDuplicateFilter;
+    bool HasIsFailedQCFilter;
+    bool HasIsFirstMateFilter;
+    bool HasIsMappedFilter;
+    bool HasIsMateMappedFilter;
+    bool HasIsMateReverseStrandFilter;
+    bool HasIsPairedFilter;
+    bool HasIsPrimaryAlignmentFilter;
+    bool HasIsProperPairFilter;
+    bool HasIsReverseStrandFilter;
+    bool HasIsSecondMateFilter;
+    
+    // filters
+    string IsDuplicateFilter;
+    string IsFailedQCFilter;
+    string IsFirstMateFilter;
+    string IsMappedFilter;
+    string IsMateMappedFilter;
+    string IsMateReverseStrandFilter;
+    string IsPairedFilter;
+    string IsPrimaryAlignmentFilter;
+    string IsProperPairFilter;
+    string IsReverseStrandFilter;
+    string IsSecondMateFilter;
     
+    // ---------------------------------
     // constructor
+    
     FilterSettings(void)
         : HasInputBamFilename(false)
         , HasOutputBamFilename(false)
+        , HasRegion(false)
+        , HasScriptFilename(false)
         , OutputFilename(Options::StandardOut())
+        , HasAlignmentFlagFilter(false)
+        , HasInsertSizeFilter(false)
+        , HasMapQualityFilter(false)
+        , HasNameFilter(false)
+        , HasQueryBasesFilter(false)
+//         , HasTagFilters(false)
+        , HasIsDuplicateFilter(false)
+        , HasIsFailedQCFilter(false)
+        , HasIsFirstMateFilter(false)
+        , HasIsMappedFilter(false)
+        , HasIsMateMappedFilter(false)
+        , HasIsMateReverseStrandFilter(false)
+        , HasIsPairedFilter(false)
+        , HasIsPrimaryAlignmentFilter(false)
+        , HasIsProperPairFilter(false)
+        , HasIsReverseStrandFilter(false)
+        , HasIsSecondMateFilter(false)
+        , IsDuplicateFilter(TRUE_STR)
+        , IsFailedQCFilter(TRUE_STR)
+        , IsFirstMateFilter(TRUE_STR)
+        , IsMappedFilter(TRUE_STR)
+        , IsMateMappedFilter(TRUE_STR)
+        , IsMateReverseStrandFilter(TRUE_STR)
+        , IsPairedFilter(TRUE_STR)
+        , IsPrimaryAlignmentFilter(TRUE_STR)
+        , IsProperPairFilter(TRUE_STR)
+        , IsReverseStrandFilter(TRUE_STR)
+        , IsSecondMateFilter(TRUE_STR)
     { }
 };  
 
@@ -48,19 +213,46 @@ struct FilterTool::FilterSettings {
 FilterTool::FilterTool(void)
     : AbstractTool()
     , m_settings(new FilterSettings)
+    , m_impl(0)
 {
     // set program details
-    Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", "-in <filename> [-in <filename> ... ] -out <filename> ");
+    Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", "-in <filename> [-in <filename> ... ] -out <filename> -region <REGION> [ [-script <filename] | [filterOptions] ]");
     
-    // set up options 
     OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
-    Options::AddValueOption("-in",  "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename,  m_settings->InputFiles,     IO_Opts, Options::StandardIn());
-    Options::AddValueOption("-out", "BAM filename", "the output BAM file",   "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
+    Options::AddValueOption("-in",     "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename,  m_settings->InputFiles,     IO_Opts, Options::StandardIn());
+    Options::AddValueOption("-out",    "BAM filename", "the output BAM file",   "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
+    Options::AddValueOption("-region", "REGION",       "only read data from this genomic region (see README for more details)", "", m_settings->HasRegion, m_settings->Region, IO_Opts);
+    Options::AddValueOption("-script", "filename",     "the filter script file (see README for more details)", "", m_settings->HasScriptFilename, m_settings->ScriptFilename, IO_Opts);
+    
+    OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters");
+    Options::AddValueOption("-alignmentFlag", "int",     "keep reads with this *exact* alignment flag (for more detailed queries, see below)", "", m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, FilterOpts);
+    Options::AddValueOption("-insertSize",    "int",     "keep reads with insert size that mathces pattern",             "", m_settings->HasInsertSizeFilter, m_settings->InsertSizeFilter, FilterOpts);
+    Options::AddValueOption("-mapQuality",    "[0-255]", "keep reads with map quality that matches pattern",             "", m_settings->HasMapQualityFilter, m_settings->MapQualityFilter, FilterOpts);
+    Options::AddValueOption("-name",          "string",  "keep reads with name that matches pattern",                    "", m_settings->HasNameFilter,       m_settings->NameFilter,       FilterOpts);
+    Options::AddValueOption("-queryBases",    "string",  "keep reads with motif that mathces pattern",                   "", m_settings->HasQueryBasesFilter, m_settings->QueryBasesFilter, FilterOpts);
+    
+//     Options::AddValueOption("-tag", "TAG:VALUE", "keep reads with this key=>value pair. If multiple tags are given, reads must match all", "", m_settings->HasTagFilters, m_settings->TagFilters, FilterOpts);
+    
+    OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters");
+    Options::AddValueOption("-isDuplicate",         "true/false", "keep only alignments that are marked as duplicate?", "", m_settings->HasIsDuplicateFilter,         m_settings->IsDuplicateFilter,         AlignmentFlagOpts, TRUE_STR);
+    Options::AddValueOption("-isFailedQC",          "true/false", "keep only alignments that failed QC?",               "", m_settings->HasIsFailedQCFilter,          m_settings->IsFailedQCFilter,          AlignmentFlagOpts, TRUE_STR);
+    Options::AddValueOption("-isFirstMate",         "true/false", "keep only alignments marked as first mate?",         "", m_settings->HasIsFirstMateFilter,         m_settings->IsFirstMateFilter,         AlignmentFlagOpts, TRUE_STR);
+    Options::AddValueOption("-isMapped",            "true/false", "keep only alignments that were mapped?",             "", m_settings->HasIsMappedFilter,            m_settings->IsMappedFilter,            AlignmentFlagOpts, TRUE_STR);
+    Options::AddValueOption("-isMateMapped",        "true/false", "keep only alignments with mates that mapped",        "", m_settings->HasIsMateMappedFilter,        m_settings->IsMateMappedFilter,        AlignmentFlagOpts, TRUE_STR);
+    Options::AddValueOption("-isMateReverseStrand", "true/false", "keep only alignments with mate on reverese strand?", "", m_settings->HasIsMateReverseStrandFilter, m_settings->IsMateReverseStrandFilter, AlignmentFlagOpts, TRUE_STR);
+    Options::AddValueOption("-isPaired",            "true/false", "keep only alignments that were sequenced as paired?","", m_settings->HasIsPairedFilter,            m_settings->IsPairedFilter,            AlignmentFlagOpts, TRUE_STR);
+    Options::AddValueOption("-isPrimaryAlignment",  "true/false", "keep only alignments marked as primary?",            "", m_settings->HasIsPrimaryAlignmentFilter,  m_settings->IsPrimaryAlignmentFilter,  AlignmentFlagOpts, TRUE_STR);
+    Options::AddValueOption("-isProperPair",        "true/false", "keep only alignments that passed PE resolution?",    "", m_settings->HasIsProperPairFilter,        m_settings->IsProperPairFilter,        AlignmentFlagOpts, TRUE_STR);
+    Options::AddValueOption("-isReverseStrand",     "true/false", "keep only alignments on reverse strand?",            "", m_settings->HasIsReverseStrandFilter,     m_settings->IsReverseStrandFilter,     AlignmentFlagOpts, TRUE_STR);
+    Options::AddValueOption("-isSecondMate",        "true/false", "keep only alignments marked as second mate?",        "", m_settings->HasIsSecondMateFilter,        m_settings->IsSecondMateFilter,        AlignmentFlagOpts, TRUE_STR);
 }
 
 FilterTool::~FilterTool(void) {
     delete m_settings;
     m_settings = 0;
+    
+    delete m_impl;
+    m_impl = 0;
 }
 
 int FilterTool::Help(void) {
@@ -72,18 +264,468 @@ int FilterTool::Run(int argc, char* argv[]) {
   
     // parse command line arguments
     Options::Parse(argc, argv, 1);
+    
+    // run internal FilterTool implementation, return success/fail
+    m_impl = new FilterToolPrivate(m_settings);
+    
+    if ( m_impl->Run() ) return 0;
+    else return 1;
+}
+// ---------------------------------------------
+// FilterToolPrivate implementation
+  
+// constructor  
+FilterTool::FilterToolPrivate::FilterToolPrivate(FilterTool::FilterSettings* settings)
+    : m_settings(settings)
+{ }  
+  
+// destructor
+FilterTool::FilterToolPrivate::~FilterToolPrivate(void) { }
+
+bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName, const map<string, string>& propertyTokens) {
+
+  
+    // dummy temp values for token parsing
+    bool boolValue;
+    int32_t int32Value;
+    uint16_t uint16Value;
+    uint32_t uint32Value;
+    string stringValue;
+    PropertyFilterValue::ValueCompareType type;
+  
+    // iterate over property token map
+    map<string, string>::const_iterator mapIter = propertyTokens.begin();
+    map<string, string>::const_iterator mapEnd  = propertyTokens.end();
+    for ( ; mapIter != mapEnd; ++mapIter ) {
+      
+        const string& propertyName = (*mapIter).first;
+        const string& token        = (*mapIter).second;
+        
+        // ------------------------------
+        // convert token to value & compare type 
+        // then add to filter engine
+        
+        // bool conversion
+        if ( propertyName == ISDUPLICATE_PROPERTY ||
+             propertyName == ISFAILEDQC_PROPERTY ||
+             propertyName == ISFIRSTMATE_PROPERTY ||
+             propertyName == ISMAPPED_PROPERTY ||
+             propertyName == ISMATEMAPPED_PROPERTY ||
+             propertyName == ISMATEREVERSESTRAND_PROPERTY ||
+             propertyName == ISPAIRED_PROPERTY ||
+             propertyName == ISPRIMARYALIGNMENT_PROPERTY ||
+             propertyName == ISPROPERPAIR_PROPERTY ||
+             propertyName == ISREVERSESTRAND_PROPERTY ||
+             propertyName == ISSECONDMATE_PROPERTY
+           ) 
+        {
+            FilterEngine::parseToken(token, boolValue, type);
+            FilterEngine::setProperty(filterName, propertyName, boolValue, type);
+        }
+        
+        // int32_t conversion
+        else if ( propertyName == INSERTSIZE_PROPERTY ||
+                  propertyName == MATEPOSITION_PROPERTY ||
+                  propertyName == POSITION_PROPERTY 
+                ) 
+        {
+            FilterEngine::parseToken(token, int32Value, type);
+            FilterEngine::setProperty(filterName, propertyName, int32Value, type);
+        }
+        
+        // uint16_t conversion
+        else if ( propertyName == MAPQUALITY_PROPERTY )
+        {
+            FilterEngine::parseToken(token, uint16Value, type);
+            FilterEngine::setProperty(filterName, propertyName, uint16Value, type);
+        }
+        
+        // uint32_t conversion
+        else if ( propertyName == ALIGNMENTFLAG_PROPERTY )
+        {
+            FilterEngine::parseToken(token, uint32Value, type);
+            FilterEngine::setProperty(filterName, propertyName, uint32Value, type);
+        }
+        
+        // string conversion
+        else if ( propertyName == MATEREFERENCE_PROPERTY ||
+                  propertyName == NAME_PROPERTY ||
+                  propertyName == QUERYBASES_PROPERTY ||
+                  propertyName == REFERENCE_PROPERTY
+                ) 
+        {
+            FilterEngine::parseToken(token, stringValue, type);
+            FilterEngine::setProperty(filterName, propertyName, stringValue, type);
+        }
+      
+        // else unknown property 
+        else {
+            cerr << "Unknown property: " << propertyName << "!" << endl;
+            return false;
+        }
+    }
+    return true;
+}
+
+bool FilterTool::FilterToolPrivate::CheckAlignment(const BamAlignment& al) {
+
+    bool keepAlignment = true;
+  
+    // only consider properties that are actually enabled
+    // iterate over these enabled properties
+    const vector<string> enabledProperties = FilterEngine::enabledPropertyNames();
+    vector<string>::const_iterator propIter = enabledProperties.begin();
+    vector<string>::const_iterator propEnd  = enabledProperties.end();
+    for ( ; propIter != propEnd; ++propIter ) {
+     
+        // check alignment data field depending on propertyName
+        const string& propertyName = (*propIter);
+        if      ( propertyName == ALIGNMENTFLAG_PROPERTY )        keepAlignment &= FilterEngine::check(ALIGNMENTFLAG_PROPERTY,       al.AlignmentFlag);
+        else if ( propertyName == INSERTSIZE_PROPERTY )           keepAlignment &= FilterEngine::check(INSERTSIZE_PROPERTY,          al.InsertSize);
+        else if ( propertyName == ISDUPLICATE_PROPERTY )          keepAlignment &= FilterEngine::check(ISDUPLICATE_PROPERTY,         al.IsDuplicate());
+        else if ( propertyName == ISFAILEDQC_PROPERTY )           keepAlignment &= FilterEngine::check(ISFAILEDQC_PROPERTY,          al.IsFailedQC());
+        else if ( propertyName == ISFIRSTMATE_PROPERTY )          keepAlignment &= FilterEngine::check(ISFIRSTMATE_PROPERTY,         al.IsFirstMate());
+        else if ( propertyName == ISMAPPED_PROPERTY )             keepAlignment &= FilterEngine::check(ISMAPPED_PROPERTY,            al.IsMapped());
+        else if ( propertyName == ISMATEMAPPED_PROPERTY )         keepAlignment &= FilterEngine::check(ISMATEMAPPED_PROPERTY,        al.IsMateMapped());
+        else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY )  keepAlignment &= FilterEngine::check(ISMATEREVERSESTRAND_PROPERTY, al.IsMateReverseStrand());
+        else if ( propertyName == ISPAIRED_PROPERTY )             keepAlignment &= FilterEngine::check(ISPAIRED_PROPERTY,            al.IsPaired());
+        else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY )   keepAlignment &= FilterEngine::check(ISPRIMARYALIGNMENT_PROPERTY,  al.IsPrimaryAlignment());
+        else if ( propertyName == ISPROPERPAIR_PROPERTY )         keepAlignment &= FilterEngine::check(ISPROPERPAIR_PROPERTY,        al.IsProperPair());
+        else if ( propertyName == ISREVERSESTRAND_PROPERTY )      keepAlignment &= FilterEngine::check(ISREVERSESTRAND_PROPERTY,     al.IsReverseStrand());
+        else if ( propertyName == ISSECONDMATE_PROPERTY )         keepAlignment &= FilterEngine::check(ISSECONDMATE_PROPERTY,        al.IsSecondMate());
+        else if ( propertyName == MAPQUALITY_PROPERTY )           keepAlignment &= FilterEngine::check(MAPQUALITY_PROPERTY,          al.MapQuality);
+        else if ( propertyName == MATEPOSITION_PROPERTY )         keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && FilterEngine::check(MATEPOSITION_PROPERTY, al.MateRefID) );
+        else if ( propertyName == MATEREFERENCE_PROPERTY ) {
+            if ( !al.IsPaired() || !al.IsMateMapped() ) return false;
+            BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)m_references.size())), "Invalid MateRefID");
+            const string& refName = m_references.at(al.MateRefID).RefName;
+            keepAlignment &= FilterEngine::check(MATEREFERENCE_PROPERTY, refName);
+        }
+        else if ( propertyName == NAME_PROPERTY )                 keepAlignment &= FilterEngine::check(NAME_PROPERTY,                al.Name);
+        else if ( propertyName == POSITION_PROPERTY )             keepAlignment &= FilterEngine::check(POSITION_PROPERTY,            al.Position);
+        else if ( propertyName == QUERYBASES_PROPERTY )           keepAlignment &= FilterEngine::check(QUERYBASES_PROPERTY,          al.QueryBases);
+        else if ( propertyName == REFERENCE_PROPERTY ) {
+            BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)m_references.size())), "Invalid RefID");
+            const string& refName = m_references.at(al.RefID).RefName;
+            keepAlignment &= FilterEngine::check(REFERENCE_PROPERTY, refName);
+        }
+        else BAMTOOLS_ASSERT_MESSAGE( false, "Unknown property");
+        
+        // if alignment fails at ANY point, just quit and return false
+        if ( !keepAlignment ) return false;
+    }
+  
+    // return success (should still be true at this point)
+    return keepAlignment;
+}
+
+const string FilterTool::FilterToolPrivate::GetScriptContents(void) {
+  
+    // open script for reading
+    FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb");
+    if ( !inFile ) {
+        cerr << "FilterTool error: Could not open script: " << m_settings->ScriptFilename << " for reading" << endl;
+        return false;
+    }
+    
+    // read in entire script contents  
+    char buffer[1024];
+    ostringstream docStream("");
+    while ( true ) {
+        
+        // peek ahead, make sure there is data available
+        char ch = fgetc(inFile);
+        ungetc(ch, inFile);
+        if( feof(inFile) ) break;       
+        
+        // read next block of data
+        if ( fgets(buffer, 1024, inFile) == 0 ) {
+            cerr << "FilterTool error : could not read from script" << endl;
+            return false;
+        }
+        
+        docStream << buffer;
+    }
+    
+    // close script file
+    fclose(inFile);
+    
+    // import buffer contents to document, return
+    string document = docStream.str();
+    return document;
+}
+
+void FilterTool::FilterToolPrivate::InitProperties(void) {
+  
+    // store property names in vector 
+    m_propertyNames.push_back(ALIGNMENTFLAG_PROPERTY);
+    m_propertyNames.push_back(INSERTSIZE_PROPERTY);
+    m_propertyNames.push_back(ISDUPLICATE_PROPERTY);
+    m_propertyNames.push_back(ISFAILEDQC_PROPERTY);
+    m_propertyNames.push_back(ISFIRSTMATE_PROPERTY);
+    m_propertyNames.push_back(ISMAPPED_PROPERTY);
+    m_propertyNames.push_back(ISMATEMAPPED_PROPERTY);
+    m_propertyNames.push_back(ISMATEREVERSESTRAND_PROPERTY);
+    m_propertyNames.push_back(ISPAIRED_PROPERTY);
+    m_propertyNames.push_back(ISPRIMARYALIGNMENT_PROPERTY);
+    m_propertyNames.push_back(ISPROPERPAIR_PROPERTY);
+    m_propertyNames.push_back(ISREVERSESTRAND_PROPERTY);
+    m_propertyNames.push_back(ISSECONDMATE_PROPERTY);
+    m_propertyNames.push_back(MAPQUALITY_PROPERTY);
+    m_propertyNames.push_back(MATEPOSITION_PROPERTY);
+    m_propertyNames.push_back(MATEREFERENCE_PROPERTY);
+    m_propertyNames.push_back(NAME_PROPERTY);
+    m_propertyNames.push_back(POSITION_PROPERTY);
+    m_propertyNames.push_back(QUERYBASES_PROPERTY);
+    m_propertyNames.push_back(REFERENCE_PROPERTY);
+    
+    // add vector contents to FilterEngine
+    vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
+    vector<string>::const_iterator propertyNameEnd  = m_propertyNames.end();
+    for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter )
+        FilterEngine::addProperty((*propertyNameIter));
+}
+
+bool FilterTool::FilterToolPrivate::ParseCommandLine(void) {
+  
+    // add a rule set to filter engine
+    const string CMD = "COMMAND_LINE";
+    FilterEngine::addFilter(CMD);
+
+    // map property names to command line args
+    map<string, string> propertyTokens;
+    if ( m_settings->HasAlignmentFlagFilter )       propertyTokens.insert( make_pair(ALIGNMENTFLAG_PROPERTY,       m_settings->AlignmentFlagFilter) );
+    if ( m_settings->HasInsertSizeFilter )          propertyTokens.insert( make_pair(INSERTSIZE_PROPERTY,          m_settings->InsertSizeFilter) );
+    if ( m_settings->HasIsDuplicateFilter )         propertyTokens.insert( make_pair(ISDUPLICATE_PROPERTY,         m_settings->IsDuplicateFilter) );
+    if ( m_settings->HasIsFailedQCFilter )          propertyTokens.insert( make_pair(ISFAILEDQC_PROPERTY,          m_settings->IsFailedQCFilter) );
+    if ( m_settings->HasIsFirstMateFilter )         propertyTokens.insert( make_pair(ISFIRSTMATE_PROPERTY,         m_settings->IsFirstMateFilter) );
+    if ( m_settings->HasIsMappedFilter )            propertyTokens.insert( make_pair(ISMAPPED_PROPERTY,            m_settings->IsMappedFilter) );
+    if ( m_settings->HasIsMateMappedFilter )        propertyTokens.insert( make_pair(ISMATEMAPPED_PROPERTY,        m_settings->IsMateMappedFilter) );
+    if ( m_settings->HasIsMateReverseStrandFilter ) propertyTokens.insert( make_pair(ISMATEREVERSESTRAND_PROPERTY, m_settings->IsMateReverseStrandFilter) );
+    if ( m_settings->HasIsPairedFilter )            propertyTokens.insert( make_pair(ISPAIRED_PROPERTY,            m_settings->IsPairedFilter) );
+    if ( m_settings->HasIsPrimaryAlignmentFilter )  propertyTokens.insert( make_pair(ISPRIMARYALIGNMENT_PROPERTY,  m_settings->IsPrimaryAlignmentFilter) );
+    if ( m_settings->HasIsProperPairFilter )        propertyTokens.insert( make_pair(ISPROPERPAIR_PROPERTY,        m_settings->IsProperPairFilter) );
+    if ( m_settings->HasIsReverseStrandFilter )     propertyTokens.insert( make_pair(ISREVERSESTRAND_PROPERTY,     m_settings->IsReverseStrandFilter) );
+    if ( m_settings->HasIsSecondMateFilter )        propertyTokens.insert( make_pair(ISSECONDMATE_PROPERTY,        m_settings->IsSecondMateFilter) );
+    if ( m_settings->HasMapQualityFilter )          propertyTokens.insert( make_pair(MAPQUALITY_PROPERTY,          m_settings->MapQualityFilter) );
+    if ( m_settings->HasNameFilter )                propertyTokens.insert( make_pair(NAME_PROPERTY,                m_settings->NameFilter) );
+    if ( m_settings->HasQueryBasesFilter )          propertyTokens.insert( make_pair(QUERYBASES_PROPERTY,          m_settings->QueryBasesFilter) );
+
+    // send add these properties to filter set "COMMAND_LINE"
+    return AddPropertyTokensToFilter(CMD, propertyTokens);
+}
+
+bool FilterTool::FilterToolPrivate::ParseFilterObject(const string& filterName, const Json::Value& filterObject) {
+  
+    // filter object parsing variables
+    Json::Value null(Json::nullValue);
+    Json::Value propertyValue;
+    
+    // store results
+    map<string, string> propertyTokens;
+    
+    // iterate over known properties
+    vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
+    vector<string>::const_iterator propertyNameEnd  = m_propertyNames.end();
+    for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter ) {
+        const string& propertyName = (*propertyNameIter);
+        
+        // if property defined in filter, add to token list
+        propertyValue = filterObject.get(propertyName, null);
+        if ( propertyValue != null ) 
+            propertyTokens.insert( make_pair(propertyName, propertyValue.asString()) );
+    }
+  
+    // add this filter to engin
+    FilterEngine::addFilter(filterName);
   
+    // add token list to this filter
+    return AddPropertyTokensToFilter(filterName, propertyTokens);
+}
+
+bool FilterTool::FilterToolPrivate::ParseScript(void) {
+  
+    // read in script contents from file
+    const string document = GetScriptContents();
+    
+    // set up JsonCPP reader and attempt to parse script
+    Json::Value root;
+    Json::Reader reader;
+    if ( !reader.parse(document, root) ) {
+        // use built-in error reporting mechanism to alert user what was wrong with the script
+        cerr  << "Failed to parse configuration\n" << reader.getFormatedErrorMessages();
+        return false;     
+    }
+
+    // see if root object contains multiple filters
+    const Json::Value filters = root["filters"];
+    if ( !filters.isNull() ) {
+      
+        bool success = true;
+      
+        // iterate over any filters found
+        int filterIndex = 0;
+        Json::Value::const_iterator filtersIter = filters.begin();
+        Json::Value::const_iterator filtersEnd  = filters.end();
+        for ( ; filtersIter != filtersEnd; ++filtersIter, ++filterIndex ) {
+            Json::Value filter = (*filtersIter);
+            
+            // convert filter index to string
+            stringstream convert;
+            convert << filterIndex;
+            const string filterName = convert.str();
+            
+            // create & parse filter 
+            success &= ParseFilterObject(filterName, filter);
+            
+        }
+        return success;
+    } 
+    
+    // otherwise, root is the only filter (just contains properties)
+    // create & parse filter named "ROOT"
+    else return ParseFilterObject("ROOT", root);
+}
+
+
+bool FilterTool::FilterToolPrivate::Run(void) {
+    
     // set to default input if none provided
     if ( !m_settings->HasInputBamFilename ) 
         m_settings->InputFiles.push_back(Options::StandardIn());
     
-    // open files
+    // initialize defined properties & user-specified filters
+    // quit if failed
+    if ( !SetupFilters() ) return 1;
+
+    // open reader without index
     BamMultiReader reader;
-    reader.Open(m_settings->InputFiles, false);
-        
-    // do filtering
+    reader.Open(m_settings->InputFiles, false, true);
+    const string headerText = reader.GetHeaderText();
+    m_references = reader.GetReferenceData();
+    
+    // open writer
+    BamWriter writer;
+    writer.Open(m_settings->OutputFilename, headerText, m_references);
+    
+    // set up error handling
+    ostringstream errorStream("");
+    bool foundError(false);
     
+    // if no REGION specified, run filter on entire file contents
+    if ( !m_settings->HasRegion ) {
+        BamAlignment al;
+        while ( reader.GetNextAlignment(al) ) {
+            // perform main filter check
+            if ( CheckAlignment(al) ) 
+                writer.SaveAlignment(al);
+        }
+    }
+    
+    // REGION specified
+    else {
+      
+        // attempt to parse string into BamRegion struct
+        BamRegion region;
+        if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
+
+            // check if there are index files *.bai/*.bti corresponding to the input files
+            bool hasDefaultIndex   = false;
+            bool hasBamtoolsIndex  = false;
+            bool hasNoIndex        = false;
+            int defaultIndexCount   = 0;
+            int bamtoolsIndexCount = 0;
+            for (vector<string>::const_iterator f = m_settings->InputFiles.begin(); f != m_settings->InputFiles.end(); ++f) {
+              
+                if ( Utilities::FileExists(*f + ".bai") ) {
+                    hasDefaultIndex = true;
+                    ++defaultIndexCount;
+                }       
+                
+                if ( Utilities::FileExists(*f + ".bti") ) {
+                    hasBamtoolsIndex = true;
+                    ++bamtoolsIndexCount;
+                }
+                  
+                if ( !hasDefaultIndex && !hasBamtoolsIndex ) {
+                    hasNoIndex = true;
+                    cerr << "*WARNING - could not find index file for " << *f  
+                         << ", parsing whole file(s) to get alignment counts for target region" 
+                         << " (could be slow)" << endl;
+                    break;
+                }
+            }
+            
+            // determine if index file types are heterogeneous
+            bool hasDifferentIndexTypes = false;
+            if ( defaultIndexCount > 0 && bamtoolsIndexCount > 0 ) {
+                hasDifferentIndexTypes = true;
+                cerr << "*WARNING - different index file formats found"  
+                         << ", parsing whole file(s) to get alignment counts for target region" 
+                         << " (could be slow)" << endl;
+            }
+            
+            // if any input file has no index, or if input files use different index formats
+            // can't use BamMultiReader to jump directly (**for now**)
+            if ( hasNoIndex || hasDifferentIndexTypes ) {
+                
+                // read through sequentially, but onlt perform filter on reads overlapping REGION
+                BamAlignment al;
+                while( reader.GetNextAlignment(al) ) {
+                    if ( (al.RefID >= region.LeftRefID)  && ( (al.Position + al.Length) >= region.LeftPosition ) &&
+                         (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) 
+                    {
+                        // perform main filter check
+                        if ( CheckAlignment(al) ) 
+                            writer.SaveAlignment(al);
+                    }
+                }
+            }
+            
+            // has index file for each input file (and same format)
+            else {
+              
+                // this is kind of a hack...?
+                BamMultiReader reader;
+                reader.Open(m_settings->InputFiles, true, true, hasDefaultIndex );
+              
+                if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
+                   foundError = true;
+                   errorStream << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
+                } else {
+                  
+                    // filter only alignments from specified region
+                    BamAlignment al;
+                    while ( reader.GetNextAlignment(al) ) {
+                        // perform main filter check
+                        if ( CheckAlignment(al) ) 
+                            writer.SaveAlignment(al);
+                    }
+                }
+            }
+            
+        } else {
+            foundError = true;
+            errorStream << "Could not parse REGION: " << m_settings->Region << endl;
+            errorStream << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl;
+        }
+    }
+
     // clean up & exit
     reader.Close();
+    writer.Close();
     return 0;
-}
\ No newline at end of file
+}
+
+bool FilterTool::FilterToolPrivate::SetupFilters(void) {
+  
+    // add known properties to FilterEngine
+    InitProperties();
+    
+    // parse script for filter rules, if given
+    if ( m_settings->HasScriptFilename ) return ParseScript();
+    
+    // otherwise check command line for filters
+    else return ParseCommandLine();
+}
index fe8728b04aeda2862d04854986d6801837be24c4..2abb0e7b5735137cc0e621af09003cfe86436e18 100644 (file)
@@ -3,7 +3,7 @@
 // Marth Lab, Department of Biology, Boston College
 // All rights reserved.
 // ---------------------------------------------------------------------------
-// Last modified: 2 June 2010
+// Last modified: 28 August 2010
 // ---------------------------------------------------------------------------
 // Filters a single BAM file (or filters multiple BAM files and merges) 
 // according to some user-specified criteria.
@@ -29,6 +29,9 @@ class FilterTool : public AbstractTool {
     private:
         struct FilterSettings;
         FilterSettings* m_settings;
+        
+        struct FilterToolPrivate;
+        FilterToolPrivate* m_impl;
 };
   
 } // namespace BamTools
index 0689ebe684685c0dfef6c74f21f673ccba8ca8d8..3ad943082a0175efc4a2dc1560288b8be32ff95d 100644 (file)
@@ -8,6 +8,7 @@
 // 
 // ***************************************************************************
 
+#include <iostream>
 #include "bamtools_filter_engine.h"
 #include "BamAux.h"
 using namespace std;
@@ -27,7 +28,7 @@ bool PropertyFilterValue::check(const string& query) const {
   
     // localize string version of our filter value
     const string& valueString = Value.get<string>();
-
+    
     // string matching based on our filter type
     switch ( Type ) {
         case ( PropertyFilterValue::CONTAINS)           : return ( query.find(valueString) != string::npos );
@@ -44,22 +45,6 @@ bool PropertyFilterValue::check(const string& query) const {
     return false;
 }
 
-// --------------------------------------------------------
-// PropertyFilter implementation
-PropertyFilter::PropertyFilter(void)
-    : Type(PropertyFilter::EXACT)
-    , LeftChild(0)
-    , RightChild(0)
-{ }
-
-PropertyFilter::~PropertyFilter(void) {
-    delete LeftChild;
-    LeftChild = 0;
-  
-    delete RightChild;
-    RightChild = 0;
-}
-
 // ---------------------------------------------------------
 // FilterEngine implementation
 
@@ -115,87 +100,3 @@ const vector<string> FilterEngine::enabledPropertyNames(void) {
         if ( (*propIter).IsEnabled ) names.push_back( (*propIter).Name );    
     return names;
 }
-
-// ================================================================
-// DEBUGGING
-
-void FilterEngine::print(void) {
-    cout << endl;
-    printAllProperties();
-    printEnabledProperties();
-    printFilters();
-}
-
-void FilterEngine::printAllProperties(void) {
-    
-    cout << "=======================================" << endl;
-    cout << "All Properties: " << endl;
-    cout << endl;
-  
-    const vector<string> propertyNames = allPropertyNames();
-    vector<string>::const_iterator nameIter = propertyNames.begin();
-    vector<string>::const_iterator nameEnd  = propertyNames.end();
-    for ( ; nameIter != nameEnd; ++nameIter )
-        cout << (*nameIter) << endl;
-    cout << endl;
-}
-
-void FilterEngine::printEnabledProperties(void) {
-    
-    cout << "=======================================" << endl;
-    cout << "Enabled Properties: " << endl;
-    cout << endl;
-  
-    const vector<string> propertyNames = enabledPropertyNames();
-    vector<string>::const_iterator nameIter = propertyNames.begin();
-    vector<string>::const_iterator nameEnd  = propertyNames.end();
-    for ( ; nameIter != nameEnd; ++nameIter )
-        cout << (*nameIter) << endl;
-    cout << endl;
-}
-
-void FilterEngine::printFilters(void) {
-  
-    cout << "=======================================" << endl;
-    cout << "Current Filters: " << endl;
-    cout << endl;
-    
-    // iterate over all filters in FilterEngine
-    FilterMap::const_iterator filterIter = m_filters.begin();
-    FilterMap::const_iterator filterEnd  = m_filters.end();
-    for ( ; filterIter != filterEnd; ++filterIter ) {
-        cout << "Filter Name: " << (*filterIter).first << endl;
-      
-        // see if filter set has this property enabled
-        const PropertyFilter& filter = (*filterIter).second;
-        PropertyMap::const_iterator propIter = filter.Properties.begin();
-        PropertyMap::const_iterator propEnd  = filter.Properties.end();
-        for ( ; propIter != propEnd; ++propIter ) {
-          
-            cout << " - " << (*propIter).first << " : ";
-            const PropertyFilterValue& filterValue = (*propIter).second;
-             
-            if ( filterValue.Value.is_type<bool>() )              cout << "\t" << boolalpha << filterValue.Value.get<bool>();
-            else if ( filterValue.Value.is_type<int>() )          cout << "\t" << filterValue.Value.get<int>();
-            else if ( filterValue.Value.is_type<unsigned int>() ) cout << "\t" << filterValue.Value.get<unsigned int>();
-            else if ( filterValue.Value.is_type<unsigned short>() ) cout << "\t" << filterValue.Value.get<unsigned short>();
-            else if ( filterValue.Value.is_type<float>() )        cout << "\t" << filterValue.Value.get<float>();
-            else if ( filterValue.Value.is_type<string>() )  cout << "\t" << filterValue.Value.get<string>();
-            else cout << "** UNKNOWN VALUE TYPE!! **";
-                
-            switch( filterValue.Type ) {
-                case (PropertyFilterValue::CONTAINS)           : cout << " (contains)"; break;
-                case (PropertyFilterValue::ENDS_WITH)          : cout << " (ends_with)"; break;
-                case (PropertyFilterValue::EXACT)              : cout << " (exact)"; break;
-                case (PropertyFilterValue::GREATER_THAN)       : cout << " (greater_than)"; break;
-                case (PropertyFilterValue::GREATER_THAN_EQUAL) : cout << " (greater_than_equal)"; break;
-                case (PropertyFilterValue::LESS_THAN)          : cout << " (less_than)"; break;
-                case (PropertyFilterValue::LESS_THAN_EQUAL)    : cout << " (less_than_equal)"; break;
-                case (PropertyFilterValue::NOT)                : cout << " (not)"; break;
-                case (PropertyFilterValue::STARTS_WITH)        : cout << " (starts_with)"; break;
-                default : cout << " : ** UNKNOWN COMPARE TYPE!! **";
-            }
-            cout << endl;
-        }
-    }
-}
index c01ab017924e859e12f2313014dbc9cd11917f46..2297aea32736ac3b80630b26a5af553fe77a9308 100644 (file)
@@ -12,7 +12,6 @@
 #define BAMTOOLS_FILTER_ENGINE_H
 
 #include <algorithm>
-#include <iostream>
 #include <map>
 #include <sstream>
 #include <string>
@@ -78,6 +77,8 @@ typedef std::map<std::string, PropertyFilterValue> PropertyMap;
 
 struct PropertyFilter {  
   
+    // will be used more later
+    // if we implement a compound 'rules' system  - i.e. "(filter1 AND filter2) OR filter 3"
     enum FilterCompareType { AND = 0
                            , EXACT
                            , NOT
@@ -87,12 +88,9 @@ struct PropertyFilter {
     // data members
     PropertyMap Properties;
     FilterCompareType Type; 
-    PropertyFilter* LeftChild;
-    PropertyFilter* RightChild;
-    
-    // ctor & dtor
-    PropertyFilter(void);
-    ~PropertyFilter(void);
+
+    // ctor
+    PropertyFilter(void) : Type( PropertyFilter::EXACT ) { }
     
     // filter check methods      
     template<typename T>
@@ -156,13 +154,6 @@ class FilterEngine {
         // returns true if query passes all filters on 'propertyName'
         template<typename T>
         static bool check(const std::string& propertyName, const T& query);
-        
-    // debugging
-    public:
-        static void print(void);
-        static void printAllProperties(void);
-        static void printEnabledProperties(void);
-        static void printFilters(void);
 
     // data members
     private:
@@ -215,48 +206,24 @@ bool PropertyFilterValue::check(const T& query) const {
 template<typename T>
 bool PropertyFilter::check(const std::string& propertyName, const T& query) const {
   
-    // if this filter is a 'leaf' filter
-    if ( (LeftChild == 0 ) && ( RightChild == 0 ) ) {
-        
-        // if propertyName found for this filter, 
-        PropertyMap::const_iterator propIter = Properties.find(propertyName);
-        if ( propIter != Properties.end() ) {
-          const PropertyFilterValue& filterValue = (*propIter).second;
-          
-          // check 
-          switch ( Type ) {
-              case ( PropertyFilter::EXACT ) : return filterValue.check(query);
-              case ( PropertyFilter::NOT )   : return !filterValue.check(query);
-              case ( PropertyFilter::AND )   :
-              case ( PropertyFilter::OR )    : BAMTOOLS_ASSERT_MESSAGE(false, "Cannot use a binary compare operator on 1 value");
-              default : BAMTOOLS_ASSERT_UNREACHABLE;
-          }
-          return false; // unreachable
-        }
-        
-        // property unknown to this filter
-        else return true;
-    }
-  
-    // if this filter is a parent filter (contains valid left & right children)
-    else if ( LeftChild && RightChild ) {
+    // if propertyName found for this filter, 
+    PropertyMap::const_iterator propIter = Properties.find(propertyName);
+    if ( propIter != Properties.end() ) {
+      const PropertyFilterValue& filterValue = (*propIter).second;
       
-        // return result of children using this filter's compare type (AND, OR)
-        switch ( Type ) {
-            case ( PropertyFilter::AND )  : return LeftChild->check(propertyName, query) && RightChild->check(propertyName, query);
-            case ( PropertyFilter::OR )   : return LeftChild->check(propertyName, query) || RightChild->check(propertyName, query);
-            case ( PropertyFilter::EXACT) : 
-            case ( PropertyFilter::NOT)   : BAMTOOLS_ASSERT_MESSAGE(false, "Cannot use a unary compare operator on 2 values");
-            default : BAMTOOLS_ASSERT_UNREACHABLE;
-        }
-        return false; // unreachable
-    }
-  
-    // otherwise filter contains one child... invalid state
-    else {
-        BAMTOOLS_ASSERT_MESSAGE(false, "PropertyFilter needs both children to do a binary compare");
-        return false;
+      // check 
+      switch ( Type ) {
+          case ( PropertyFilter::EXACT ) : return filterValue.check(query);
+          case ( PropertyFilter::NOT )   : return !filterValue.check(query);
+          case ( PropertyFilter::AND )   :
+          case ( PropertyFilter::OR )    : BAMTOOLS_ASSERT_MESSAGE(false, "Cannot use a binary compare operator on 1 value");
+          default : BAMTOOLS_ASSERT_UNREACHABLE;
+      }
+      return false; // unreachable
     }
+
+    // property unknown to this filter
+    else return true;
 }
 
 template<typename T>
@@ -453,22 +420,11 @@ bool FilterEngine::check(const std::string& propertyName, const T& query) {
       
         // check query against this filter
         const PropertyFilter& filter = (*filterIter).second;
-        if ( !filter.check(propertyName, query) ) return false;
-      
-      
-//         // see if filter set has this property enabled
-//         const PropertyFilter& filter = (*filterIter).second;
-//         PropertyMap::const_iterator propIter = filter.Properties.find(propertyName);
-//         if ( propIter != filter.Properties.end() ) {
-//         
-//           // if so, check query against filter, return false if check fails
-//           const PropertyFilterValue& propertyFilter = (*propIter).second;
-//           if ( !propertyFilter.check(query) ) return false;
-//         }
+        if ( filter.check(propertyName, query) ) return true;
     }
  
-    // query passes all filters with property enabled
-    return true;
+    // query passes none of the filters with current property enabled
+    return false;
 }
 
 } // namespace BamTools