]> git.donarmstrong.com Git - bamtools.git/commitdiff
Added tag support in filter tool.
authorderek <derek@donatello.(none)>
Mon, 4 Oct 2010 17:49:10 +0000 (13:49 -0400)
committerderek <derek@donatello.(none)>
Mon, 4 Oct 2010 17:49:10 +0000 (13:49 -0400)
src/toolkit/bamtools_filter.cpp
src/utils/bamtools_filter_engine.h

index 59e89dd9e59d7c1f7bf3d3d8c94d9b8fa85763bf..f5232ec0824a94a2120c0fc47a23f27c6b4108f0 100644 (file)
@@ -3,7 +3,7 @@
 // Marth Lab, Department of Biology, Boston College
 // All rights reserved.
 // ---------------------------------------------------------------------------
-// Last modified: 21 September 2010
+// Last modified: 4 October 2010
 // ---------------------------------------------------------------------------
 // Filters BAM file(s) according to some user-specified criteria.
 // ***************************************************************************
@@ -32,6 +32,7 @@ namespace BamTools {
 
 // property names
 const string ALIGNMENTFLAG_PROPERTY       = "alignmentFlag";
+const string CIGAR_PROPERTY               = "cigar";
 const string INSERTSIZE_PROPERTY          = "insertSize";
 const string ISDUPLICATE_PROPERTY         = "isDuplicate";
 const string ISFAILEDQC_PROPERTY          = "isFailedQC";
@@ -51,7 +52,7 @@ const string NAME_PROPERTY                = "name";
 const string POSITION_PROPERTY            = "position";
 const string QUERYBASES_PROPERTY          = "queryBases";
 const string REFERENCE_PROPERTY           = "reference";
-const string CIGAR_PROPERTY               = "cigar";
+const string TAG_PROPERTY                 = "tag";
 
 // boolalpha
 const string TRUE_STR  = "true";
@@ -72,7 +73,21 @@ struct BamAlignmentChecker {
             const string& propertyName = (*propertyIter).first;
             const PropertyFilterValue& valueFilter = (*propertyIter).second;
             
-            if      ( propertyName == ALIGNMENTFLAG_PROPERTY )        keepAlignment &= valueFilter.check(al.AlignmentFlag);
+            if      ( propertyName == ALIGNMENTFLAG_PROPERTY )  keepAlignment &= valueFilter.check(al.AlignmentFlag);
+           else if ( propertyName == CIGAR_PROPERTY ) {
+                stringstream cigarSs;
+                const vector<CigarOp>& cigarData = al.CigarData;
+                if ( !cigarData.empty() ) {
+                    vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
+                    vector<CigarOp>::const_iterator cigarIter = cigarBegin;
+                    vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
+                    for ( ; cigarIter != cigarEnd; ++cigarIter ) {
+                        const CigarOp& op = (*cigarIter);
+                        cigarSs << op.Length << op.Type;
+                    }
+                    keepAlignment &= valueFilter.check(cigarSs.str());
+                }
+            }
             else if ( propertyName == INSERTSIZE_PROPERTY )           keepAlignment &= valueFilter.check(al.InsertSize);
             else if ( propertyName == ISDUPLICATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsDuplicate());
             else if ( propertyName == ISFAILEDQC_PROPERTY )           keepAlignment &= valueFilter.check(al.IsFailedQC());
@@ -101,31 +116,108 @@ struct BamAlignmentChecker {
                 const string& refName = filterToolReferences.at(al.RefID).RefName;
                 keepAlignment &= valueFilter.check(refName);
             }
-            else if ( propertyName == CIGAR_PROPERTY ) {
-                stringstream cigarSs;
-                const vector<CigarOp>& cigarData = al.CigarData;
-                if ( !cigarData.empty() ) {
-                    vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
-                    vector<CigarOp>::const_iterator cigarIter = cigarBegin;
-                    vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
-                    for ( ; cigarIter != cigarEnd; ++cigarIter ) {
-                        const CigarOp& op = (*cigarIter);
-                        cigarSs << op.Length << op.Type;
-                    }
-                    keepAlignment &= valueFilter.check(cigarSs.str());
-                }
-            }
+            else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al);
             else BAMTOOLS_ASSERT_UNREACHABLE;
             
             // if alignment fails at ANY point, just quit and return false
-            if ( !keepAlignment ) 
-                return false;
+            if ( !keepAlignment ) return false;
         }
       
         BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here");
         return keepAlignment;
     }
-};
+    
+    bool checkAlignmentTag(const PropertyFilterValue& valueFilter, const BamAlignment& al) {
+     
+       // ensure filter contains string data
+       Variant entireTagFilter = valueFilter.Value;
+       if ( !entireTagFilter.is_type<string>() ) return false;
+       
+       // localize string from variant
+       const string& entireTagFilterString = entireTagFilter.get<string>();
+       
+       // ensure we have at least "XX:x"
+       if ( entireTagFilterString.length() < 4 ) return false;
+       
+       // get tagName & lookup in alignment
+       // if found, set tagType to tag type character
+       // if not found, return false
+       const string& tagName = entireTagFilterString.substr(0,2);
+       char tagType = '\0';
+       if ( !al.GetTagType(tagName, tagType) ) return false;
+         
+       // remove tagName & ":" from beginning tagFilter
+       string tagFilterString = entireTagFilterString.substr(3);
+         
+       // switch on tag type to set tag query value & parse filter token
+       int32_t  intFilterValue,    intQueryValue;
+       uint32_t uintFilterValue,   uintQueryValue;
+       float    realFilterValue,   realQueryValue;
+       string   stringFilterValue, stringQueryValue;
+       
+       PropertyFilterValue tagFilter;
+       PropertyFilterValue::ValueCompareType compareType;
+       bool keepAlignment = false;
+       switch (tagType) {
+            
+           // signed int tag type
+           case 'c' :
+            case 's' : 
+            case 'i' :
+               if ( al.GetTag(tagName, intQueryValue) ) {
+                   if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, intFilterValue, compareType) ) {
+                       tagFilter.Value = intFilterValue;
+                       tagFilter.Type  = compareType;
+                       keepAlignment   = tagFilter.check(intQueryValue);
+                   }
+               }
+               break;
+               
+           // unsigned int tag type
+            case 'C' :
+            case 'S' :
+            case 'I' : 
+               if ( al.GetTag(tagName, uintQueryValue) ) {
+                   if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, uintFilterValue, compareType) ) {
+                       tagFilter.Value = uintFilterValue;
+                       tagFilter.Type  = compareType;
+                       keepAlignment   = tagFilter.check(uintQueryValue);
+                   }
+               }
+               break;
+               
+           // 'real' tag type
+            case 'f' :
+               if ( al.GetTag(tagName, realQueryValue) ) {
+                   if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, realFilterValue, compareType) ) {
+                       tagFilter.Value = realFilterValue;
+                       tagFilter.Type  = compareType;
+                       keepAlignment   = tagFilter.check(realQueryValue);
+                   }
+               }
+                break;
+               
+           // string tag type
+            case 'A':
+            case 'Z':
+            case 'H':        
+                if ( al.GetTag(tagName, stringQueryValue) ) {
+                   if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, stringFilterValue, compareType) ) {
+                       tagFilter.Value = stringFilterValue;
+                       tagFilter.Type  = compareType;
+                       keepAlignment   = tagFilter.check(stringQueryValue);
+                   }
+               }
+               break;
+               
+           // unknown tag type
+           default : 
+               keepAlignment = false;
+       }
+       
+       return keepAlignment;
+    }
+};    
     
 } // namespace BamTools
   
@@ -191,8 +283,7 @@ struct FilterTool::FilterSettings {
     bool HasMapQualityFilter;
     bool HasNameFilter;
     bool HasQueryBasesFilter;
-    
-//     bool HasTagFilters;
+    bool HasTagFilter; //(s)
 
     // filters
     string AlignmentFlagFilter;
@@ -200,8 +291,7 @@ struct FilterTool::FilterSettings {
     string NameFilter;
     string MapQualityFilter;
     string QueryBasesFilter;
-    
-//     vector<string> TagFilters;
+    string TagFilter;  // support multiple ?
 
     // -----------------------------------
     // AlignmentFlag filter opts
@@ -247,7 +337,7 @@ struct FilterTool::FilterSettings {
         , HasMapQualityFilter(false)
         , HasNameFilter(false)
         , HasQueryBasesFilter(false)
-//         , HasTagFilters(false)
+        , HasTagFilter(false)
         , HasIsDuplicateFilter(false)
         , HasIsFailedQCFilter(false)
         , HasIsFirstMateFilter(false)
@@ -292,13 +382,12 @@ FilterTool::FilterTool(void)
     Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts);
     
     OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters");
-    Options::AddValueOption("-alignmentFlag", "int",     "keep reads with this *exact* alignment flag (for more detailed queries, see below)", "", m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, FilterOpts);
-    Options::AddValueOption("-insertSize",    "int",     "keep reads with insert size that mathces pattern",             "", m_settings->HasInsertSizeFilter, m_settings->InsertSizeFilter, FilterOpts);
-    Options::AddValueOption("-mapQuality",    "[0-255]", "keep reads with map quality that matches pattern",             "", m_settings->HasMapQualityFilter, m_settings->MapQualityFilter, FilterOpts);
-    Options::AddValueOption("-name",          "string",  "keep reads with name that matches pattern",                    "", m_settings->HasNameFilter,       m_settings->NameFilter,       FilterOpts);
-    Options::AddValueOption("-queryBases",    "string",  "keep reads with motif that mathces pattern",                   "", m_settings->HasQueryBasesFilter, m_settings->QueryBasesFilter, FilterOpts);
-    
-//     Options::AddValueOption("-tag", "TAG:VALUE", "keep reads with this key=>value pair. If multiple tags are given, reads must match all", "", m_settings->HasTagFilters, m_settings->TagFilters, FilterOpts);
+    Options::AddValueOption("-alignmentFlag", "int",        "keep reads with this *exact* alignment flag (for more detailed queries, see below)", "", m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, FilterOpts);
+    Options::AddValueOption("-insertSize",    "int",        "keep reads with insert size that mathces pattern",             "", m_settings->HasInsertSizeFilter, m_settings->InsertSizeFilter, FilterOpts);
+    Options::AddValueOption("-mapQuality",    "[0-255]",    "keep reads with map quality that matches pattern",             "", m_settings->HasMapQualityFilter, m_settings->MapQualityFilter, FilterOpts);
+    Options::AddValueOption("-name",          "string",     "keep reads with name that matches pattern",                    "", m_settings->HasNameFilter,       m_settings->NameFilter,       FilterOpts);
+    Options::AddValueOption("-queryBases",    "string",     "keep reads with motif that mathces pattern",                   "", m_settings->HasQueryBasesFilter, m_settings->QueryBasesFilter, FilterOpts);
+    Options::AddValueOption("-tag",            "TAG:VALUE", "keep reads with this key=>value pair",                         "", m_settings->HasTagFilter,        m_settings->TagFilter,        FilterOpts);
     
     OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters");
     Options::AddValueOption("-isDuplicate",         "true/false", "keep only alignments that are marked as duplicate?", "", m_settings->HasIsDuplicateFilter,         m_settings->IsDuplicateFilter,         AlignmentFlagOpts, TRUE_STR);
@@ -386,7 +475,7 @@ bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filt
              propertyName == ISSECONDMATE_PROPERTY
            ) 
         {
-            m_filterEngine.parseToken(token, boolValue, type);
+            FilterEngine<BamAlignmentChecker>::parseToken(token, boolValue, type);
             m_filterEngine.setProperty(filterName, propertyName, boolValue, type);
         }
         
@@ -396,36 +485,42 @@ bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filt
                   propertyName == POSITION_PROPERTY 
                 ) 
         {
-            m_filterEngine.parseToken(token, int32Value, type);
+            FilterEngine<BamAlignmentChecker>::parseToken(token, int32Value, type);
             m_filterEngine.setProperty(filterName, propertyName, int32Value, type);
         }
         
         // uint16_t conversion
         else if ( propertyName == MAPQUALITY_PROPERTY )
         {
-            m_filterEngine.parseToken(token, uint16Value, type);
+            FilterEngine<BamAlignmentChecker>::parseToken(token, uint16Value, type);
             m_filterEngine.setProperty(filterName, propertyName, uint16Value, type);
         }
         
         // uint32_t conversion
         else if ( propertyName == ALIGNMENTFLAG_PROPERTY )
         {
-            m_filterEngine.parseToken(token, uint32Value, type);
+            FilterEngine<BamAlignmentChecker>::parseToken(token, uint32Value, type);
             m_filterEngine.setProperty(filterName, propertyName, uint32Value, type);
         }
         
         // string conversion
-        else if ( propertyName == MATEREFERENCE_PROPERTY ||
+        else if ( propertyName == CIGAR_PROPERTY || 
+                 propertyName == MATEREFERENCE_PROPERTY ||
                   propertyName == NAME_PROPERTY ||
                   propertyName == QUERYBASES_PROPERTY ||
-                  propertyName == REFERENCE_PROPERTY ||
-                  propertyName == CIGAR_PROPERTY
+                  propertyName == REFERENCE_PROPERTY 
                 ) 
         {
-            m_filterEngine.parseToken(token, stringValue, type);
+            FilterEngine<BamAlignmentChecker>::parseToken(token, stringValue, type);
             m_filterEngine.setProperty(filterName, propertyName, stringValue, type);
         }
       
+       else if (propertyName == TAG_PROPERTY ) {
+           // this will be stored directly as the TAG:VALUE token 
+           // (VALUE may contain compare ops, will be parsed out later)
+           m_filterEngine.setProperty(filterName, propertyName, token, PropertyFilterValue::EXACT);
+       }
+      
         // else unknown property 
         else {
             cerr << "Unknown property: " << propertyName << "!" << endl;
@@ -479,6 +574,7 @@ void FilterTool::FilterToolPrivate::InitProperties(void) {
   
     // store property names in vector 
     m_propertyNames.push_back(ALIGNMENTFLAG_PROPERTY);
+    m_propertyNames.push_back(CIGAR_PROPERTY);
     m_propertyNames.push_back(INSERTSIZE_PROPERTY);
     m_propertyNames.push_back(ISDUPLICATE_PROPERTY);
     m_propertyNames.push_back(ISFAILEDQC_PROPERTY);
@@ -498,7 +594,7 @@ void FilterTool::FilterToolPrivate::InitProperties(void) {
     m_propertyNames.push_back(POSITION_PROPERTY);
     m_propertyNames.push_back(QUERYBASES_PROPERTY);
     m_propertyNames.push_back(REFERENCE_PROPERTY);
-    m_propertyNames.push_back(CIGAR_PROPERTY);
+    m_propertyNames.push_back(TAG_PROPERTY);
     
     // add vector contents to FilterEngine<BamAlignmentChecker>
     vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
@@ -531,7 +627,8 @@ bool FilterTool::FilterToolPrivate::ParseCommandLine(void) {
     if ( m_settings->HasMapQualityFilter )          propertyTokens.insert( make_pair(MAPQUALITY_PROPERTY,          m_settings->MapQualityFilter) );
     if ( m_settings->HasNameFilter )                propertyTokens.insert( make_pair(NAME_PROPERTY,                m_settings->NameFilter) );
     if ( m_settings->HasQueryBasesFilter )          propertyTokens.insert( make_pair(QUERYBASES_PROPERTY,          m_settings->QueryBasesFilter) );
-
+    if ( m_settings->HasTagFilter )                 propertyTokens.insert( make_pair(TAG_PROPERTY,                 m_settings->TagFilter) );
+    
     // send add these properties to filter set "COMMAND_LINE"
     return AddPropertyTokensToFilter(CMD, propertyTokens);
 }
@@ -597,9 +694,8 @@ bool FilterTool::FilterToolPrivate::ParseScript(void) {
             
             // if id tag supplied
             const Json::Value id = filter["id"];
-            if ( !id.isNull() ) {
+            if ( !id.isNull() ) 
                 filterName = id.asString();
-            } 
             
             // use array index 
             else {
@@ -632,20 +728,19 @@ bool FilterTool::FilterToolPrivate::ParseScript(void) {
     return success;
 }
 
-
 bool FilterTool::FilterToolPrivate::Run(void) {
     
     // set to default input if none provided
     if ( !m_settings->HasInputBamFilename ) 
         m_settings->InputFiles.push_back(Options::StandardIn());
-    
+
     // initialize defined properties & user-specified filters
     // quit if failed
     if ( !SetupFilters() ) return 1;
 
     // open reader without index
     BamMultiReader reader;
-    if (!reader.Open(m_settings->InputFiles, false, true)) {
+    if ( !reader.Open(m_settings->InputFiles, false, true) ) {
         cerr << "Could not open input files for reading." << endl;
         return false;
     }
@@ -655,7 +750,7 @@ bool FilterTool::FilterToolPrivate::Run(void) {
     // open writer
     BamWriter writer;
     bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression );
-    if (!writer.Open(m_settings->OutputFilename, headerText, filterToolReferences, writeUncompressed)) {
+    if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences, writeUncompressed) ) {
         cerr << "Could not open " << m_settings->OutputFilename << " for writing." << endl;
         return false;
     }
@@ -706,9 +801,9 @@ bool FilterTool::FilterToolPrivate::Run(void) {
             // no index data available, we have to iterate through until we
             // find overlapping alignments
             else {
-                while( reader.GetNextAlignmentCore(al) ) {
-                    if ( (al.RefID >= region.LeftRefID)  && ( (al.Position + al.Length) >= region.LeftPosition ) &&
-                          (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) 
+                while ( reader.GetNextAlignmentCore(al) ) {
+                    if ( (al.RefID >= region.LeftRefID)  && ((al.Position + al.Length) >= region.LeftPosition) &&
+                         (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) 
                     {
                         if ( CheckAlignment(al) ) 
                             writer.SaveAlignment(al);
index 924b043119b7cd3b96e6826aa18371ca7cb5a28b..cc9312081ab03d98c8b2644c9e12060a1fe41f51 100644 (file)
@@ -3,7 +3,7 @@
 // Marth Lab, Department of Biology, Boston College
 // All rights reserved.
 // ---------------------------------------------------------------------------
-// Last modified: 21 September 2010
+// Last modified: 29 September 2010
 // ---------------------------------------------------------------------------
 // Provides a generic filter engine based on filter-sets of properties,
 // with possible "rules" (compound logical expressions) to create more complex
@@ -126,7 +126,7 @@ class FilterEngine {
     // token parsing (for property filter generation)
     public:
         template<typename T>
-        bool parseToken(const std::string& token, T& value, PropertyFilterValue::ValueCompareType& type);
+        static bool parseToken(const std::string& token, T& value, PropertyFilterValue::ValueCompareType& type);
         
     // query evaluation
     public:
@@ -377,10 +377,8 @@ bool FilterEngine<FilterChecker>::parseToken(const std::string& token, T& value,
         switch ( firstChar ) {
           
             case ( FilterEngine<FilterChecker>::NOT_CHAR ) :
-              
                 strippedToken = token.substr(1);       
                 type = PropertyFilterValue::NOT;
-                
                 break;
                 
             case ( FilterEngine<FilterChecker>::GREATER_THAN_CHAR ) :
@@ -435,7 +433,6 @@ bool FilterEngine<FilterChecker>::parseToken(const std::string& token, T& value,
                 break;
                
             default :
-              
                 // check for str* case (STARTS_WITH)
                 if ( token.at( token.length() - 1 ) == FilterEngine<FilterChecker>::WILDCARD_CHAR ) {
                     if ( token.length() == 2 ) return false;