X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Ftoolkit%2Fbamtools_filter.cpp;h=2f172422e9cc68a633a4a6f4c8c31d8ac0d4edad;hb=75af0fbc1c88cb67f2a91f79de53b0ec7a7211c3;hp=59e89dd9e59d7c1f7bf3d3d8c94d9b8fa85763bf;hpb=8807d4776b73d173b10cf6fb77ccbf485d270597;p=bamtools.git diff --git a/src/toolkit/bamtools_filter.cpp b/src/toolkit/bamtools_filter.cpp index 59e89dd..2f17242 100644 --- a/src/toolkit/bamtools_filter.cpp +++ b/src/toolkit/bamtools_filter.cpp @@ -1,29 +1,31 @@ // *************************************************************************** // bamtools_filter.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College -// All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 September 2010 +// Last modified: 3 May 2013 // --------------------------------------------------------------------------- -// Filters BAM file(s) according to some user-specified criteria. +// Filters BAM file(s) according to some user-specified criteria // *************************************************************************** +#include "bamtools_filter.h" + +#include +#include +#include +#include +#include +using namespace BamTools; + +#include +using namespace Json; + #include +#include #include #include #include #include -#include "bamtools_filter.h" -#include "bamtools_filter_engine.h" -#include "bamtools_options.h" -#include "bamtools_utilities.h" -#include "BamReader.h" -#include "BamMultiReader.h" -#include "BamWriter.h" -#include "jsoncpp/json.h" using namespace std; -using namespace BamTools; -using namespace Json; namespace BamTools { @@ -32,6 +34,7 @@ namespace BamTools { // property names const string ALIGNMENTFLAG_PROPERTY = "alignmentFlag"; +const string CIGAR_PROPERTY = "cigar"; const string INSERTSIZE_PROPERTY = "insertSize"; const string ISDUPLICATE_PROPERTY = "isDuplicate"; const string ISFAILEDQC_PROPERTY = "isFailedQC"; @@ -44,6 +47,8 @@ const string ISPRIMARYALIGNMENT_PROPERTY = "isPrimaryAlignment"; const string ISPROPERPAIR_PROPERTY = "isProperPair"; const string ISREVERSESTRAND_PROPERTY = "isReverseStrand"; const string ISSECONDMATE_PROPERTY = "isSecondMate"; +const string ISSINGLETON_PROPERTY = "isSingleton"; +const string LENGTH_PROPERTY = "length"; const string MAPQUALITY_PROPERTY = "mapQuality"; const string MATEPOSITION_PROPERTY = "matePosition"; const string MATEREFERENCE_PROPERTY = "mateReference"; @@ -51,7 +56,7 @@ const string NAME_PROPERTY = "name"; const string POSITION_PROPERTY = "position"; const string QUERYBASES_PROPERTY = "queryBases"; const string REFERENCE_PROPERTY = "reference"; -const string CIGAR_PROPERTY = "cigar"; +const string TAG_PROPERTY = "tag"; // boolalpha const string TRUE_STR = "true"; @@ -72,7 +77,21 @@ struct BamAlignmentChecker { const string& propertyName = (*propertyIter).first; const PropertyFilterValue& valueFilter = (*propertyIter).second; - if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= valueFilter.check(al.AlignmentFlag); + if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= valueFilter.check(al.AlignmentFlag); + else if ( propertyName == CIGAR_PROPERTY ) { + stringstream cigarSs; + const vector& cigarData = al.CigarData; + if ( !cigarData.empty() ) { + vector::const_iterator cigarBegin = cigarData.begin(); + vector::const_iterator cigarIter = cigarBegin; + vector::const_iterator cigarEnd = cigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter ) { + const CigarOp& op = (*cigarIter); + cigarSs << op.Length << op.Type; + } + keepAlignment &= valueFilter.check(cigarSs.str()); + } + } else if ( propertyName == INSERTSIZE_PROPERTY ) keepAlignment &= valueFilter.check(al.InsertSize); else if ( propertyName == ISDUPLICATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsDuplicate()); else if ( propertyName == ISFAILEDQC_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFailedQC()); @@ -85,6 +104,11 @@ struct BamAlignmentChecker { else if ( propertyName == ISPROPERPAIR_PROPERTY ) keepAlignment &= valueFilter.check(al.IsProperPair()); else if ( propertyName == ISREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsReverseStrand()); else if ( propertyName == ISSECONDMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsSecondMate()); + else if ( propertyName == ISSINGLETON_PROPERTY ) { + const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped(); + keepAlignment &= valueFilter.check(isSingleton); + } + else if ( propertyName == LENGTH_PROPERTY ) keepAlignment &= valueFilter.check(al.Length); else if ( propertyName == MAPQUALITY_PROPERTY ) keepAlignment &= valueFilter.check(al.MapQuality); else if ( propertyName == MATEPOSITION_PROPERTY ) keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) ); else if ( propertyName == MATEREFERENCE_PROPERTY ) { @@ -101,65 +125,122 @@ struct BamAlignmentChecker { const string& refName = filterToolReferences.at(al.RefID).RefName; keepAlignment &= valueFilter.check(refName); } - else if ( propertyName == CIGAR_PROPERTY ) { - stringstream cigarSs; - const vector& cigarData = al.CigarData; - if ( !cigarData.empty() ) { - vector::const_iterator cigarBegin = cigarData.begin(); - vector::const_iterator cigarIter = cigarBegin; - vector::const_iterator cigarEnd = cigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter ) { - const CigarOp& op = (*cigarIter); - cigarSs << op.Length << op.Type; - } - keepAlignment &= valueFilter.check(cigarSs.str()); - } - } + else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al); else BAMTOOLS_ASSERT_UNREACHABLE; // if alignment fails at ANY point, just quit and return false - if ( !keepAlignment ) - return false; + if ( !keepAlignment ) return false; } BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here"); return keepAlignment; } -}; -} // namespace BamTools - -// --------------------------------------------- -// FilterToolPrivate declaration + bool checkAlignmentTag(const PropertyFilterValue& valueFilter, const BamAlignment& al) { + + // ensure filter contains string data + Variant entireTagFilter = valueFilter.Value; + if ( !entireTagFilter.is_type() ) return false; -class FilterTool::FilterToolPrivate { - - // ctor & dtor - public: - FilterToolPrivate(FilterTool::FilterSettings* settings); - ~FilterToolPrivate(void); - - // 'public' interface - public: - bool Run(void); - - // internal methods - private: - bool AddPropertyTokensToFilter(const string& filterName, const map& propertyTokens); - bool CheckAlignment(const BamAlignment& al); - const string GetScriptContents(void); - void InitProperties(void); - bool ParseCommandLine(void); - bool ParseFilterObject(const string& filterName, const Json::Value& filterObject); - bool ParseScript(void); - bool SetupFilters(void); - - // data members - private: - vector m_propertyNames; - FilterTool::FilterSettings* m_settings; - FilterEngine m_filterEngine; -}; + // localize string from variant + const string& entireTagFilterString = entireTagFilter.get(); + + // ensure we have at least "XX:x" + if ( entireTagFilterString.length() < 4 ) return false; + + // get tagName & lookup in alignment + // if found, set tagType to tag type character + // if not found, return false + const string& tagName = entireTagFilterString.substr(0,2); + char tagType = '\0'; + if ( !al.GetTagType(tagName, tagType) ) return false; + + // remove tagName & ":" from beginning tagFilter + string tagFilterString = entireTagFilterString.substr(3); + + // switch on tag type to set tag query value & parse filter token + int8_t asciiFilterValue, asciiQueryValue; + int32_t intFilterValue, intQueryValue; + uint32_t uintFilterValue, uintQueryValue; + float realFilterValue, realQueryValue; + string stringFilterValue, stringQueryValue; + + PropertyFilterValue tagFilter; + PropertyFilterValue::ValueCompareType compareType; + bool keepAlignment = false; + switch (tagType) { + + // ASCII tag type + case 'A': + if ( al.GetTag(tagName, asciiQueryValue) ) { + if ( FilterEngine::parseToken(tagFilterString, asciiFilterValue, compareType) ) { + tagFilter.Value = asciiFilterValue; + tagFilter.Type = compareType; + keepAlignment = tagFilter.check(asciiQueryValue); + } + } + break; + + // signed int tag type + case 'c' : + case 's' : + case 'i' : + if ( al.GetTag(tagName, intQueryValue) ) { + if ( FilterEngine::parseToken(tagFilterString, intFilterValue, compareType) ) { + tagFilter.Value = intFilterValue; + tagFilter.Type = compareType; + keepAlignment = tagFilter.check(intQueryValue); + } + } + break; + + // unsigned int tag type + case 'C' : + case 'S' : + case 'I' : + if ( al.GetTag(tagName, uintQueryValue) ) { + if ( FilterEngine::parseToken(tagFilterString, uintFilterValue, compareType) ) { + tagFilter.Value = uintFilterValue; + tagFilter.Type = compareType; + keepAlignment = tagFilter.check(uintQueryValue); + } + } + break; + + // 'real' tag type + case 'f' : + if ( al.GetTag(tagName, realQueryValue) ) { + if ( FilterEngine::parseToken(tagFilterString, realFilterValue, compareType) ) { + tagFilter.Value = realFilterValue; + tagFilter.Type = compareType; + keepAlignment = tagFilter.check(realQueryValue); + } + } + break; + + // string tag type + + case 'Z': + case 'H': + if ( al.GetTag(tagName, stringQueryValue) ) { + if ( FilterEngine::parseToken(tagFilterString, stringFilterValue, compareType) ) { + tagFilter.Value = stringFilterValue; + tagFilter.Type = compareType; + keepAlignment = tagFilter.check(stringQueryValue); + } + } + break; + + // unknown tag type + default : + keepAlignment = false; + } + + return keepAlignment; + } +}; + +} // namespace BamTools // --------------------------------------------- // FilterSettings implementation @@ -168,44 +249,46 @@ struct FilterTool::FilterSettings { // ---------------------------------- // IO opts - + // flags - bool HasInputBamFilename; - bool HasOutputBamFilename; + bool HasInput; + bool HasInputFilelist; + bool HasOutput; bool HasRegion; - bool HasScriptFilename; + bool HasScript; bool IsForceCompression; - + // filenames vector InputFiles; + string InputFilelist; string OutputFilename; string Region; string ScriptFilename; - + // ----------------------------------- // General filter opts - + // flags bool HasAlignmentFlagFilter; bool HasInsertSizeFilter; + bool HasLengthFilter; bool HasMapQualityFilter; bool HasNameFilter; bool HasQueryBasesFilter; - -// bool HasTagFilters; + bool HasTagFilter; //(s) // filters string AlignmentFlagFilter; string InsertSizeFilter; - string NameFilter; + string LengthFilter; string MapQualityFilter; + string NameFilter; string QueryBasesFilter; - -// vector TagFilters; + string TagFilter; // support multiple ? // ----------------------------------- // AlignmentFlag filter opts - + // flags bool HasIsDuplicateFilter; bool HasIsFailedQCFilter; @@ -218,7 +301,8 @@ struct FilterTool::FilterSettings { bool HasIsProperPairFilter; bool HasIsReverseStrandFilter; bool HasIsSecondMateFilter; - + bool HasIsSingletonFilter; + // filters string IsDuplicateFilter; string IsFailedQCFilter; @@ -231,23 +315,26 @@ struct FilterTool::FilterSettings { string IsProperPairFilter; string IsReverseStrandFilter; string IsSecondMateFilter; - + string IsSingletonFilter; + // --------------------------------- // constructor - + FilterSettings(void) - : HasInputBamFilename(false) - , HasOutputBamFilename(false) + : HasInput(false) + , HasInputFilelist(false) + , HasOutput(false) , HasRegion(false) - , HasScriptFilename(false) + , HasScript(false) , IsForceCompression(false) , OutputFilename(Options::StandardOut()) , HasAlignmentFlagFilter(false) , HasInsertSizeFilter(false) + , HasLengthFilter(false) , HasMapQualityFilter(false) , HasNameFilter(false) , HasQueryBasesFilter(false) -// , HasTagFilters(false) + , HasTagFilter(false) , HasIsDuplicateFilter(false) , HasIsFailedQCFilter(false) , HasIsFirstMateFilter(false) @@ -259,6 +346,7 @@ struct FilterTool::FilterSettings { , HasIsProperPairFilter(false) , HasIsReverseStrandFilter(false) , HasIsSecondMateFilter(false) + , HasIsSingletonFilter(false) , IsDuplicateFilter(TRUE_STR) , IsFailedQCFilter(TRUE_STR) , IsFirstMateFilter(TRUE_STR) @@ -270,74 +358,41 @@ struct FilterTool::FilterSettings { , IsProperPairFilter(TRUE_STR) , IsReverseStrandFilter(TRUE_STR) , IsSecondMateFilter(TRUE_STR) + , IsSingletonFilter(TRUE_STR) { } -}; +}; // --------------------------------------------- -// FilterTool implementation - -FilterTool::FilterTool(void) - : AbstractTool() - , m_settings(new FilterSettings) - , m_impl(0) -{ - // set program details - Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", "-in [-in ... ] -out -region [ [-script HasInputBamFilename, m_settings->InputFiles, IO_Opts, Options::StandardIn()); - Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); - Options::AddValueOption("-region", "REGION", "only read data from this genomic region (see README for more details)", "", m_settings->HasRegion, m_settings->Region, IO_Opts); - Options::AddValueOption("-script", "filename", "the filter script file (see README for more details)", "", m_settings->HasScriptFilename, m_settings->ScriptFilename, IO_Opts); - Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts); - - OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters"); - Options::AddValueOption("-alignmentFlag", "int", "keep reads with this *exact* alignment flag (for more detailed queries, see below)", "", m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, FilterOpts); - Options::AddValueOption("-insertSize", "int", "keep reads with insert size that mathces pattern", "", m_settings->HasInsertSizeFilter, m_settings->InsertSizeFilter, FilterOpts); - Options::AddValueOption("-mapQuality", "[0-255]", "keep reads with map quality that matches pattern", "", m_settings->HasMapQualityFilter, m_settings->MapQualityFilter, FilterOpts); - Options::AddValueOption("-name", "string", "keep reads with name that matches pattern", "", m_settings->HasNameFilter, m_settings->NameFilter, FilterOpts); - Options::AddValueOption("-queryBases", "string", "keep reads with motif that mathces pattern", "", m_settings->HasQueryBasesFilter, m_settings->QueryBasesFilter, FilterOpts); - -// Options::AddValueOption("-tag", "TAG:VALUE", "keep reads with this key=>value pair. If multiple tags are given, reads must match all", "", m_settings->HasTagFilters, m_settings->TagFilters, FilterOpts); - - OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters"); - Options::AddValueOption("-isDuplicate", "true/false", "keep only alignments that are marked as duplicate?", "", m_settings->HasIsDuplicateFilter, m_settings->IsDuplicateFilter, AlignmentFlagOpts, TRUE_STR); - Options::AddValueOption("-isFailedQC", "true/false", "keep only alignments that failed QC?", "", m_settings->HasIsFailedQCFilter, m_settings->IsFailedQCFilter, AlignmentFlagOpts, TRUE_STR); - Options::AddValueOption("-isFirstMate", "true/false", "keep only alignments marked as first mate?", "", m_settings->HasIsFirstMateFilter, m_settings->IsFirstMateFilter, AlignmentFlagOpts, TRUE_STR); - Options::AddValueOption("-isMapped", "true/false", "keep only alignments that were mapped?", "", m_settings->HasIsMappedFilter, m_settings->IsMappedFilter, AlignmentFlagOpts, TRUE_STR); - Options::AddValueOption("-isMateMapped", "true/false", "keep only alignments with mates that mapped", "", m_settings->HasIsMateMappedFilter, m_settings->IsMateMappedFilter, AlignmentFlagOpts, TRUE_STR); - Options::AddValueOption("-isMateReverseStrand", "true/false", "keep only alignments with mate on reverese strand?", "", m_settings->HasIsMateReverseStrandFilter, m_settings->IsMateReverseStrandFilter, AlignmentFlagOpts, TRUE_STR); - Options::AddValueOption("-isPaired", "true/false", "keep only alignments that were sequenced as paired?","", m_settings->HasIsPairedFilter, m_settings->IsPairedFilter, AlignmentFlagOpts, TRUE_STR); - Options::AddValueOption("-isPrimaryAlignment", "true/false", "keep only alignments marked as primary?", "", m_settings->HasIsPrimaryAlignmentFilter, m_settings->IsPrimaryAlignmentFilter, AlignmentFlagOpts, TRUE_STR); - Options::AddValueOption("-isProperPair", "true/false", "keep only alignments that passed PE resolution?", "", m_settings->HasIsProperPairFilter, m_settings->IsProperPairFilter, AlignmentFlagOpts, TRUE_STR); - Options::AddValueOption("-isReverseStrand", "true/false", "keep only alignments on reverse strand?", "", m_settings->HasIsReverseStrandFilter, m_settings->IsReverseStrandFilter, AlignmentFlagOpts, TRUE_STR); - Options::AddValueOption("-isSecondMate", "true/false", "keep only alignments marked as second mate?", "", m_settings->HasIsSecondMateFilter, m_settings->IsSecondMateFilter, AlignmentFlagOpts, TRUE_STR); -} - -FilterTool::~FilterTool(void) { - delete m_settings; - m_settings = 0; - - delete m_impl; - m_impl = 0; -} - -int FilterTool::Help(void) { - Options::DisplayHelp(); - return 0; -} +// FilterToolPrivate declaration -int FilterTool::Run(int argc, char* argv[]) { - - // parse command line arguments - Options::Parse(argc, argv, 1); - - // run internal FilterTool implementation, return success/fail - m_impl = new FilterToolPrivate(m_settings); - - if ( m_impl->Run() ) return 0; - else return 1; -} +class FilterTool::FilterToolPrivate { + + // ctor & dtor + public: + FilterToolPrivate(FilterTool::FilterSettings* settings); + ~FilterToolPrivate(void); + + // 'public' interface + public: + bool Run(void); + + // internal methods + private: + bool AddPropertyTokensToFilter(const string& filterName, const map& propertyTokens); + bool CheckAlignment(const BamAlignment& al); + const string GetScriptContents(void); + void InitProperties(void); + bool ParseCommandLine(void); + bool ParseFilterObject(const string& filterName, const Json::Value& filterObject); + bool ParseScript(void); + bool SetupFilters(void); + + // data members + private: + vector m_propertyNames; + FilterTool::FilterSettings* m_settings; + FilterEngine m_filterEngine; +}; // --------------------------------------------- // FilterToolPrivate implementation @@ -350,8 +405,10 @@ FilterTool::FilterToolPrivate::FilterToolPrivate(FilterTool::FilterSettings* set // destructor FilterTool::FilterToolPrivate::~FilterToolPrivate(void) { } -bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName, const map& propertyTokens) { - +bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName, + const map& propertyTokens) +{ // dummy temp values for token parsing bool boolValue; int32_t int32Value; @@ -383,52 +440,60 @@ bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filt propertyName == ISPRIMARYALIGNMENT_PROPERTY || propertyName == ISPROPERPAIR_PROPERTY || propertyName == ISREVERSESTRAND_PROPERTY || - propertyName == ISSECONDMATE_PROPERTY + propertyName == ISSECONDMATE_PROPERTY || + propertyName == ISSINGLETON_PROPERTY ) { - m_filterEngine.parseToken(token, boolValue, type); + FilterEngine::parseToken(token, boolValue, type); m_filterEngine.setProperty(filterName, propertyName, boolValue, type); } // int32_t conversion else if ( propertyName == INSERTSIZE_PROPERTY || + propertyName == LENGTH_PROPERTY || propertyName == MATEPOSITION_PROPERTY || propertyName == POSITION_PROPERTY ) { - m_filterEngine.parseToken(token, int32Value, type); + FilterEngine::parseToken(token, int32Value, type); m_filterEngine.setProperty(filterName, propertyName, int32Value, type); } // uint16_t conversion else if ( propertyName == MAPQUALITY_PROPERTY ) { - m_filterEngine.parseToken(token, uint16Value, type); + FilterEngine::parseToken(token, uint16Value, type); m_filterEngine.setProperty(filterName, propertyName, uint16Value, type); } // uint32_t conversion else if ( propertyName == ALIGNMENTFLAG_PROPERTY ) { - m_filterEngine.parseToken(token, uint32Value, type); + FilterEngine::parseToken(token, uint32Value, type); m_filterEngine.setProperty(filterName, propertyName, uint32Value, type); } // string conversion - else if ( propertyName == MATEREFERENCE_PROPERTY || + else if ( propertyName == CIGAR_PROPERTY || + propertyName == MATEREFERENCE_PROPERTY || propertyName == NAME_PROPERTY || propertyName == QUERYBASES_PROPERTY || - propertyName == REFERENCE_PROPERTY || - propertyName == CIGAR_PROPERTY + propertyName == REFERENCE_PROPERTY ) { - m_filterEngine.parseToken(token, stringValue, type); + FilterEngine::parseToken(token, stringValue, type); m_filterEngine.setProperty(filterName, propertyName, stringValue, type); } + else if ( propertyName == TAG_PROPERTY ) { + // this will be stored directly as the TAG:VALUE token + // (VALUE may contain compare ops, will be parsed out later) + m_filterEngine.setProperty(filterName, propertyName, token, PropertyFilterValue::EXACT); + } + // else unknown property else { - cerr << "Unknown property: " << propertyName << "!" << endl; + cerr << "bamtools filter ERROR: unknown property - " << propertyName << endl; return false; } } @@ -444,8 +509,9 @@ const string FilterTool::FilterToolPrivate::GetScriptContents(void) { // open script for reading FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb"); if ( !inFile ) { - cerr << "FilterTool error: Could not open script: " << m_settings->ScriptFilename << " for reading" << endl; - return false; + cerr << "bamtools filter ERROR: could not open script: " + << m_settings->ScriptFilename << " for reading" << endl; + return string(); } // read in entire script contents @@ -456,12 +522,13 @@ const string FilterTool::FilterToolPrivate::GetScriptContents(void) { // peek ahead, make sure there is data available char ch = fgetc(inFile); ungetc(ch, inFile); - if( feof(inFile) ) break; + if( feof(inFile) ) + break; // read next block of data if ( fgets(buffer, 1024, inFile) == 0 ) { - cerr << "FilterTool error : could not read from script" << endl; - return false; + cerr << "bamtools filter ERROR: could not read script contents" << endl; + return string(); } docStream << buffer; @@ -471,14 +538,14 @@ const string FilterTool::FilterToolPrivate::GetScriptContents(void) { fclose(inFile); // import buffer contents to document, return - string document = docStream.str(); - return document; + return docStream.str(); } void FilterTool::FilterToolPrivate::InitProperties(void) { // store property names in vector m_propertyNames.push_back(ALIGNMENTFLAG_PROPERTY); + m_propertyNames.push_back(CIGAR_PROPERTY); m_propertyNames.push_back(INSERTSIZE_PROPERTY); m_propertyNames.push_back(ISDUPLICATE_PROPERTY); m_propertyNames.push_back(ISFAILEDQC_PROPERTY); @@ -491,6 +558,8 @@ void FilterTool::FilterToolPrivate::InitProperties(void) { m_propertyNames.push_back(ISPROPERPAIR_PROPERTY); m_propertyNames.push_back(ISREVERSESTRAND_PROPERTY); m_propertyNames.push_back(ISSECONDMATE_PROPERTY); + m_propertyNames.push_back(ISSINGLETON_PROPERTY); + m_propertyNames.push_back(LENGTH_PROPERTY); m_propertyNames.push_back(MAPQUALITY_PROPERTY); m_propertyNames.push_back(MATEPOSITION_PROPERTY); m_propertyNames.push_back(MATEREFERENCE_PROPERTY); @@ -498,7 +567,7 @@ void FilterTool::FilterToolPrivate::InitProperties(void) { m_propertyNames.push_back(POSITION_PROPERTY); m_propertyNames.push_back(QUERYBASES_PROPERTY); m_propertyNames.push_back(REFERENCE_PROPERTY); - m_propertyNames.push_back(CIGAR_PROPERTY); + m_propertyNames.push_back(TAG_PROPERTY); // add vector contents to FilterEngine vector::const_iterator propertyNameIter = m_propertyNames.begin(); @@ -528,10 +597,13 @@ bool FilterTool::FilterToolPrivate::ParseCommandLine(void) { if ( m_settings->HasIsProperPairFilter ) propertyTokens.insert( make_pair(ISPROPERPAIR_PROPERTY, m_settings->IsProperPairFilter) ); if ( m_settings->HasIsReverseStrandFilter ) propertyTokens.insert( make_pair(ISREVERSESTRAND_PROPERTY, m_settings->IsReverseStrandFilter) ); if ( m_settings->HasIsSecondMateFilter ) propertyTokens.insert( make_pair(ISSECONDMATE_PROPERTY, m_settings->IsSecondMateFilter) ); + if ( m_settings->HasIsSingletonFilter ) propertyTokens.insert( make_pair(ISSINGLETON_PROPERTY, m_settings->IsSingletonFilter) ); + if ( m_settings->HasLengthFilter ) propertyTokens.insert( make_pair(LENGTH_PROPERTY, m_settings->LengthFilter) ); if ( m_settings->HasMapQualityFilter ) propertyTokens.insert( make_pair(MAPQUALITY_PROPERTY, m_settings->MapQualityFilter) ); if ( m_settings->HasNameFilter ) propertyTokens.insert( make_pair(NAME_PROPERTY, m_settings->NameFilter) ); if ( m_settings->HasQueryBasesFilter ) propertyTokens.insert( make_pair(QUERYBASES_PROPERTY, m_settings->QueryBasesFilter) ); - + if ( m_settings->HasTagFilter ) propertyTokens.insert( make_pair(TAG_PROPERTY, m_settings->TagFilter) ); + // send add these properties to filter set "COMMAND_LINE" return AddPropertyTokensToFilter(CMD, propertyTokens); } @@ -574,7 +646,8 @@ bool FilterTool::FilterToolPrivate::ParseScript(void) { Json::Reader reader; if ( !reader.parse(document, root) ) { // use built-in error reporting mechanism to alert user what was wrong with the script - cerr << "Failed to parse configuration\n" << reader.getFormatedErrorMessages(); + cerr << "bamtools filter ERROR: failed to parse script - see error message(s) below" << endl + << reader.getFormatedErrorMessages(); return false; } @@ -597,9 +670,8 @@ bool FilterTool::FilterToolPrivate::ParseScript(void) { // if id tag supplied const Json::Value id = filter["id"]; - if ( !id.isNull() ) { + if ( !id.isNull() ) filterName = id.asString(); - } // use array index else { @@ -632,37 +704,59 @@ bool FilterTool::FilterToolPrivate::ParseScript(void) { return success; } - bool FilterTool::FilterToolPrivate::Run(void) { // set to default input if none provided - if ( !m_settings->HasInputBamFilename ) + if ( !m_settings->HasInput && !m_settings->HasInputFilelist ) m_settings->InputFiles.push_back(Options::StandardIn()); - + + // add files in the filelist to the input file list + if ( m_settings->HasInputFilelist ) { + + ifstream filelist(m_settings->InputFilelist.c_str(), ios::in); + if ( !filelist.is_open() ) { + cerr << "bamtools filter ERROR: could not open input BAM file list... Aborting." << endl; + return false; + } + + string line; + while ( getline(filelist, line) ) + m_settings->InputFiles.push_back(line); + } + // initialize defined properties & user-specified filters // quit if failed - if ( !SetupFilters() ) return 1; + if ( !SetupFilters() ) + return false; // open reader without index BamMultiReader reader; - if (!reader.Open(m_settings->InputFiles, false, true)) { - cerr << "Could not open input files for reading." << endl; + if ( !reader.Open(m_settings->InputFiles) ) { + cerr << "bamtools filter ERROR: could not open input files for reading." << endl; return false; } + + // retrieve reader header & reference data const string headerText = reader.GetHeaderText(); filterToolReferences = reader.GetReferenceData(); - // open writer + // determine compression mode for BamWriter + bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && + !m_settings->IsForceCompression ); + BamWriter::CompressionMode compressionMode = BamWriter::Compressed; + if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed; + + // open BamWriter BamWriter writer; - bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression ); - if (!writer.Open(m_settings->OutputFilename, headerText, filterToolReferences, writeUncompressed)) { - cerr << "Could not open " << m_settings->OutputFilename << " for writing." << endl; + writer.SetCompressionMode(compressionMode); + if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences) ) { + cerr << "bamtools filter ERROR: could not open " << m_settings->OutputFilename << " for writing." << endl; + reader.Close(); return false; } - - BamAlignment al; - + // if no region specified, filter entire file + BamAlignment al; if ( !m_settings->HasRegion ) { while ( reader.GetNextAlignment(al) ) { if ( CheckAlignment(al) ) @@ -677,38 +771,31 @@ bool FilterTool::FilterToolPrivate::Run(void) { BamRegion region; if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { - // attempt to re-open reader with index files - reader.Close(); - bool openedOK = reader.Open(m_settings->InputFiles, true, true ); - - // if error - if ( !openedOK ) { - cerr << "ERROR: Could not open input BAM file(s)... Aborting." << endl; - return 1; - } - - // if index data available, we can use SetRegion - if ( reader.IsIndexLoaded() ) { - + // attempt to find index files + reader.LocateIndexes(); + + // if index data available for all BAM files, we can use SetRegion + if ( reader.HasIndexes() ) { + // attempt to use SetRegion(), if failed report error if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) { - cerr << "ERROR: Region requested, but could not set BamReader region to REGION: " << m_settings->Region << " Aborting." << endl; + cerr << "bamtools filter ERROR: set region failed. Check that REGION describes a valid range" << endl; reader.Close(); - return 1; + return false; } // everything checks out, just iterate through specified region, filtering alignments - while ( reader.GetNextAlignmentCore(al) ) + while ( reader.GetNextAlignment(al) ) if ( CheckAlignment(al) ) writer.SaveAlignment(al); - } + } // no index data available, we have to iterate through until we // find overlapping alignments else { - while( reader.GetNextAlignmentCore(al) ) { - if ( (al.RefID >= region.LeftRefID) && ( (al.Position + al.Length) >= region.LeftPosition ) && - (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) + while ( reader.GetNextAlignment(al) ) { + if ( (al.RefID >= region.LeftRefID) && ((al.Position + al.Length) >= region.LeftPosition) && + (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) { if ( CheckAlignment(al) ) writer.SaveAlignment(al); @@ -719,27 +806,150 @@ bool FilterTool::FilterToolPrivate::Run(void) { // error parsing REGION string else { - cerr << "ERROR: Could not parse REGION - " << m_settings->Region << endl; - cerr << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl; + cerr << "bamtools filter ERROR: could not parse REGION: " << m_settings->Region << endl; + cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid" + << endl; reader.Close(); - return 1; + return false; } } // clean up & exit reader.Close(); writer.Close(); - return 0; + return true; } bool FilterTool::FilterToolPrivate::SetupFilters(void) { - // add known properties to FilterEngine + // set up filter engine with supported properties InitProperties(); // parse script for filter rules, if given - if ( m_settings->HasScriptFilename ) return ParseScript(); + if ( m_settings->HasScript ) + return ParseScript(); // otherwise check command line for filters else return ParseCommandLine(); } + +// --------------------------------------------- +// FilterTool implementation + +FilterTool::FilterTool(void) + : AbstractTool() + , m_settings(new FilterSettings) + , m_impl(0) +{ + // ---------------------------------- + // set program details + + const string usage = "[-in -in ... | -list ] " + "[-out | [-forceCompression]] [-region ] " + "[ [-script HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-list", "filename", listDesc, "", m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts); + Options::AddValueOption("-out", "BAM filename", outDesc, "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); + Options::AddValueOption("-region", "REGION", regionDesc, "", m_settings->HasRegion, m_settings->Region, IO_Opts); + Options::AddValueOption("-script", "filename", scriptDesc, "", m_settings->HasScript, m_settings->ScriptFilename, IO_Opts); + Options::AddOption("-forceCompression",forceDesc, m_settings->IsForceCompression, IO_Opts); + + // ---------------------------------- + // general filter options + + OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters"); + + const string flagDesc = "keep reads with this *exact* alignment flag (for more detailed queries, see below)"; + const string insertDesc = "keep reads with insert size that matches pattern"; + const string lengthDesc = "keep reads with length that matches pattern"; + const string mapQualDesc = "keep reads with map quality that matches pattern"; + const string nameDesc = "keep reads with name that matches pattern"; + const string queryDesc = "keep reads with motif that matches pattern"; + const string tagDesc = "keep reads with this key=>value pair"; + + Options::AddValueOption("-alignmentFlag", "int", flagDesc, "", m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, FilterOpts); + Options::AddValueOption("-insertSize", "int", insertDesc, "", m_settings->HasInsertSizeFilter, m_settings->InsertSizeFilter, FilterOpts); + Options::AddValueOption("-length", "int", lengthDesc, "", m_settings->HasLengthFilter, m_settings->LengthFilter, FilterOpts); + Options::AddValueOption("-mapQuality", "[0-255]", mapQualDesc, "", m_settings->HasMapQualityFilter, m_settings->MapQualityFilter, FilterOpts); + Options::AddValueOption("-name", "string", nameDesc, "", m_settings->HasNameFilter, m_settings->NameFilter, FilterOpts); + Options::AddValueOption("-queryBases", "string", queryDesc, "", m_settings->HasQueryBasesFilter, m_settings->QueryBasesFilter, FilterOpts); + Options::AddValueOption("-tag", "TAG:VALUE", tagDesc, "", m_settings->HasTagFilter, m_settings->TagFilter, FilterOpts); + + // ---------------------------------- + // alignment flag filter options + + OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters"); + + const string boolArg = "true/false"; + const string isDupDesc = "keep only alignments that are marked as duplicate?"; + const string isFailQcDesc = "keep only alignments that failed QC?"; + const string isFirstMateDesc = "keep only alignments marked as first mate?"; + const string isMappedDesc = "keep only alignments that were mapped?"; + const string isMateMappedDesc = "keep only alignments with mates that mapped"; + const string isMateReverseDesc = "keep only alignments with mate on reverese strand?"; + const string isPairedDesc = "keep only alignments that were sequenced as paired?"; + const string isPrimaryDesc = "keep only alignments marked as primary?"; + const string isProperPairDesc = "keep only alignments that passed PE resolution?"; + const string isReverseDesc = "keep only alignments on reverse strand?"; + const string isSecondMateDesc = "keep only alignments marked as second mate?"; + const string isSingletonDesc = "keep only singletons"; + + Options::AddValueOption("-isDuplicate", boolArg, isDupDesc, "", m_settings->HasIsDuplicateFilter, m_settings->IsDuplicateFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isFailedQC", boolArg, isFailQcDesc, "", m_settings->HasIsFailedQCFilter, m_settings->IsFailedQCFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isFirstMate", boolArg, isFirstMateDesc, "", m_settings->HasIsFirstMateFilter, m_settings->IsFirstMateFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isMapped", boolArg, isMappedDesc, "", m_settings->HasIsMappedFilter, m_settings->IsMappedFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isMateMapped", boolArg, isMateMappedDesc, "", m_settings->HasIsMateMappedFilter, m_settings->IsMateMappedFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isMateReverseStrand", boolArg, isMateReverseDesc, "", m_settings->HasIsMateReverseStrandFilter, m_settings->IsMateReverseStrandFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isPaired", boolArg, isPairedDesc, "", m_settings->HasIsPairedFilter, m_settings->IsPairedFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isPrimaryAlignment", boolArg, isPrimaryDesc, "", m_settings->HasIsPrimaryAlignmentFilter, m_settings->IsPrimaryAlignmentFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isProperPair", boolArg, isProperPairDesc, "", m_settings->HasIsProperPairFilter, m_settings->IsProperPairFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isReverseStrand", boolArg, isReverseDesc, "", m_settings->HasIsReverseStrandFilter, m_settings->IsReverseStrandFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isSecondMate", boolArg, isSecondMateDesc, "", m_settings->HasIsSecondMateFilter, m_settings->IsSecondMateFilter, AlignmentFlagOpts, TRUE_STR); + Options::AddValueOption("-isSingleton", boolArg, isSingletonDesc, "", m_settings->HasIsSingletonFilter, m_settings->IsSingletonFilter, AlignmentFlagOpts, TRUE_STR); +} + +FilterTool::~FilterTool(void) { + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int FilterTool::Help(void) { + Options::DisplayHelp(); + return 0; +} + +int FilterTool::Run(int argc, char* argv[]) { + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // initialize FilterTool with settings + m_impl = new FilterToolPrivate(m_settings); + + // run FilterTool, return success/fail + if ( m_impl->Run() ) + return 0; + else + return 1; +}