X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Ftoolkit%2Fbamtools_filter.cpp;h=023fbc99d053f15ef217d9903cba7714255b340f;hb=8c80d760637f8df39262683cd2570f0589423d36;hp=d2de3b495aac1d588a29fad914b7e7ecdb30d9dc;hpb=c5363861478f495b41a2ad99028326f85feeb905;p=bamtools.git diff --git a/src/toolkit/bamtools_filter.cpp b/src/toolkit/bamtools_filter.cpp index d2de3b4..023fbc9 100644 --- a/src/toolkit/bamtools_filter.cpp +++ b/src/toolkit/bamtools_filter.cpp @@ -3,27 +3,29 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 17 November 2010 +// Last modified: 21 March 2011 // --------------------------------------------------------------------------- // Filters BAM file(s) according to some user-specified criteria. // *************************************************************************** +#include "bamtools_filter.h" + +#include +#include +#include +#include +#include +using namespace BamTools; + +#include +using namespace Json; + #include #include #include #include #include -#include "bamtools_filter.h" -#include "bamtools_filter_engine.h" -#include "bamtools_options.h" -#include "bamtools_utilities.h" -#include "BamReader.h" -#include "BamMultiReader.h" -#include "BamWriter.h" -#include "jsoncpp/json.h" using namespace std; -using namespace BamTools; -using namespace Json; namespace BamTools { @@ -74,7 +76,7 @@ struct BamAlignmentChecker { const PropertyFilterValue& valueFilter = (*propertyIter).second; if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= valueFilter.check(al.AlignmentFlag); - else if ( propertyName == CIGAR_PROPERTY ) { + else if ( propertyName == CIGAR_PROPERTY ) { stringstream cigarSs; const vector& cigarData = al.CigarData; if ( !cigarData.empty() ) { @@ -129,93 +131,93 @@ struct BamAlignmentChecker { bool checkAlignmentTag(const PropertyFilterValue& valueFilter, const BamAlignment& al) { - // ensure filter contains string data - Variant entireTagFilter = valueFilter.Value; - if ( !entireTagFilter.is_type() ) return false; - - // localize string from variant - const string& entireTagFilterString = entireTagFilter.get(); - - // ensure we have at least "XX:x" - if ( entireTagFilterString.length() < 4 ) return false; - - // get tagName & lookup in alignment - // if found, set tagType to tag type character - // if not found, return false - const string& tagName = entireTagFilterString.substr(0,2); - char tagType = '\0'; - if ( !al.GetTagType(tagName, tagType) ) return false; - - // remove tagName & ":" from beginning tagFilter - string tagFilterString = entireTagFilterString.substr(3); - - // switch on tag type to set tag query value & parse filter token - int32_t intFilterValue, intQueryValue; - uint32_t uintFilterValue, uintQueryValue; - float realFilterValue, realQueryValue; - string stringFilterValue, stringQueryValue; - - PropertyFilterValue tagFilter; - PropertyFilterValue::ValueCompareType compareType; - bool keepAlignment = false; - switch (tagType) { - - // signed int tag type - case 'c' : - case 's' : + // ensure filter contains string data + Variant entireTagFilter = valueFilter.Value; + if ( !entireTagFilter.is_type() ) return false; + + // localize string from variant + const string& entireTagFilterString = entireTagFilter.get(); + + // ensure we have at least "XX:x" + if ( entireTagFilterString.length() < 4 ) return false; + + // get tagName & lookup in alignment + // if found, set tagType to tag type character + // if not found, return false + const string& tagName = entireTagFilterString.substr(0,2); + char tagType = '\0'; + if ( !al.GetTagType(tagName, tagType) ) return false; + + // remove tagName & ":" from beginning tagFilter + string tagFilterString = entireTagFilterString.substr(3); + + // switch on tag type to set tag query value & parse filter token + int32_t intFilterValue, intQueryValue; + uint32_t uintFilterValue, uintQueryValue; + float realFilterValue, realQueryValue; + string stringFilterValue, stringQueryValue; + + PropertyFilterValue tagFilter; + PropertyFilterValue::ValueCompareType compareType; + bool keepAlignment = false; + switch (tagType) { + + // signed int tag type + case 'c' : + case 's' : case 'i' : - if ( al.GetTag(tagName, intQueryValue) ) { - if ( FilterEngine::parseToken(tagFilterString, intFilterValue, compareType) ) { - tagFilter.Value = intFilterValue; - tagFilter.Type = compareType; - keepAlignment = tagFilter.check(intQueryValue); - } - } - break; - - // unsigned int tag type + if ( al.GetTag(tagName, intQueryValue) ) { + if ( FilterEngine::parseToken(tagFilterString, intFilterValue, compareType) ) { + tagFilter.Value = intFilterValue; + tagFilter.Type = compareType; + keepAlignment = tagFilter.check(intQueryValue); + } + } + break; + + // unsigned int tag type case 'C' : case 'S' : - case 'I' : - if ( al.GetTag(tagName, uintQueryValue) ) { - if ( FilterEngine::parseToken(tagFilterString, uintFilterValue, compareType) ) { - tagFilter.Value = uintFilterValue; - tagFilter.Type = compareType; - keepAlignment = tagFilter.check(uintQueryValue); - } - } - break; - - // 'real' tag type + case 'I' : + if ( al.GetTag(tagName, uintQueryValue) ) { + if ( FilterEngine::parseToken(tagFilterString, uintFilterValue, compareType) ) { + tagFilter.Value = uintFilterValue; + tagFilter.Type = compareType; + keepAlignment = tagFilter.check(uintQueryValue); + } + } + break; + + // 'real' tag type case 'f' : - if ( al.GetTag(tagName, realQueryValue) ) { - if ( FilterEngine::parseToken(tagFilterString, realFilterValue, compareType) ) { - tagFilter.Value = realFilterValue; - tagFilter.Type = compareType; - keepAlignment = tagFilter.check(realQueryValue); - } - } + if ( al.GetTag(tagName, realQueryValue) ) { + if ( FilterEngine::parseToken(tagFilterString, realFilterValue, compareType) ) { + tagFilter.Value = realFilterValue; + tagFilter.Type = compareType; + keepAlignment = tagFilter.check(realQueryValue); + } + } break; - - // string tag type + + // string tag type case 'A': case 'Z': - case 'H': + case 'H': if ( al.GetTag(tagName, stringQueryValue) ) { - if ( FilterEngine::parseToken(tagFilterString, stringFilterValue, compareType) ) { - tagFilter.Value = stringFilterValue; - tagFilter.Type = compareType; - keepAlignment = tagFilter.check(stringQueryValue); - } - } - break; - - // unknown tag type - default : - keepAlignment = false; - } - - return keepAlignment; + if ( FilterEngine::parseToken(tagFilterString, stringFilterValue, compareType) ) { + tagFilter.Value = stringFilterValue; + tagFilter.Type = compareType; + keepAlignment = tagFilter.check(stringQueryValue); + } + } + break; + + // unknown tag type + default : + keepAlignment = false; + } + + return keepAlignment; } }; @@ -377,8 +379,8 @@ FilterTool::FilterTool(void) OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts, Options::StandardIn()); Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut()); - Options::AddValueOption("-region", "REGION", "only read data from this genomic region (see README for more details)", "", m_settings->HasRegion, m_settings->Region, IO_Opts); - Options::AddValueOption("-script", "filename", "the filter script file (see README for more details)", "", m_settings->HasScriptFilename, m_settings->ScriptFilename, IO_Opts); + Options::AddValueOption("-region", "REGION", "only read data from this genomic region (see documentation for more details)", "", m_settings->HasRegion, m_settings->Region, IO_Opts); + Options::AddValueOption("-script", "filename", "the filter script file (see documentation for more details)", "", m_settings->HasScriptFilename, m_settings->ScriptFilename, IO_Opts); Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts); OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters"); @@ -387,7 +389,7 @@ FilterTool::FilterTool(void) Options::AddValueOption("-mapQuality", "[0-255]", "keep reads with map quality that matches pattern", "", m_settings->HasMapQualityFilter, m_settings->MapQualityFilter, FilterOpts); Options::AddValueOption("-name", "string", "keep reads with name that matches pattern", "", m_settings->HasNameFilter, m_settings->NameFilter, FilterOpts); Options::AddValueOption("-queryBases", "string", "keep reads with motif that mathces pattern", "", m_settings->HasQueryBasesFilter, m_settings->QueryBasesFilter, FilterOpts); - Options::AddValueOption("-tag", "TAG:VALUE", "keep reads with this key=>value pair", "", m_settings->HasTagFilter, m_settings->TagFilter, FilterOpts); + Options::AddValueOption("-tag", "TAG:VALUE", "keep reads with this key=>value pair", "", m_settings->HasTagFilter, m_settings->TagFilter, FilterOpts); OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters"); Options::AddValueOption("-isDuplicate", "true/false", "keep only alignments that are marked as duplicate?", "", m_settings->HasIsDuplicateFilter, m_settings->IsDuplicateFilter, AlignmentFlagOpts, TRUE_STR); @@ -439,8 +441,10 @@ FilterTool::FilterToolPrivate::FilterToolPrivate(FilterTool::FilterSettings* set // destructor FilterTool::FilterToolPrivate::~FilterToolPrivate(void) { } -bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName, const map& propertyTokens) { - +bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName, + const map& propertyTokens) +{ // dummy temp values for token parsing bool boolValue; int32_t int32Value; @@ -515,7 +519,7 @@ bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filt m_filterEngine.setProperty(filterName, propertyName, stringValue, type); } - else if (propertyName == TAG_PROPERTY ) { + else if ( propertyName == TAG_PROPERTY ) { // this will be stored directly as the TAG:VALUE token // (VALUE may contain compare ops, will be parsed out later) m_filterEngine.setProperty(filterName, propertyName, token, PropertyFilterValue::EXACT); @@ -523,7 +527,7 @@ bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filt // else unknown property else { - cerr << "Unknown property: " << propertyName << "!" << endl; + cerr << "bamtools filter ERROR: unknown property - " << propertyName << endl; return false; } } @@ -539,7 +543,8 @@ const string FilterTool::FilterToolPrivate::GetScriptContents(void) { // open script for reading FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb"); if ( !inFile ) { - cerr << "FilterTool error: Could not open script: " << m_settings->ScriptFilename << " for reading" << endl; + cerr << "bamtools filter ERROR: could not open script: " + << m_settings->ScriptFilename << " for reading" << endl; return false; } @@ -555,7 +560,7 @@ const string FilterTool::FilterToolPrivate::GetScriptContents(void) { // read next block of data if ( fgets(buffer, 1024, inFile) == 0 ) { - cerr << "FilterTool error : could not read from script" << endl; + cerr << "bamtools filter ERROR: could not read script contents" << endl; return false; } @@ -671,7 +676,8 @@ bool FilterTool::FilterToolPrivate::ParseScript(void) { Json::Reader reader; if ( !reader.parse(document, root) ) { // use built-in error reporting mechanism to alert user what was wrong with the script - cerr << "Failed to parse configuration\n" << reader.getFormatedErrorMessages(); + cerr << "bamtools filter ERROR: failed to parse script - see error message(s) below" << endl + << reader.getFormatedErrorMessages(); return false; } @@ -736,28 +742,36 @@ bool FilterTool::FilterToolPrivate::Run(void) { // initialize defined properties & user-specified filters // quit if failed - if ( !SetupFilters() ) return 1; + if ( !SetupFilters() ) return false; // open reader without index BamMultiReader reader; - if ( !reader.Open(m_settings->InputFiles, false, true) ) { - cerr << "Could not open input files for reading." << endl; + if ( !reader.Open(m_settings->InputFiles) ) { + cerr << "bamtools filter ERROR: could not open input files for reading." << endl; return false; } + + // retrieve reader header & reference data const string headerText = reader.GetHeaderText(); filterToolReferences = reader.GetReferenceData(); - // open writer + // determine compression mode for BamWriter + bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && + !m_settings->IsForceCompression ); + BamWriter::CompressionMode compressionMode = BamWriter::Compressed; + if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed; + + // open BamWriter BamWriter writer; - bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression ); - if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences, writeUncompressed) ) { - cerr << "Could not open " << m_settings->OutputFilename << " for writing." << endl; + writer.SetCompressionMode(compressionMode); + if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences) ) { + cerr << "bamtools filter ERROR: could not open " << m_settings->OutputFilename << " for writing." << endl; + reader.Close(); return false; } - - BamAlignment al; - + // if no region specified, filter entire file + BamAlignment al; if ( !m_settings->HasRegion ) { while ( reader.GetNextAlignment(al) ) { if ( CheckAlignment(al) ) @@ -772,36 +786,29 @@ bool FilterTool::FilterToolPrivate::Run(void) { BamRegion region; if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) { - // attempt to re-open reader with index files - reader.Close(); - bool openedOK = reader.Open(m_settings->InputFiles, true, true ); - - // if error - if ( !openedOK ) { - cerr << "ERROR: Could not open input BAM file(s)... Aborting." << endl; - return 1; - } - - // if index data available, we can use SetRegion - if ( reader.IsIndexLoaded() ) { - + // attempt to find index files + reader.LocateIndexes(); + + // if index data available for all BAM files, we can use SetRegion + if ( reader.HasIndexes() ) { + // attempt to use SetRegion(), if failed report error if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) { - cerr << "ERROR: Region requested, but could not set BamReader region to REGION: " << m_settings->Region << " Aborting." << endl; + cerr << "bamtools filter ERROR: set region failed. Check that REGION describes a valid range" << endl; reader.Close(); - return 1; + return false; } // everything checks out, just iterate through specified region, filtering alignments - while ( reader.GetNextAlignment(al) ) + while ( reader.GetNextAlignment(al) ) if ( CheckAlignment(al) ) writer.SaveAlignment(al); - } + } // no index data available, we have to iterate through until we // find overlapping alignments else { - while ( reader.GetNextAlignment(al) ) { + while ( reader.GetNextAlignment(al) ) { if ( (al.RefID >= region.LeftRefID) && ((al.Position + al.Length) >= region.LeftPosition) && (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) { @@ -814,26 +821,28 @@ bool FilterTool::FilterToolPrivate::Run(void) { // error parsing REGION string else { - cerr << "ERROR: Could not parse REGION - " << m_settings->Region << endl; - cerr << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl; + cerr << "bamtools filter ERROR: could not parse REGION: " << m_settings->Region << endl; + cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid" + << endl; reader.Close(); - return 1; + return false; } } // clean up & exit reader.Close(); writer.Close(); - return 0; + return true; } bool FilterTool::FilterToolPrivate::SetupFilters(void) { - // add known properties to FilterEngine + // set up filter engine with supported properties InitProperties(); // parse script for filter rules, if given - if ( m_settings->HasScriptFilename ) return ParseScript(); + if ( m_settings->HasScriptFilename ) + return ParseScript(); // otherwise check command line for filters else return ParseCommandLine();