// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
-// Last modified: 4 October 2010
+// Last modified: 21 March 2011
// ---------------------------------------------------------------------------
// Filters BAM file(s) according to some user-specified criteria.
// ***************************************************************************
+#include "bamtools_filter.h"
+
+#include <api/BamMultiReader.h>
+#include <api/BamWriter.h>
+#include <utils/bamtools_filter_engine.h>
+#include <utils/bamtools_options.h>
+#include <utils/bamtools_utilities.h>
+using namespace BamTools;
+
+#include <jsoncpp/json.h>
+using namespace Json;
+
#include <cstdio>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
-#include "bamtools_filter.h"
-#include "bamtools_filter_engine.h"
-#include "bamtools_options.h"
-#include "bamtools_utilities.h"
-#include "BamReader.h"
-#include "BamMultiReader.h"
-#include "BamWriter.h"
-#include "jsoncpp/json.h"
using namespace std;
-using namespace BamTools;
-using namespace Json;
namespace BamTools {
const PropertyFilterValue& valueFilter = (*propertyIter).second;
if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= valueFilter.check(al.AlignmentFlag);
- else if ( propertyName == CIGAR_PROPERTY ) {
+ else if ( propertyName == CIGAR_PROPERTY ) {
stringstream cigarSs;
const vector<CigarOp>& cigarData = al.CigarData;
if ( !cigarData.empty() ) {
bool checkAlignmentTag(const PropertyFilterValue& valueFilter, const BamAlignment& al) {
- // ensure filter contains string data
- Variant entireTagFilter = valueFilter.Value;
- if ( !entireTagFilter.is_type<string>() ) return false;
-
- // localize string from variant
- const string& entireTagFilterString = entireTagFilter.get<string>();
-
- // ensure we have at least "XX:x"
- if ( entireTagFilterString.length() < 4 ) return false;
-
- // get tagName & lookup in alignment
- // if found, set tagType to tag type character
- // if not found, return false
- const string& tagName = entireTagFilterString.substr(0,2);
- char tagType = '\0';
- if ( !al.GetTagType(tagName, tagType) ) return false;
-
- // remove tagName & ":" from beginning tagFilter
- string tagFilterString = entireTagFilterString.substr(3);
-
- // switch on tag type to set tag query value & parse filter token
- int32_t intFilterValue, intQueryValue;
- uint32_t uintFilterValue, uintQueryValue;
- float realFilterValue, realQueryValue;
- string stringFilterValue, stringQueryValue;
-
- PropertyFilterValue tagFilter;
- PropertyFilterValue::ValueCompareType compareType;
- bool keepAlignment = false;
- switch (tagType) {
-
- // signed int tag type
- case 'c' :
- case 's' :
+ // ensure filter contains string data
+ Variant entireTagFilter = valueFilter.Value;
+ if ( !entireTagFilter.is_type<string>() ) return false;
+
+ // localize string from variant
+ const string& entireTagFilterString = entireTagFilter.get<string>();
+
+ // ensure we have at least "XX:x"
+ if ( entireTagFilterString.length() < 4 ) return false;
+
+ // get tagName & lookup in alignment
+ // if found, set tagType to tag type character
+ // if not found, return false
+ const string& tagName = entireTagFilterString.substr(0,2);
+ char tagType = '\0';
+ if ( !al.GetTagType(tagName, tagType) ) return false;
+
+ // remove tagName & ":" from beginning tagFilter
+ string tagFilterString = entireTagFilterString.substr(3);
+
+ // switch on tag type to set tag query value & parse filter token
+ int32_t intFilterValue, intQueryValue;
+ uint32_t uintFilterValue, uintQueryValue;
+ float realFilterValue, realQueryValue;
+ string stringFilterValue, stringQueryValue;
+
+ PropertyFilterValue tagFilter;
+ PropertyFilterValue::ValueCompareType compareType;
+ bool keepAlignment = false;
+ switch (tagType) {
+
+ // signed int tag type
+ case 'c' :
+ case 's' :
case 'i' :
- if ( al.GetTag(tagName, intQueryValue) ) {
- if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, intFilterValue, compareType) ) {
- tagFilter.Value = intFilterValue;
- tagFilter.Type = compareType;
- keepAlignment = tagFilter.check(intQueryValue);
- }
- }
- break;
-
- // unsigned int tag type
+ if ( al.GetTag(tagName, intQueryValue) ) {
+ if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, intFilterValue, compareType) ) {
+ tagFilter.Value = intFilterValue;
+ tagFilter.Type = compareType;
+ keepAlignment = tagFilter.check(intQueryValue);
+ }
+ }
+ break;
+
+ // unsigned int tag type
case 'C' :
case 'S' :
- case 'I' :
- if ( al.GetTag(tagName, uintQueryValue) ) {
- if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, uintFilterValue, compareType) ) {
- tagFilter.Value = uintFilterValue;
- tagFilter.Type = compareType;
- keepAlignment = tagFilter.check(uintQueryValue);
- }
- }
- break;
-
- // 'real' tag type
+ case 'I' :
+ if ( al.GetTag(tagName, uintQueryValue) ) {
+ if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, uintFilterValue, compareType) ) {
+ tagFilter.Value = uintFilterValue;
+ tagFilter.Type = compareType;
+ keepAlignment = tagFilter.check(uintQueryValue);
+ }
+ }
+ break;
+
+ // 'real' tag type
case 'f' :
- if ( al.GetTag(tagName, realQueryValue) ) {
- if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, realFilterValue, compareType) ) {
- tagFilter.Value = realFilterValue;
- tagFilter.Type = compareType;
- keepAlignment = tagFilter.check(realQueryValue);
- }
- }
+ if ( al.GetTag(tagName, realQueryValue) ) {
+ if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, realFilterValue, compareType) ) {
+ tagFilter.Value = realFilterValue;
+ tagFilter.Type = compareType;
+ keepAlignment = tagFilter.check(realQueryValue);
+ }
+ }
break;
-
- // string tag type
+
+ // string tag type
case 'A':
case 'Z':
- case 'H':
+ case 'H':
if ( al.GetTag(tagName, stringQueryValue) ) {
- if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, stringFilterValue, compareType) ) {
- tagFilter.Value = stringFilterValue;
- tagFilter.Type = compareType;
- keepAlignment = tagFilter.check(stringQueryValue);
- }
- }
- break;
-
- // unknown tag type
- default :
- keepAlignment = false;
- }
-
- return keepAlignment;
+ if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, stringFilterValue, compareType) ) {
+ tagFilter.Value = stringFilterValue;
+ tagFilter.Type = compareType;
+ keepAlignment = tagFilter.check(stringQueryValue);
+ }
+ }
+ break;
+
+ // unknown tag type
+ default :
+ keepAlignment = false;
+ }
+
+ return keepAlignment;
}
};
OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts, Options::StandardIn());
Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
- Options::AddValueOption("-region", "REGION", "only read data from this genomic region (see README for more details)", "", m_settings->HasRegion, m_settings->Region, IO_Opts);
- Options::AddValueOption("-script", "filename", "the filter script file (see README for more details)", "", m_settings->HasScriptFilename, m_settings->ScriptFilename, IO_Opts);
+ Options::AddValueOption("-region", "REGION", "only read data from this genomic region (see documentation for more details)", "", m_settings->HasRegion, m_settings->Region, IO_Opts);
+ Options::AddValueOption("-script", "filename", "the filter script file (see documentation for more details)", "", m_settings->HasScriptFilename, m_settings->ScriptFilename, IO_Opts);
Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts);
OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters");
Options::AddValueOption("-mapQuality", "[0-255]", "keep reads with map quality that matches pattern", "", m_settings->HasMapQualityFilter, m_settings->MapQualityFilter, FilterOpts);
Options::AddValueOption("-name", "string", "keep reads with name that matches pattern", "", m_settings->HasNameFilter, m_settings->NameFilter, FilterOpts);
Options::AddValueOption("-queryBases", "string", "keep reads with motif that mathces pattern", "", m_settings->HasQueryBasesFilter, m_settings->QueryBasesFilter, FilterOpts);
- Options::AddValueOption("-tag", "TAG:VALUE", "keep reads with this key=>value pair", "", m_settings->HasTagFilter, m_settings->TagFilter, FilterOpts);
+ Options::AddValueOption("-tag", "TAG:VALUE", "keep reads with this key=>value pair", "", m_settings->HasTagFilter, m_settings->TagFilter, FilterOpts);
OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters");
Options::AddValueOption("-isDuplicate", "true/false", "keep only alignments that are marked as duplicate?", "", m_settings->HasIsDuplicateFilter, m_settings->IsDuplicateFilter, AlignmentFlagOpts, TRUE_STR);
// destructor
FilterTool::FilterToolPrivate::~FilterToolPrivate(void) { }
-bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName, const map<string, string>& propertyTokens) {
-
+bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName,
+ const map<string,
+ string>& propertyTokens)
+{
// dummy temp values for token parsing
bool boolValue;
int32_t int32Value;
m_filterEngine.setProperty(filterName, propertyName, stringValue, type);
}
- else if (propertyName == TAG_PROPERTY ) {
+ else if ( propertyName == TAG_PROPERTY ) {
// this will be stored directly as the TAG:VALUE token
// (VALUE may contain compare ops, will be parsed out later)
m_filterEngine.setProperty(filterName, propertyName, token, PropertyFilterValue::EXACT);
// else unknown property
else {
- cerr << "Unknown property: " << propertyName << "!" << endl;
+ cerr << "bamtools filter ERROR: unknown property - " << propertyName << endl;
return false;
}
}
// open script for reading
FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb");
if ( !inFile ) {
- cerr << "FilterTool error: Could not open script: " << m_settings->ScriptFilename << " for reading" << endl;
+ cerr << "bamtools filter ERROR: could not open script: "
+ << m_settings->ScriptFilename << " for reading" << endl;
return false;
}
// read next block of data
if ( fgets(buffer, 1024, inFile) == 0 ) {
- cerr << "FilterTool error : could not read from script" << endl;
+ cerr << "bamtools filter ERROR: could not read script contents" << endl;
return false;
}
Json::Reader reader;
if ( !reader.parse(document, root) ) {
// use built-in error reporting mechanism to alert user what was wrong with the script
- cerr << "Failed to parse configuration\n" << reader.getFormatedErrorMessages();
+ cerr << "bamtools filter ERROR: failed to parse script - see error message(s) below" << endl
+ << reader.getFormatedErrorMessages();
return false;
}
// initialize defined properties & user-specified filters
// quit if failed
- if ( !SetupFilters() ) return 1;
+ if ( !SetupFilters() ) return false;
// open reader without index
BamMultiReader reader;
- if ( !reader.Open(m_settings->InputFiles, false, true) ) {
- cerr << "Could not open input files for reading." << endl;
+ if ( !reader.Open(m_settings->InputFiles) ) {
+ cerr << "bamtools filter ERROR: could not open input files for reading." << endl;
return false;
}
+
+ // retrieve reader header & reference data
const string headerText = reader.GetHeaderText();
filterToolReferences = reader.GetReferenceData();
- // open writer
+ // determine compression mode for BamWriter
+ bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
+ !m_settings->IsForceCompression );
+ BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
+ if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
+
+ // open BamWriter
BamWriter writer;
- bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression );
- if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences, writeUncompressed) ) {
- cerr << "Could not open " << m_settings->OutputFilename << " for writing." << endl;
+ writer.SetCompressionMode(compressionMode);
+ if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences) ) {
+ cerr << "bamtools filter ERROR: could not open " << m_settings->OutputFilename << " for writing." << endl;
+ reader.Close();
return false;
}
-
- BamAlignment al;
-
+
// if no region specified, filter entire file
+ BamAlignment al;
if ( !m_settings->HasRegion ) {
while ( reader.GetNextAlignment(al) ) {
if ( CheckAlignment(al) )
BamRegion region;
if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
- // attempt to re-open reader with index files
- reader.Close();
- bool openedOK = reader.Open(m_settings->InputFiles, true, true );
-
- // if error
- if ( !openedOK ) {
- cerr << "ERROR: Could not open input BAM file(s)... Aborting." << endl;
- return 1;
- }
-
- // if index data available, we can use SetRegion
- if ( reader.IsIndexLoaded() ) {
-
+ // attempt to find index files
+ reader.LocateIndexes();
+
+ // if index data available for all BAM files, we can use SetRegion
+ if ( reader.HasIndexes() ) {
+
// attempt to use SetRegion(), if failed report error
if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
- cerr << "ERROR: Region requested, but could not set BamReader region to REGION: " << m_settings->Region << " Aborting." << endl;
+ cerr << "bamtools filter ERROR: set region failed. Check that REGION describes a valid range" << endl;
reader.Close();
- return 1;
+ return false;
}
// everything checks out, just iterate through specified region, filtering alignments
- while ( reader.GetNextAlignmentCore(al) )
+ while ( reader.GetNextAlignment(al) )
if ( CheckAlignment(al) )
writer.SaveAlignment(al);
- }
+ }
// no index data available, we have to iterate through until we
// find overlapping alignments
else {
- while ( reader.GetNextAlignmentCore(al) ) {
+ while ( reader.GetNextAlignment(al) ) {
if ( (al.RefID >= region.LeftRefID) && ((al.Position + al.Length) >= region.LeftPosition) &&
(al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) )
{
// error parsing REGION string
else {
- cerr << "ERROR: Could not parse REGION - " << m_settings->Region << endl;
- cerr << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl;
+ cerr << "bamtools filter ERROR: could not parse REGION: " << m_settings->Region << endl;
+ cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
+ << endl;
reader.Close();
- return 1;
+ return false;
}
}
// clean up & exit
reader.Close();
writer.Close();
- return 0;
+ return true;
}
bool FilterTool::FilterToolPrivate::SetupFilters(void) {
- // add known properties to FilterEngine<BamAlignmentChecker>
+ // set up filter engine with supported properties
InitProperties();
// parse script for filter rules, if given
- if ( m_settings->HasScriptFilename ) return ParseScript();
+ if ( m_settings->HasScriptFilename )
+ return ParseScript();
// otherwise check command line for filters
else return ParseCommandLine();