1 // ***************************************************************************
2 // bamtools_filter.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 30 August 2010
7 // ---------------------------------------------------------------------------
8 // Filters a single BAM file (or filters multiple BAM files and merges)
9 // according to some user-specified criteria.
10 // ***************************************************************************
20 #include "bamtools_filter.h"
21 #include "bamtools_filter_engine.h"
22 #include "bamtools_options.h"
23 #include "bamtools_utilities.h"
24 #include "BamReader.h"
25 #include "BamMultiReader.h"
26 #include "BamWriter.h"
29 #include "jsoncpp/json.h"
32 using namespace BamTools;
37 // -------------------------------
38 // string literal constants
41 const string ALIGNMENTFLAG_PROPERTY = "alignmentFlag";
42 const string INSERTSIZE_PROPERTY = "insertSize";
43 const string ISDUPLICATE_PROPERTY = "isDuplicate";
44 const string ISFAILEDQC_PROPERTY = "isFailedQC";
45 const string ISFIRSTMATE_PROPERTY = "isFirstMate";
46 const string ISMAPPED_PROPERTY = "isMapped";
47 const string ISMATEMAPPED_PROPERTY = "isMateMapped";
48 const string ISMATEREVERSESTRAND_PROPERTY = "isMateReverseStrand";
49 const string ISPAIRED_PROPERTY = "isPaired";
50 const string ISPRIMARYALIGNMENT_PROPERTY = "isPrimaryAlignment";
51 const string ISPROPERPAIR_PROPERTY = "isProperPair";
52 const string ISREVERSESTRAND_PROPERTY = "isReverseStrand";
53 const string ISSECONDMATE_PROPERTY = "isSecondMate";
54 const string MAPQUALITY_PROPERTY = "mapQuality";
55 const string MATEPOSITION_PROPERTY = "matePosition";
56 const string MATEREFERENCE_PROPERTY = "mateReference";
57 const string NAME_PROPERTY = "name";
58 const string POSITION_PROPERTY = "position";
59 const string QUERYBASES_PROPERTY = "queryBases";
60 const string REFERENCE_PROPERTY = "reference";
63 const string TRUE_STR = "true";
64 const string FALSE_STR = "false";
66 } // namespace BamTools
68 // ---------------------------------------------
69 // FilterToolPrivate declaration
71 class FilterTool::FilterToolPrivate {
75 FilterToolPrivate(FilterTool::FilterSettings* settings);
76 ~FilterToolPrivate(void);
84 bool AddPropertyTokensToFilter(const string& filterName, const map<string, string>& propertyTokens);
85 bool CheckAlignment(const BamAlignment& al);
86 const string GetScriptContents(void);
87 void InitProperties(void);
88 bool ParseCommandLine(void);
89 bool ParseFilterObject(const string& filterName, const Json::Value& filterObject);
90 bool ParseScript(void);
91 bool SetupFilters(void);
95 vector<string> m_propertyNames;
96 FilterTool::FilterSettings* m_settings;
97 RefVector m_references;
100 // ---------------------------------------------
101 // FilterSettings implementation
103 struct FilterTool::FilterSettings {
105 // ----------------------------------
109 bool HasInputBamFilename;
110 bool HasOutputBamFilename;
112 bool HasScriptFilename;
115 vector<string> InputFiles;
116 string OutputFilename;
118 string ScriptFilename;
120 // -----------------------------------
121 // General filter opts
124 bool HasAlignmentFlagFilter;
125 bool HasInsertSizeFilter;
126 bool HasMapQualityFilter;
128 bool HasQueryBasesFilter;
130 // bool HasTagFilters;
133 string AlignmentFlagFilter;
134 string InsertSizeFilter;
136 string MapQualityFilter;
137 string QueryBasesFilter;
139 // vector<string> TagFilters;
141 // -----------------------------------
142 // AlignmentFlag filter opts
145 bool HasIsDuplicateFilter;
146 bool HasIsFailedQCFilter;
147 bool HasIsFirstMateFilter;
148 bool HasIsMappedFilter;
149 bool HasIsMateMappedFilter;
150 bool HasIsMateReverseStrandFilter;
151 bool HasIsPairedFilter;
152 bool HasIsPrimaryAlignmentFilter;
153 bool HasIsProperPairFilter;
154 bool HasIsReverseStrandFilter;
155 bool HasIsSecondMateFilter;
158 string IsDuplicateFilter;
159 string IsFailedQCFilter;
160 string IsFirstMateFilter;
161 string IsMappedFilter;
162 string IsMateMappedFilter;
163 string IsMateReverseStrandFilter;
164 string IsPairedFilter;
165 string IsPrimaryAlignmentFilter;
166 string IsProperPairFilter;
167 string IsReverseStrandFilter;
168 string IsSecondMateFilter;
170 // ---------------------------------
174 : HasInputBamFilename(false)
175 , HasOutputBamFilename(false)
177 , HasScriptFilename(false)
178 , OutputFilename(Options::StandardOut())
179 , HasAlignmentFlagFilter(false)
180 , HasInsertSizeFilter(false)
181 , HasMapQualityFilter(false)
182 , HasNameFilter(false)
183 , HasQueryBasesFilter(false)
184 // , HasTagFilters(false)
185 , HasIsDuplicateFilter(false)
186 , HasIsFailedQCFilter(false)
187 , HasIsFirstMateFilter(false)
188 , HasIsMappedFilter(false)
189 , HasIsMateMappedFilter(false)
190 , HasIsMateReverseStrandFilter(false)
191 , HasIsPairedFilter(false)
192 , HasIsPrimaryAlignmentFilter(false)
193 , HasIsProperPairFilter(false)
194 , HasIsReverseStrandFilter(false)
195 , HasIsSecondMateFilter(false)
196 , IsDuplicateFilter(TRUE_STR)
197 , IsFailedQCFilter(TRUE_STR)
198 , IsFirstMateFilter(TRUE_STR)
199 , IsMappedFilter(TRUE_STR)
200 , IsMateMappedFilter(TRUE_STR)
201 , IsMateReverseStrandFilter(TRUE_STR)
202 , IsPairedFilter(TRUE_STR)
203 , IsPrimaryAlignmentFilter(TRUE_STR)
204 , IsProperPairFilter(TRUE_STR)
205 , IsReverseStrandFilter(TRUE_STR)
206 , IsSecondMateFilter(TRUE_STR)
210 // ---------------------------------------------
211 // FilterTool implementation
213 FilterTool::FilterTool(void)
215 , m_settings(new FilterSettings)
218 // set program details
219 Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", "-in <filename> [-in <filename> ... ] -out <filename> -region <REGION> [ [-script <filename] | [filterOptions] ]");
221 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
222 Options::AddValueOption("-in", "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename, m_settings->InputFiles, IO_Opts, Options::StandardIn());
223 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
224 Options::AddValueOption("-region", "REGION", "only read data from this genomic region (see README for more details)", "", m_settings->HasRegion, m_settings->Region, IO_Opts);
225 Options::AddValueOption("-script", "filename", "the filter script file (see README for more details)", "", m_settings->HasScriptFilename, m_settings->ScriptFilename, IO_Opts);
227 OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters");
228 Options::AddValueOption("-alignmentFlag", "int", "keep reads with this *exact* alignment flag (for more detailed queries, see below)", "", m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, FilterOpts);
229 Options::AddValueOption("-insertSize", "int", "keep reads with insert size that mathces pattern", "", m_settings->HasInsertSizeFilter, m_settings->InsertSizeFilter, FilterOpts);
230 Options::AddValueOption("-mapQuality", "[0-255]", "keep reads with map quality that matches pattern", "", m_settings->HasMapQualityFilter, m_settings->MapQualityFilter, FilterOpts);
231 Options::AddValueOption("-name", "string", "keep reads with name that matches pattern", "", m_settings->HasNameFilter, m_settings->NameFilter, FilterOpts);
232 Options::AddValueOption("-queryBases", "string", "keep reads with motif that mathces pattern", "", m_settings->HasQueryBasesFilter, m_settings->QueryBasesFilter, FilterOpts);
234 // Options::AddValueOption("-tag", "TAG:VALUE", "keep reads with this key=>value pair. If multiple tags are given, reads must match all", "", m_settings->HasTagFilters, m_settings->TagFilters, FilterOpts);
236 OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters");
237 Options::AddValueOption("-isDuplicate", "true/false", "keep only alignments that are marked as duplicate?", "", m_settings->HasIsDuplicateFilter, m_settings->IsDuplicateFilter, AlignmentFlagOpts, TRUE_STR);
238 Options::AddValueOption("-isFailedQC", "true/false", "keep only alignments that failed QC?", "", m_settings->HasIsFailedQCFilter, m_settings->IsFailedQCFilter, AlignmentFlagOpts, TRUE_STR);
239 Options::AddValueOption("-isFirstMate", "true/false", "keep only alignments marked as first mate?", "", m_settings->HasIsFirstMateFilter, m_settings->IsFirstMateFilter, AlignmentFlagOpts, TRUE_STR);
240 Options::AddValueOption("-isMapped", "true/false", "keep only alignments that were mapped?", "", m_settings->HasIsMappedFilter, m_settings->IsMappedFilter, AlignmentFlagOpts, TRUE_STR);
241 Options::AddValueOption("-isMateMapped", "true/false", "keep only alignments with mates that mapped", "", m_settings->HasIsMateMappedFilter, m_settings->IsMateMappedFilter, AlignmentFlagOpts, TRUE_STR);
242 Options::AddValueOption("-isMateReverseStrand", "true/false", "keep only alignments with mate on reverese strand?", "", m_settings->HasIsMateReverseStrandFilter, m_settings->IsMateReverseStrandFilter, AlignmentFlagOpts, TRUE_STR);
243 Options::AddValueOption("-isPaired", "true/false", "keep only alignments that were sequenced as paired?","", m_settings->HasIsPairedFilter, m_settings->IsPairedFilter, AlignmentFlagOpts, TRUE_STR);
244 Options::AddValueOption("-isPrimaryAlignment", "true/false", "keep only alignments marked as primary?", "", m_settings->HasIsPrimaryAlignmentFilter, m_settings->IsPrimaryAlignmentFilter, AlignmentFlagOpts, TRUE_STR);
245 Options::AddValueOption("-isProperPair", "true/false", "keep only alignments that passed PE resolution?", "", m_settings->HasIsProperPairFilter, m_settings->IsProperPairFilter, AlignmentFlagOpts, TRUE_STR);
246 Options::AddValueOption("-isReverseStrand", "true/false", "keep only alignments on reverse strand?", "", m_settings->HasIsReverseStrandFilter, m_settings->IsReverseStrandFilter, AlignmentFlagOpts, TRUE_STR);
247 Options::AddValueOption("-isSecondMate", "true/false", "keep only alignments marked as second mate?", "", m_settings->HasIsSecondMateFilter, m_settings->IsSecondMateFilter, AlignmentFlagOpts, TRUE_STR);
250 FilterTool::~FilterTool(void) {
258 int FilterTool::Help(void) {
259 Options::DisplayHelp();
263 int FilterTool::Run(int argc, char* argv[]) {
265 // parse command line arguments
266 Options::Parse(argc, argv, 1);
268 // run internal FilterTool implementation, return success/fail
269 m_impl = new FilterToolPrivate(m_settings);
271 if ( m_impl->Run() ) return 0;
275 // ---------------------------------------------
276 // FilterToolPrivate implementation
279 FilterTool::FilterToolPrivate::FilterToolPrivate(FilterTool::FilterSettings* settings)
280 : m_settings(settings)
284 FilterTool::FilterToolPrivate::~FilterToolPrivate(void) { }
286 bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName, const map<string, string>& propertyTokens) {
289 // dummy temp values for token parsing
292 uint16_t uint16Value;
293 uint32_t uint32Value;
295 PropertyFilterValue::ValueCompareType type;
297 // iterate over property token map
298 map<string, string>::const_iterator mapIter = propertyTokens.begin();
299 map<string, string>::const_iterator mapEnd = propertyTokens.end();
300 for ( ; mapIter != mapEnd; ++mapIter ) {
302 const string& propertyName = (*mapIter).first;
303 const string& token = (*mapIter).second;
305 // ------------------------------
306 // convert token to value & compare type
307 // then add to filter engine
310 if ( propertyName == ISDUPLICATE_PROPERTY ||
311 propertyName == ISFAILEDQC_PROPERTY ||
312 propertyName == ISFIRSTMATE_PROPERTY ||
313 propertyName == ISMAPPED_PROPERTY ||
314 propertyName == ISMATEMAPPED_PROPERTY ||
315 propertyName == ISMATEREVERSESTRAND_PROPERTY ||
316 propertyName == ISPAIRED_PROPERTY ||
317 propertyName == ISPRIMARYALIGNMENT_PROPERTY ||
318 propertyName == ISPROPERPAIR_PROPERTY ||
319 propertyName == ISREVERSESTRAND_PROPERTY ||
320 propertyName == ISSECONDMATE_PROPERTY
323 FilterEngine::parseToken(token, boolValue, type);
324 FilterEngine::setProperty(filterName, propertyName, boolValue, type);
327 // int32_t conversion
328 else if ( propertyName == INSERTSIZE_PROPERTY ||
329 propertyName == MATEPOSITION_PROPERTY ||
330 propertyName == POSITION_PROPERTY
333 FilterEngine::parseToken(token, int32Value, type);
334 FilterEngine::setProperty(filterName, propertyName, int32Value, type);
337 // uint16_t conversion
338 else if ( propertyName == MAPQUALITY_PROPERTY )
340 FilterEngine::parseToken(token, uint16Value, type);
341 FilterEngine::setProperty(filterName, propertyName, uint16Value, type);
344 // uint32_t conversion
345 else if ( propertyName == ALIGNMENTFLAG_PROPERTY )
347 FilterEngine::parseToken(token, uint32Value, type);
348 FilterEngine::setProperty(filterName, propertyName, uint32Value, type);
352 else if ( propertyName == MATEREFERENCE_PROPERTY ||
353 propertyName == NAME_PROPERTY ||
354 propertyName == QUERYBASES_PROPERTY ||
355 propertyName == REFERENCE_PROPERTY
358 FilterEngine::parseToken(token, stringValue, type);
359 FilterEngine::setProperty(filterName, propertyName, stringValue, type);
362 // else unknown property
364 cerr << "Unknown property: " << propertyName << "!" << endl;
371 bool FilterTool::FilterToolPrivate::CheckAlignment(const BamAlignment& al) {
373 bool keepAlignment = true;
375 // only consider properties that are actually enabled
376 // iterate over these enabled properties
377 const vector<string> enabledProperties = FilterEngine::enabledPropertyNames();
378 vector<string>::const_iterator propIter = enabledProperties.begin();
379 vector<string>::const_iterator propEnd = enabledProperties.end();
380 for ( ; propIter != propEnd; ++propIter ) {
382 // check alignment data field depending on propertyName
383 const string& propertyName = (*propIter);
384 if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= FilterEngine::check(ALIGNMENTFLAG_PROPERTY, al.AlignmentFlag);
385 else if ( propertyName == INSERTSIZE_PROPERTY ) keepAlignment &= FilterEngine::check(INSERTSIZE_PROPERTY, al.InsertSize);
386 else if ( propertyName == ISDUPLICATE_PROPERTY ) keepAlignment &= FilterEngine::check(ISDUPLICATE_PROPERTY, al.IsDuplicate());
387 else if ( propertyName == ISFAILEDQC_PROPERTY ) keepAlignment &= FilterEngine::check(ISFAILEDQC_PROPERTY, al.IsFailedQC());
388 else if ( propertyName == ISFIRSTMATE_PROPERTY ) keepAlignment &= FilterEngine::check(ISFIRSTMATE_PROPERTY, al.IsFirstMate());
389 else if ( propertyName == ISMAPPED_PROPERTY ) keepAlignment &= FilterEngine::check(ISMAPPED_PROPERTY, al.IsMapped());
390 else if ( propertyName == ISMATEMAPPED_PROPERTY ) keepAlignment &= FilterEngine::check(ISMATEMAPPED_PROPERTY, al.IsMateMapped());
391 else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY ) keepAlignment &= FilterEngine::check(ISMATEREVERSESTRAND_PROPERTY, al.IsMateReverseStrand());
392 else if ( propertyName == ISPAIRED_PROPERTY ) keepAlignment &= FilterEngine::check(ISPAIRED_PROPERTY, al.IsPaired());
393 else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY ) keepAlignment &= FilterEngine::check(ISPRIMARYALIGNMENT_PROPERTY, al.IsPrimaryAlignment());
394 else if ( propertyName == ISPROPERPAIR_PROPERTY ) keepAlignment &= FilterEngine::check(ISPROPERPAIR_PROPERTY, al.IsProperPair());
395 else if ( propertyName == ISREVERSESTRAND_PROPERTY ) keepAlignment &= FilterEngine::check(ISREVERSESTRAND_PROPERTY, al.IsReverseStrand());
396 else if ( propertyName == ISSECONDMATE_PROPERTY ) keepAlignment &= FilterEngine::check(ISSECONDMATE_PROPERTY, al.IsSecondMate());
397 else if ( propertyName == MAPQUALITY_PROPERTY ) keepAlignment &= FilterEngine::check(MAPQUALITY_PROPERTY, al.MapQuality);
398 else if ( propertyName == MATEPOSITION_PROPERTY ) keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && FilterEngine::check(MATEPOSITION_PROPERTY, al.MateRefID) );
399 else if ( propertyName == MATEREFERENCE_PROPERTY ) {
400 if ( !al.IsPaired() || !al.IsMateMapped() ) return false;
401 BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)m_references.size())), "Invalid MateRefID");
402 const string& refName = m_references.at(al.MateRefID).RefName;
403 keepAlignment &= FilterEngine::check(MATEREFERENCE_PROPERTY, refName);
405 else if ( propertyName == NAME_PROPERTY ) keepAlignment &= FilterEngine::check(NAME_PROPERTY, al.Name);
406 else if ( propertyName == POSITION_PROPERTY ) keepAlignment &= FilterEngine::check(POSITION_PROPERTY, al.Position);
407 else if ( propertyName == QUERYBASES_PROPERTY ) keepAlignment &= FilterEngine::check(QUERYBASES_PROPERTY, al.QueryBases);
408 else if ( propertyName == REFERENCE_PROPERTY ) {
409 BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)m_references.size())), "Invalid RefID");
410 const string& refName = m_references.at(al.RefID).RefName;
411 keepAlignment &= FilterEngine::check(REFERENCE_PROPERTY, refName);
413 else BAMTOOLS_ASSERT_MESSAGE( false, "Unknown property");
415 // if alignment fails at ANY point, just quit and return false
416 if ( !keepAlignment ) return false;
419 // return success (should still be true at this point)
420 return keepAlignment;
423 const string FilterTool::FilterToolPrivate::GetScriptContents(void) {
425 // open script for reading
426 FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb");
428 cerr << "FilterTool error: Could not open script: " << m_settings->ScriptFilename << " for reading" << endl;
432 // read in entire script contents
434 ostringstream docStream("");
437 // peek ahead, make sure there is data available
438 char ch = fgetc(inFile);
440 if( feof(inFile) ) break;
442 // read next block of data
443 if ( fgets(buffer, 1024, inFile) == 0 ) {
444 cerr << "FilterTool error : could not read from script" << endl;
454 // import buffer contents to document, return
455 string document = docStream.str();
459 void FilterTool::FilterToolPrivate::InitProperties(void) {
461 // store property names in vector
462 m_propertyNames.push_back(ALIGNMENTFLAG_PROPERTY);
463 m_propertyNames.push_back(INSERTSIZE_PROPERTY);
464 m_propertyNames.push_back(ISDUPLICATE_PROPERTY);
465 m_propertyNames.push_back(ISFAILEDQC_PROPERTY);
466 m_propertyNames.push_back(ISFIRSTMATE_PROPERTY);
467 m_propertyNames.push_back(ISMAPPED_PROPERTY);
468 m_propertyNames.push_back(ISMATEMAPPED_PROPERTY);
469 m_propertyNames.push_back(ISMATEREVERSESTRAND_PROPERTY);
470 m_propertyNames.push_back(ISPAIRED_PROPERTY);
471 m_propertyNames.push_back(ISPRIMARYALIGNMENT_PROPERTY);
472 m_propertyNames.push_back(ISPROPERPAIR_PROPERTY);
473 m_propertyNames.push_back(ISREVERSESTRAND_PROPERTY);
474 m_propertyNames.push_back(ISSECONDMATE_PROPERTY);
475 m_propertyNames.push_back(MAPQUALITY_PROPERTY);
476 m_propertyNames.push_back(MATEPOSITION_PROPERTY);
477 m_propertyNames.push_back(MATEREFERENCE_PROPERTY);
478 m_propertyNames.push_back(NAME_PROPERTY);
479 m_propertyNames.push_back(POSITION_PROPERTY);
480 m_propertyNames.push_back(QUERYBASES_PROPERTY);
481 m_propertyNames.push_back(REFERENCE_PROPERTY);
483 // add vector contents to FilterEngine
484 vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
485 vector<string>::const_iterator propertyNameEnd = m_propertyNames.end();
486 for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter )
487 FilterEngine::addProperty((*propertyNameIter));
490 bool FilterTool::FilterToolPrivate::ParseCommandLine(void) {
492 // add a rule set to filter engine
493 const string CMD = "COMMAND_LINE";
494 FilterEngine::addFilter(CMD);
496 // map property names to command line args
497 map<string, string> propertyTokens;
498 if ( m_settings->HasAlignmentFlagFilter ) propertyTokens.insert( make_pair(ALIGNMENTFLAG_PROPERTY, m_settings->AlignmentFlagFilter) );
499 if ( m_settings->HasInsertSizeFilter ) propertyTokens.insert( make_pair(INSERTSIZE_PROPERTY, m_settings->InsertSizeFilter) );
500 if ( m_settings->HasIsDuplicateFilter ) propertyTokens.insert( make_pair(ISDUPLICATE_PROPERTY, m_settings->IsDuplicateFilter) );
501 if ( m_settings->HasIsFailedQCFilter ) propertyTokens.insert( make_pair(ISFAILEDQC_PROPERTY, m_settings->IsFailedQCFilter) );
502 if ( m_settings->HasIsFirstMateFilter ) propertyTokens.insert( make_pair(ISFIRSTMATE_PROPERTY, m_settings->IsFirstMateFilter) );
503 if ( m_settings->HasIsMappedFilter ) propertyTokens.insert( make_pair(ISMAPPED_PROPERTY, m_settings->IsMappedFilter) );
504 if ( m_settings->HasIsMateMappedFilter ) propertyTokens.insert( make_pair(ISMATEMAPPED_PROPERTY, m_settings->IsMateMappedFilter) );
505 if ( m_settings->HasIsMateReverseStrandFilter ) propertyTokens.insert( make_pair(ISMATEREVERSESTRAND_PROPERTY, m_settings->IsMateReverseStrandFilter) );
506 if ( m_settings->HasIsPairedFilter ) propertyTokens.insert( make_pair(ISPAIRED_PROPERTY, m_settings->IsPairedFilter) );
507 if ( m_settings->HasIsPrimaryAlignmentFilter ) propertyTokens.insert( make_pair(ISPRIMARYALIGNMENT_PROPERTY, m_settings->IsPrimaryAlignmentFilter) );
508 if ( m_settings->HasIsProperPairFilter ) propertyTokens.insert( make_pair(ISPROPERPAIR_PROPERTY, m_settings->IsProperPairFilter) );
509 if ( m_settings->HasIsReverseStrandFilter ) propertyTokens.insert( make_pair(ISREVERSESTRAND_PROPERTY, m_settings->IsReverseStrandFilter) );
510 if ( m_settings->HasIsSecondMateFilter ) propertyTokens.insert( make_pair(ISSECONDMATE_PROPERTY, m_settings->IsSecondMateFilter) );
511 if ( m_settings->HasMapQualityFilter ) propertyTokens.insert( make_pair(MAPQUALITY_PROPERTY, m_settings->MapQualityFilter) );
512 if ( m_settings->HasNameFilter ) propertyTokens.insert( make_pair(NAME_PROPERTY, m_settings->NameFilter) );
513 if ( m_settings->HasQueryBasesFilter ) propertyTokens.insert( make_pair(QUERYBASES_PROPERTY, m_settings->QueryBasesFilter) );
515 // send add these properties to filter set "COMMAND_LINE"
516 return AddPropertyTokensToFilter(CMD, propertyTokens);
519 bool FilterTool::FilterToolPrivate::ParseFilterObject(const string& filterName, const Json::Value& filterObject) {
521 // filter object parsing variables
522 Json::Value null(Json::nullValue);
523 Json::Value propertyValue;
526 map<string, string> propertyTokens;
528 // iterate over known properties
529 vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
530 vector<string>::const_iterator propertyNameEnd = m_propertyNames.end();
531 for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter ) {
532 const string& propertyName = (*propertyNameIter);
534 // if property defined in filter, add to token list
535 propertyValue = filterObject.get(propertyName, null);
536 if ( propertyValue != null )
537 propertyTokens.insert( make_pair(propertyName, propertyValue.asString()) );
540 // add this filter to engin
541 FilterEngine::addFilter(filterName);
543 // add token list to this filter
544 return AddPropertyTokensToFilter(filterName, propertyTokens);
547 bool FilterTool::FilterToolPrivate::ParseScript(void) {
549 // read in script contents from file
550 const string document = GetScriptContents();
552 // set up JsonCPP reader and attempt to parse script
555 if ( !reader.parse(document, root) ) {
556 // use built-in error reporting mechanism to alert user what was wrong with the script
557 cerr << "Failed to parse configuration\n" << reader.getFormatedErrorMessages();
561 // see if root object contains multiple filters
562 const Json::Value filters = root["filters"];
563 if ( !filters.isNull() ) {
567 // iterate over any filters found
569 Json::Value::const_iterator filtersIter = filters.begin();
570 Json::Value::const_iterator filtersEnd = filters.end();
571 for ( ; filtersIter != filtersEnd; ++filtersIter, ++filterIndex ) {
572 Json::Value filter = (*filtersIter);
574 // convert filter index to string
575 stringstream convert;
576 convert << filterIndex;
577 const string filterName = convert.str();
579 // create & parse filter
580 success &= ParseFilterObject(filterName, filter);
586 // otherwise, root is the only filter (just contains properties)
587 // create & parse filter named "ROOT"
588 else return ParseFilterObject("ROOT", root);
592 bool FilterTool::FilterToolPrivate::Run(void) {
594 // set to default input if none provided
595 if ( !m_settings->HasInputBamFilename )
596 m_settings->InputFiles.push_back(Options::StandardIn());
598 // initialize defined properties & user-specified filters
600 if ( !SetupFilters() ) return 1;
602 // open reader without index
603 BamMultiReader reader;
604 reader.Open(m_settings->InputFiles, false, true);
605 const string headerText = reader.GetHeaderText();
606 m_references = reader.GetReferenceData();
610 writer.Open(m_settings->OutputFilename, headerText, m_references);
612 // set up error handling
613 ostringstream errorStream("");
614 bool foundError(false);
616 // if no REGION specified, run filter on entire file contents
617 if ( !m_settings->HasRegion ) {
619 while ( reader.GetNextAlignment(al) ) {
620 // perform main filter check
621 if ( CheckAlignment(al) )
622 writer.SaveAlignment(al);
629 // attempt to parse string into BamRegion struct
631 if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
633 // check if there are index files *.bai/*.bti corresponding to the input files
634 bool hasDefaultIndex = false;
635 bool hasBamtoolsIndex = false;
636 bool hasNoIndex = false;
637 int defaultIndexCount = 0;
638 int bamtoolsIndexCount = 0;
639 for (vector<string>::const_iterator f = m_settings->InputFiles.begin(); f != m_settings->InputFiles.end(); ++f) {
641 if ( Utilities::FileExists(*f + ".bai") ) {
642 hasDefaultIndex = true;
646 if ( Utilities::FileExists(*f + ".bti") ) {
647 hasBamtoolsIndex = true;
648 ++bamtoolsIndexCount;
651 if ( !hasDefaultIndex && !hasBamtoolsIndex ) {
653 cerr << "*WARNING - could not find index file for " << *f
654 << ", parsing whole file(s) to get alignment counts for target region"
655 << " (could be slow)" << endl;
660 // determine if index file types are heterogeneous
661 bool hasDifferentIndexTypes = false;
662 if ( defaultIndexCount > 0 && bamtoolsIndexCount > 0 ) {
663 hasDifferentIndexTypes = true;
664 cerr << "*WARNING - different index file formats found"
665 << ", parsing whole file(s) to get alignment counts for target region"
666 << " (could be slow)" << endl;
669 // if any input file has no index, or if input files use different index formats
670 // can't use BamMultiReader to jump directly (**for now**)
671 if ( hasNoIndex || hasDifferentIndexTypes ) {
673 // read through sequentially, but onlt perform filter on reads overlapping REGION
675 while( reader.GetNextAlignment(al) ) {
676 if ( (al.RefID >= region.LeftRefID) && ( (al.Position + al.Length) >= region.LeftPosition ) &&
677 (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) )
679 // perform main filter check
680 if ( CheckAlignment(al) )
681 writer.SaveAlignment(al);
686 // has index file for each input file (and same format)
689 // this is kind of a hack...?
690 BamMultiReader reader;
691 reader.Open(m_settings->InputFiles, true, true, hasDefaultIndex );
693 if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
695 errorStream << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
698 // filter only alignments from specified region
700 while ( reader.GetNextAlignment(al) ) {
701 // perform main filter check
702 if ( CheckAlignment(al) )
703 writer.SaveAlignment(al);
710 errorStream << "Could not parse REGION: " << m_settings->Region << endl;
711 errorStream << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl;
721 bool FilterTool::FilterToolPrivate::SetupFilters(void) {
723 // add known properties to FilterEngine
726 // parse script for filter rules, if given
727 if ( m_settings->HasScriptFilename ) return ParseScript();
729 // otherwise check command line for filters
730 else return ParseCommandLine();