1 // ***************************************************************************
2 // bamtools_filter.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 3 May 2013
6 // ---------------------------------------------------------------------------
7 // Filters BAM file(s) according to some user-specified criteria
8 // ***************************************************************************
10 #include "bamtools_filter.h"
12 #include <api/BamMultiReader.h>
13 #include <api/BamWriter.h>
14 #include <utils/bamtools_filter_engine.h>
15 #include <utils/bamtools_options.h>
16 #include <utils/bamtools_utilities.h>
17 using namespace BamTools;
19 #include <jsoncpp/json.h>
32 // -------------------------------
33 // string literal constants
36 const string ALIGNMENTFLAG_PROPERTY = "alignmentFlag";
37 const string CIGAR_PROPERTY = "cigar";
38 const string INSERTSIZE_PROPERTY = "insertSize";
39 const string ISDUPLICATE_PROPERTY = "isDuplicate";
40 const string ISFAILEDQC_PROPERTY = "isFailedQC";
41 const string ISFIRSTMATE_PROPERTY = "isFirstMate";
42 const string ISMAPPED_PROPERTY = "isMapped";
43 const string ISMATEMAPPED_PROPERTY = "isMateMapped";
44 const string ISMATEREVERSESTRAND_PROPERTY = "isMateReverseStrand";
45 const string ISPAIRED_PROPERTY = "isPaired";
46 const string ISPRIMARYALIGNMENT_PROPERTY = "isPrimaryAlignment";
47 const string ISPROPERPAIR_PROPERTY = "isProperPair";
48 const string ISREVERSESTRAND_PROPERTY = "isReverseStrand";
49 const string ISSECONDMATE_PROPERTY = "isSecondMate";
50 const string ISSINGLETON_PROPERTY = "isSingleton";
51 const string LENGTH_PROPERTY = "length";
52 const string MAPQUALITY_PROPERTY = "mapQuality";
53 const string MATEPOSITION_PROPERTY = "matePosition";
54 const string MATEREFERENCE_PROPERTY = "mateReference";
55 const string NAME_PROPERTY = "name";
56 const string POSITION_PROPERTY = "position";
57 const string QUERYBASES_PROPERTY = "queryBases";
58 const string REFERENCE_PROPERTY = "reference";
59 const string TAG_PROPERTY = "tag";
62 const string TRUE_STR = "true";
63 const string FALSE_STR = "false";
65 RefVector filterToolReferences;
67 struct BamAlignmentChecker {
68 bool check(const PropertyFilter& filter, const BamAlignment& al) {
70 bool keepAlignment = true;
71 const PropertyMap& properties = filter.Properties;
72 PropertyMap::const_iterator propertyIter = properties.begin();
73 PropertyMap::const_iterator propertyEnd = properties.end();
74 for ( ; propertyIter != propertyEnd; ++propertyIter ) {
76 // check alignment data field depending on propertyName
77 const string& propertyName = (*propertyIter).first;
78 const PropertyFilterValue& valueFilter = (*propertyIter).second;
80 if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= valueFilter.check(al.AlignmentFlag);
81 else if ( propertyName == CIGAR_PROPERTY ) {
83 const vector<CigarOp>& cigarData = al.CigarData;
84 if ( !cigarData.empty() ) {
85 vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
86 vector<CigarOp>::const_iterator cigarIter = cigarBegin;
87 vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
88 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
89 const CigarOp& op = (*cigarIter);
90 cigarSs << op.Length << op.Type;
92 keepAlignment &= valueFilter.check(cigarSs.str());
95 else if ( propertyName == INSERTSIZE_PROPERTY ) keepAlignment &= valueFilter.check(al.InsertSize);
96 else if ( propertyName == ISDUPLICATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsDuplicate());
97 else if ( propertyName == ISFAILEDQC_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFailedQC());
98 else if ( propertyName == ISFIRSTMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFirstMate());
99 else if ( propertyName == ISMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMapped());
100 else if ( propertyName == ISMATEMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateMapped());
101 else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateReverseStrand());
102 else if ( propertyName == ISPAIRED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPaired());
103 else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPrimaryAlignment());
104 else if ( propertyName == ISPROPERPAIR_PROPERTY ) keepAlignment &= valueFilter.check(al.IsProperPair());
105 else if ( propertyName == ISREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsReverseStrand());
106 else if ( propertyName == ISSECONDMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsSecondMate());
107 else if ( propertyName == ISSINGLETON_PROPERTY ) {
108 const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped();
109 keepAlignment &= valueFilter.check(isSingleton);
111 else if ( propertyName == LENGTH_PROPERTY ) keepAlignment &= valueFilter.check(al.Length);
112 else if ( propertyName == MAPQUALITY_PROPERTY ) keepAlignment &= valueFilter.check(al.MapQuality);
113 else if ( propertyName == MATEPOSITION_PROPERTY ) keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) );
114 else if ( propertyName == MATEREFERENCE_PROPERTY ) {
115 if ( !al.IsPaired() || !al.IsMateMapped() ) return false;
116 BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID");
117 const string& refName = filterToolReferences.at(al.MateRefID).RefName;
118 keepAlignment &= valueFilter.check(refName);
120 else if ( propertyName == NAME_PROPERTY ) keepAlignment &= valueFilter.check(al.Name);
121 else if ( propertyName == POSITION_PROPERTY ) keepAlignment &= valueFilter.check(al.Position);
122 else if ( propertyName == QUERYBASES_PROPERTY ) keepAlignment &= valueFilter.check(al.QueryBases);
123 else if ( propertyName == REFERENCE_PROPERTY ) {
124 BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID");
125 const string& refName = filterToolReferences.at(al.RefID).RefName;
126 keepAlignment &= valueFilter.check(refName);
128 else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al);
129 else BAMTOOLS_ASSERT_UNREACHABLE;
131 // if alignment fails at ANY point, just quit and return false
132 if ( !keepAlignment ) return false;
135 BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here");
136 return keepAlignment;
139 bool checkAlignmentTag(const PropertyFilterValue& valueFilter, const BamAlignment& al) {
141 // ensure filter contains string data
142 Variant entireTagFilter = valueFilter.Value;
143 if ( !entireTagFilter.is_type<string>() ) return false;
145 // localize string from variant
146 const string& entireTagFilterString = entireTagFilter.get<string>();
148 // ensure we have at least "XX:x"
149 if ( entireTagFilterString.length() < 4 ) return false;
151 // get tagName & lookup in alignment
152 // if found, set tagType to tag type character
153 // if not found, return false
154 const string& tagName = entireTagFilterString.substr(0,2);
156 if ( !al.GetTagType(tagName, tagType) ) return false;
158 // remove tagName & ":" from beginning tagFilter
159 string tagFilterString = entireTagFilterString.substr(3);
161 // switch on tag type to set tag query value & parse filter token
162 int8_t asciiFilterValue, asciiQueryValue;
163 int32_t intFilterValue, intQueryValue;
164 uint32_t uintFilterValue, uintQueryValue;
165 float realFilterValue, realQueryValue;
166 string stringFilterValue, stringQueryValue;
168 PropertyFilterValue tagFilter;
169 PropertyFilterValue::ValueCompareType compareType;
170 bool keepAlignment = false;
175 if ( al.GetTag(tagName, asciiQueryValue) ) {
176 if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, asciiFilterValue, compareType) ) {
177 tagFilter.Value = asciiFilterValue;
178 tagFilter.Type = compareType;
179 keepAlignment = tagFilter.check(asciiQueryValue);
184 // signed int tag type
188 if ( al.GetTag(tagName, intQueryValue) ) {
189 if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, intFilterValue, compareType) ) {
190 tagFilter.Value = intFilterValue;
191 tagFilter.Type = compareType;
192 keepAlignment = tagFilter.check(intQueryValue);
197 // unsigned int tag type
201 if ( al.GetTag(tagName, uintQueryValue) ) {
202 if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, uintFilterValue, compareType) ) {
203 tagFilter.Value = uintFilterValue;
204 tagFilter.Type = compareType;
205 keepAlignment = tagFilter.check(uintQueryValue);
212 if ( al.GetTag(tagName, realQueryValue) ) {
213 if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, realFilterValue, compareType) ) {
214 tagFilter.Value = realFilterValue;
215 tagFilter.Type = compareType;
216 keepAlignment = tagFilter.check(realQueryValue);
225 if ( al.GetTag(tagName, stringQueryValue) ) {
226 if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, stringFilterValue, compareType) ) {
227 tagFilter.Value = stringFilterValue;
228 tagFilter.Type = compareType;
229 keepAlignment = tagFilter.check(stringQueryValue);
236 keepAlignment = false;
239 return keepAlignment;
243 } // namespace BamTools
245 // ---------------------------------------------
246 // FilterSettings implementation
248 struct FilterTool::FilterSettings {
250 // ----------------------------------
255 bool HasInputFilelist;
259 bool IsForceCompression;
262 vector<string> InputFiles;
263 string InputFilelist;
264 string OutputFilename;
266 string ScriptFilename;
268 // -----------------------------------
269 // General filter opts
272 bool HasAlignmentFlagFilter;
273 bool HasInsertSizeFilter;
274 bool HasLengthFilter;
275 bool HasMapQualityFilter;
277 bool HasQueryBasesFilter;
278 bool HasTagFilter; //(s)
281 string AlignmentFlagFilter;
282 string InsertSizeFilter;
284 string MapQualityFilter;
286 string QueryBasesFilter;
287 string TagFilter; // support multiple ?
289 // -----------------------------------
290 // AlignmentFlag filter opts
293 bool HasIsDuplicateFilter;
294 bool HasIsFailedQCFilter;
295 bool HasIsFirstMateFilter;
296 bool HasIsMappedFilter;
297 bool HasIsMateMappedFilter;
298 bool HasIsMateReverseStrandFilter;
299 bool HasIsPairedFilter;
300 bool HasIsPrimaryAlignmentFilter;
301 bool HasIsProperPairFilter;
302 bool HasIsReverseStrandFilter;
303 bool HasIsSecondMateFilter;
304 bool HasIsSingletonFilter;
307 string IsDuplicateFilter;
308 string IsFailedQCFilter;
309 string IsFirstMateFilter;
310 string IsMappedFilter;
311 string IsMateMappedFilter;
312 string IsMateReverseStrandFilter;
313 string IsPairedFilter;
314 string IsPrimaryAlignmentFilter;
315 string IsProperPairFilter;
316 string IsReverseStrandFilter;
317 string IsSecondMateFilter;
318 string IsSingletonFilter;
320 // ---------------------------------
325 , HasInputFilelist(false)
329 , IsForceCompression(false)
330 , OutputFilename(Options::StandardOut())
331 , HasAlignmentFlagFilter(false)
332 , HasInsertSizeFilter(false)
333 , HasLengthFilter(false)
334 , HasMapQualityFilter(false)
335 , HasNameFilter(false)
336 , HasQueryBasesFilter(false)
337 , HasTagFilter(false)
338 , HasIsDuplicateFilter(false)
339 , HasIsFailedQCFilter(false)
340 , HasIsFirstMateFilter(false)
341 , HasIsMappedFilter(false)
342 , HasIsMateMappedFilter(false)
343 , HasIsMateReverseStrandFilter(false)
344 , HasIsPairedFilter(false)
345 , HasIsPrimaryAlignmentFilter(false)
346 , HasIsProperPairFilter(false)
347 , HasIsReverseStrandFilter(false)
348 , HasIsSecondMateFilter(false)
349 , HasIsSingletonFilter(false)
350 , IsDuplicateFilter(TRUE_STR)
351 , IsFailedQCFilter(TRUE_STR)
352 , IsFirstMateFilter(TRUE_STR)
353 , IsMappedFilter(TRUE_STR)
354 , IsMateMappedFilter(TRUE_STR)
355 , IsMateReverseStrandFilter(TRUE_STR)
356 , IsPairedFilter(TRUE_STR)
357 , IsPrimaryAlignmentFilter(TRUE_STR)
358 , IsProperPairFilter(TRUE_STR)
359 , IsReverseStrandFilter(TRUE_STR)
360 , IsSecondMateFilter(TRUE_STR)
361 , IsSingletonFilter(TRUE_STR)
365 // ---------------------------------------------
366 // FilterToolPrivate declaration
368 class FilterTool::FilterToolPrivate {
372 FilterToolPrivate(FilterTool::FilterSettings* settings);
373 ~FilterToolPrivate(void);
375 // 'public' interface
381 bool AddPropertyTokensToFilter(const string& filterName, const map<string, string>& propertyTokens);
382 bool CheckAlignment(const BamAlignment& al);
383 const string GetScriptContents(void);
384 void InitProperties(void);
385 bool ParseCommandLine(void);
386 bool ParseFilterObject(const string& filterName, const Json::Value& filterObject);
387 bool ParseScript(void);
388 bool SetupFilters(void);
392 vector<string> m_propertyNames;
393 FilterTool::FilterSettings* m_settings;
394 FilterEngine<BamAlignmentChecker> m_filterEngine;
397 // ---------------------------------------------
398 // FilterToolPrivate implementation
401 FilterTool::FilterToolPrivate::FilterToolPrivate(FilterTool::FilterSettings* settings)
402 : m_settings(settings)
406 FilterTool::FilterToolPrivate::~FilterToolPrivate(void) { }
408 bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName,
410 string>& propertyTokens)
412 // dummy temp values for token parsing
415 uint16_t uint16Value;
416 uint32_t uint32Value;
418 PropertyFilterValue::ValueCompareType type;
420 // iterate over property token map
421 map<string, string>::const_iterator mapIter = propertyTokens.begin();
422 map<string, string>::const_iterator mapEnd = propertyTokens.end();
423 for ( ; mapIter != mapEnd; ++mapIter ) {
425 const string& propertyName = (*mapIter).first;
426 const string& token = (*mapIter).second;
428 // ------------------------------
429 // convert token to value & compare type
430 // then add to filter engine
433 if ( propertyName == ISDUPLICATE_PROPERTY ||
434 propertyName == ISFAILEDQC_PROPERTY ||
435 propertyName == ISFIRSTMATE_PROPERTY ||
436 propertyName == ISMAPPED_PROPERTY ||
437 propertyName == ISMATEMAPPED_PROPERTY ||
438 propertyName == ISMATEREVERSESTRAND_PROPERTY ||
439 propertyName == ISPAIRED_PROPERTY ||
440 propertyName == ISPRIMARYALIGNMENT_PROPERTY ||
441 propertyName == ISPROPERPAIR_PROPERTY ||
442 propertyName == ISREVERSESTRAND_PROPERTY ||
443 propertyName == ISSECONDMATE_PROPERTY ||
444 propertyName == ISSINGLETON_PROPERTY
447 FilterEngine<BamAlignmentChecker>::parseToken(token, boolValue, type);
448 m_filterEngine.setProperty(filterName, propertyName, boolValue, type);
451 // int32_t conversion
452 else if ( propertyName == INSERTSIZE_PROPERTY ||
453 propertyName == LENGTH_PROPERTY ||
454 propertyName == MATEPOSITION_PROPERTY ||
455 propertyName == POSITION_PROPERTY
458 FilterEngine<BamAlignmentChecker>::parseToken(token, int32Value, type);
459 m_filterEngine.setProperty(filterName, propertyName, int32Value, type);
462 // uint16_t conversion
463 else if ( propertyName == MAPQUALITY_PROPERTY )
465 FilterEngine<BamAlignmentChecker>::parseToken(token, uint16Value, type);
466 m_filterEngine.setProperty(filterName, propertyName, uint16Value, type);
469 // uint32_t conversion
470 else if ( propertyName == ALIGNMENTFLAG_PROPERTY )
472 FilterEngine<BamAlignmentChecker>::parseToken(token, uint32Value, type);
473 m_filterEngine.setProperty(filterName, propertyName, uint32Value, type);
477 else if ( propertyName == CIGAR_PROPERTY ||
478 propertyName == MATEREFERENCE_PROPERTY ||
479 propertyName == NAME_PROPERTY ||
480 propertyName == QUERYBASES_PROPERTY ||
481 propertyName == REFERENCE_PROPERTY
484 FilterEngine<BamAlignmentChecker>::parseToken(token, stringValue, type);
485 m_filterEngine.setProperty(filterName, propertyName, stringValue, type);
488 else if ( propertyName == TAG_PROPERTY ) {
489 // this will be stored directly as the TAG:VALUE token
490 // (VALUE may contain compare ops, will be parsed out later)
491 m_filterEngine.setProperty(filterName, propertyName, token, PropertyFilterValue::EXACT);
494 // else unknown property
496 cerr << "bamtools filter ERROR: unknown property - " << propertyName << endl;
503 bool FilterTool::FilterToolPrivate::CheckAlignment(const BamAlignment& al) {
504 return m_filterEngine.check(al);
507 const string FilterTool::FilterToolPrivate::GetScriptContents(void) {
509 // open script for reading
510 FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb");
512 cerr << "bamtools filter ERROR: could not open script: "
513 << m_settings->ScriptFilename << " for reading" << endl;
517 // read in entire script contents
519 ostringstream docStream("");
522 // peek ahead, make sure there is data available
523 char ch = fgetc(inFile);
528 // read next block of data
529 if ( fgets(buffer, 1024, inFile) == 0 ) {
530 cerr << "bamtools filter ERROR: could not read script contents" << endl;
540 // import buffer contents to document, return
541 return docStream.str();
544 void FilterTool::FilterToolPrivate::InitProperties(void) {
546 // store property names in vector
547 m_propertyNames.push_back(ALIGNMENTFLAG_PROPERTY);
548 m_propertyNames.push_back(CIGAR_PROPERTY);
549 m_propertyNames.push_back(INSERTSIZE_PROPERTY);
550 m_propertyNames.push_back(ISDUPLICATE_PROPERTY);
551 m_propertyNames.push_back(ISFAILEDQC_PROPERTY);
552 m_propertyNames.push_back(ISFIRSTMATE_PROPERTY);
553 m_propertyNames.push_back(ISMAPPED_PROPERTY);
554 m_propertyNames.push_back(ISMATEMAPPED_PROPERTY);
555 m_propertyNames.push_back(ISMATEREVERSESTRAND_PROPERTY);
556 m_propertyNames.push_back(ISPAIRED_PROPERTY);
557 m_propertyNames.push_back(ISPRIMARYALIGNMENT_PROPERTY);
558 m_propertyNames.push_back(ISPROPERPAIR_PROPERTY);
559 m_propertyNames.push_back(ISREVERSESTRAND_PROPERTY);
560 m_propertyNames.push_back(ISSECONDMATE_PROPERTY);
561 m_propertyNames.push_back(ISSINGLETON_PROPERTY);
562 m_propertyNames.push_back(LENGTH_PROPERTY);
563 m_propertyNames.push_back(MAPQUALITY_PROPERTY);
564 m_propertyNames.push_back(MATEPOSITION_PROPERTY);
565 m_propertyNames.push_back(MATEREFERENCE_PROPERTY);
566 m_propertyNames.push_back(NAME_PROPERTY);
567 m_propertyNames.push_back(POSITION_PROPERTY);
568 m_propertyNames.push_back(QUERYBASES_PROPERTY);
569 m_propertyNames.push_back(REFERENCE_PROPERTY);
570 m_propertyNames.push_back(TAG_PROPERTY);
572 // add vector contents to FilterEngine<BamAlignmentChecker>
573 vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
574 vector<string>::const_iterator propertyNameEnd = m_propertyNames.end();
575 for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter )
576 m_filterEngine.addProperty((*propertyNameIter));
579 bool FilterTool::FilterToolPrivate::ParseCommandLine(void) {
581 // add a rule set to filter engine
582 const string CMD = "COMMAND_LINE";
583 m_filterEngine.addFilter(CMD);
585 // map property names to command line args
586 map<string, string> propertyTokens;
587 if ( m_settings->HasAlignmentFlagFilter ) propertyTokens.insert( make_pair(ALIGNMENTFLAG_PROPERTY, m_settings->AlignmentFlagFilter) );
588 if ( m_settings->HasInsertSizeFilter ) propertyTokens.insert( make_pair(INSERTSIZE_PROPERTY, m_settings->InsertSizeFilter) );
589 if ( m_settings->HasIsDuplicateFilter ) propertyTokens.insert( make_pair(ISDUPLICATE_PROPERTY, m_settings->IsDuplicateFilter) );
590 if ( m_settings->HasIsFailedQCFilter ) propertyTokens.insert( make_pair(ISFAILEDQC_PROPERTY, m_settings->IsFailedQCFilter) );
591 if ( m_settings->HasIsFirstMateFilter ) propertyTokens.insert( make_pair(ISFIRSTMATE_PROPERTY, m_settings->IsFirstMateFilter) );
592 if ( m_settings->HasIsMappedFilter ) propertyTokens.insert( make_pair(ISMAPPED_PROPERTY, m_settings->IsMappedFilter) );
593 if ( m_settings->HasIsMateMappedFilter ) propertyTokens.insert( make_pair(ISMATEMAPPED_PROPERTY, m_settings->IsMateMappedFilter) );
594 if ( m_settings->HasIsMateReverseStrandFilter ) propertyTokens.insert( make_pair(ISMATEREVERSESTRAND_PROPERTY, m_settings->IsMateReverseStrandFilter) );
595 if ( m_settings->HasIsPairedFilter ) propertyTokens.insert( make_pair(ISPAIRED_PROPERTY, m_settings->IsPairedFilter) );
596 if ( m_settings->HasIsPrimaryAlignmentFilter ) propertyTokens.insert( make_pair(ISPRIMARYALIGNMENT_PROPERTY, m_settings->IsPrimaryAlignmentFilter) );
597 if ( m_settings->HasIsProperPairFilter ) propertyTokens.insert( make_pair(ISPROPERPAIR_PROPERTY, m_settings->IsProperPairFilter) );
598 if ( m_settings->HasIsReverseStrandFilter ) propertyTokens.insert( make_pair(ISREVERSESTRAND_PROPERTY, m_settings->IsReverseStrandFilter) );
599 if ( m_settings->HasIsSecondMateFilter ) propertyTokens.insert( make_pair(ISSECONDMATE_PROPERTY, m_settings->IsSecondMateFilter) );
600 if ( m_settings->HasIsSingletonFilter ) propertyTokens.insert( make_pair(ISSINGLETON_PROPERTY, m_settings->IsSingletonFilter) );
601 if ( m_settings->HasLengthFilter ) propertyTokens.insert( make_pair(LENGTH_PROPERTY, m_settings->LengthFilter) );
602 if ( m_settings->HasMapQualityFilter ) propertyTokens.insert( make_pair(MAPQUALITY_PROPERTY, m_settings->MapQualityFilter) );
603 if ( m_settings->HasNameFilter ) propertyTokens.insert( make_pair(NAME_PROPERTY, m_settings->NameFilter) );
604 if ( m_settings->HasQueryBasesFilter ) propertyTokens.insert( make_pair(QUERYBASES_PROPERTY, m_settings->QueryBasesFilter) );
605 if ( m_settings->HasTagFilter ) propertyTokens.insert( make_pair(TAG_PROPERTY, m_settings->TagFilter) );
607 // send add these properties to filter set "COMMAND_LINE"
608 return AddPropertyTokensToFilter(CMD, propertyTokens);
611 bool FilterTool::FilterToolPrivate::ParseFilterObject(const string& filterName, const Json::Value& filterObject) {
613 // filter object parsing variables
614 Json::Value null(Json::nullValue);
615 Json::Value propertyValue;
618 map<string, string> propertyTokens;
620 // iterate over known properties
621 vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
622 vector<string>::const_iterator propertyNameEnd = m_propertyNames.end();
623 for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter ) {
624 const string& propertyName = (*propertyNameIter);
626 // if property defined in filter, add to token list
627 propertyValue = filterObject.get(propertyName, null);
628 if ( propertyValue != null )
629 propertyTokens.insert( make_pair(propertyName, propertyValue.asString()) );
632 // add this filter to engin
633 m_filterEngine.addFilter(filterName);
635 // add token list to this filter
636 return AddPropertyTokensToFilter(filterName, propertyTokens);
639 bool FilterTool::FilterToolPrivate::ParseScript(void) {
641 // read in script contents from file
642 const string document = GetScriptContents();
644 // set up JsonCPP reader and attempt to parse script
647 if ( !reader.parse(document, root) ) {
648 // use built-in error reporting mechanism to alert user what was wrong with the script
649 cerr << "bamtools filter ERROR: failed to parse script - see error message(s) below" << endl
650 << reader.getFormatedErrorMessages();
654 // initialize return status
657 // see if root object contains multiple filters
658 const Json::Value filters = root["filters"];
659 if ( !filters.isNull() ) {
661 // iterate over any filters found
663 Json::Value::const_iterator filtersIter = filters.begin();
664 Json::Value::const_iterator filtersEnd = filters.end();
665 for ( ; filtersIter != filtersEnd; ++filtersIter, ++filterIndex ) {
666 Json::Value filter = (*filtersIter);
668 // convert filter index to string
671 // if id tag supplied
672 const Json::Value id = filter["id"];
674 filterName = id.asString();
678 stringstream convert;
679 convert << filterIndex;
680 filterName = convert.str();
683 // create & parse filter
684 success &= ParseFilterObject(filterName, filter);
687 // see if user defined a "rule" for these filters
688 // otherwise, use filter engine's default rule behavior
689 string ruleString("");
690 const Json::Value rule = root["rule"];
691 if ( rule.isString() )
692 ruleString = rule.asString();
693 m_filterEngine.setRule(ruleString);
695 // return success/fail
699 // otherwise, root is the only filter (just contains properties)
700 // create & parse filter named "ROOT"
701 else success = ParseFilterObject("ROOT", root);
703 // return success/failure
707 bool FilterTool::FilterToolPrivate::Run(void) {
709 // set to default input if none provided
710 if ( !m_settings->HasInput && !m_settings->HasInputFilelist )
711 m_settings->InputFiles.push_back(Options::StandardIn());
713 // add files in the filelist to the input file list
714 if ( m_settings->HasInputFilelist ) {
716 ifstream filelist(m_settings->InputFilelist.c_str(), ios::in);
717 if ( !filelist.is_open() ) {
718 cerr << "bamtools filter ERROR: could not open input BAM file list... Aborting." << endl;
723 while ( getline(filelist, line) )
724 m_settings->InputFiles.push_back(line);
727 // initialize defined properties & user-specified filters
729 if ( !SetupFilters() )
732 // open reader without index
733 BamMultiReader reader;
734 if ( !reader.Open(m_settings->InputFiles) ) {
735 cerr << "bamtools filter ERROR: could not open input files for reading." << endl;
739 // retrieve reader header & reference data
740 const string headerText = reader.GetHeaderText();
741 filterToolReferences = reader.GetReferenceData();
743 // determine compression mode for BamWriter
744 bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
745 !m_settings->IsForceCompression );
746 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
747 if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
751 writer.SetCompressionMode(compressionMode);
752 if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences) ) {
753 cerr << "bamtools filter ERROR: could not open " << m_settings->OutputFilename << " for writing." << endl;
758 // if no region specified, filter entire file
760 if ( !m_settings->HasRegion ) {
761 while ( reader.GetNextAlignment(al) ) {
762 if ( CheckAlignment(al) )
763 writer.SaveAlignment(al);
767 // otherwise attempt to use region as constraint
770 // if region string parses OK
772 if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
774 // attempt to find index files
775 reader.LocateIndexes();
777 // if index data available for all BAM files, we can use SetRegion
778 if ( reader.HasIndexes() ) {
780 // attempt to use SetRegion(), if failed report error
781 if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
782 cerr << "bamtools filter ERROR: set region failed. Check that REGION describes a valid range" << endl;
787 // everything checks out, just iterate through specified region, filtering alignments
788 while ( reader.GetNextAlignment(al) )
789 if ( CheckAlignment(al) )
790 writer.SaveAlignment(al);
793 // no index data available, we have to iterate through until we
794 // find overlapping alignments
796 while ( reader.GetNextAlignment(al) ) {
797 if ( (al.RefID >= region.LeftRefID) && ((al.Position + al.Length) >= region.LeftPosition) &&
798 (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) )
800 if ( CheckAlignment(al) )
801 writer.SaveAlignment(al);
807 // error parsing REGION string
809 cerr << "bamtools filter ERROR: could not parse REGION: " << m_settings->Region << endl;
810 cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
823 bool FilterTool::FilterToolPrivate::SetupFilters(void) {
825 // set up filter engine with supported properties
828 // parse script for filter rules, if given
829 if ( m_settings->HasScript )
830 return ParseScript();
832 // otherwise check command line for filters
833 else return ParseCommandLine();
836 // ---------------------------------------------
837 // FilterTool implementation
839 FilterTool::FilterTool(void)
841 , m_settings(new FilterSettings)
844 // ----------------------------------
845 // set program details
847 const string usage = "[-in <filename> -in <filename> ... | -list <filelist>] "
848 "[-out <filename> | [-forceCompression]] [-region <REGION>] "
849 "[ [-script <filename] | [filterOptions] ]";
851 Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", usage );
853 // ----------------------------------
856 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
858 const string inDesc = "the input BAM file(s)";
859 const string listDesc = "the input BAM file list, one line per file";
860 const string outDesc = "the output BAM file";
861 const string regionDesc = "only read data from this genomic region (see documentation for more details)";
862 const string scriptDesc = "the filter script file (see documentation for more details)";
863 const string forceDesc = "if results are sent to stdout (like when piping to another tool), "
864 "default behavior is to leave output uncompressed. Use this flag to "
865 "override and force compression";
867 Options::AddValueOption("-in", "BAM filename", inDesc, "", m_settings->HasInput, m_settings->InputFiles, IO_Opts, Options::StandardIn());
868 Options::AddValueOption("-list", "filename", listDesc, "", m_settings->HasInputFilelist, m_settings->InputFilelist, IO_Opts);
869 Options::AddValueOption("-out", "BAM filename", outDesc, "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
870 Options::AddValueOption("-region", "REGION", regionDesc, "", m_settings->HasRegion, m_settings->Region, IO_Opts);
871 Options::AddValueOption("-script", "filename", scriptDesc, "", m_settings->HasScript, m_settings->ScriptFilename, IO_Opts);
872 Options::AddOption("-forceCompression",forceDesc, m_settings->IsForceCompression, IO_Opts);
874 // ----------------------------------
875 // general filter options
877 OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters");
879 const string flagDesc = "keep reads with this *exact* alignment flag (for more detailed queries, see below)";
880 const string insertDesc = "keep reads with insert size that matches pattern";
881 const string lengthDesc = "keep reads with length that matches pattern";
882 const string mapQualDesc = "keep reads with map quality that matches pattern";
883 const string nameDesc = "keep reads with name that matches pattern";
884 const string queryDesc = "keep reads with motif that matches pattern";
885 const string tagDesc = "keep reads with this key=>value pair";
887 Options::AddValueOption("-alignmentFlag", "int", flagDesc, "", m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, FilterOpts);
888 Options::AddValueOption("-insertSize", "int", insertDesc, "", m_settings->HasInsertSizeFilter, m_settings->InsertSizeFilter, FilterOpts);
889 Options::AddValueOption("-length", "int", lengthDesc, "", m_settings->HasLengthFilter, m_settings->LengthFilter, FilterOpts);
890 Options::AddValueOption("-mapQuality", "[0-255]", mapQualDesc, "", m_settings->HasMapQualityFilter, m_settings->MapQualityFilter, FilterOpts);
891 Options::AddValueOption("-name", "string", nameDesc, "", m_settings->HasNameFilter, m_settings->NameFilter, FilterOpts);
892 Options::AddValueOption("-queryBases", "string", queryDesc, "", m_settings->HasQueryBasesFilter, m_settings->QueryBasesFilter, FilterOpts);
893 Options::AddValueOption("-tag", "TAG:VALUE", tagDesc, "", m_settings->HasTagFilter, m_settings->TagFilter, FilterOpts);
895 // ----------------------------------
896 // alignment flag filter options
898 OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters");
900 const string boolArg = "true/false";
901 const string isDupDesc = "keep only alignments that are marked as duplicate?";
902 const string isFailQcDesc = "keep only alignments that failed QC?";
903 const string isFirstMateDesc = "keep only alignments marked as first mate?";
904 const string isMappedDesc = "keep only alignments that were mapped?";
905 const string isMateMappedDesc = "keep only alignments with mates that mapped";
906 const string isMateReverseDesc = "keep only alignments with mate on reverese strand?";
907 const string isPairedDesc = "keep only alignments that were sequenced as paired?";
908 const string isPrimaryDesc = "keep only alignments marked as primary?";
909 const string isProperPairDesc = "keep only alignments that passed PE resolution?";
910 const string isReverseDesc = "keep only alignments on reverse strand?";
911 const string isSecondMateDesc = "keep only alignments marked as second mate?";
912 const string isSingletonDesc = "keep only singletons";
914 Options::AddValueOption("-isDuplicate", boolArg, isDupDesc, "", m_settings->HasIsDuplicateFilter, m_settings->IsDuplicateFilter, AlignmentFlagOpts, TRUE_STR);
915 Options::AddValueOption("-isFailedQC", boolArg, isFailQcDesc, "", m_settings->HasIsFailedQCFilter, m_settings->IsFailedQCFilter, AlignmentFlagOpts, TRUE_STR);
916 Options::AddValueOption("-isFirstMate", boolArg, isFirstMateDesc, "", m_settings->HasIsFirstMateFilter, m_settings->IsFirstMateFilter, AlignmentFlagOpts, TRUE_STR);
917 Options::AddValueOption("-isMapped", boolArg, isMappedDesc, "", m_settings->HasIsMappedFilter, m_settings->IsMappedFilter, AlignmentFlagOpts, TRUE_STR);
918 Options::AddValueOption("-isMateMapped", boolArg, isMateMappedDesc, "", m_settings->HasIsMateMappedFilter, m_settings->IsMateMappedFilter, AlignmentFlagOpts, TRUE_STR);
919 Options::AddValueOption("-isMateReverseStrand", boolArg, isMateReverseDesc, "", m_settings->HasIsMateReverseStrandFilter, m_settings->IsMateReverseStrandFilter, AlignmentFlagOpts, TRUE_STR);
920 Options::AddValueOption("-isPaired", boolArg, isPairedDesc, "", m_settings->HasIsPairedFilter, m_settings->IsPairedFilter, AlignmentFlagOpts, TRUE_STR);
921 Options::AddValueOption("-isPrimaryAlignment", boolArg, isPrimaryDesc, "", m_settings->HasIsPrimaryAlignmentFilter, m_settings->IsPrimaryAlignmentFilter, AlignmentFlagOpts, TRUE_STR);
922 Options::AddValueOption("-isProperPair", boolArg, isProperPairDesc, "", m_settings->HasIsProperPairFilter, m_settings->IsProperPairFilter, AlignmentFlagOpts, TRUE_STR);
923 Options::AddValueOption("-isReverseStrand", boolArg, isReverseDesc, "", m_settings->HasIsReverseStrandFilter, m_settings->IsReverseStrandFilter, AlignmentFlagOpts, TRUE_STR);
924 Options::AddValueOption("-isSecondMate", boolArg, isSecondMateDesc, "", m_settings->HasIsSecondMateFilter, m_settings->IsSecondMateFilter, AlignmentFlagOpts, TRUE_STR);
925 Options::AddValueOption("-isSingleton", boolArg, isSingletonDesc, "", m_settings->HasIsSingletonFilter, m_settings->IsSingletonFilter, AlignmentFlagOpts, TRUE_STR);
928 FilterTool::~FilterTool(void) {
937 int FilterTool::Help(void) {
938 Options::DisplayHelp();
942 int FilterTool::Run(int argc, char* argv[]) {
944 // parse command line arguments
945 Options::Parse(argc, argv, 1);
947 // initialize FilterTool with settings
948 m_impl = new FilterToolPrivate(m_settings);
950 // run FilterTool, return success/fail