]> git.donarmstrong.com Git - bamtools.git/blob - src/toolkit/bamtools_filter.cpp
1189981452cd2cae582eda14283fde483bf2eca9
[bamtools.git] / src / toolkit / bamtools_filter.cpp
1 // ***************************************************************************
2 // bamtools_filter.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 10 December 2012
6 // ---------------------------------------------------------------------------
7 // Filters BAM file(s) according to some user-specified criteria
8 // ***************************************************************************
9
10 #include "bamtools_filter.h"
11
12 #include <api/BamMultiReader.h>
13 #include <api/BamWriter.h>
14 #include <utils/bamtools_filter_engine.h>
15 #include <utils/bamtools_options.h>
16 #include <utils/bamtools_utilities.h>
17 using namespace BamTools;
18
19 #include <jsoncpp/json.h>
20 using namespace Json;
21
22 #include <cstdio>
23 #include <fstream>
24 #include <iostream>
25 #include <sstream>
26 #include <string>
27 #include <vector>
28 using namespace std;
29   
30 namespace BamTools {
31   
32 // -------------------------------  
33 // string literal constants  
34
35 // property names
36 const string ALIGNMENTFLAG_PROPERTY       = "alignmentFlag";
37 const string CIGAR_PROPERTY               = "cigar";
38 const string INSERTSIZE_PROPERTY          = "insertSize";
39 const string ISDUPLICATE_PROPERTY         = "isDuplicate";
40 const string ISFAILEDQC_PROPERTY          = "isFailedQC";
41 const string ISFIRSTMATE_PROPERTY         = "isFirstMate";
42 const string ISMAPPED_PROPERTY            = "isMapped";
43 const string ISMATEMAPPED_PROPERTY        = "isMateMapped";
44 const string ISMATEREVERSESTRAND_PROPERTY = "isMateReverseStrand";
45 const string ISPAIRED_PROPERTY            = "isPaired";
46 const string ISPRIMARYALIGNMENT_PROPERTY  = "isPrimaryAlignment";
47 const string ISPROPERPAIR_PROPERTY        = "isProperPair";
48 const string ISREVERSESTRAND_PROPERTY     = "isReverseStrand";
49 const string ISSECONDMATE_PROPERTY        = "isSecondMate";
50 const string ISSINGLETON_PROPERTY         = "isSingleton";
51 const string MAPQUALITY_PROPERTY          = "mapQuality";
52 const string MATEPOSITION_PROPERTY        = "matePosition";
53 const string MATEREFERENCE_PROPERTY       = "mateReference";
54 const string NAME_PROPERTY                = "name";
55 const string POSITION_PROPERTY            = "position";
56 const string QUERYBASES_PROPERTY          = "queryBases";
57 const string REFERENCE_PROPERTY           = "reference";
58 const string TAG_PROPERTY                 = "tag";
59
60 // boolalpha
61 const string TRUE_STR  = "true";
62 const string FALSE_STR = "false";
63     
64 RefVector filterToolReferences;    
65     
66 struct BamAlignmentChecker {
67     bool check(const PropertyFilter& filter, const BamAlignment& al) {
68       
69         bool keepAlignment = true;
70         const PropertyMap& properties = filter.Properties;
71         PropertyMap::const_iterator propertyIter = properties.begin();
72         PropertyMap::const_iterator propertyEnd  = properties.end();
73         for ( ; propertyIter != propertyEnd; ++propertyIter ) {
74           
75             // check alignment data field depending on propertyName
76             const string& propertyName = (*propertyIter).first;
77             const PropertyFilterValue& valueFilter = (*propertyIter).second;
78             
79             if      ( propertyName == ALIGNMENTFLAG_PROPERTY )  keepAlignment &= valueFilter.check(al.AlignmentFlag);
80             else if ( propertyName == CIGAR_PROPERTY ) {
81                 stringstream cigarSs;
82                 const vector<CigarOp>& cigarData = al.CigarData;
83                 if ( !cigarData.empty() ) {
84                     vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
85                     vector<CigarOp>::const_iterator cigarIter = cigarBegin;
86                     vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
87                     for ( ; cigarIter != cigarEnd; ++cigarIter ) {
88                         const CigarOp& op = (*cigarIter);
89                         cigarSs << op.Length << op.Type;
90                     }
91                     keepAlignment &= valueFilter.check(cigarSs.str());
92                 }
93             }
94             else if ( propertyName == INSERTSIZE_PROPERTY )           keepAlignment &= valueFilter.check(al.InsertSize);
95             else if ( propertyName == ISDUPLICATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsDuplicate());
96             else if ( propertyName == ISFAILEDQC_PROPERTY )           keepAlignment &= valueFilter.check(al.IsFailedQC());
97             else if ( propertyName == ISFIRSTMATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsFirstMate());
98             else if ( propertyName == ISMAPPED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsMapped());
99             else if ( propertyName == ISMATEMAPPED_PROPERTY )         keepAlignment &= valueFilter.check(al.IsMateMapped());
100             else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY )  keepAlignment &= valueFilter.check(al.IsMateReverseStrand());
101             else if ( propertyName == ISPAIRED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsPaired());
102             else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY )   keepAlignment &= valueFilter.check(al.IsPrimaryAlignment());
103             else if ( propertyName == ISPROPERPAIR_PROPERTY )         keepAlignment &= valueFilter.check(al.IsProperPair());
104             else if ( propertyName == ISREVERSESTRAND_PROPERTY )      keepAlignment &= valueFilter.check(al.IsReverseStrand());
105             else if ( propertyName == ISSECONDMATE_PROPERTY )         keepAlignment &= valueFilter.check(al.IsSecondMate());
106             else if ( propertyName == ISSINGLETON_PROPERTY ) {
107                 const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped();
108                 keepAlignment &= valueFilter.check(isSingleton);
109             }
110             else if ( propertyName == MAPQUALITY_PROPERTY )           keepAlignment &= valueFilter.check(al.MapQuality);
111             else if ( propertyName == MATEPOSITION_PROPERTY )         keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) );
112             else if ( propertyName == MATEREFERENCE_PROPERTY ) {
113                 if ( !al.IsPaired() || !al.IsMateMapped() ) return false;
114                 BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID");
115                 const string& refName = filterToolReferences.at(al.MateRefID).RefName;
116                 keepAlignment &= valueFilter.check(refName);
117             }
118             else if ( propertyName == NAME_PROPERTY )                 keepAlignment &= valueFilter.check(al.Name);
119             else if ( propertyName == POSITION_PROPERTY )             keepAlignment &= valueFilter.check(al.Position);
120             else if ( propertyName == QUERYBASES_PROPERTY )           keepAlignment &= valueFilter.check(al.QueryBases);
121             else if ( propertyName == REFERENCE_PROPERTY ) {
122                 BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID");
123                 const string& refName = filterToolReferences.at(al.RefID).RefName;
124                 keepAlignment &= valueFilter.check(refName);
125             }
126             else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al);
127             else BAMTOOLS_ASSERT_UNREACHABLE;
128             
129             // if alignment fails at ANY point, just quit and return false
130             if ( !keepAlignment ) return false;
131         }
132       
133         BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here");
134         return keepAlignment;
135     }
136     
137     bool checkAlignmentTag(const PropertyFilterValue& valueFilter, const BamAlignment& al) {
138      
139         // ensure filter contains string data
140         Variant entireTagFilter = valueFilter.Value;
141         if ( !entireTagFilter.is_type<string>() ) return false;
142
143         // localize string from variant
144         const string& entireTagFilterString = entireTagFilter.get<string>();
145
146         // ensure we have at least "XX:x"
147         if ( entireTagFilterString.length() < 4 ) return false;
148
149         // get tagName & lookup in alignment
150         // if found, set tagType to tag type character
151         // if not found, return false
152         const string& tagName = entireTagFilterString.substr(0,2);
153         char tagType = '\0';
154         if ( !al.GetTagType(tagName, tagType) ) return false;
155
156         // remove tagName & ":" from beginning tagFilter
157         string tagFilterString = entireTagFilterString.substr(3);
158
159         // switch on tag type to set tag query value & parse filter token
160         int32_t  intFilterValue,    intQueryValue;
161         uint32_t uintFilterValue,   uintQueryValue;
162         float    realFilterValue,   realQueryValue;
163         string   stringFilterValue, stringQueryValue;
164
165         PropertyFilterValue tagFilter;
166         PropertyFilterValue::ValueCompareType compareType;
167         bool keepAlignment = false;
168         switch (tagType) {
169
170             // signed int tag type
171             case 'c' :
172             case 's' :
173             case 'i' :
174                 if ( al.GetTag(tagName, intQueryValue) ) {
175                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, intFilterValue, compareType) ) {
176                         tagFilter.Value = intFilterValue;
177                         tagFilter.Type  = compareType;
178                         keepAlignment   = tagFilter.check(intQueryValue);
179                     }
180                 }
181                 break;
182
183             // unsigned int tag type
184             case 'C' :
185             case 'S' :
186             case 'I' :
187                 if ( al.GetTag(tagName, uintQueryValue) ) {
188                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, uintFilterValue, compareType) ) {
189                         tagFilter.Value = uintFilterValue;
190                         tagFilter.Type  = compareType;
191                         keepAlignment   = tagFilter.check(uintQueryValue);
192                     }
193                 }
194                 break;
195
196             // 'real' tag type
197             case 'f' :
198                 if ( al.GetTag(tagName, realQueryValue) ) {
199                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, realFilterValue, compareType) ) {
200                         tagFilter.Value = realFilterValue;
201                         tagFilter.Type  = compareType;
202                         keepAlignment   = tagFilter.check(realQueryValue);
203                     }
204                 }
205                 break;
206
207             // string tag type
208             case 'A':
209             case 'Z':
210             case 'H':
211                 if ( al.GetTag(tagName, stringQueryValue) ) {
212                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, stringFilterValue, compareType) ) {
213                         tagFilter.Value = stringFilterValue;
214                         tagFilter.Type  = compareType;
215                         keepAlignment   = tagFilter.check(stringQueryValue);
216                     }
217                 }
218                 break;
219
220             // unknown tag type
221             default :
222                 keepAlignment = false;
223         }
224
225         return keepAlignment;
226     }
227 };    
228     
229 } // namespace BamTools
230   
231 // ---------------------------------------------
232 // FilterSettings implementation
233
234 struct FilterTool::FilterSettings {
235
236     // ----------------------------------
237     // IO opts
238
239     // flags
240     bool HasInput;
241     bool HasInputFilelist;
242     bool HasOutput;
243     bool HasRegion;
244     bool HasScript;
245     bool IsForceCompression;
246
247     // filenames
248     vector<string> InputFiles;
249     string InputFilelist;
250     string OutputFilename;
251     string Region;
252     string ScriptFilename;
253
254     // -----------------------------------
255     // General filter opts
256
257     // flags
258     bool HasAlignmentFlagFilter;
259     bool HasInsertSizeFilter;
260     bool HasMapQualityFilter;
261     bool HasNameFilter;
262     bool HasQueryBasesFilter;
263     bool HasTagFilter; //(s)
264
265     // filters
266     string AlignmentFlagFilter;
267     string InsertSizeFilter;
268     string NameFilter;
269     string MapQualityFilter;
270     string QueryBasesFilter;
271     string TagFilter;  // support multiple ?
272
273     // -----------------------------------
274     // AlignmentFlag filter opts
275
276     // flags
277     bool HasIsDuplicateFilter;
278     bool HasIsFailedQCFilter;
279     bool HasIsFirstMateFilter;
280     bool HasIsMappedFilter;
281     bool HasIsMateMappedFilter;
282     bool HasIsMateReverseStrandFilter;
283     bool HasIsPairedFilter;
284     bool HasIsPrimaryAlignmentFilter;
285     bool HasIsProperPairFilter;
286     bool HasIsReverseStrandFilter;
287     bool HasIsSecondMateFilter;
288     bool HasIsSingletonFilter;
289
290     // filters
291     string IsDuplicateFilter;
292     string IsFailedQCFilter;
293     string IsFirstMateFilter;
294     string IsMappedFilter;
295     string IsMateMappedFilter;
296     string IsMateReverseStrandFilter;
297     string IsPairedFilter;
298     string IsPrimaryAlignmentFilter;
299     string IsProperPairFilter;
300     string IsReverseStrandFilter;
301     string IsSecondMateFilter;
302     string IsSingletonFilter;
303
304     // ---------------------------------
305     // constructor
306
307     FilterSettings(void)
308         : HasInput(false)
309         , HasInputFilelist(false)
310         , HasOutput(false)
311         , HasRegion(false)
312         , HasScript(false)
313         , IsForceCompression(false)
314         , OutputFilename(Options::StandardOut())
315         , HasAlignmentFlagFilter(false)
316         , HasInsertSizeFilter(false)
317         , HasMapQualityFilter(false)
318         , HasNameFilter(false)
319         , HasQueryBasesFilter(false)
320         , HasTagFilter(false)
321         , HasIsDuplicateFilter(false)
322         , HasIsFailedQCFilter(false)
323         , HasIsFirstMateFilter(false)
324         , HasIsMappedFilter(false)
325         , HasIsMateMappedFilter(false)
326         , HasIsMateReverseStrandFilter(false)
327         , HasIsPairedFilter(false)
328         , HasIsPrimaryAlignmentFilter(false)
329         , HasIsProperPairFilter(false)
330         , HasIsReverseStrandFilter(false)
331         , HasIsSecondMateFilter(false)
332         , HasIsSingletonFilter(false)
333         , IsDuplicateFilter(TRUE_STR)
334         , IsFailedQCFilter(TRUE_STR)
335         , IsFirstMateFilter(TRUE_STR)
336         , IsMappedFilter(TRUE_STR)
337         , IsMateMappedFilter(TRUE_STR)
338         , IsMateReverseStrandFilter(TRUE_STR)
339         , IsPairedFilter(TRUE_STR)
340         , IsPrimaryAlignmentFilter(TRUE_STR)
341         , IsProperPairFilter(TRUE_STR)
342         , IsReverseStrandFilter(TRUE_STR)
343         , IsSecondMateFilter(TRUE_STR)
344         , IsSingletonFilter(TRUE_STR)
345     { }
346 };
347
348 // ---------------------------------------------
349 // FilterToolPrivate declaration
350
351 class FilterTool::FilterToolPrivate {
352       
353     // ctor & dtor
354     public:
355         FilterToolPrivate(FilterTool::FilterSettings* settings);
356         ~FilterToolPrivate(void);  
357         
358     // 'public' interface
359     public:
360         bool Run(void);
361         
362     // internal methods
363     private:
364         bool AddPropertyTokensToFilter(const string& filterName, const map<string, string>& propertyTokens);
365         bool CheckAlignment(const BamAlignment& al);
366         const string GetScriptContents(void);
367         void InitProperties(void);
368         bool ParseCommandLine(void);
369         bool ParseFilterObject(const string& filterName, const Json::Value& filterObject);
370         bool ParseScript(void);
371         bool SetupFilters(void);
372         
373     // data members
374     private:
375         vector<string> m_propertyNames;
376         FilterTool::FilterSettings* m_settings;
377         FilterEngine<BamAlignmentChecker> m_filterEngine;
378 };
379  
380 // ---------------------------------------------
381 // FilterToolPrivate implementation
382   
383 // constructor  
384 FilterTool::FilterToolPrivate::FilterToolPrivate(FilterTool::FilterSettings* settings)
385     : m_settings(settings)
386 { }  
387   
388 // destructor
389 FilterTool::FilterToolPrivate::~FilterToolPrivate(void) { }
390
391 bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName,
392                                                               const map<string,
393                                                               string>& propertyTokens)
394 {
395     // dummy temp values for token parsing
396     bool boolValue;
397     int32_t int32Value;
398     uint16_t uint16Value;
399     uint32_t uint32Value;
400     string stringValue;
401     PropertyFilterValue::ValueCompareType type;
402   
403     // iterate over property token map
404     map<string, string>::const_iterator mapIter = propertyTokens.begin();
405     map<string, string>::const_iterator mapEnd  = propertyTokens.end();
406     for ( ; mapIter != mapEnd; ++mapIter ) {
407       
408         const string& propertyName = (*mapIter).first;
409         const string& token        = (*mapIter).second;
410         
411         // ------------------------------
412         // convert token to value & compare type 
413         // then add to filter engine
414         
415         // bool conversion
416         if ( propertyName == ISDUPLICATE_PROPERTY ||
417              propertyName == ISFAILEDQC_PROPERTY ||
418              propertyName == ISFIRSTMATE_PROPERTY ||
419              propertyName == ISMAPPED_PROPERTY ||
420              propertyName == ISMATEMAPPED_PROPERTY ||
421              propertyName == ISMATEREVERSESTRAND_PROPERTY ||
422              propertyName == ISPAIRED_PROPERTY ||
423              propertyName == ISPRIMARYALIGNMENT_PROPERTY ||
424              propertyName == ISPROPERPAIR_PROPERTY ||
425              propertyName == ISREVERSESTRAND_PROPERTY ||
426              propertyName == ISSECONDMATE_PROPERTY ||
427              propertyName == ISSINGLETON_PROPERTY
428            ) 
429         {
430             FilterEngine<BamAlignmentChecker>::parseToken(token, boolValue, type);
431             m_filterEngine.setProperty(filterName, propertyName, boolValue, type);
432         }
433         
434         // int32_t conversion
435         else if ( propertyName == INSERTSIZE_PROPERTY ||
436                   propertyName == MATEPOSITION_PROPERTY ||
437                   propertyName == POSITION_PROPERTY 
438                 ) 
439         {
440             FilterEngine<BamAlignmentChecker>::parseToken(token, int32Value, type);
441             m_filterEngine.setProperty(filterName, propertyName, int32Value, type);
442         }
443         
444         // uint16_t conversion
445         else if ( propertyName == MAPQUALITY_PROPERTY )
446         {
447             FilterEngine<BamAlignmentChecker>::parseToken(token, uint16Value, type);
448             m_filterEngine.setProperty(filterName, propertyName, uint16Value, type);
449         }
450         
451         // uint32_t conversion
452         else if ( propertyName == ALIGNMENTFLAG_PROPERTY )
453         {
454             FilterEngine<BamAlignmentChecker>::parseToken(token, uint32Value, type);
455             m_filterEngine.setProperty(filterName, propertyName, uint32Value, type);
456         }
457         
458         // string conversion
459         else if ( propertyName == CIGAR_PROPERTY || 
460                   propertyName == MATEREFERENCE_PROPERTY ||
461                   propertyName == NAME_PROPERTY ||
462                   propertyName == QUERYBASES_PROPERTY ||
463                   propertyName == REFERENCE_PROPERTY 
464                 ) 
465         {
466             FilterEngine<BamAlignmentChecker>::parseToken(token, stringValue, type);
467             m_filterEngine.setProperty(filterName, propertyName, stringValue, type);
468         }
469       
470         else if ( propertyName == TAG_PROPERTY ) {
471             // this will be stored directly as the TAG:VALUE token
472             // (VALUE may contain compare ops, will be parsed out later)
473             m_filterEngine.setProperty(filterName, propertyName, token, PropertyFilterValue::EXACT);
474         }
475       
476         // else unknown property 
477         else {
478             cerr << "bamtools filter ERROR: unknown property - " << propertyName << endl;
479             return false;
480         }
481     }
482     return true;
483 }
484
485 bool FilterTool::FilterToolPrivate::CheckAlignment(const BamAlignment& al) {
486     return m_filterEngine.check(al);
487 }
488
489 const string FilterTool::FilterToolPrivate::GetScriptContents(void) {
490   
491     // open script for reading
492     FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb");
493     if ( !inFile ) {
494         cerr << "bamtools filter ERROR: could not open script: "
495              << m_settings->ScriptFilename << " for reading" << endl;
496         return string();
497     }
498     
499     // read in entire script contents  
500     char buffer[1024];
501     ostringstream docStream("");
502     while ( true ) {
503         
504         // peek ahead, make sure there is data available
505         char ch = fgetc(inFile);
506         ungetc(ch, inFile);
507         if( feof(inFile) )
508             break;
509         
510         // read next block of data
511         if ( fgets(buffer, 1024, inFile) == 0 ) {
512             cerr << "bamtools filter ERROR: could not read script contents" << endl;
513             return string();
514         }
515         
516         docStream << buffer;
517     }
518     
519     // close script file
520     fclose(inFile);
521     
522     // import buffer contents to document, return
523     return docStream.str();
524 }
525
526 void FilterTool::FilterToolPrivate::InitProperties(void) {
527   
528     // store property names in vector 
529     m_propertyNames.push_back(ALIGNMENTFLAG_PROPERTY);
530     m_propertyNames.push_back(CIGAR_PROPERTY);
531     m_propertyNames.push_back(INSERTSIZE_PROPERTY);
532     m_propertyNames.push_back(ISDUPLICATE_PROPERTY);
533     m_propertyNames.push_back(ISFAILEDQC_PROPERTY);
534     m_propertyNames.push_back(ISFIRSTMATE_PROPERTY);
535     m_propertyNames.push_back(ISMAPPED_PROPERTY);
536     m_propertyNames.push_back(ISMATEMAPPED_PROPERTY);
537     m_propertyNames.push_back(ISMATEREVERSESTRAND_PROPERTY);
538     m_propertyNames.push_back(ISPAIRED_PROPERTY);
539     m_propertyNames.push_back(ISPRIMARYALIGNMENT_PROPERTY);
540     m_propertyNames.push_back(ISPROPERPAIR_PROPERTY);
541     m_propertyNames.push_back(ISREVERSESTRAND_PROPERTY);
542     m_propertyNames.push_back(ISSECONDMATE_PROPERTY);
543     m_propertyNames.push_back(ISSINGLETON_PROPERTY);
544     m_propertyNames.push_back(MAPQUALITY_PROPERTY);
545     m_propertyNames.push_back(MATEPOSITION_PROPERTY);
546     m_propertyNames.push_back(MATEREFERENCE_PROPERTY);
547     m_propertyNames.push_back(NAME_PROPERTY);
548     m_propertyNames.push_back(POSITION_PROPERTY);
549     m_propertyNames.push_back(QUERYBASES_PROPERTY);
550     m_propertyNames.push_back(REFERENCE_PROPERTY);
551     m_propertyNames.push_back(TAG_PROPERTY);
552     
553     // add vector contents to FilterEngine<BamAlignmentChecker>
554     vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
555     vector<string>::const_iterator propertyNameEnd  = m_propertyNames.end();
556     for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter )
557         m_filterEngine.addProperty((*propertyNameIter));
558 }
559
560 bool FilterTool::FilterToolPrivate::ParseCommandLine(void) {
561   
562     // add a rule set to filter engine
563     const string CMD = "COMMAND_LINE";
564     m_filterEngine.addFilter(CMD);
565
566     // map property names to command line args
567     map<string, string> propertyTokens;
568     if ( m_settings->HasAlignmentFlagFilter )       propertyTokens.insert( make_pair(ALIGNMENTFLAG_PROPERTY,       m_settings->AlignmentFlagFilter) );
569     if ( m_settings->HasInsertSizeFilter )          propertyTokens.insert( make_pair(INSERTSIZE_PROPERTY,          m_settings->InsertSizeFilter) );
570     if ( m_settings->HasIsDuplicateFilter )         propertyTokens.insert( make_pair(ISDUPLICATE_PROPERTY,         m_settings->IsDuplicateFilter) );
571     if ( m_settings->HasIsFailedQCFilter )          propertyTokens.insert( make_pair(ISFAILEDQC_PROPERTY,          m_settings->IsFailedQCFilter) );
572     if ( m_settings->HasIsFirstMateFilter )         propertyTokens.insert( make_pair(ISFIRSTMATE_PROPERTY,         m_settings->IsFirstMateFilter) );
573     if ( m_settings->HasIsMappedFilter )            propertyTokens.insert( make_pair(ISMAPPED_PROPERTY,            m_settings->IsMappedFilter) );
574     if ( m_settings->HasIsMateMappedFilter )        propertyTokens.insert( make_pair(ISMATEMAPPED_PROPERTY,        m_settings->IsMateMappedFilter) );
575     if ( m_settings->HasIsMateReverseStrandFilter ) propertyTokens.insert( make_pair(ISMATEREVERSESTRAND_PROPERTY, m_settings->IsMateReverseStrandFilter) );
576     if ( m_settings->HasIsPairedFilter )            propertyTokens.insert( make_pair(ISPAIRED_PROPERTY,            m_settings->IsPairedFilter) );
577     if ( m_settings->HasIsPrimaryAlignmentFilter )  propertyTokens.insert( make_pair(ISPRIMARYALIGNMENT_PROPERTY,  m_settings->IsPrimaryAlignmentFilter) );
578     if ( m_settings->HasIsProperPairFilter )        propertyTokens.insert( make_pair(ISPROPERPAIR_PROPERTY,        m_settings->IsProperPairFilter) );
579     if ( m_settings->HasIsReverseStrandFilter )     propertyTokens.insert( make_pair(ISREVERSESTRAND_PROPERTY,     m_settings->IsReverseStrandFilter) );
580     if ( m_settings->HasIsSecondMateFilter )        propertyTokens.insert( make_pair(ISSECONDMATE_PROPERTY,        m_settings->IsSecondMateFilter) );
581     if ( m_settings->HasIsSingletonFilter )         propertyTokens.insert( make_pair(ISSINGLETON_PROPERTY,         m_settings->IsSingletonFilter) );
582     if ( m_settings->HasMapQualityFilter )          propertyTokens.insert( make_pair(MAPQUALITY_PROPERTY,          m_settings->MapQualityFilter) );
583     if ( m_settings->HasNameFilter )                propertyTokens.insert( make_pair(NAME_PROPERTY,                m_settings->NameFilter) );
584     if ( m_settings->HasQueryBasesFilter )          propertyTokens.insert( make_pair(QUERYBASES_PROPERTY,          m_settings->QueryBasesFilter) );
585     if ( m_settings->HasTagFilter )                 propertyTokens.insert( make_pair(TAG_PROPERTY,                 m_settings->TagFilter) );
586     
587     // send add these properties to filter set "COMMAND_LINE"
588     return AddPropertyTokensToFilter(CMD, propertyTokens);
589 }
590
591 bool FilterTool::FilterToolPrivate::ParseFilterObject(const string& filterName, const Json::Value& filterObject) {
592   
593     // filter object parsing variables
594     Json::Value null(Json::nullValue);
595     Json::Value propertyValue;
596     
597     // store results
598     map<string, string> propertyTokens;
599     
600     // iterate over known properties
601     vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
602     vector<string>::const_iterator propertyNameEnd  = m_propertyNames.end();
603     for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter ) {
604         const string& propertyName = (*propertyNameIter);
605         
606         // if property defined in filter, add to token list
607         propertyValue = filterObject.get(propertyName, null);
608         if ( propertyValue != null ) 
609             propertyTokens.insert( make_pair(propertyName, propertyValue.asString()) );
610     }
611   
612     // add this filter to engin
613     m_filterEngine.addFilter(filterName);
614   
615     // add token list to this filter
616     return AddPropertyTokensToFilter(filterName, propertyTokens);
617 }
618
619 bool FilterTool::FilterToolPrivate::ParseScript(void) {
620   
621     // read in script contents from file
622     const string document = GetScriptContents();
623     
624     // set up JsonCPP reader and attempt to parse script
625     Json::Value root;
626     Json::Reader reader;
627     if ( !reader.parse(document, root) ) {
628         // use built-in error reporting mechanism to alert user what was wrong with the script
629         cerr  << "bamtools filter ERROR: failed to parse script - see error message(s) below" << endl
630               << reader.getFormatedErrorMessages();
631         return false;     
632     }
633
634     // initialize return status
635     bool success = true;
636     
637     // see if root object contains multiple filters
638     const Json::Value filters = root["filters"];
639     if ( !filters.isNull() ) {
640       
641         // iterate over any filters found
642         int filterIndex = 0;
643         Json::Value::const_iterator filtersIter = filters.begin();
644         Json::Value::const_iterator filtersEnd  = filters.end();
645         for ( ; filtersIter != filtersEnd; ++filtersIter, ++filterIndex ) {
646             Json::Value filter = (*filtersIter);
647             
648             // convert filter index to string
649             string filterName;
650             
651             // if id tag supplied
652             const Json::Value id = filter["id"];
653             if ( !id.isNull() ) 
654                 filterName = id.asString();
655             
656             // use array index 
657             else {
658                 stringstream convert;
659                 convert << filterIndex;
660                 filterName = convert.str();
661             }
662             
663             // create & parse filter 
664             success &= ParseFilterObject(filterName, filter);
665         }
666         
667         // see if user defined a "rule" for these filters
668         // otherwise, use filter engine's default rule behavior
669         string ruleString("");
670         const Json::Value rule = root["rule"];
671         if ( rule.isString() )
672             ruleString = rule.asString();
673         m_filterEngine.setRule(ruleString);
674           
675         // return success/fail
676         return success;
677     } 
678     
679     // otherwise, root is the only filter (just contains properties)
680     // create & parse filter named "ROOT"
681     else success = ParseFilterObject("ROOT", root);
682     
683     // return success/failure
684     return success;
685 }
686
687 bool FilterTool::FilterToolPrivate::Run(void) {
688     
689     // set to default input if none provided
690     if ( !m_settings->HasInput && !m_settings->HasInputFilelist )
691         m_settings->InputFiles.push_back(Options::StandardIn());
692
693     // add files in the filelist to the input file list
694     if ( m_settings->HasInputFilelist ) {
695
696         ifstream filelist(m_settings->InputFilelist.c_str(), ios::in);
697         if ( !filelist.is_open() ) {
698             cerr << "bamtools filter ERROR: could not open input BAM file list... Aborting." << endl;
699             return false;
700         }
701
702         string line;
703         while ( getline(filelist, line) )
704             m_settings->InputFiles.push_back(line);
705     }
706
707     // initialize defined properties & user-specified filters
708     // quit if failed
709     if ( !SetupFilters() )
710         return false;
711
712     // open reader without index
713     BamMultiReader reader;
714     if ( !reader.Open(m_settings->InputFiles) ) {
715         cerr << "bamtools filter ERROR: could not open input files for reading." << endl;
716         return false;
717     }
718
719     // retrieve reader header & reference data
720     const string headerText = reader.GetHeaderText();
721     filterToolReferences = reader.GetReferenceData();
722     
723     // determine compression mode for BamWriter
724     bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
725                               !m_settings->IsForceCompression );
726     BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
727     if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
728
729     // open BamWriter
730     BamWriter writer;
731     writer.SetCompressionMode(compressionMode);
732     if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences) ) {
733         cerr << "bamtools filter ERROR: could not open " << m_settings->OutputFilename << " for writing." << endl;
734         reader.Close();
735         return false;
736     }
737
738     // if no region specified, filter entire file 
739     BamAlignment al;
740     if ( !m_settings->HasRegion ) {
741         while ( reader.GetNextAlignment(al) ) {
742             if ( CheckAlignment(al) ) 
743                 writer.SaveAlignment(al);
744         }
745     }
746     
747     // otherwise attempt to use region as constraint
748     else {
749         
750         // if region string parses OK
751         BamRegion region;
752         if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
753
754             // attempt to find index files
755             reader.LocateIndexes();
756
757             // if index data available for all BAM files, we can use SetRegion
758             if ( reader.HasIndexes() ) {
759
760                 // attempt to use SetRegion(), if failed report error
761                 if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
762                     cerr << "bamtools filter ERROR: set region failed. Check that REGION describes a valid range" << endl;
763                     reader.Close();
764                     return false;
765                 } 
766               
767                 // everything checks out, just iterate through specified region, filtering alignments
768                 while ( reader.GetNextAlignment(al) )
769                     if ( CheckAlignment(al) ) 
770                         writer.SaveAlignment(al);
771                 }
772             
773             // no index data available, we have to iterate through until we
774             // find overlapping alignments
775             else {
776                 while ( reader.GetNextAlignment(al) ) {
777                     if ( (al.RefID >= region.LeftRefID)  && ((al.Position + al.Length) >= region.LeftPosition) &&
778                          (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) 
779                     {
780                         if ( CheckAlignment(al) ) 
781                             writer.SaveAlignment(al);
782                     }
783                 }
784             }
785         } 
786         
787         // error parsing REGION string
788         else {
789             cerr << "bamtools filter ERROR: could not parse REGION: " << m_settings->Region << endl;
790             cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
791                  << endl;
792             reader.Close();
793             return false;
794         }
795     }
796
797     // clean up & exit
798     reader.Close();
799     writer.Close();
800     return true;
801 }
802
803 bool FilterTool::FilterToolPrivate::SetupFilters(void) {
804   
805     // set up filter engine with supported properties
806     InitProperties();
807     
808     // parse script for filter rules, if given
809     if ( m_settings->HasScript )
810         return ParseScript();
811     
812     // otherwise check command line for filters
813     else return ParseCommandLine();
814 }
815
816 // ---------------------------------------------
817 // FilterTool implementation
818
819 FilterTool::FilterTool(void)
820     : AbstractTool()
821     , m_settings(new FilterSettings)
822     , m_impl(0)
823 {
824     // ----------------------------------
825     // set program details
826
827     const string usage = "[-in <filename> -in <filename> ... | -list <filelist>] "
828                          "[-out <filename> | [-forceCompression]] [-region <REGION>] "
829                          "[ [-script <filename] | [filterOptions] ]";
830
831     Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", usage );
832
833     // ----------------------------------
834     // I/O options
835
836     OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
837
838     const string inDesc     = "the input BAM file(s)";
839     const string listDesc   = "the input BAM file list, one line per file";
840     const string outDesc    = "the output BAM file";
841     const string regionDesc = "only read data from this genomic region (see documentation for more details)";
842     const string scriptDesc = "the filter script file (see documentation for more details)";
843     const string forceDesc  = "if results are sent to stdout (like when piping to another tool), "
844                               "default behavior is to leave output uncompressed. Use this flag to "
845                               "override and force compression";
846
847     Options::AddValueOption("-in",     "BAM filename", inDesc,     "", m_settings->HasInput,  m_settings->InputFiles,     IO_Opts, Options::StandardIn());
848     Options::AddValueOption("-list",   "filename",     listDesc,   "", m_settings->HasInputFilelist,  m_settings->InputFilelist, IO_Opts);
849     Options::AddValueOption("-out",    "BAM filename", outDesc,    "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
850     Options::AddValueOption("-region", "REGION",       regionDesc, "", m_settings->HasRegion, m_settings->Region,         IO_Opts);
851     Options::AddValueOption("-script", "filename",     scriptDesc, "", m_settings->HasScript, m_settings->ScriptFilename, IO_Opts);
852     Options::AddOption("-forceCompression",forceDesc, m_settings->IsForceCompression, IO_Opts);
853
854     // ----------------------------------
855     // general filter options
856
857     OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters");
858
859     const string flagDesc    = "keep reads with this *exact* alignment flag (for more detailed queries, see below)";
860     const string insertDesc  = "keep reads with insert size that matches pattern";
861     const string mapQualDesc = "keep reads with map quality that matches pattern";
862     const string nameDesc    = "keep reads with name that matches pattern";
863     const string queryDesc   = "keep reads with motif that matches pattern";
864     const string tagDesc     = "keep reads with this key=>value pair";
865
866     Options::AddValueOption("-alignmentFlag", "int",       flagDesc,    "", m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, FilterOpts);
867     Options::AddValueOption("-insertSize",    "int",       insertDesc,  "", m_settings->HasInsertSizeFilter,    m_settings->InsertSizeFilter,    FilterOpts);
868     Options::AddValueOption("-mapQuality",    "[0-255]",   mapQualDesc, "", m_settings->HasMapQualityFilter,    m_settings->MapQualityFilter,    FilterOpts);
869     Options::AddValueOption("-name",          "string",    nameDesc,    "", m_settings->HasNameFilter,          m_settings->NameFilter,          FilterOpts);
870     Options::AddValueOption("-queryBases",    "string",    queryDesc,   "", m_settings->HasQueryBasesFilter,    m_settings->QueryBasesFilter,    FilterOpts);
871     Options::AddValueOption("-tag",           "TAG:VALUE", tagDesc,     "", m_settings->HasTagFilter,           m_settings->TagFilter,           FilterOpts);
872
873     // ----------------------------------
874     // alignment flag filter options
875
876     OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters");
877
878     const string boolArg           = "true/false";
879     const string isDupDesc         = "keep only alignments that are marked as duplicate?";
880     const string isFailQcDesc      = "keep only alignments that failed QC?";
881     const string isFirstMateDesc   = "keep only alignments marked as first mate?";
882     const string isMappedDesc      = "keep only alignments that were mapped?";
883     const string isMateMappedDesc  = "keep only alignments with mates that mapped";
884     const string isMateReverseDesc = "keep only alignments with mate on reverese strand?";
885     const string isPairedDesc      = "keep only alignments that were sequenced as paired?";
886     const string isPrimaryDesc     = "keep only alignments marked as primary?";
887     const string isProperPairDesc  = "keep only alignments that passed PE resolution?";
888     const string isReverseDesc     = "keep only alignments on reverse strand?";
889     const string isSecondMateDesc  = "keep only alignments marked as second mate?";
890     const string isSingletonDesc   = "keep only singletons";
891
892     Options::AddValueOption("-isDuplicate",         boolArg, isDupDesc,         "", m_settings->HasIsDuplicateFilter,         m_settings->IsDuplicateFilter,         AlignmentFlagOpts, TRUE_STR);
893     Options::AddValueOption("-isFailedQC",          boolArg, isFailQcDesc,      "", m_settings->HasIsFailedQCFilter,          m_settings->IsFailedQCFilter,          AlignmentFlagOpts, TRUE_STR);
894     Options::AddValueOption("-isFirstMate",         boolArg, isFirstMateDesc,   "", m_settings->HasIsFirstMateFilter,         m_settings->IsFirstMateFilter,         AlignmentFlagOpts, TRUE_STR);
895     Options::AddValueOption("-isMapped",            boolArg, isMappedDesc,      "", m_settings->HasIsMappedFilter,            m_settings->IsMappedFilter,            AlignmentFlagOpts, TRUE_STR);
896     Options::AddValueOption("-isMateMapped",        boolArg, isMateMappedDesc,  "", m_settings->HasIsMateMappedFilter,        m_settings->IsMateMappedFilter,        AlignmentFlagOpts, TRUE_STR);
897     Options::AddValueOption("-isMateReverseStrand", boolArg, isMateReverseDesc, "", m_settings->HasIsMateReverseStrandFilter, m_settings->IsMateReverseStrandFilter, AlignmentFlagOpts, TRUE_STR);
898     Options::AddValueOption("-isPaired",            boolArg, isPairedDesc,      "", m_settings->HasIsPairedFilter,            m_settings->IsPairedFilter,            AlignmentFlagOpts, TRUE_STR);
899     Options::AddValueOption("-isPrimaryAlignment",  boolArg, isPrimaryDesc,     "", m_settings->HasIsPrimaryAlignmentFilter,  m_settings->IsPrimaryAlignmentFilter,  AlignmentFlagOpts, TRUE_STR);
900     Options::AddValueOption("-isProperPair",        boolArg, isProperPairDesc,  "", m_settings->HasIsProperPairFilter,        m_settings->IsProperPairFilter,        AlignmentFlagOpts, TRUE_STR);
901     Options::AddValueOption("-isReverseStrand",     boolArg, isReverseDesc,     "", m_settings->HasIsReverseStrandFilter,     m_settings->IsReverseStrandFilter,     AlignmentFlagOpts, TRUE_STR);
902     Options::AddValueOption("-isSecondMate",        boolArg, isSecondMateDesc,  "", m_settings->HasIsSecondMateFilter,        m_settings->IsSecondMateFilter,        AlignmentFlagOpts, TRUE_STR);
903     Options::AddValueOption("-isSingleton",         boolArg, isSingletonDesc,   "", m_settings->HasIsSingletonFilter,         m_settings->IsSingletonFilter,         AlignmentFlagOpts, TRUE_STR);
904 }
905
906 FilterTool::~FilterTool(void) {
907
908     delete m_settings;
909     m_settings = 0;
910
911     delete m_impl;
912     m_impl = 0;
913 }
914
915 int FilterTool::Help(void) {
916     Options::DisplayHelp();
917     return 0;
918 }
919
920 int FilterTool::Run(int argc, char* argv[]) {
921
922     // parse command line arguments
923     Options::Parse(argc, argv, 1);
924
925     // initialize FilterTool with settings
926     m_impl = new FilterToolPrivate(m_settings);
927
928     // run FilterTool, return success/fail
929     if ( m_impl->Run() )
930         return 0;
931     else
932         return 1;
933 }