]> git.donarmstrong.com Git - bamtools.git/blob - src/toolkit/bamtools_filter.cpp
Posted implementation of FilterTool.
[bamtools.git] / src / toolkit / bamtools_filter.cpp
1 // ***************************************************************************
2 // bamtools_filter.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 30 August 2010
7 // ---------------------------------------------------------------------------
8 // Filters a single BAM file (or filters multiple BAM files and merges) 
9 // according to some user-specified criteria.
10 // ***************************************************************************
11
12 // std includes
13 #include <cstdio>
14 #include <iostream>
15 #include <sstream>
16 #include <string>
17 #include <vector>
18
19 // BamTools includes
20 #include "bamtools_filter.h"
21 #include "bamtools_filter_engine.h"
22 #include "bamtools_options.h"
23 #include "bamtools_utilities.h"
24 #include "BamReader.h"
25 #include "BamMultiReader.h"
26 #include "BamWriter.h"
27
28 //JsonCPP includes
29 #include "jsoncpp/json.h"
30
31 using namespace std;
32 using namespace BamTools; 
33 using namespace Json;
34   
35 namespace BamTools {
36   
37 // -------------------------------  
38 // string literal constants  
39
40 // property names
41 const string ALIGNMENTFLAG_PROPERTY       = "alignmentFlag";
42 const string INSERTSIZE_PROPERTY          = "insertSize";
43 const string ISDUPLICATE_PROPERTY         = "isDuplicate";
44 const string ISFAILEDQC_PROPERTY          = "isFailedQC";
45 const string ISFIRSTMATE_PROPERTY         = "isFirstMate";
46 const string ISMAPPED_PROPERTY            = "isMapped";
47 const string ISMATEMAPPED_PROPERTY        = "isMateMapped";
48 const string ISMATEREVERSESTRAND_PROPERTY = "isMateReverseStrand";
49 const string ISPAIRED_PROPERTY            = "isPaired";
50 const string ISPRIMARYALIGNMENT_PROPERTY  = "isPrimaryAlignment";
51 const string ISPROPERPAIR_PROPERTY        = "isProperPair";
52 const string ISREVERSESTRAND_PROPERTY     = "isReverseStrand";
53 const string ISSECONDMATE_PROPERTY        = "isSecondMate";
54 const string MAPQUALITY_PROPERTY          = "mapQuality";
55 const string MATEPOSITION_PROPERTY        = "matePosition";
56 const string MATEREFERENCE_PROPERTY       = "mateReference";
57 const string NAME_PROPERTY                = "name";
58 const string POSITION_PROPERTY            = "position";
59 const string QUERYBASES_PROPERTY          = "queryBases";
60 const string REFERENCE_PROPERTY           = "reference";
61
62 // boolalpha
63 const string TRUE_STR  = "true";
64 const string FALSE_STR = "false";
65     
66 } // namespace BamTools
67   
68 // ---------------------------------------------
69 // FilterToolPrivate declaration
70
71 class FilterTool::FilterToolPrivate {
72       
73     // ctor & dtor
74     public:
75         FilterToolPrivate(FilterTool::FilterSettings* settings);
76         ~FilterToolPrivate(void);  
77         
78     // 'public' interface
79     public:
80         bool Run(void);
81         
82     // internal methods
83     private:
84         bool AddPropertyTokensToFilter(const string& filterName, const map<string, string>& propertyTokens);
85         bool CheckAlignment(const BamAlignment& al);
86         const string GetScriptContents(void);
87         void InitProperties(void);
88         bool ParseCommandLine(void);
89         bool ParseFilterObject(const string& filterName, const Json::Value& filterObject);
90         bool ParseScript(void);
91         bool SetupFilters(void);
92         
93     // data members
94     private:
95         vector<string> m_propertyNames;
96         FilterTool::FilterSettings* m_settings;
97         RefVector m_references;
98 };
99   
100 // ---------------------------------------------
101 // FilterSettings implementation
102
103 struct FilterTool::FilterSettings {
104
105     // ----------------------------------
106     // IO opts
107   
108     // flags
109     bool HasInputBamFilename;
110     bool HasOutputBamFilename;
111     bool HasRegion;
112     bool HasScriptFilename;
113     
114     // filenames
115     vector<string> InputFiles;
116     string OutputFilename;
117     string Region;
118     string ScriptFilename;
119     
120     // -----------------------------------
121     // General filter opts
122     
123     // flags
124     bool HasAlignmentFlagFilter;
125     bool HasInsertSizeFilter;
126     bool HasMapQualityFilter;
127     bool HasNameFilter;
128     bool HasQueryBasesFilter;
129     
130 //     bool HasTagFilters;
131
132     // filters
133     string AlignmentFlagFilter;
134     string InsertSizeFilter;
135     string NameFilter;
136     string MapQualityFilter;
137     string QueryBasesFilter;
138     
139 //     vector<string> TagFilters;
140
141     // -----------------------------------
142     // AlignmentFlag filter opts
143     
144     // flags
145     bool HasIsDuplicateFilter;
146     bool HasIsFailedQCFilter;
147     bool HasIsFirstMateFilter;
148     bool HasIsMappedFilter;
149     bool HasIsMateMappedFilter;
150     bool HasIsMateReverseStrandFilter;
151     bool HasIsPairedFilter;
152     bool HasIsPrimaryAlignmentFilter;
153     bool HasIsProperPairFilter;
154     bool HasIsReverseStrandFilter;
155     bool HasIsSecondMateFilter;
156     
157     // filters
158     string IsDuplicateFilter;
159     string IsFailedQCFilter;
160     string IsFirstMateFilter;
161     string IsMappedFilter;
162     string IsMateMappedFilter;
163     string IsMateReverseStrandFilter;
164     string IsPairedFilter;
165     string IsPrimaryAlignmentFilter;
166     string IsProperPairFilter;
167     string IsReverseStrandFilter;
168     string IsSecondMateFilter;
169     
170     // ---------------------------------
171     // constructor
172     
173     FilterSettings(void)
174         : HasInputBamFilename(false)
175         , HasOutputBamFilename(false)
176         , HasRegion(false)
177         , HasScriptFilename(false)
178         , OutputFilename(Options::StandardOut())
179         , HasAlignmentFlagFilter(false)
180         , HasInsertSizeFilter(false)
181         , HasMapQualityFilter(false)
182         , HasNameFilter(false)
183         , HasQueryBasesFilter(false)
184 //         , HasTagFilters(false)
185         , HasIsDuplicateFilter(false)
186         , HasIsFailedQCFilter(false)
187         , HasIsFirstMateFilter(false)
188         , HasIsMappedFilter(false)
189         , HasIsMateMappedFilter(false)
190         , HasIsMateReverseStrandFilter(false)
191         , HasIsPairedFilter(false)
192         , HasIsPrimaryAlignmentFilter(false)
193         , HasIsProperPairFilter(false)
194         , HasIsReverseStrandFilter(false)
195         , HasIsSecondMateFilter(false)
196         , IsDuplicateFilter(TRUE_STR)
197         , IsFailedQCFilter(TRUE_STR)
198         , IsFirstMateFilter(TRUE_STR)
199         , IsMappedFilter(TRUE_STR)
200         , IsMateMappedFilter(TRUE_STR)
201         , IsMateReverseStrandFilter(TRUE_STR)
202         , IsPairedFilter(TRUE_STR)
203         , IsPrimaryAlignmentFilter(TRUE_STR)
204         , IsProperPairFilter(TRUE_STR)
205         , IsReverseStrandFilter(TRUE_STR)
206         , IsSecondMateFilter(TRUE_STR)
207     { }
208 };  
209
210 // ---------------------------------------------
211 // FilterTool implementation
212
213 FilterTool::FilterTool(void)
214     : AbstractTool()
215     , m_settings(new FilterSettings)
216     , m_impl(0)
217 {
218     // set program details
219     Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", "-in <filename> [-in <filename> ... ] -out <filename> -region <REGION> [ [-script <filename] | [filterOptions] ]");
220     
221     OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
222     Options::AddValueOption("-in",     "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename,  m_settings->InputFiles,     IO_Opts, Options::StandardIn());
223     Options::AddValueOption("-out",    "BAM filename", "the output BAM file",   "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
224     Options::AddValueOption("-region", "REGION",       "only read data from this genomic region (see README for more details)", "", m_settings->HasRegion, m_settings->Region, IO_Opts);
225     Options::AddValueOption("-script", "filename",     "the filter script file (see README for more details)", "", m_settings->HasScriptFilename, m_settings->ScriptFilename, IO_Opts);
226     
227     OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters");
228     Options::AddValueOption("-alignmentFlag", "int",     "keep reads with this *exact* alignment flag (for more detailed queries, see below)", "", m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, FilterOpts);
229     Options::AddValueOption("-insertSize",    "int",     "keep reads with insert size that mathces pattern",             "", m_settings->HasInsertSizeFilter, m_settings->InsertSizeFilter, FilterOpts);
230     Options::AddValueOption("-mapQuality",    "[0-255]", "keep reads with map quality that matches pattern",             "", m_settings->HasMapQualityFilter, m_settings->MapQualityFilter, FilterOpts);
231     Options::AddValueOption("-name",          "string",  "keep reads with name that matches pattern",                    "", m_settings->HasNameFilter,       m_settings->NameFilter,       FilterOpts);
232     Options::AddValueOption("-queryBases",    "string",  "keep reads with motif that mathces pattern",                   "", m_settings->HasQueryBasesFilter, m_settings->QueryBasesFilter, FilterOpts);
233     
234 //     Options::AddValueOption("-tag", "TAG:VALUE", "keep reads with this key=>value pair. If multiple tags are given, reads must match all", "", m_settings->HasTagFilters, m_settings->TagFilters, FilterOpts);
235     
236     OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters");
237     Options::AddValueOption("-isDuplicate",         "true/false", "keep only alignments that are marked as duplicate?", "", m_settings->HasIsDuplicateFilter,         m_settings->IsDuplicateFilter,         AlignmentFlagOpts, TRUE_STR);
238     Options::AddValueOption("-isFailedQC",          "true/false", "keep only alignments that failed QC?",               "", m_settings->HasIsFailedQCFilter,          m_settings->IsFailedQCFilter,          AlignmentFlagOpts, TRUE_STR);
239     Options::AddValueOption("-isFirstMate",         "true/false", "keep only alignments marked as first mate?",         "", m_settings->HasIsFirstMateFilter,         m_settings->IsFirstMateFilter,         AlignmentFlagOpts, TRUE_STR);
240     Options::AddValueOption("-isMapped",            "true/false", "keep only alignments that were mapped?",             "", m_settings->HasIsMappedFilter,            m_settings->IsMappedFilter,            AlignmentFlagOpts, TRUE_STR);
241     Options::AddValueOption("-isMateMapped",        "true/false", "keep only alignments with mates that mapped",        "", m_settings->HasIsMateMappedFilter,        m_settings->IsMateMappedFilter,        AlignmentFlagOpts, TRUE_STR);
242     Options::AddValueOption("-isMateReverseStrand", "true/false", "keep only alignments with mate on reverese strand?", "", m_settings->HasIsMateReverseStrandFilter, m_settings->IsMateReverseStrandFilter, AlignmentFlagOpts, TRUE_STR);
243     Options::AddValueOption("-isPaired",            "true/false", "keep only alignments that were sequenced as paired?","", m_settings->HasIsPairedFilter,            m_settings->IsPairedFilter,            AlignmentFlagOpts, TRUE_STR);
244     Options::AddValueOption("-isPrimaryAlignment",  "true/false", "keep only alignments marked as primary?",            "", m_settings->HasIsPrimaryAlignmentFilter,  m_settings->IsPrimaryAlignmentFilter,  AlignmentFlagOpts, TRUE_STR);
245     Options::AddValueOption("-isProperPair",        "true/false", "keep only alignments that passed PE resolution?",    "", m_settings->HasIsProperPairFilter,        m_settings->IsProperPairFilter,        AlignmentFlagOpts, TRUE_STR);
246     Options::AddValueOption("-isReverseStrand",     "true/false", "keep only alignments on reverse strand?",            "", m_settings->HasIsReverseStrandFilter,     m_settings->IsReverseStrandFilter,     AlignmentFlagOpts, TRUE_STR);
247     Options::AddValueOption("-isSecondMate",        "true/false", "keep only alignments marked as second mate?",        "", m_settings->HasIsSecondMateFilter,        m_settings->IsSecondMateFilter,        AlignmentFlagOpts, TRUE_STR);
248 }
249
250 FilterTool::~FilterTool(void) {
251     delete m_settings;
252     m_settings = 0;
253     
254     delete m_impl;
255     m_impl = 0;
256 }
257
258 int FilterTool::Help(void) {
259     Options::DisplayHelp();
260     return 0;
261 }
262
263 int FilterTool::Run(int argc, char* argv[]) {
264   
265     // parse command line arguments
266     Options::Parse(argc, argv, 1);
267     
268     // run internal FilterTool implementation, return success/fail
269     m_impl = new FilterToolPrivate(m_settings);
270     
271     if ( m_impl->Run() ) return 0;
272     else return 1;
273 }
274  
275 // ---------------------------------------------
276 // FilterToolPrivate implementation
277   
278 // constructor  
279 FilterTool::FilterToolPrivate::FilterToolPrivate(FilterTool::FilterSettings* settings)
280     : m_settings(settings)
281 { }  
282   
283 // destructor
284 FilterTool::FilterToolPrivate::~FilterToolPrivate(void) { }
285
286 bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName, const map<string, string>& propertyTokens) {
287
288   
289     // dummy temp values for token parsing
290     bool boolValue;
291     int32_t int32Value;
292     uint16_t uint16Value;
293     uint32_t uint32Value;
294     string stringValue;
295     PropertyFilterValue::ValueCompareType type;
296   
297     // iterate over property token map
298     map<string, string>::const_iterator mapIter = propertyTokens.begin();
299     map<string, string>::const_iterator mapEnd  = propertyTokens.end();
300     for ( ; mapIter != mapEnd; ++mapIter ) {
301       
302         const string& propertyName = (*mapIter).first;
303         const string& token        = (*mapIter).second;
304         
305         // ------------------------------
306         // convert token to value & compare type 
307         // then add to filter engine
308         
309         // bool conversion
310         if ( propertyName == ISDUPLICATE_PROPERTY ||
311              propertyName == ISFAILEDQC_PROPERTY ||
312              propertyName == ISFIRSTMATE_PROPERTY ||
313              propertyName == ISMAPPED_PROPERTY ||
314              propertyName == ISMATEMAPPED_PROPERTY ||
315              propertyName == ISMATEREVERSESTRAND_PROPERTY ||
316              propertyName == ISPAIRED_PROPERTY ||
317              propertyName == ISPRIMARYALIGNMENT_PROPERTY ||
318              propertyName == ISPROPERPAIR_PROPERTY ||
319              propertyName == ISREVERSESTRAND_PROPERTY ||
320              propertyName == ISSECONDMATE_PROPERTY
321            ) 
322         {
323             FilterEngine::parseToken(token, boolValue, type);
324             FilterEngine::setProperty(filterName, propertyName, boolValue, type);
325         }
326         
327         // int32_t conversion
328         else if ( propertyName == INSERTSIZE_PROPERTY ||
329                   propertyName == MATEPOSITION_PROPERTY ||
330                   propertyName == POSITION_PROPERTY 
331                 ) 
332         {
333             FilterEngine::parseToken(token, int32Value, type);
334             FilterEngine::setProperty(filterName, propertyName, int32Value, type);
335         }
336         
337         // uint16_t conversion
338         else if ( propertyName == MAPQUALITY_PROPERTY )
339         {
340             FilterEngine::parseToken(token, uint16Value, type);
341             FilterEngine::setProperty(filterName, propertyName, uint16Value, type);
342         }
343         
344         // uint32_t conversion
345         else if ( propertyName == ALIGNMENTFLAG_PROPERTY )
346         {
347             FilterEngine::parseToken(token, uint32Value, type);
348             FilterEngine::setProperty(filterName, propertyName, uint32Value, type);
349         }
350         
351         // string conversion
352         else if ( propertyName == MATEREFERENCE_PROPERTY ||
353                   propertyName == NAME_PROPERTY ||
354                   propertyName == QUERYBASES_PROPERTY ||
355                   propertyName == REFERENCE_PROPERTY
356                 ) 
357         {
358             FilterEngine::parseToken(token, stringValue, type);
359             FilterEngine::setProperty(filterName, propertyName, stringValue, type);
360         }
361       
362         // else unknown property 
363         else {
364             cerr << "Unknown property: " << propertyName << "!" << endl;
365             return false;
366         }
367     }
368     return true;
369 }
370
371 bool FilterTool::FilterToolPrivate::CheckAlignment(const BamAlignment& al) {
372
373     bool keepAlignment = true;
374   
375     // only consider properties that are actually enabled
376     // iterate over these enabled properties
377     const vector<string> enabledProperties = FilterEngine::enabledPropertyNames();
378     vector<string>::const_iterator propIter = enabledProperties.begin();
379     vector<string>::const_iterator propEnd  = enabledProperties.end();
380     for ( ; propIter != propEnd; ++propIter ) {
381      
382         // check alignment data field depending on propertyName
383         const string& propertyName = (*propIter);
384         if      ( propertyName == ALIGNMENTFLAG_PROPERTY )        keepAlignment &= FilterEngine::check(ALIGNMENTFLAG_PROPERTY,       al.AlignmentFlag);
385         else if ( propertyName == INSERTSIZE_PROPERTY )           keepAlignment &= FilterEngine::check(INSERTSIZE_PROPERTY,          al.InsertSize);
386         else if ( propertyName == ISDUPLICATE_PROPERTY )          keepAlignment &= FilterEngine::check(ISDUPLICATE_PROPERTY,         al.IsDuplicate());
387         else if ( propertyName == ISFAILEDQC_PROPERTY )           keepAlignment &= FilterEngine::check(ISFAILEDQC_PROPERTY,          al.IsFailedQC());
388         else if ( propertyName == ISFIRSTMATE_PROPERTY )          keepAlignment &= FilterEngine::check(ISFIRSTMATE_PROPERTY,         al.IsFirstMate());
389         else if ( propertyName == ISMAPPED_PROPERTY )             keepAlignment &= FilterEngine::check(ISMAPPED_PROPERTY,            al.IsMapped());
390         else if ( propertyName == ISMATEMAPPED_PROPERTY )         keepAlignment &= FilterEngine::check(ISMATEMAPPED_PROPERTY,        al.IsMateMapped());
391         else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY )  keepAlignment &= FilterEngine::check(ISMATEREVERSESTRAND_PROPERTY, al.IsMateReverseStrand());
392         else if ( propertyName == ISPAIRED_PROPERTY )             keepAlignment &= FilterEngine::check(ISPAIRED_PROPERTY,            al.IsPaired());
393         else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY )   keepAlignment &= FilterEngine::check(ISPRIMARYALIGNMENT_PROPERTY,  al.IsPrimaryAlignment());
394         else if ( propertyName == ISPROPERPAIR_PROPERTY )         keepAlignment &= FilterEngine::check(ISPROPERPAIR_PROPERTY,        al.IsProperPair());
395         else if ( propertyName == ISREVERSESTRAND_PROPERTY )      keepAlignment &= FilterEngine::check(ISREVERSESTRAND_PROPERTY,     al.IsReverseStrand());
396         else if ( propertyName == ISSECONDMATE_PROPERTY )         keepAlignment &= FilterEngine::check(ISSECONDMATE_PROPERTY,        al.IsSecondMate());
397         else if ( propertyName == MAPQUALITY_PROPERTY )           keepAlignment &= FilterEngine::check(MAPQUALITY_PROPERTY,          al.MapQuality);
398         else if ( propertyName == MATEPOSITION_PROPERTY )         keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && FilterEngine::check(MATEPOSITION_PROPERTY, al.MateRefID) );
399         else if ( propertyName == MATEREFERENCE_PROPERTY ) {
400             if ( !al.IsPaired() || !al.IsMateMapped() ) return false;
401             BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)m_references.size())), "Invalid MateRefID");
402             const string& refName = m_references.at(al.MateRefID).RefName;
403             keepAlignment &= FilterEngine::check(MATEREFERENCE_PROPERTY, refName);
404         }
405         else if ( propertyName == NAME_PROPERTY )                 keepAlignment &= FilterEngine::check(NAME_PROPERTY,                al.Name);
406         else if ( propertyName == POSITION_PROPERTY )             keepAlignment &= FilterEngine::check(POSITION_PROPERTY,            al.Position);
407         else if ( propertyName == QUERYBASES_PROPERTY )           keepAlignment &= FilterEngine::check(QUERYBASES_PROPERTY,          al.QueryBases);
408         else if ( propertyName == REFERENCE_PROPERTY ) {
409             BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)m_references.size())), "Invalid RefID");
410             const string& refName = m_references.at(al.RefID).RefName;
411             keepAlignment &= FilterEngine::check(REFERENCE_PROPERTY, refName);
412         }
413         else BAMTOOLS_ASSERT_MESSAGE( false, "Unknown property");
414         
415         // if alignment fails at ANY point, just quit and return false
416         if ( !keepAlignment ) return false;
417     }
418   
419     // return success (should still be true at this point)
420     return keepAlignment;
421 }
422
423 const string FilterTool::FilterToolPrivate::GetScriptContents(void) {
424   
425     // open script for reading
426     FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb");
427     if ( !inFile ) {
428         cerr << "FilterTool error: Could not open script: " << m_settings->ScriptFilename << " for reading" << endl;
429         return false;
430     }
431     
432     // read in entire script contents  
433     char buffer[1024];
434     ostringstream docStream("");
435     while ( true ) {
436         
437         // peek ahead, make sure there is data available
438         char ch = fgetc(inFile);
439         ungetc(ch, inFile);
440         if( feof(inFile) ) break;       
441         
442         // read next block of data
443         if ( fgets(buffer, 1024, inFile) == 0 ) {
444             cerr << "FilterTool error : could not read from script" << endl;
445             return false;
446         }
447         
448         docStream << buffer;
449     }
450     
451     // close script file
452     fclose(inFile);
453     
454     // import buffer contents to document, return
455     string document = docStream.str();
456     return document;
457 }
458
459 void FilterTool::FilterToolPrivate::InitProperties(void) {
460   
461     // store property names in vector 
462     m_propertyNames.push_back(ALIGNMENTFLAG_PROPERTY);
463     m_propertyNames.push_back(INSERTSIZE_PROPERTY);
464     m_propertyNames.push_back(ISDUPLICATE_PROPERTY);
465     m_propertyNames.push_back(ISFAILEDQC_PROPERTY);
466     m_propertyNames.push_back(ISFIRSTMATE_PROPERTY);
467     m_propertyNames.push_back(ISMAPPED_PROPERTY);
468     m_propertyNames.push_back(ISMATEMAPPED_PROPERTY);
469     m_propertyNames.push_back(ISMATEREVERSESTRAND_PROPERTY);
470     m_propertyNames.push_back(ISPAIRED_PROPERTY);
471     m_propertyNames.push_back(ISPRIMARYALIGNMENT_PROPERTY);
472     m_propertyNames.push_back(ISPROPERPAIR_PROPERTY);
473     m_propertyNames.push_back(ISREVERSESTRAND_PROPERTY);
474     m_propertyNames.push_back(ISSECONDMATE_PROPERTY);
475     m_propertyNames.push_back(MAPQUALITY_PROPERTY);
476     m_propertyNames.push_back(MATEPOSITION_PROPERTY);
477     m_propertyNames.push_back(MATEREFERENCE_PROPERTY);
478     m_propertyNames.push_back(NAME_PROPERTY);
479     m_propertyNames.push_back(POSITION_PROPERTY);
480     m_propertyNames.push_back(QUERYBASES_PROPERTY);
481     m_propertyNames.push_back(REFERENCE_PROPERTY);
482     
483     // add vector contents to FilterEngine
484     vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
485     vector<string>::const_iterator propertyNameEnd  = m_propertyNames.end();
486     for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter )
487         FilterEngine::addProperty((*propertyNameIter));
488 }
489
490 bool FilterTool::FilterToolPrivate::ParseCommandLine(void) {
491   
492     // add a rule set to filter engine
493     const string CMD = "COMMAND_LINE";
494     FilterEngine::addFilter(CMD);
495
496     // map property names to command line args
497     map<string, string> propertyTokens;
498     if ( m_settings->HasAlignmentFlagFilter )       propertyTokens.insert( make_pair(ALIGNMENTFLAG_PROPERTY,       m_settings->AlignmentFlagFilter) );
499     if ( m_settings->HasInsertSizeFilter )          propertyTokens.insert( make_pair(INSERTSIZE_PROPERTY,          m_settings->InsertSizeFilter) );
500     if ( m_settings->HasIsDuplicateFilter )         propertyTokens.insert( make_pair(ISDUPLICATE_PROPERTY,         m_settings->IsDuplicateFilter) );
501     if ( m_settings->HasIsFailedQCFilter )          propertyTokens.insert( make_pair(ISFAILEDQC_PROPERTY,          m_settings->IsFailedQCFilter) );
502     if ( m_settings->HasIsFirstMateFilter )         propertyTokens.insert( make_pair(ISFIRSTMATE_PROPERTY,         m_settings->IsFirstMateFilter) );
503     if ( m_settings->HasIsMappedFilter )            propertyTokens.insert( make_pair(ISMAPPED_PROPERTY,            m_settings->IsMappedFilter) );
504     if ( m_settings->HasIsMateMappedFilter )        propertyTokens.insert( make_pair(ISMATEMAPPED_PROPERTY,        m_settings->IsMateMappedFilter) );
505     if ( m_settings->HasIsMateReverseStrandFilter ) propertyTokens.insert( make_pair(ISMATEREVERSESTRAND_PROPERTY, m_settings->IsMateReverseStrandFilter) );
506     if ( m_settings->HasIsPairedFilter )            propertyTokens.insert( make_pair(ISPAIRED_PROPERTY,            m_settings->IsPairedFilter) );
507     if ( m_settings->HasIsPrimaryAlignmentFilter )  propertyTokens.insert( make_pair(ISPRIMARYALIGNMENT_PROPERTY,  m_settings->IsPrimaryAlignmentFilter) );
508     if ( m_settings->HasIsProperPairFilter )        propertyTokens.insert( make_pair(ISPROPERPAIR_PROPERTY,        m_settings->IsProperPairFilter) );
509     if ( m_settings->HasIsReverseStrandFilter )     propertyTokens.insert( make_pair(ISREVERSESTRAND_PROPERTY,     m_settings->IsReverseStrandFilter) );
510     if ( m_settings->HasIsSecondMateFilter )        propertyTokens.insert( make_pair(ISSECONDMATE_PROPERTY,        m_settings->IsSecondMateFilter) );
511     if ( m_settings->HasMapQualityFilter )          propertyTokens.insert( make_pair(MAPQUALITY_PROPERTY,          m_settings->MapQualityFilter) );
512     if ( m_settings->HasNameFilter )                propertyTokens.insert( make_pair(NAME_PROPERTY,                m_settings->NameFilter) );
513     if ( m_settings->HasQueryBasesFilter )          propertyTokens.insert( make_pair(QUERYBASES_PROPERTY,          m_settings->QueryBasesFilter) );
514
515     // send add these properties to filter set "COMMAND_LINE"
516     return AddPropertyTokensToFilter(CMD, propertyTokens);
517 }
518
519 bool FilterTool::FilterToolPrivate::ParseFilterObject(const string& filterName, const Json::Value& filterObject) {
520   
521     // filter object parsing variables
522     Json::Value null(Json::nullValue);
523     Json::Value propertyValue;
524     
525     // store results
526     map<string, string> propertyTokens;
527     
528     // iterate over known properties
529     vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
530     vector<string>::const_iterator propertyNameEnd  = m_propertyNames.end();
531     for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter ) {
532         const string& propertyName = (*propertyNameIter);
533         
534         // if property defined in filter, add to token list
535         propertyValue = filterObject.get(propertyName, null);
536         if ( propertyValue != null ) 
537             propertyTokens.insert( make_pair(propertyName, propertyValue.asString()) );
538     }
539   
540     // add this filter to engin
541     FilterEngine::addFilter(filterName);
542   
543     // add token list to this filter
544     return AddPropertyTokensToFilter(filterName, propertyTokens);
545 }
546
547 bool FilterTool::FilterToolPrivate::ParseScript(void) {
548   
549     // read in script contents from file
550     const string document = GetScriptContents();
551     
552     // set up JsonCPP reader and attempt to parse script
553     Json::Value root;
554     Json::Reader reader;
555     if ( !reader.parse(document, root) ) {
556         // use built-in error reporting mechanism to alert user what was wrong with the script
557         cerr  << "Failed to parse configuration\n" << reader.getFormatedErrorMessages();
558         return false;     
559     }
560
561     // see if root object contains multiple filters
562     const Json::Value filters = root["filters"];
563     if ( !filters.isNull() ) {
564       
565         bool success = true;
566       
567         // iterate over any filters found
568         int filterIndex = 0;
569         Json::Value::const_iterator filtersIter = filters.begin();
570         Json::Value::const_iterator filtersEnd  = filters.end();
571         for ( ; filtersIter != filtersEnd; ++filtersIter, ++filterIndex ) {
572             Json::Value filter = (*filtersIter);
573             
574             // convert filter index to string
575             stringstream convert;
576             convert << filterIndex;
577             const string filterName = convert.str();
578             
579             // create & parse filter 
580             success &= ParseFilterObject(filterName, filter);
581             
582         }
583         return success;
584     } 
585     
586     // otherwise, root is the only filter (just contains properties)
587     // create & parse filter named "ROOT"
588     else return ParseFilterObject("ROOT", root);
589 }
590
591
592 bool FilterTool::FilterToolPrivate::Run(void) {
593     
594     // set to default input if none provided
595     if ( !m_settings->HasInputBamFilename ) 
596         m_settings->InputFiles.push_back(Options::StandardIn());
597     
598     // initialize defined properties & user-specified filters
599     // quit if failed
600     if ( !SetupFilters() ) return 1;
601
602     // open reader without index
603     BamMultiReader reader;
604     reader.Open(m_settings->InputFiles, false, true);
605     const string headerText = reader.GetHeaderText();
606     m_references = reader.GetReferenceData();
607     
608     // open writer
609     BamWriter writer;
610     writer.Open(m_settings->OutputFilename, headerText, m_references);
611     
612     // set up error handling
613     ostringstream errorStream("");
614     bool foundError(false);
615     
616     // if no REGION specified, run filter on entire file contents
617     if ( !m_settings->HasRegion ) {
618         BamAlignment al;
619         while ( reader.GetNextAlignment(al) ) {
620             // perform main filter check
621             if ( CheckAlignment(al) ) 
622                 writer.SaveAlignment(al);
623         }
624     }
625     
626     // REGION specified
627     else {
628       
629         // attempt to parse string into BamRegion struct
630         BamRegion region;
631         if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
632
633             // check if there are index files *.bai/*.bti corresponding to the input files
634             bool hasDefaultIndex   = false;
635             bool hasBamtoolsIndex  = false;
636             bool hasNoIndex        = false;
637             int defaultIndexCount   = 0;
638             int bamtoolsIndexCount = 0;
639             for (vector<string>::const_iterator f = m_settings->InputFiles.begin(); f != m_settings->InputFiles.end(); ++f) {
640               
641                 if ( Utilities::FileExists(*f + ".bai") ) {
642                     hasDefaultIndex = true;
643                     ++defaultIndexCount;
644                 }       
645                 
646                 if ( Utilities::FileExists(*f + ".bti") ) {
647                     hasBamtoolsIndex = true;
648                     ++bamtoolsIndexCount;
649                 }
650                   
651                 if ( !hasDefaultIndex && !hasBamtoolsIndex ) {
652                     hasNoIndex = true;
653                     cerr << "*WARNING - could not find index file for " << *f  
654                          << ", parsing whole file(s) to get alignment counts for target region" 
655                          << " (could be slow)" << endl;
656                     break;
657                 }
658             }
659             
660             // determine if index file types are heterogeneous
661             bool hasDifferentIndexTypes = false;
662             if ( defaultIndexCount > 0 && bamtoolsIndexCount > 0 ) {
663                 hasDifferentIndexTypes = true;
664                 cerr << "*WARNING - different index file formats found"  
665                          << ", parsing whole file(s) to get alignment counts for target region" 
666                          << " (could be slow)" << endl;
667             }
668             
669             // if any input file has no index, or if input files use different index formats
670             // can't use BamMultiReader to jump directly (**for now**)
671             if ( hasNoIndex || hasDifferentIndexTypes ) {
672                 
673                 // read through sequentially, but onlt perform filter on reads overlapping REGION
674                 BamAlignment al;
675                 while( reader.GetNextAlignment(al) ) {
676                     if ( (al.RefID >= region.LeftRefID)  && ( (al.Position + al.Length) >= region.LeftPosition ) &&
677                          (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) 
678                     {
679                         // perform main filter check
680                         if ( CheckAlignment(al) ) 
681                             writer.SaveAlignment(al);
682                     }
683                 }
684             }
685             
686             // has index file for each input file (and same format)
687             else {
688               
689                 // this is kind of a hack...?
690                 BamMultiReader reader;
691                 reader.Open(m_settings->InputFiles, true, true, hasDefaultIndex );
692               
693                 if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
694                    foundError = true;
695                    errorStream << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
696                 } else {
697                   
698                     // filter only alignments from specified region
699                     BamAlignment al;
700                     while ( reader.GetNextAlignment(al) ) {
701                         // perform main filter check
702                         if ( CheckAlignment(al) ) 
703                             writer.SaveAlignment(al);
704                     }
705                 }
706             }
707             
708         } else {
709             foundError = true;
710             errorStream << "Could not parse REGION: " << m_settings->Region << endl;
711             errorStream << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl;
712         }
713     }
714
715     // clean up & exit
716     reader.Close();
717     writer.Close();
718     return 0;
719 }
720
721 bool FilterTool::FilterToolPrivate::SetupFilters(void) {
722   
723     // add known properties to FilterEngine
724     InitProperties();
725     
726     // parse script for filter rules, if given
727     if ( m_settings->HasScriptFilename ) return ParseScript();
728     
729     // otherwise check command line for filters
730     else return ParseCommandLine();
731 }