]> git.donarmstrong.com Git - bamtools.git/blob - src/toolkit/bamtools_filter.cpp
Added "isSingleton" property to filter tool
[bamtools.git] / src / toolkit / bamtools_filter.cpp
1 // ***************************************************************************
2 // bamtools_filter.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 7 April 2011
7 // ---------------------------------------------------------------------------
8 // Filters BAM file(s) according to some user-specified criteria
9 // ***************************************************************************
10
11 #include "bamtools_filter.h"
12
13 #include <api/BamMultiReader.h>
14 #include <api/BamWriter.h>
15 #include <utils/bamtools_filter_engine.h>
16 #include <utils/bamtools_options.h>
17 #include <utils/bamtools_utilities.h>
18 using namespace BamTools;
19
20 #include <jsoncpp/json.h>
21 using namespace Json;
22
23 #include <cstdio>
24 #include <iostream>
25 #include <sstream>
26 #include <string>
27 #include <vector>
28 using namespace std;
29   
30 namespace BamTools {
31   
32 // -------------------------------  
33 // string literal constants  
34
35 // property names
36 const string ALIGNMENTFLAG_PROPERTY       = "alignmentFlag";
37 const string CIGAR_PROPERTY               = "cigar";
38 const string INSERTSIZE_PROPERTY          = "insertSize";
39 const string ISDUPLICATE_PROPERTY         = "isDuplicate";
40 const string ISFAILEDQC_PROPERTY          = "isFailedQC";
41 const string ISFIRSTMATE_PROPERTY         = "isFirstMate";
42 const string ISMAPPED_PROPERTY            = "isMapped";
43 const string ISMATEMAPPED_PROPERTY        = "isMateMapped";
44 const string ISMATEREVERSESTRAND_PROPERTY = "isMateReverseStrand";
45 const string ISPAIRED_PROPERTY            = "isPaired";
46 const string ISPRIMARYALIGNMENT_PROPERTY  = "isPrimaryAlignment";
47 const string ISPROPERPAIR_PROPERTY        = "isProperPair";
48 const string ISREVERSESTRAND_PROPERTY     = "isReverseStrand";
49 const string ISSECONDMATE_PROPERTY        = "isSecondMate";
50 const string ISSINGLETON_PROPERTY         = "isSingleton";
51 const string MAPQUALITY_PROPERTY          = "mapQuality";
52 const string MATEPOSITION_PROPERTY        = "matePosition";
53 const string MATEREFERENCE_PROPERTY       = "mateReference";
54 const string NAME_PROPERTY                = "name";
55 const string POSITION_PROPERTY            = "position";
56 const string QUERYBASES_PROPERTY          = "queryBases";
57 const string REFERENCE_PROPERTY           = "reference";
58 const string TAG_PROPERTY                 = "tag";
59
60 // boolalpha
61 const string TRUE_STR  = "true";
62 const string FALSE_STR = "false";
63     
64 RefVector filterToolReferences;    
65     
66 struct BamAlignmentChecker {
67     bool check(const PropertyFilter& filter, const BamAlignment& al) {
68       
69         bool keepAlignment = true;
70         const PropertyMap& properties = filter.Properties;
71         PropertyMap::const_iterator propertyIter = properties.begin();
72         PropertyMap::const_iterator propertyEnd  = properties.end();
73         for ( ; propertyIter != propertyEnd; ++propertyIter ) {
74           
75             // check alignment data field depending on propertyName
76             const string& propertyName = (*propertyIter).first;
77             const PropertyFilterValue& valueFilter = (*propertyIter).second;
78             
79             if      ( propertyName == ALIGNMENTFLAG_PROPERTY )  keepAlignment &= valueFilter.check(al.AlignmentFlag);
80             else if ( propertyName == CIGAR_PROPERTY ) {
81                 stringstream cigarSs;
82                 const vector<CigarOp>& cigarData = al.CigarData;
83                 if ( !cigarData.empty() ) {
84                     vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
85                     vector<CigarOp>::const_iterator cigarIter = cigarBegin;
86                     vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
87                     for ( ; cigarIter != cigarEnd; ++cigarIter ) {
88                         const CigarOp& op = (*cigarIter);
89                         cigarSs << op.Length << op.Type;
90                     }
91                     keepAlignment &= valueFilter.check(cigarSs.str());
92                 }
93             }
94             else if ( propertyName == INSERTSIZE_PROPERTY )           keepAlignment &= valueFilter.check(al.InsertSize);
95             else if ( propertyName == ISDUPLICATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsDuplicate());
96             else if ( propertyName == ISFAILEDQC_PROPERTY )           keepAlignment &= valueFilter.check(al.IsFailedQC());
97             else if ( propertyName == ISFIRSTMATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsFirstMate());
98             else if ( propertyName == ISMAPPED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsMapped());
99             else if ( propertyName == ISMATEMAPPED_PROPERTY )         keepAlignment &= valueFilter.check(al.IsMateMapped());
100             else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY )  keepAlignment &= valueFilter.check(al.IsMateReverseStrand());
101             else if ( propertyName == ISPAIRED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsPaired());
102             else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY )   keepAlignment &= valueFilter.check(al.IsPrimaryAlignment());
103             else if ( propertyName == ISPROPERPAIR_PROPERTY )         keepAlignment &= valueFilter.check(al.IsProperPair());
104             else if ( propertyName == ISREVERSESTRAND_PROPERTY )      keepAlignment &= valueFilter.check(al.IsReverseStrand());
105             else if ( propertyName == ISSECONDMATE_PROPERTY )         keepAlignment &= valueFilter.check(al.IsSecondMate());
106             else if ( propertyName == ISSINGLETON_PROPERTY ) {
107                 const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped();
108                 keepAlignment &= valueFilter.check(isSingleton);
109             }
110             else if ( propertyName == MAPQUALITY_PROPERTY )           keepAlignment &= valueFilter.check(al.MapQuality);
111             else if ( propertyName == MATEPOSITION_PROPERTY )         keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) );
112             else if ( propertyName == MATEREFERENCE_PROPERTY ) {
113                 if ( !al.IsPaired() || !al.IsMateMapped() ) return false;
114                 BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID");
115                 const string& refName = filterToolReferences.at(al.MateRefID).RefName;
116                 keepAlignment &= valueFilter.check(refName);
117             }
118             else if ( propertyName == NAME_PROPERTY )                 keepAlignment &= valueFilter.check(al.Name);
119             else if ( propertyName == POSITION_PROPERTY )             keepAlignment &= valueFilter.check(al.Position);
120             else if ( propertyName == QUERYBASES_PROPERTY )           keepAlignment &= valueFilter.check(al.QueryBases);
121             else if ( propertyName == REFERENCE_PROPERTY ) {
122                 BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID");
123                 const string& refName = filterToolReferences.at(al.RefID).RefName;
124                 keepAlignment &= valueFilter.check(refName);
125             }
126             else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al);
127             else BAMTOOLS_ASSERT_UNREACHABLE;
128             
129             // if alignment fails at ANY point, just quit and return false
130             if ( !keepAlignment ) return false;
131         }
132       
133         BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here");
134         return keepAlignment;
135     }
136     
137     bool checkAlignmentTag(const PropertyFilterValue& valueFilter, const BamAlignment& al) {
138      
139         // ensure filter contains string data
140         Variant entireTagFilter = valueFilter.Value;
141         if ( !entireTagFilter.is_type<string>() ) return false;
142
143         // localize string from variant
144         const string& entireTagFilterString = entireTagFilter.get<string>();
145
146         // ensure we have at least "XX:x"
147         if ( entireTagFilterString.length() < 4 ) return false;
148
149         // get tagName & lookup in alignment
150         // if found, set tagType to tag type character
151         // if not found, return false
152         const string& tagName = entireTagFilterString.substr(0,2);
153         char tagType = '\0';
154         if ( !al.GetTagType(tagName, tagType) ) return false;
155
156         // remove tagName & ":" from beginning tagFilter
157         string tagFilterString = entireTagFilterString.substr(3);
158
159         // switch on tag type to set tag query value & parse filter token
160         int32_t  intFilterValue,    intQueryValue;
161         uint32_t uintFilterValue,   uintQueryValue;
162         float    realFilterValue,   realQueryValue;
163         string   stringFilterValue, stringQueryValue;
164
165         PropertyFilterValue tagFilter;
166         PropertyFilterValue::ValueCompareType compareType;
167         bool keepAlignment = false;
168         switch (tagType) {
169
170             // signed int tag type
171             case 'c' :
172             case 's' :
173             case 'i' :
174                 if ( al.GetTag(tagName, intQueryValue) ) {
175                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, intFilterValue, compareType) ) {
176                         tagFilter.Value = intFilterValue;
177                         tagFilter.Type  = compareType;
178                         keepAlignment   = tagFilter.check(intQueryValue);
179                     }
180                 }
181                 break;
182
183             // unsigned int tag type
184             case 'C' :
185             case 'S' :
186             case 'I' :
187                 if ( al.GetTag(tagName, uintQueryValue) ) {
188                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, uintFilterValue, compareType) ) {
189                         tagFilter.Value = uintFilterValue;
190                         tagFilter.Type  = compareType;
191                         keepAlignment   = tagFilter.check(uintQueryValue);
192                     }
193                 }
194                 break;
195
196             // 'real' tag type
197             case 'f' :
198                 if ( al.GetTag(tagName, realQueryValue) ) {
199                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, realFilterValue, compareType) ) {
200                         tagFilter.Value = realFilterValue;
201                         tagFilter.Type  = compareType;
202                         keepAlignment   = tagFilter.check(realQueryValue);
203                     }
204                 }
205                 break;
206
207             // string tag type
208             case 'A':
209             case 'Z':
210             case 'H':
211                 if ( al.GetTag(tagName, stringQueryValue) ) {
212                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, stringFilterValue, compareType) ) {
213                         tagFilter.Value = stringFilterValue;
214                         tagFilter.Type  = compareType;
215                         keepAlignment   = tagFilter.check(stringQueryValue);
216                     }
217                 }
218                 break;
219
220             // unknown tag type
221             default :
222                 keepAlignment = false;
223         }
224
225         return keepAlignment;
226     }
227 };    
228     
229 } // namespace BamTools
230   
231 // ---------------------------------------------
232 // FilterSettings implementation
233
234 struct FilterTool::FilterSettings {
235
236     // ----------------------------------
237     // IO opts
238
239     // flags
240     bool HasInputBamFilename;
241     bool HasOutputBamFilename;
242     bool HasRegion;
243     bool HasScriptFilename;
244     bool IsForceCompression;
245
246     // filenames
247     vector<string> InputFiles;
248     string OutputFilename;
249     string Region;
250     string ScriptFilename;
251
252     // -----------------------------------
253     // General filter opts
254
255     // flags
256     bool HasAlignmentFlagFilter;
257     bool HasInsertSizeFilter;
258     bool HasMapQualityFilter;
259     bool HasNameFilter;
260     bool HasQueryBasesFilter;
261     bool HasTagFilter; //(s)
262
263     // filters
264     string AlignmentFlagFilter;
265     string InsertSizeFilter;
266     string NameFilter;
267     string MapQualityFilter;
268     string QueryBasesFilter;
269     string TagFilter;  // support multiple ?
270
271     // -----------------------------------
272     // AlignmentFlag filter opts
273
274     // flags
275     bool HasIsDuplicateFilter;
276     bool HasIsFailedQCFilter;
277     bool HasIsFirstMateFilter;
278     bool HasIsMappedFilter;
279     bool HasIsMateMappedFilter;
280     bool HasIsMateReverseStrandFilter;
281     bool HasIsPairedFilter;
282     bool HasIsPrimaryAlignmentFilter;
283     bool HasIsProperPairFilter;
284     bool HasIsReverseStrandFilter;
285     bool HasIsSecondMateFilter;
286     bool HasIsSingletonFilter;
287
288     // filters
289     string IsDuplicateFilter;
290     string IsFailedQCFilter;
291     string IsFirstMateFilter;
292     string IsMappedFilter;
293     string IsMateMappedFilter;
294     string IsMateReverseStrandFilter;
295     string IsPairedFilter;
296     string IsPrimaryAlignmentFilter;
297     string IsProperPairFilter;
298     string IsReverseStrandFilter;
299     string IsSecondMateFilter;
300     string IsSingletonFilter;
301
302     // ---------------------------------
303     // constructor
304
305     FilterSettings(void)
306         : HasInputBamFilename(false)
307         , HasOutputBamFilename(false)
308         , HasRegion(false)
309         , HasScriptFilename(false)
310         , IsForceCompression(false)
311         , OutputFilename(Options::StandardOut())
312         , HasAlignmentFlagFilter(false)
313         , HasInsertSizeFilter(false)
314         , HasMapQualityFilter(false)
315         , HasNameFilter(false)
316         , HasQueryBasesFilter(false)
317         , HasTagFilter(false)
318         , HasIsDuplicateFilter(false)
319         , HasIsFailedQCFilter(false)
320         , HasIsFirstMateFilter(false)
321         , HasIsMappedFilter(false)
322         , HasIsMateMappedFilter(false)
323         , HasIsMateReverseStrandFilter(false)
324         , HasIsPairedFilter(false)
325         , HasIsPrimaryAlignmentFilter(false)
326         , HasIsProperPairFilter(false)
327         , HasIsReverseStrandFilter(false)
328         , HasIsSecondMateFilter(false)
329         , HasIsSingletonFilter(false)
330         , IsDuplicateFilter(TRUE_STR)
331         , IsFailedQCFilter(TRUE_STR)
332         , IsFirstMateFilter(TRUE_STR)
333         , IsMappedFilter(TRUE_STR)
334         , IsMateMappedFilter(TRUE_STR)
335         , IsMateReverseStrandFilter(TRUE_STR)
336         , IsPairedFilter(TRUE_STR)
337         , IsPrimaryAlignmentFilter(TRUE_STR)
338         , IsProperPairFilter(TRUE_STR)
339         , IsReverseStrandFilter(TRUE_STR)
340         , IsSecondMateFilter(TRUE_STR)
341         , IsSingletonFilter(TRUE_STR)
342     { }
343 };
344
345 // ---------------------------------------------
346 // FilterToolPrivate declaration
347
348 class FilterTool::FilterToolPrivate {
349       
350     // ctor & dtor
351     public:
352         FilterToolPrivate(FilterTool::FilterSettings* settings);
353         ~FilterToolPrivate(void);  
354         
355     // 'public' interface
356     public:
357         bool Run(void);
358         
359     // internal methods
360     private:
361         bool AddPropertyTokensToFilter(const string& filterName, const map<string, string>& propertyTokens);
362         bool CheckAlignment(const BamAlignment& al);
363         const string GetScriptContents(void);
364         void InitProperties(void);
365         bool ParseCommandLine(void);
366         bool ParseFilterObject(const string& filterName, const Json::Value& filterObject);
367         bool ParseScript(void);
368         bool SetupFilters(void);
369         
370     // data members
371     private:
372         vector<string> m_propertyNames;
373         FilterTool::FilterSettings* m_settings;
374         FilterEngine<BamAlignmentChecker> m_filterEngine;
375 };
376  
377 // ---------------------------------------------
378 // FilterToolPrivate implementation
379   
380 // constructor  
381 FilterTool::FilterToolPrivate::FilterToolPrivate(FilterTool::FilterSettings* settings)
382     : m_settings(settings)
383 { }  
384   
385 // destructor
386 FilterTool::FilterToolPrivate::~FilterToolPrivate(void) { }
387
388 bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName,
389                                                               const map<string,
390                                                               string>& propertyTokens)
391 {
392     // dummy temp values for token parsing
393     bool boolValue;
394     int32_t int32Value;
395     uint16_t uint16Value;
396     uint32_t uint32Value;
397     string stringValue;
398     PropertyFilterValue::ValueCompareType type;
399   
400     // iterate over property token map
401     map<string, string>::const_iterator mapIter = propertyTokens.begin();
402     map<string, string>::const_iterator mapEnd  = propertyTokens.end();
403     for ( ; mapIter != mapEnd; ++mapIter ) {
404       
405         const string& propertyName = (*mapIter).first;
406         const string& token        = (*mapIter).second;
407         
408         // ------------------------------
409         // convert token to value & compare type 
410         // then add to filter engine
411         
412         // bool conversion
413         if ( propertyName == ISDUPLICATE_PROPERTY ||
414              propertyName == ISFAILEDQC_PROPERTY ||
415              propertyName == ISFIRSTMATE_PROPERTY ||
416              propertyName == ISMAPPED_PROPERTY ||
417              propertyName == ISMATEMAPPED_PROPERTY ||
418              propertyName == ISMATEREVERSESTRAND_PROPERTY ||
419              propertyName == ISPAIRED_PROPERTY ||
420              propertyName == ISPRIMARYALIGNMENT_PROPERTY ||
421              propertyName == ISPROPERPAIR_PROPERTY ||
422              propertyName == ISREVERSESTRAND_PROPERTY ||
423              propertyName == ISSECONDMATE_PROPERTY ||
424              propertyName == ISSINGLETON_PROPERTY
425            ) 
426         {
427             FilterEngine<BamAlignmentChecker>::parseToken(token, boolValue, type);
428             m_filterEngine.setProperty(filterName, propertyName, boolValue, type);
429         }
430         
431         // int32_t conversion
432         else if ( propertyName == INSERTSIZE_PROPERTY ||
433                   propertyName == MATEPOSITION_PROPERTY ||
434                   propertyName == POSITION_PROPERTY 
435                 ) 
436         {
437             FilterEngine<BamAlignmentChecker>::parseToken(token, int32Value, type);
438             m_filterEngine.setProperty(filterName, propertyName, int32Value, type);
439         }
440         
441         // uint16_t conversion
442         else if ( propertyName == MAPQUALITY_PROPERTY )
443         {
444             FilterEngine<BamAlignmentChecker>::parseToken(token, uint16Value, type);
445             m_filterEngine.setProperty(filterName, propertyName, uint16Value, type);
446         }
447         
448         // uint32_t conversion
449         else if ( propertyName == ALIGNMENTFLAG_PROPERTY )
450         {
451             FilterEngine<BamAlignmentChecker>::parseToken(token, uint32Value, type);
452             m_filterEngine.setProperty(filterName, propertyName, uint32Value, type);
453         }
454         
455         // string conversion
456         else if ( propertyName == CIGAR_PROPERTY || 
457                   propertyName == MATEREFERENCE_PROPERTY ||
458                   propertyName == NAME_PROPERTY ||
459                   propertyName == QUERYBASES_PROPERTY ||
460                   propertyName == REFERENCE_PROPERTY 
461                 ) 
462         {
463             FilterEngine<BamAlignmentChecker>::parseToken(token, stringValue, type);
464             m_filterEngine.setProperty(filterName, propertyName, stringValue, type);
465         }
466       
467     else if ( propertyName == TAG_PROPERTY ) {
468             // this will be stored directly as the TAG:VALUE token 
469             // (VALUE may contain compare ops, will be parsed out later)
470             m_filterEngine.setProperty(filterName, propertyName, token, PropertyFilterValue::EXACT);
471         }
472       
473         // else unknown property 
474         else {
475             cerr << "bamtools filter ERROR: unknown property - " << propertyName << endl;
476             return false;
477         }
478     }
479     return true;
480 }
481
482 bool FilterTool::FilterToolPrivate::CheckAlignment(const BamAlignment& al) {
483     return m_filterEngine.check(al);
484 }
485
486 const string FilterTool::FilterToolPrivate::GetScriptContents(void) {
487   
488     // open script for reading
489     FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb");
490     if ( !inFile ) {
491         cerr << "bamtools filter ERROR: could not open script: "
492              << m_settings->ScriptFilename << " for reading" << endl;
493         return false;
494     }
495     
496     // read in entire script contents  
497     char buffer[1024];
498     ostringstream docStream("");
499     while ( true ) {
500         
501         // peek ahead, make sure there is data available
502         char ch = fgetc(inFile);
503         ungetc(ch, inFile);
504         if( feof(inFile) ) break;       
505         
506         // read next block of data
507         if ( fgets(buffer, 1024, inFile) == 0 ) {
508             cerr << "bamtools filter ERROR: could not read script contents" << endl;
509             return false;
510         }
511         
512         docStream << buffer;
513     }
514     
515     // close script file
516     fclose(inFile);
517     
518     // import buffer contents to document, return
519     string document = docStream.str();
520     return document;
521 }
522
523 void FilterTool::FilterToolPrivate::InitProperties(void) {
524   
525     // store property names in vector 
526     m_propertyNames.push_back(ALIGNMENTFLAG_PROPERTY);
527     m_propertyNames.push_back(CIGAR_PROPERTY);
528     m_propertyNames.push_back(INSERTSIZE_PROPERTY);
529     m_propertyNames.push_back(ISDUPLICATE_PROPERTY);
530     m_propertyNames.push_back(ISFAILEDQC_PROPERTY);
531     m_propertyNames.push_back(ISFIRSTMATE_PROPERTY);
532     m_propertyNames.push_back(ISMAPPED_PROPERTY);
533     m_propertyNames.push_back(ISMATEMAPPED_PROPERTY);
534     m_propertyNames.push_back(ISMATEREVERSESTRAND_PROPERTY);
535     m_propertyNames.push_back(ISPAIRED_PROPERTY);
536     m_propertyNames.push_back(ISPRIMARYALIGNMENT_PROPERTY);
537     m_propertyNames.push_back(ISPROPERPAIR_PROPERTY);
538     m_propertyNames.push_back(ISREVERSESTRAND_PROPERTY);
539     m_propertyNames.push_back(ISSECONDMATE_PROPERTY);
540     m_propertyNames.push_back(ISSINGLETON_PROPERTY);
541     m_propertyNames.push_back(MAPQUALITY_PROPERTY);
542     m_propertyNames.push_back(MATEPOSITION_PROPERTY);
543     m_propertyNames.push_back(MATEREFERENCE_PROPERTY);
544     m_propertyNames.push_back(NAME_PROPERTY);
545     m_propertyNames.push_back(POSITION_PROPERTY);
546     m_propertyNames.push_back(QUERYBASES_PROPERTY);
547     m_propertyNames.push_back(REFERENCE_PROPERTY);
548     m_propertyNames.push_back(TAG_PROPERTY);
549     
550     // add vector contents to FilterEngine<BamAlignmentChecker>
551     vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
552     vector<string>::const_iterator propertyNameEnd  = m_propertyNames.end();
553     for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter )
554         m_filterEngine.addProperty((*propertyNameIter));
555 }
556
557 bool FilterTool::FilterToolPrivate::ParseCommandLine(void) {
558   
559     // add a rule set to filter engine
560     const string CMD = "COMMAND_LINE";
561     m_filterEngine.addFilter(CMD);
562
563     // map property names to command line args
564     map<string, string> propertyTokens;
565     if ( m_settings->HasAlignmentFlagFilter )       propertyTokens.insert( make_pair(ALIGNMENTFLAG_PROPERTY,       m_settings->AlignmentFlagFilter) );
566     if ( m_settings->HasInsertSizeFilter )          propertyTokens.insert( make_pair(INSERTSIZE_PROPERTY,          m_settings->InsertSizeFilter) );
567     if ( m_settings->HasIsDuplicateFilter )         propertyTokens.insert( make_pair(ISDUPLICATE_PROPERTY,         m_settings->IsDuplicateFilter) );
568     if ( m_settings->HasIsFailedQCFilter )          propertyTokens.insert( make_pair(ISFAILEDQC_PROPERTY,          m_settings->IsFailedQCFilter) );
569     if ( m_settings->HasIsFirstMateFilter )         propertyTokens.insert( make_pair(ISFIRSTMATE_PROPERTY,         m_settings->IsFirstMateFilter) );
570     if ( m_settings->HasIsMappedFilter )            propertyTokens.insert( make_pair(ISMAPPED_PROPERTY,            m_settings->IsMappedFilter) );
571     if ( m_settings->HasIsMateMappedFilter )        propertyTokens.insert( make_pair(ISMATEMAPPED_PROPERTY,        m_settings->IsMateMappedFilter) );
572     if ( m_settings->HasIsMateReverseStrandFilter ) propertyTokens.insert( make_pair(ISMATEREVERSESTRAND_PROPERTY, m_settings->IsMateReverseStrandFilter) );
573     if ( m_settings->HasIsPairedFilter )            propertyTokens.insert( make_pair(ISPAIRED_PROPERTY,            m_settings->IsPairedFilter) );
574     if ( m_settings->HasIsPrimaryAlignmentFilter )  propertyTokens.insert( make_pair(ISPRIMARYALIGNMENT_PROPERTY,  m_settings->IsPrimaryAlignmentFilter) );
575     if ( m_settings->HasIsProperPairFilter )        propertyTokens.insert( make_pair(ISPROPERPAIR_PROPERTY,        m_settings->IsProperPairFilter) );
576     if ( m_settings->HasIsReverseStrandFilter )     propertyTokens.insert( make_pair(ISREVERSESTRAND_PROPERTY,     m_settings->IsReverseStrandFilter) );
577     if ( m_settings->HasIsSecondMateFilter )        propertyTokens.insert( make_pair(ISSECONDMATE_PROPERTY,        m_settings->IsSecondMateFilter) );
578     if ( m_settings->HasIsSingletonFilter )         propertyTokens.insert( make_pair(ISSINGLETON_PROPERTY,         m_settings->IsSingletonFilter) );
579     if ( m_settings->HasMapQualityFilter )          propertyTokens.insert( make_pair(MAPQUALITY_PROPERTY,          m_settings->MapQualityFilter) );
580     if ( m_settings->HasNameFilter )                propertyTokens.insert( make_pair(NAME_PROPERTY,                m_settings->NameFilter) );
581     if ( m_settings->HasQueryBasesFilter )          propertyTokens.insert( make_pair(QUERYBASES_PROPERTY,          m_settings->QueryBasesFilter) );
582     if ( m_settings->HasTagFilter )                 propertyTokens.insert( make_pair(TAG_PROPERTY,                 m_settings->TagFilter) );
583     
584     // send add these properties to filter set "COMMAND_LINE"
585     return AddPropertyTokensToFilter(CMD, propertyTokens);
586 }
587
588 bool FilterTool::FilterToolPrivate::ParseFilterObject(const string& filterName, const Json::Value& filterObject) {
589   
590     // filter object parsing variables
591     Json::Value null(Json::nullValue);
592     Json::Value propertyValue;
593     
594     // store results
595     map<string, string> propertyTokens;
596     
597     // iterate over known properties
598     vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
599     vector<string>::const_iterator propertyNameEnd  = m_propertyNames.end();
600     for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter ) {
601         const string& propertyName = (*propertyNameIter);
602         
603         // if property defined in filter, add to token list
604         propertyValue = filterObject.get(propertyName, null);
605         if ( propertyValue != null ) 
606             propertyTokens.insert( make_pair(propertyName, propertyValue.asString()) );
607     }
608   
609     // add this filter to engin
610     m_filterEngine.addFilter(filterName);
611   
612     // add token list to this filter
613     return AddPropertyTokensToFilter(filterName, propertyTokens);
614 }
615
616 bool FilterTool::FilterToolPrivate::ParseScript(void) {
617   
618     // read in script contents from file
619     const string document = GetScriptContents();
620     
621     // set up JsonCPP reader and attempt to parse script
622     Json::Value root;
623     Json::Reader reader;
624     if ( !reader.parse(document, root) ) {
625         // use built-in error reporting mechanism to alert user what was wrong with the script
626         cerr  << "bamtools filter ERROR: failed to parse script - see error message(s) below" << endl
627               << reader.getFormatedErrorMessages();
628         return false;     
629     }
630
631     // initialize return status
632     bool success = true;
633     
634     // see if root object contains multiple filters
635     const Json::Value filters = root["filters"];
636     if ( !filters.isNull() ) {
637       
638         // iterate over any filters found
639         int filterIndex = 0;
640         Json::Value::const_iterator filtersIter = filters.begin();
641         Json::Value::const_iterator filtersEnd  = filters.end();
642         for ( ; filtersIter != filtersEnd; ++filtersIter, ++filterIndex ) {
643             Json::Value filter = (*filtersIter);
644             
645             // convert filter index to string
646             string filterName;
647             
648             // if id tag supplied
649             const Json::Value id = filter["id"];
650             if ( !id.isNull() ) 
651                 filterName = id.asString();
652             
653             // use array index 
654             else {
655                 stringstream convert;
656                 convert << filterIndex;
657                 filterName = convert.str();
658             }
659             
660             // create & parse filter 
661             success &= ParseFilterObject(filterName, filter);
662         }
663         
664         // see if user defined a "rule" for these filters
665         // otherwise, use filter engine's default rule behavior
666         string ruleString("");
667         const Json::Value rule = root["rule"];
668         if ( rule.isString() )
669             ruleString = rule.asString();
670         m_filterEngine.setRule(ruleString);
671           
672         // return success/fail
673         return success;
674     } 
675     
676     // otherwise, root is the only filter (just contains properties)
677     // create & parse filter named "ROOT"
678     else success = ParseFilterObject("ROOT", root);
679     
680     // return success/failure
681     return success;
682 }
683
684 bool FilterTool::FilterToolPrivate::Run(void) {
685     
686     // set to default input if none provided
687     if ( !m_settings->HasInputBamFilename ) 
688         m_settings->InputFiles.push_back(Options::StandardIn());
689
690     // initialize defined properties & user-specified filters
691     // quit if failed
692     if ( !SetupFilters() ) return false;
693
694     // open reader without index
695     BamMultiReader reader;
696     if ( !reader.Open(m_settings->InputFiles) ) {
697         cerr << "bamtools filter ERROR: could not open input files for reading." << endl;
698         return false;
699     }
700
701     // retrieve reader header & reference data
702     const string headerText = reader.GetHeaderText();
703     filterToolReferences = reader.GetReferenceData();
704     
705     // determine compression mode for BamWriter
706     bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
707                               !m_settings->IsForceCompression );
708     BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
709     if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
710
711     // open BamWriter
712     BamWriter writer;
713     writer.SetCompressionMode(compressionMode);
714     if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences) ) {
715         cerr << "bamtools filter ERROR: could not open " << m_settings->OutputFilename << " for writing." << endl;
716         reader.Close();
717         return false;
718     }
719
720     // if no region specified, filter entire file 
721     BamAlignment al;
722     if ( !m_settings->HasRegion ) {
723         while ( reader.GetNextAlignment(al) ) {
724             if ( CheckAlignment(al) ) 
725                 writer.SaveAlignment(al);
726         }
727     }
728     
729     // otherwise attempt to use region as constraint
730     else {
731         
732         // if region string parses OK
733         BamRegion region;
734         if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
735
736             // attempt to find index files
737             reader.LocateIndexes();
738
739             // if index data available for all BAM files, we can use SetRegion
740             if ( reader.HasIndexes() ) {
741
742                 // attempt to use SetRegion(), if failed report error
743                 if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
744                     cerr << "bamtools filter ERROR: set region failed. Check that REGION describes a valid range" << endl;
745                     reader.Close();
746                     return false;
747                 } 
748               
749                 // everything checks out, just iterate through specified region, filtering alignments
750                 while ( reader.GetNextAlignment(al) )
751                     if ( CheckAlignment(al) ) 
752                         writer.SaveAlignment(al);
753                 }
754             
755             // no index data available, we have to iterate through until we
756             // find overlapping alignments
757             else {
758                 while ( reader.GetNextAlignment(al) ) {
759                     if ( (al.RefID >= region.LeftRefID)  && ((al.Position + al.Length) >= region.LeftPosition) &&
760                          (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) 
761                     {
762                         if ( CheckAlignment(al) ) 
763                             writer.SaveAlignment(al);
764                     }
765                 }
766             }
767         } 
768         
769         // error parsing REGION string
770         else {
771             cerr << "bamtools filter ERROR: could not parse REGION: " << m_settings->Region << endl;
772             cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
773                  << endl;
774             reader.Close();
775             return false;
776         }
777     }
778
779     // clean up & exit
780     reader.Close();
781     writer.Close();
782     return true;
783 }
784
785 bool FilterTool::FilterToolPrivate::SetupFilters(void) {
786   
787     // set up filter engine with supported properties
788     InitProperties();
789     
790     // parse script for filter rules, if given
791     if ( m_settings->HasScriptFilename )
792         return ParseScript();
793     
794     // otherwise check command line for filters
795     else return ParseCommandLine();
796 }
797
798 // ---------------------------------------------
799 // FilterTool implementation
800
801 FilterTool::FilterTool(void)
802     : AbstractTool()
803     , m_settings(new FilterSettings)
804     , m_impl(0)
805 {
806     // ----------------------------------
807     // set program details
808
809     const string usage = "[-in <filename> -in <filename> ...] "
810                          "[-out <filename> | [-forceCompression]] [-region <REGION>] "
811                          "[ [-script <filename] | [filterOptions] ]";
812     Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", usage );
813
814     // ----------------------------------
815     // I/O options
816
817     OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
818
819     const string inDesc     = "the input BAM file(s)";
820     const string outDesc    = "the output BAM file";
821     const string regionDesc = "only read data from this genomic region (see documentation for more details)";
822     const string scriptDesc = "the filter script file (see documentation for more details)";
823     const string forceDesc  = "if results are sent to stdout (like when piping to another tool), "
824                               "default behavior is to leave output uncompressed. Use this flag to "
825                               "override and force compression";
826
827     Options::AddValueOption("-in",     "BAM filename", inDesc,     "", m_settings->HasInputBamFilename,  m_settings->InputFiles,     IO_Opts, Options::StandardIn());
828     Options::AddValueOption("-out",    "BAM filename", outDesc,    "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
829     Options::AddValueOption("-region", "REGION",       regionDesc, "", m_settings->HasRegion,            m_settings->Region,         IO_Opts);
830     Options::AddValueOption("-script", "filename",     scriptDesc, "", m_settings->HasScriptFilename,    m_settings->ScriptFilename, IO_Opts);
831     Options::AddOption("-forceCompression",forceDesc, m_settings->IsForceCompression, IO_Opts);
832
833     // ----------------------------------
834     // general filter options
835
836     OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters");
837
838     const string flagDesc    = "keep reads with this *exact* alignment flag (for more detailed queries, see below)";
839     const string insertDesc  = "keep reads with insert size that matches pattern";
840     const string mapQualDesc = "keep reads with map quality that matches pattern";
841     const string nameDesc    = "keep reads with name that matches pattern";
842     const string queryDesc   = "keep reads with motif that matches pattern";
843     const string tagDesc     = "keep reads with this key=>value pair";
844
845     Options::AddValueOption("-alignmentFlag", "int",       flagDesc,    "", m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, FilterOpts);
846     Options::AddValueOption("-insertSize",    "int",       insertDesc,  "", m_settings->HasInsertSizeFilter,    m_settings->InsertSizeFilter,    FilterOpts);
847     Options::AddValueOption("-mapQuality",    "[0-255]",   mapQualDesc, "", m_settings->HasMapQualityFilter,    m_settings->MapQualityFilter,    FilterOpts);
848     Options::AddValueOption("-name",          "string",    nameDesc,    "", m_settings->HasNameFilter,          m_settings->NameFilter,          FilterOpts);
849     Options::AddValueOption("-queryBases",    "string",    queryDesc,   "", m_settings->HasQueryBasesFilter,    m_settings->QueryBasesFilter,    FilterOpts);
850     Options::AddValueOption("-tag",           "TAG:VALUE", tagDesc,     "", m_settings->HasTagFilter,           m_settings->TagFilter,           FilterOpts);
851
852     // ----------------------------------
853     // alignment flag filter options
854
855     OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters");
856
857     const string boolArg           = "true/false";
858     const string isDupDesc         = "keep only alignments that are marked as duplicate?";
859     const string isFailQcDesc      = "keep only alignments that failed QC?";
860     const string isFirstMateDesc   = "keep only alignments marked as first mate?";
861     const string isMappedDesc      = "keep only alignments that were mapped?";
862     const string isMateMappedDesc  = "keep only alignments with mates that mapped";
863     const string isMateReverseDesc = "keep only alignments with mate on reverese strand?";
864     const string isPairedDesc      = "keep only alignments that were sequenced as paired?";
865     const string isPrimaryDesc     = "keep only alignments marked as primary?";
866     const string isProperPairDesc  = "keep only alignments that passed PE resolution?";
867     const string isReverseDesc     = "keep only alignments on reverse strand?";
868     const string isSecondMateDesc  = "keep only alignments marked as second mate?";
869     const string isSingletonDesc   = "keep only singletons";
870
871     Options::AddValueOption("-isDuplicate",         boolArg, isDupDesc,         "", m_settings->HasIsDuplicateFilter,         m_settings->IsDuplicateFilter,         AlignmentFlagOpts, TRUE_STR);
872     Options::AddValueOption("-isFailedQC",          boolArg, isFailQcDesc,      "", m_settings->HasIsFailedQCFilter,          m_settings->IsFailedQCFilter,          AlignmentFlagOpts, TRUE_STR);
873     Options::AddValueOption("-isFirstMate",         boolArg, isFirstMateDesc,   "", m_settings->HasIsFirstMateFilter,         m_settings->IsFirstMateFilter,         AlignmentFlagOpts, TRUE_STR);
874     Options::AddValueOption("-isMapped",            boolArg, isMappedDesc,      "", m_settings->HasIsMappedFilter,            m_settings->IsMappedFilter,            AlignmentFlagOpts, TRUE_STR);
875     Options::AddValueOption("-isMateMapped",        boolArg, isMateMappedDesc,  "", m_settings->HasIsMateMappedFilter,        m_settings->IsMateMappedFilter,        AlignmentFlagOpts, TRUE_STR);
876     Options::AddValueOption("-isMateReverseStrand", boolArg, isMateReverseDesc, "", m_settings->HasIsMateReverseStrandFilter, m_settings->IsMateReverseStrandFilter, AlignmentFlagOpts, TRUE_STR);
877     Options::AddValueOption("-isPaired",            boolArg, isPairedDesc,      "", m_settings->HasIsPairedFilter,            m_settings->IsPairedFilter,            AlignmentFlagOpts, TRUE_STR);
878     Options::AddValueOption("-isPrimaryAlignment",  boolArg, isPrimaryDesc,     "", m_settings->HasIsPrimaryAlignmentFilter,  m_settings->IsPrimaryAlignmentFilter,  AlignmentFlagOpts, TRUE_STR);
879     Options::AddValueOption("-isProperPair",        boolArg, isProperPairDesc,  "", m_settings->HasIsProperPairFilter,        m_settings->IsProperPairFilter,        AlignmentFlagOpts, TRUE_STR);
880     Options::AddValueOption("-isReverseStrand",     boolArg, isReverseDesc,     "", m_settings->HasIsReverseStrandFilter,     m_settings->IsReverseStrandFilter,     AlignmentFlagOpts, TRUE_STR);
881     Options::AddValueOption("-isSecondMate",        boolArg, isSecondMateDesc,  "", m_settings->HasIsSecondMateFilter,        m_settings->IsSecondMateFilter,        AlignmentFlagOpts, TRUE_STR);
882     Options::AddValueOption("-isSingleton",         boolArg, isSingletonDesc,   "", m_settings->HasIsSingletonFilter,         m_settings->IsSingletonFilter,         AlignmentFlagOpts, TRUE_STR);
883 }
884
885 FilterTool::~FilterTool(void) {
886
887     delete m_settings;
888     m_settings = 0;
889
890     delete m_impl;
891     m_impl = 0;
892 }
893
894 int FilterTool::Help(void) {
895     Options::DisplayHelp();
896     return 0;
897 }
898
899 int FilterTool::Run(int argc, char* argv[]) {
900
901     // parse command line arguments
902     Options::Parse(argc, argv, 1);
903
904     // initialize FilterTool with settings
905     m_impl = new FilterToolPrivate(m_settings);
906
907     // run FilterTool, return success/fail
908     if ( m_impl->Run() )
909         return 0;
910     else
911         return 1;
912 }