]> git.donarmstrong.com Git - bamtools.git/blob - src/toolkit/bamtools_filter.cpp
Minor cleanup in FilterTool
[bamtools.git] / src / toolkit / bamtools_filter.cpp
1 // ***************************************************************************
2 // bamtools_filter.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 14 October 2011
6 // ---------------------------------------------------------------------------
7 // Filters BAM file(s) according to some user-specified criteria
8 // ***************************************************************************
9
10 #include "bamtools_filter.h"
11
12 #include <api/BamMultiReader.h>
13 #include <api/BamWriter.h>
14 #include <utils/bamtools_filter_engine.h>
15 #include <utils/bamtools_options.h>
16 #include <utils/bamtools_utilities.h>
17 using namespace BamTools;
18
19 #include <jsoncpp/json.h>
20 using namespace Json;
21
22 #include <cstdio>
23 #include <iostream>
24 #include <sstream>
25 #include <string>
26 #include <vector>
27 using namespace std;
28   
29 namespace BamTools {
30   
31 // -------------------------------  
32 // string literal constants  
33
34 // property names
35 const string ALIGNMENTFLAG_PROPERTY       = "alignmentFlag";
36 const string CIGAR_PROPERTY               = "cigar";
37 const string INSERTSIZE_PROPERTY          = "insertSize";
38 const string ISDUPLICATE_PROPERTY         = "isDuplicate";
39 const string ISFAILEDQC_PROPERTY          = "isFailedQC";
40 const string ISFIRSTMATE_PROPERTY         = "isFirstMate";
41 const string ISMAPPED_PROPERTY            = "isMapped";
42 const string ISMATEMAPPED_PROPERTY        = "isMateMapped";
43 const string ISMATEREVERSESTRAND_PROPERTY = "isMateReverseStrand";
44 const string ISPAIRED_PROPERTY            = "isPaired";
45 const string ISPRIMARYALIGNMENT_PROPERTY  = "isPrimaryAlignment";
46 const string ISPROPERPAIR_PROPERTY        = "isProperPair";
47 const string ISREVERSESTRAND_PROPERTY     = "isReverseStrand";
48 const string ISSECONDMATE_PROPERTY        = "isSecondMate";
49 const string ISSINGLETON_PROPERTY         = "isSingleton";
50 const string MAPQUALITY_PROPERTY          = "mapQuality";
51 const string MATEPOSITION_PROPERTY        = "matePosition";
52 const string MATEREFERENCE_PROPERTY       = "mateReference";
53 const string NAME_PROPERTY                = "name";
54 const string POSITION_PROPERTY            = "position";
55 const string QUERYBASES_PROPERTY          = "queryBases";
56 const string REFERENCE_PROPERTY           = "reference";
57 const string TAG_PROPERTY                 = "tag";
58
59 // boolalpha
60 const string TRUE_STR  = "true";
61 const string FALSE_STR = "false";
62     
63 RefVector filterToolReferences;    
64     
65 struct BamAlignmentChecker {
66     bool check(const PropertyFilter& filter, const BamAlignment& al) {
67       
68         bool keepAlignment = true;
69         const PropertyMap& properties = filter.Properties;
70         PropertyMap::const_iterator propertyIter = properties.begin();
71         PropertyMap::const_iterator propertyEnd  = properties.end();
72         for ( ; propertyIter != propertyEnd; ++propertyIter ) {
73           
74             // check alignment data field depending on propertyName
75             const string& propertyName = (*propertyIter).first;
76             const PropertyFilterValue& valueFilter = (*propertyIter).second;
77             
78             if      ( propertyName == ALIGNMENTFLAG_PROPERTY )  keepAlignment &= valueFilter.check(al.AlignmentFlag);
79             else if ( propertyName == CIGAR_PROPERTY ) {
80                 stringstream cigarSs;
81                 const vector<CigarOp>& cigarData = al.CigarData;
82                 if ( !cigarData.empty() ) {
83                     vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
84                     vector<CigarOp>::const_iterator cigarIter = cigarBegin;
85                     vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
86                     for ( ; cigarIter != cigarEnd; ++cigarIter ) {
87                         const CigarOp& op = (*cigarIter);
88                         cigarSs << op.Length << op.Type;
89                     }
90                     keepAlignment &= valueFilter.check(cigarSs.str());
91                 }
92             }
93             else if ( propertyName == INSERTSIZE_PROPERTY )           keepAlignment &= valueFilter.check(al.InsertSize);
94             else if ( propertyName == ISDUPLICATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsDuplicate());
95             else if ( propertyName == ISFAILEDQC_PROPERTY )           keepAlignment &= valueFilter.check(al.IsFailedQC());
96             else if ( propertyName == ISFIRSTMATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsFirstMate());
97             else if ( propertyName == ISMAPPED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsMapped());
98             else if ( propertyName == ISMATEMAPPED_PROPERTY )         keepAlignment &= valueFilter.check(al.IsMateMapped());
99             else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY )  keepAlignment &= valueFilter.check(al.IsMateReverseStrand());
100             else if ( propertyName == ISPAIRED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsPaired());
101             else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY )   keepAlignment &= valueFilter.check(al.IsPrimaryAlignment());
102             else if ( propertyName == ISPROPERPAIR_PROPERTY )         keepAlignment &= valueFilter.check(al.IsProperPair());
103             else if ( propertyName == ISREVERSESTRAND_PROPERTY )      keepAlignment &= valueFilter.check(al.IsReverseStrand());
104             else if ( propertyName == ISSECONDMATE_PROPERTY )         keepAlignment &= valueFilter.check(al.IsSecondMate());
105             else if ( propertyName == ISSINGLETON_PROPERTY ) {
106                 const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped();
107                 keepAlignment &= valueFilter.check(isSingleton);
108             }
109             else if ( propertyName == MAPQUALITY_PROPERTY )           keepAlignment &= valueFilter.check(al.MapQuality);
110             else if ( propertyName == MATEPOSITION_PROPERTY )         keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) );
111             else if ( propertyName == MATEREFERENCE_PROPERTY ) {
112                 if ( !al.IsPaired() || !al.IsMateMapped() ) return false;
113                 BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID");
114                 const string& refName = filterToolReferences.at(al.MateRefID).RefName;
115                 keepAlignment &= valueFilter.check(refName);
116             }
117             else if ( propertyName == NAME_PROPERTY )                 keepAlignment &= valueFilter.check(al.Name);
118             else if ( propertyName == POSITION_PROPERTY )             keepAlignment &= valueFilter.check(al.Position);
119             else if ( propertyName == QUERYBASES_PROPERTY )           keepAlignment &= valueFilter.check(al.QueryBases);
120             else if ( propertyName == REFERENCE_PROPERTY ) {
121                 BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID");
122                 const string& refName = filterToolReferences.at(al.RefID).RefName;
123                 keepAlignment &= valueFilter.check(refName);
124             }
125             else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al);
126             else BAMTOOLS_ASSERT_UNREACHABLE;
127             
128             // if alignment fails at ANY point, just quit and return false
129             if ( !keepAlignment ) return false;
130         }
131       
132         BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here");
133         return keepAlignment;
134     }
135     
136     bool checkAlignmentTag(const PropertyFilterValue& valueFilter, const BamAlignment& al) {
137      
138         // ensure filter contains string data
139         Variant entireTagFilter = valueFilter.Value;
140         if ( !entireTagFilter.is_type<string>() ) return false;
141
142         // localize string from variant
143         const string& entireTagFilterString = entireTagFilter.get<string>();
144
145         // ensure we have at least "XX:x"
146         if ( entireTagFilterString.length() < 4 ) return false;
147
148         // get tagName & lookup in alignment
149         // if found, set tagType to tag type character
150         // if not found, return false
151         const string& tagName = entireTagFilterString.substr(0,2);
152         char tagType = '\0';
153         if ( !al.GetTagType(tagName, tagType) ) return false;
154
155         // remove tagName & ":" from beginning tagFilter
156         string tagFilterString = entireTagFilterString.substr(3);
157
158         // switch on tag type to set tag query value & parse filter token
159         int32_t  intFilterValue,    intQueryValue;
160         uint32_t uintFilterValue,   uintQueryValue;
161         float    realFilterValue,   realQueryValue;
162         string   stringFilterValue, stringQueryValue;
163
164         PropertyFilterValue tagFilter;
165         PropertyFilterValue::ValueCompareType compareType;
166         bool keepAlignment = false;
167         switch (tagType) {
168
169             // signed int tag type
170             case 'c' :
171             case 's' :
172             case 'i' :
173                 if ( al.GetTag(tagName, intQueryValue) ) {
174                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, intFilterValue, compareType) ) {
175                         tagFilter.Value = intFilterValue;
176                         tagFilter.Type  = compareType;
177                         keepAlignment   = tagFilter.check(intQueryValue);
178                     }
179                 }
180                 break;
181
182             // unsigned int tag type
183             case 'C' :
184             case 'S' :
185             case 'I' :
186                 if ( al.GetTag(tagName, uintQueryValue) ) {
187                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, uintFilterValue, compareType) ) {
188                         tagFilter.Value = uintFilterValue;
189                         tagFilter.Type  = compareType;
190                         keepAlignment   = tagFilter.check(uintQueryValue);
191                     }
192                 }
193                 break;
194
195             // 'real' tag type
196             case 'f' :
197                 if ( al.GetTag(tagName, realQueryValue) ) {
198                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, realFilterValue, compareType) ) {
199                         tagFilter.Value = realFilterValue;
200                         tagFilter.Type  = compareType;
201                         keepAlignment   = tagFilter.check(realQueryValue);
202                     }
203                 }
204                 break;
205
206             // string tag type
207             case 'A':
208             case 'Z':
209             case 'H':
210                 if ( al.GetTag(tagName, stringQueryValue) ) {
211                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, stringFilterValue, compareType) ) {
212                         tagFilter.Value = stringFilterValue;
213                         tagFilter.Type  = compareType;
214                         keepAlignment   = tagFilter.check(stringQueryValue);
215                     }
216                 }
217                 break;
218
219             // unknown tag type
220             default :
221                 keepAlignment = false;
222         }
223
224         return keepAlignment;
225     }
226 };    
227     
228 } // namespace BamTools
229   
230 // ---------------------------------------------
231 // FilterSettings implementation
232
233 struct FilterTool::FilterSettings {
234
235     // ----------------------------------
236     // IO opts
237
238     // flags
239     bool HasInputBamFilename;
240     bool HasOutputBamFilename;
241     bool HasRegion;
242     bool HasScriptFilename;
243     bool IsForceCompression;
244
245     // filenames
246     vector<string> InputFiles;
247     string OutputFilename;
248     string Region;
249     string ScriptFilename;
250
251     // -----------------------------------
252     // General filter opts
253
254     // flags
255     bool HasAlignmentFlagFilter;
256     bool HasInsertSizeFilter;
257     bool HasMapQualityFilter;
258     bool HasNameFilter;
259     bool HasQueryBasesFilter;
260     bool HasTagFilter; //(s)
261
262     // filters
263     string AlignmentFlagFilter;
264     string InsertSizeFilter;
265     string NameFilter;
266     string MapQualityFilter;
267     string QueryBasesFilter;
268     string TagFilter;  // support multiple ?
269
270     // -----------------------------------
271     // AlignmentFlag filter opts
272
273     // flags
274     bool HasIsDuplicateFilter;
275     bool HasIsFailedQCFilter;
276     bool HasIsFirstMateFilter;
277     bool HasIsMappedFilter;
278     bool HasIsMateMappedFilter;
279     bool HasIsMateReverseStrandFilter;
280     bool HasIsPairedFilter;
281     bool HasIsPrimaryAlignmentFilter;
282     bool HasIsProperPairFilter;
283     bool HasIsReverseStrandFilter;
284     bool HasIsSecondMateFilter;
285     bool HasIsSingletonFilter;
286
287     // filters
288     string IsDuplicateFilter;
289     string IsFailedQCFilter;
290     string IsFirstMateFilter;
291     string IsMappedFilter;
292     string IsMateMappedFilter;
293     string IsMateReverseStrandFilter;
294     string IsPairedFilter;
295     string IsPrimaryAlignmentFilter;
296     string IsProperPairFilter;
297     string IsReverseStrandFilter;
298     string IsSecondMateFilter;
299     string IsSingletonFilter;
300
301     // ---------------------------------
302     // constructor
303
304     FilterSettings(void)
305         : HasInputBamFilename(false)
306         , HasOutputBamFilename(false)
307         , HasRegion(false)
308         , HasScriptFilename(false)
309         , IsForceCompression(false)
310         , OutputFilename(Options::StandardOut())
311         , HasAlignmentFlagFilter(false)
312         , HasInsertSizeFilter(false)
313         , HasMapQualityFilter(false)
314         , HasNameFilter(false)
315         , HasQueryBasesFilter(false)
316         , HasTagFilter(false)
317         , HasIsDuplicateFilter(false)
318         , HasIsFailedQCFilter(false)
319         , HasIsFirstMateFilter(false)
320         , HasIsMappedFilter(false)
321         , HasIsMateMappedFilter(false)
322         , HasIsMateReverseStrandFilter(false)
323         , HasIsPairedFilter(false)
324         , HasIsPrimaryAlignmentFilter(false)
325         , HasIsProperPairFilter(false)
326         , HasIsReverseStrandFilter(false)
327         , HasIsSecondMateFilter(false)
328         , HasIsSingletonFilter(false)
329         , IsDuplicateFilter(TRUE_STR)
330         , IsFailedQCFilter(TRUE_STR)
331         , IsFirstMateFilter(TRUE_STR)
332         , IsMappedFilter(TRUE_STR)
333         , IsMateMappedFilter(TRUE_STR)
334         , IsMateReverseStrandFilter(TRUE_STR)
335         , IsPairedFilter(TRUE_STR)
336         , IsPrimaryAlignmentFilter(TRUE_STR)
337         , IsProperPairFilter(TRUE_STR)
338         , IsReverseStrandFilter(TRUE_STR)
339         , IsSecondMateFilter(TRUE_STR)
340         , IsSingletonFilter(TRUE_STR)
341     { }
342 };
343
344 // ---------------------------------------------
345 // FilterToolPrivate declaration
346
347 class FilterTool::FilterToolPrivate {
348       
349     // ctor & dtor
350     public:
351         FilterToolPrivate(FilterTool::FilterSettings* settings);
352         ~FilterToolPrivate(void);  
353         
354     // 'public' interface
355     public:
356         bool Run(void);
357         
358     // internal methods
359     private:
360         bool AddPropertyTokensToFilter(const string& filterName, const map<string, string>& propertyTokens);
361         bool CheckAlignment(const BamAlignment& al);
362         const string GetScriptContents(void);
363         void InitProperties(void);
364         bool ParseCommandLine(void);
365         bool ParseFilterObject(const string& filterName, const Json::Value& filterObject);
366         bool ParseScript(void);
367         bool SetupFilters(void);
368         
369     // data members
370     private:
371         vector<string> m_propertyNames;
372         FilterTool::FilterSettings* m_settings;
373         FilterEngine<BamAlignmentChecker> m_filterEngine;
374 };
375  
376 // ---------------------------------------------
377 // FilterToolPrivate implementation
378   
379 // constructor  
380 FilterTool::FilterToolPrivate::FilterToolPrivate(FilterTool::FilterSettings* settings)
381     : m_settings(settings)
382 { }  
383   
384 // destructor
385 FilterTool::FilterToolPrivate::~FilterToolPrivate(void) { }
386
387 bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName,
388                                                               const map<string,
389                                                               string>& propertyTokens)
390 {
391     // dummy temp values for token parsing
392     bool boolValue;
393     int32_t int32Value;
394     uint16_t uint16Value;
395     uint32_t uint32Value;
396     string stringValue;
397     PropertyFilterValue::ValueCompareType type;
398   
399     // iterate over property token map
400     map<string, string>::const_iterator mapIter = propertyTokens.begin();
401     map<string, string>::const_iterator mapEnd  = propertyTokens.end();
402     for ( ; mapIter != mapEnd; ++mapIter ) {
403       
404         const string& propertyName = (*mapIter).first;
405         const string& token        = (*mapIter).second;
406         
407         // ------------------------------
408         // convert token to value & compare type 
409         // then add to filter engine
410         
411         // bool conversion
412         if ( propertyName == ISDUPLICATE_PROPERTY ||
413              propertyName == ISFAILEDQC_PROPERTY ||
414              propertyName == ISFIRSTMATE_PROPERTY ||
415              propertyName == ISMAPPED_PROPERTY ||
416              propertyName == ISMATEMAPPED_PROPERTY ||
417              propertyName == ISMATEREVERSESTRAND_PROPERTY ||
418              propertyName == ISPAIRED_PROPERTY ||
419              propertyName == ISPRIMARYALIGNMENT_PROPERTY ||
420              propertyName == ISPROPERPAIR_PROPERTY ||
421              propertyName == ISREVERSESTRAND_PROPERTY ||
422              propertyName == ISSECONDMATE_PROPERTY ||
423              propertyName == ISSINGLETON_PROPERTY
424            ) 
425         {
426             FilterEngine<BamAlignmentChecker>::parseToken(token, boolValue, type);
427             m_filterEngine.setProperty(filterName, propertyName, boolValue, type);
428         }
429         
430         // int32_t conversion
431         else if ( propertyName == INSERTSIZE_PROPERTY ||
432                   propertyName == MATEPOSITION_PROPERTY ||
433                   propertyName == POSITION_PROPERTY 
434                 ) 
435         {
436             FilterEngine<BamAlignmentChecker>::parseToken(token, int32Value, type);
437             m_filterEngine.setProperty(filterName, propertyName, int32Value, type);
438         }
439         
440         // uint16_t conversion
441         else if ( propertyName == MAPQUALITY_PROPERTY )
442         {
443             FilterEngine<BamAlignmentChecker>::parseToken(token, uint16Value, type);
444             m_filterEngine.setProperty(filterName, propertyName, uint16Value, type);
445         }
446         
447         // uint32_t conversion
448         else if ( propertyName == ALIGNMENTFLAG_PROPERTY )
449         {
450             FilterEngine<BamAlignmentChecker>::parseToken(token, uint32Value, type);
451             m_filterEngine.setProperty(filterName, propertyName, uint32Value, type);
452         }
453         
454         // string conversion
455         else if ( propertyName == CIGAR_PROPERTY || 
456                   propertyName == MATEREFERENCE_PROPERTY ||
457                   propertyName == NAME_PROPERTY ||
458                   propertyName == QUERYBASES_PROPERTY ||
459                   propertyName == REFERENCE_PROPERTY 
460                 ) 
461         {
462             FilterEngine<BamAlignmentChecker>::parseToken(token, stringValue, type);
463             m_filterEngine.setProperty(filterName, propertyName, stringValue, type);
464         }
465       
466     else if ( propertyName == TAG_PROPERTY ) {
467             // this will be stored directly as the TAG:VALUE token 
468             // (VALUE may contain compare ops, will be parsed out later)
469             m_filterEngine.setProperty(filterName, propertyName, token, PropertyFilterValue::EXACT);
470         }
471       
472         // else unknown property 
473         else {
474             cerr << "bamtools filter ERROR: unknown property - " << propertyName << endl;
475             return false;
476         }
477     }
478     return true;
479 }
480
481 bool FilterTool::FilterToolPrivate::CheckAlignment(const BamAlignment& al) {
482     return m_filterEngine.check(al);
483 }
484
485 const string FilterTool::FilterToolPrivate::GetScriptContents(void) {
486   
487     // open script for reading
488     FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb");
489     if ( !inFile ) {
490         cerr << "bamtools filter ERROR: could not open script: "
491              << m_settings->ScriptFilename << " for reading" << endl;
492         return string();
493     }
494     
495     // read in entire script contents  
496     char buffer[1024];
497     ostringstream docStream("");
498     while ( true ) {
499         
500         // peek ahead, make sure there is data available
501         char ch = fgetc(inFile);
502         ungetc(ch, inFile);
503         if( feof(inFile) ) break;       
504         
505         // read next block of data
506         if ( fgets(buffer, 1024, inFile) == 0 ) {
507             cerr << "bamtools filter ERROR: could not read script contents" << endl;
508             return string();
509         }
510         
511         docStream << buffer;
512     }
513     
514     // close script file
515     fclose(inFile);
516     
517     // import buffer contents to document, return
518     return docStream.str();
519 }
520
521 void FilterTool::FilterToolPrivate::InitProperties(void) {
522   
523     // store property names in vector 
524     m_propertyNames.push_back(ALIGNMENTFLAG_PROPERTY);
525     m_propertyNames.push_back(CIGAR_PROPERTY);
526     m_propertyNames.push_back(INSERTSIZE_PROPERTY);
527     m_propertyNames.push_back(ISDUPLICATE_PROPERTY);
528     m_propertyNames.push_back(ISFAILEDQC_PROPERTY);
529     m_propertyNames.push_back(ISFIRSTMATE_PROPERTY);
530     m_propertyNames.push_back(ISMAPPED_PROPERTY);
531     m_propertyNames.push_back(ISMATEMAPPED_PROPERTY);
532     m_propertyNames.push_back(ISMATEREVERSESTRAND_PROPERTY);
533     m_propertyNames.push_back(ISPAIRED_PROPERTY);
534     m_propertyNames.push_back(ISPRIMARYALIGNMENT_PROPERTY);
535     m_propertyNames.push_back(ISPROPERPAIR_PROPERTY);
536     m_propertyNames.push_back(ISREVERSESTRAND_PROPERTY);
537     m_propertyNames.push_back(ISSECONDMATE_PROPERTY);
538     m_propertyNames.push_back(ISSINGLETON_PROPERTY);
539     m_propertyNames.push_back(MAPQUALITY_PROPERTY);
540     m_propertyNames.push_back(MATEPOSITION_PROPERTY);
541     m_propertyNames.push_back(MATEREFERENCE_PROPERTY);
542     m_propertyNames.push_back(NAME_PROPERTY);
543     m_propertyNames.push_back(POSITION_PROPERTY);
544     m_propertyNames.push_back(QUERYBASES_PROPERTY);
545     m_propertyNames.push_back(REFERENCE_PROPERTY);
546     m_propertyNames.push_back(TAG_PROPERTY);
547     
548     // add vector contents to FilterEngine<BamAlignmentChecker>
549     vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
550     vector<string>::const_iterator propertyNameEnd  = m_propertyNames.end();
551     for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter )
552         m_filterEngine.addProperty((*propertyNameIter));
553 }
554
555 bool FilterTool::FilterToolPrivate::ParseCommandLine(void) {
556   
557     // add a rule set to filter engine
558     const string CMD = "COMMAND_LINE";
559     m_filterEngine.addFilter(CMD);
560
561     // map property names to command line args
562     map<string, string> propertyTokens;
563     if ( m_settings->HasAlignmentFlagFilter )       propertyTokens.insert( make_pair(ALIGNMENTFLAG_PROPERTY,       m_settings->AlignmentFlagFilter) );
564     if ( m_settings->HasInsertSizeFilter )          propertyTokens.insert( make_pair(INSERTSIZE_PROPERTY,          m_settings->InsertSizeFilter) );
565     if ( m_settings->HasIsDuplicateFilter )         propertyTokens.insert( make_pair(ISDUPLICATE_PROPERTY,         m_settings->IsDuplicateFilter) );
566     if ( m_settings->HasIsFailedQCFilter )          propertyTokens.insert( make_pair(ISFAILEDQC_PROPERTY,          m_settings->IsFailedQCFilter) );
567     if ( m_settings->HasIsFirstMateFilter )         propertyTokens.insert( make_pair(ISFIRSTMATE_PROPERTY,         m_settings->IsFirstMateFilter) );
568     if ( m_settings->HasIsMappedFilter )            propertyTokens.insert( make_pair(ISMAPPED_PROPERTY,            m_settings->IsMappedFilter) );
569     if ( m_settings->HasIsMateMappedFilter )        propertyTokens.insert( make_pair(ISMATEMAPPED_PROPERTY,        m_settings->IsMateMappedFilter) );
570     if ( m_settings->HasIsMateReverseStrandFilter ) propertyTokens.insert( make_pair(ISMATEREVERSESTRAND_PROPERTY, m_settings->IsMateReverseStrandFilter) );
571     if ( m_settings->HasIsPairedFilter )            propertyTokens.insert( make_pair(ISPAIRED_PROPERTY,            m_settings->IsPairedFilter) );
572     if ( m_settings->HasIsPrimaryAlignmentFilter )  propertyTokens.insert( make_pair(ISPRIMARYALIGNMENT_PROPERTY,  m_settings->IsPrimaryAlignmentFilter) );
573     if ( m_settings->HasIsProperPairFilter )        propertyTokens.insert( make_pair(ISPROPERPAIR_PROPERTY,        m_settings->IsProperPairFilter) );
574     if ( m_settings->HasIsReverseStrandFilter )     propertyTokens.insert( make_pair(ISREVERSESTRAND_PROPERTY,     m_settings->IsReverseStrandFilter) );
575     if ( m_settings->HasIsSecondMateFilter )        propertyTokens.insert( make_pair(ISSECONDMATE_PROPERTY,        m_settings->IsSecondMateFilter) );
576     if ( m_settings->HasIsSingletonFilter )         propertyTokens.insert( make_pair(ISSINGLETON_PROPERTY,         m_settings->IsSingletonFilter) );
577     if ( m_settings->HasMapQualityFilter )          propertyTokens.insert( make_pair(MAPQUALITY_PROPERTY,          m_settings->MapQualityFilter) );
578     if ( m_settings->HasNameFilter )                propertyTokens.insert( make_pair(NAME_PROPERTY,                m_settings->NameFilter) );
579     if ( m_settings->HasQueryBasesFilter )          propertyTokens.insert( make_pair(QUERYBASES_PROPERTY,          m_settings->QueryBasesFilter) );
580     if ( m_settings->HasTagFilter )                 propertyTokens.insert( make_pair(TAG_PROPERTY,                 m_settings->TagFilter) );
581     
582     // send add these properties to filter set "COMMAND_LINE"
583     return AddPropertyTokensToFilter(CMD, propertyTokens);
584 }
585
586 bool FilterTool::FilterToolPrivate::ParseFilterObject(const string& filterName, const Json::Value& filterObject) {
587   
588     // filter object parsing variables
589     Json::Value null(Json::nullValue);
590     Json::Value propertyValue;
591     
592     // store results
593     map<string, string> propertyTokens;
594     
595     // iterate over known properties
596     vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
597     vector<string>::const_iterator propertyNameEnd  = m_propertyNames.end();
598     for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter ) {
599         const string& propertyName = (*propertyNameIter);
600         
601         // if property defined in filter, add to token list
602         propertyValue = filterObject.get(propertyName, null);
603         if ( propertyValue != null ) 
604             propertyTokens.insert( make_pair(propertyName, propertyValue.asString()) );
605     }
606   
607     // add this filter to engin
608     m_filterEngine.addFilter(filterName);
609   
610     // add token list to this filter
611     return AddPropertyTokensToFilter(filterName, propertyTokens);
612 }
613
614 bool FilterTool::FilterToolPrivate::ParseScript(void) {
615   
616     // read in script contents from file
617     const string document = GetScriptContents();
618     
619     // set up JsonCPP reader and attempt to parse script
620     Json::Value root;
621     Json::Reader reader;
622     if ( !reader.parse(document, root) ) {
623         // use built-in error reporting mechanism to alert user what was wrong with the script
624         cerr  << "bamtools filter ERROR: failed to parse script - see error message(s) below" << endl
625               << reader.getFormatedErrorMessages();
626         return false;     
627     }
628
629     // initialize return status
630     bool success = true;
631     
632     // see if root object contains multiple filters
633     const Json::Value filters = root["filters"];
634     if ( !filters.isNull() ) {
635       
636         // iterate over any filters found
637         int filterIndex = 0;
638         Json::Value::const_iterator filtersIter = filters.begin();
639         Json::Value::const_iterator filtersEnd  = filters.end();
640         for ( ; filtersIter != filtersEnd; ++filtersIter, ++filterIndex ) {
641             Json::Value filter = (*filtersIter);
642             
643             // convert filter index to string
644             string filterName;
645             
646             // if id tag supplied
647             const Json::Value id = filter["id"];
648             if ( !id.isNull() ) 
649                 filterName = id.asString();
650             
651             // use array index 
652             else {
653                 stringstream convert;
654                 convert << filterIndex;
655                 filterName = convert.str();
656             }
657             
658             // create & parse filter 
659             success &= ParseFilterObject(filterName, filter);
660         }
661         
662         // see if user defined a "rule" for these filters
663         // otherwise, use filter engine's default rule behavior
664         string ruleString("");
665         const Json::Value rule = root["rule"];
666         if ( rule.isString() )
667             ruleString = rule.asString();
668         m_filterEngine.setRule(ruleString);
669           
670         // return success/fail
671         return success;
672     } 
673     
674     // otherwise, root is the only filter (just contains properties)
675     // create & parse filter named "ROOT"
676     else success = ParseFilterObject("ROOT", root);
677     
678     // return success/failure
679     return success;
680 }
681
682 bool FilterTool::FilterToolPrivate::Run(void) {
683     
684     // set to default input if none provided
685     if ( !m_settings->HasInputBamFilename ) 
686         m_settings->InputFiles.push_back(Options::StandardIn());
687
688     // initialize defined properties & user-specified filters
689     // quit if failed
690     if ( !SetupFilters() ) return false;
691
692     // open reader without index
693     BamMultiReader reader;
694     if ( !reader.Open(m_settings->InputFiles) ) {
695         cerr << "bamtools filter ERROR: could not open input files for reading." << endl;
696         return false;
697     }
698
699     // retrieve reader header & reference data
700     const string headerText = reader.GetHeaderText();
701     filterToolReferences = reader.GetReferenceData();
702     
703     // determine compression mode for BamWriter
704     bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
705                               !m_settings->IsForceCompression );
706     BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
707     if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
708
709     // open BamWriter
710     BamWriter writer;
711     writer.SetCompressionMode(compressionMode);
712     if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences) ) {
713         cerr << "bamtools filter ERROR: could not open " << m_settings->OutputFilename << " for writing." << endl;
714         reader.Close();
715         return false;
716     }
717
718     // if no region specified, filter entire file 
719     BamAlignment al;
720     if ( !m_settings->HasRegion ) {
721         while ( reader.GetNextAlignment(al) ) {
722             if ( CheckAlignment(al) ) 
723                 writer.SaveAlignment(al);
724         }
725     }
726     
727     // otherwise attempt to use region as constraint
728     else {
729         
730         // if region string parses OK
731         BamRegion region;
732         if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
733
734             // attempt to find index files
735             reader.LocateIndexes();
736
737             // if index data available for all BAM files, we can use SetRegion
738             if ( reader.HasIndexes() ) {
739
740                 // attempt to use SetRegion(), if failed report error
741                 if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
742                     cerr << "bamtools filter ERROR: set region failed. Check that REGION describes a valid range" << endl;
743                     reader.Close();
744                     return false;
745                 } 
746               
747                 // everything checks out, just iterate through specified region, filtering alignments
748                 while ( reader.GetNextAlignment(al) )
749                     if ( CheckAlignment(al) ) 
750                         writer.SaveAlignment(al);
751                 }
752             
753             // no index data available, we have to iterate through until we
754             // find overlapping alignments
755             else {
756                 while ( reader.GetNextAlignment(al) ) {
757                     if ( (al.RefID >= region.LeftRefID)  && ((al.Position + al.Length) >= region.LeftPosition) &&
758                          (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) 
759                     {
760                         if ( CheckAlignment(al) ) 
761                             writer.SaveAlignment(al);
762                     }
763                 }
764             }
765         } 
766         
767         // error parsing REGION string
768         else {
769             cerr << "bamtools filter ERROR: could not parse REGION: " << m_settings->Region << endl;
770             cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
771                  << endl;
772             reader.Close();
773             return false;
774         }
775     }
776
777     // clean up & exit
778     reader.Close();
779     writer.Close();
780     return true;
781 }
782
783 bool FilterTool::FilterToolPrivate::SetupFilters(void) {
784   
785     // set up filter engine with supported properties
786     InitProperties();
787     
788     // parse script for filter rules, if given
789     if ( m_settings->HasScriptFilename )
790         return ParseScript();
791     
792     // otherwise check command line for filters
793     else return ParseCommandLine();
794 }
795
796 // ---------------------------------------------
797 // FilterTool implementation
798
799 FilterTool::FilterTool(void)
800     : AbstractTool()
801     , m_settings(new FilterSettings)
802     , m_impl(0)
803 {
804     // ----------------------------------
805     // set program details
806
807     const string usage = "[-in <filename> -in <filename> ...] "
808                          "[-out <filename> | [-forceCompression]] [-region <REGION>] "
809                          "[ [-script <filename] | [filterOptions] ]";
810     Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", usage );
811
812     // ----------------------------------
813     // I/O options
814
815     OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
816
817     const string inDesc     = "the input BAM file(s)";
818     const string outDesc    = "the output BAM file";
819     const string regionDesc = "only read data from this genomic region (see documentation for more details)";
820     const string scriptDesc = "the filter script file (see documentation for more details)";
821     const string forceDesc  = "if results are sent to stdout (like when piping to another tool), "
822                               "default behavior is to leave output uncompressed. Use this flag to "
823                               "override and force compression";
824
825     Options::AddValueOption("-in",     "BAM filename", inDesc,     "", m_settings->HasInputBamFilename,  m_settings->InputFiles,     IO_Opts, Options::StandardIn());
826     Options::AddValueOption("-out",    "BAM filename", outDesc,    "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
827     Options::AddValueOption("-region", "REGION",       regionDesc, "", m_settings->HasRegion,            m_settings->Region,         IO_Opts);
828     Options::AddValueOption("-script", "filename",     scriptDesc, "", m_settings->HasScriptFilename,    m_settings->ScriptFilename, IO_Opts);
829     Options::AddOption("-forceCompression",forceDesc, m_settings->IsForceCompression, IO_Opts);
830
831     // ----------------------------------
832     // general filter options
833
834     OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters");
835
836     const string flagDesc    = "keep reads with this *exact* alignment flag (for more detailed queries, see below)";
837     const string insertDesc  = "keep reads with insert size that matches pattern";
838     const string mapQualDesc = "keep reads with map quality that matches pattern";
839     const string nameDesc    = "keep reads with name that matches pattern";
840     const string queryDesc   = "keep reads with motif that matches pattern";
841     const string tagDesc     = "keep reads with this key=>value pair";
842
843     Options::AddValueOption("-alignmentFlag", "int",       flagDesc,    "", m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, FilterOpts);
844     Options::AddValueOption("-insertSize",    "int",       insertDesc,  "", m_settings->HasInsertSizeFilter,    m_settings->InsertSizeFilter,    FilterOpts);
845     Options::AddValueOption("-mapQuality",    "[0-255]",   mapQualDesc, "", m_settings->HasMapQualityFilter,    m_settings->MapQualityFilter,    FilterOpts);
846     Options::AddValueOption("-name",          "string",    nameDesc,    "", m_settings->HasNameFilter,          m_settings->NameFilter,          FilterOpts);
847     Options::AddValueOption("-queryBases",    "string",    queryDesc,   "", m_settings->HasQueryBasesFilter,    m_settings->QueryBasesFilter,    FilterOpts);
848     Options::AddValueOption("-tag",           "TAG:VALUE", tagDesc,     "", m_settings->HasTagFilter,           m_settings->TagFilter,           FilterOpts);
849
850     // ----------------------------------
851     // alignment flag filter options
852
853     OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters");
854
855     const string boolArg           = "true/false";
856     const string isDupDesc         = "keep only alignments that are marked as duplicate?";
857     const string isFailQcDesc      = "keep only alignments that failed QC?";
858     const string isFirstMateDesc   = "keep only alignments marked as first mate?";
859     const string isMappedDesc      = "keep only alignments that were mapped?";
860     const string isMateMappedDesc  = "keep only alignments with mates that mapped";
861     const string isMateReverseDesc = "keep only alignments with mate on reverese strand?";
862     const string isPairedDesc      = "keep only alignments that were sequenced as paired?";
863     const string isPrimaryDesc     = "keep only alignments marked as primary?";
864     const string isProperPairDesc  = "keep only alignments that passed PE resolution?";
865     const string isReverseDesc     = "keep only alignments on reverse strand?";
866     const string isSecondMateDesc  = "keep only alignments marked as second mate?";
867     const string isSingletonDesc   = "keep only singletons";
868
869     Options::AddValueOption("-isDuplicate",         boolArg, isDupDesc,         "", m_settings->HasIsDuplicateFilter,         m_settings->IsDuplicateFilter,         AlignmentFlagOpts, TRUE_STR);
870     Options::AddValueOption("-isFailedQC",          boolArg, isFailQcDesc,      "", m_settings->HasIsFailedQCFilter,          m_settings->IsFailedQCFilter,          AlignmentFlagOpts, TRUE_STR);
871     Options::AddValueOption("-isFirstMate",         boolArg, isFirstMateDesc,   "", m_settings->HasIsFirstMateFilter,         m_settings->IsFirstMateFilter,         AlignmentFlagOpts, TRUE_STR);
872     Options::AddValueOption("-isMapped",            boolArg, isMappedDesc,      "", m_settings->HasIsMappedFilter,            m_settings->IsMappedFilter,            AlignmentFlagOpts, TRUE_STR);
873     Options::AddValueOption("-isMateMapped",        boolArg, isMateMappedDesc,  "", m_settings->HasIsMateMappedFilter,        m_settings->IsMateMappedFilter,        AlignmentFlagOpts, TRUE_STR);
874     Options::AddValueOption("-isMateReverseStrand", boolArg, isMateReverseDesc, "", m_settings->HasIsMateReverseStrandFilter, m_settings->IsMateReverseStrandFilter, AlignmentFlagOpts, TRUE_STR);
875     Options::AddValueOption("-isPaired",            boolArg, isPairedDesc,      "", m_settings->HasIsPairedFilter,            m_settings->IsPairedFilter,            AlignmentFlagOpts, TRUE_STR);
876     Options::AddValueOption("-isPrimaryAlignment",  boolArg, isPrimaryDesc,     "", m_settings->HasIsPrimaryAlignmentFilter,  m_settings->IsPrimaryAlignmentFilter,  AlignmentFlagOpts, TRUE_STR);
877     Options::AddValueOption("-isProperPair",        boolArg, isProperPairDesc,  "", m_settings->HasIsProperPairFilter,        m_settings->IsProperPairFilter,        AlignmentFlagOpts, TRUE_STR);
878     Options::AddValueOption("-isReverseStrand",     boolArg, isReverseDesc,     "", m_settings->HasIsReverseStrandFilter,     m_settings->IsReverseStrandFilter,     AlignmentFlagOpts, TRUE_STR);
879     Options::AddValueOption("-isSecondMate",        boolArg, isSecondMateDesc,  "", m_settings->HasIsSecondMateFilter,        m_settings->IsSecondMateFilter,        AlignmentFlagOpts, TRUE_STR);
880     Options::AddValueOption("-isSingleton",         boolArg, isSingletonDesc,   "", m_settings->HasIsSingletonFilter,         m_settings->IsSingletonFilter,         AlignmentFlagOpts, TRUE_STR);
881 }
882
883 FilterTool::~FilterTool(void) {
884
885     delete m_settings;
886     m_settings = 0;
887
888     delete m_impl;
889     m_impl = 0;
890 }
891
892 int FilterTool::Help(void) {
893     Options::DisplayHelp();
894     return 0;
895 }
896
897 int FilterTool::Run(int argc, char* argv[]) {
898
899     // parse command line arguments
900     Options::Parse(argc, argv, 1);
901
902     // initialize FilterTool with settings
903     m_impl = new FilterToolPrivate(m_settings);
904
905     // run FilterTool, return success/fail
906     if ( m_impl->Run() )
907         return 0;
908     else
909         return 1;
910 }