]> git.donarmstrong.com Git - bamtools.git/blob - src/toolkit/bamtools_filter.cpp
Minor cleanup to toolkit code formatting.
[bamtools.git] / src / toolkit / bamtools_filter.cpp
1 // ***************************************************************************
2 // bamtools_filter.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 7 April 2011
7 // ---------------------------------------------------------------------------
8 // Filters BAM file(s) according to some user-specified criteria
9 // ***************************************************************************
10
11 #include "bamtools_filter.h"
12
13 #include <api/BamMultiReader.h>
14 #include <api/BamWriter.h>
15 #include <utils/bamtools_filter_engine.h>
16 #include <utils/bamtools_options.h>
17 #include <utils/bamtools_utilities.h>
18 using namespace BamTools;
19
20 #include <jsoncpp/json.h>
21 using namespace Json;
22
23 #include <cstdio>
24 #include <iostream>
25 #include <sstream>
26 #include <string>
27 #include <vector>
28 using namespace std;
29   
30 namespace BamTools {
31   
32 // -------------------------------  
33 // string literal constants  
34
35 // property names
36 const string ALIGNMENTFLAG_PROPERTY       = "alignmentFlag";
37 const string CIGAR_PROPERTY               = "cigar";
38 const string INSERTSIZE_PROPERTY          = "insertSize";
39 const string ISDUPLICATE_PROPERTY         = "isDuplicate";
40 const string ISFAILEDQC_PROPERTY          = "isFailedQC";
41 const string ISFIRSTMATE_PROPERTY         = "isFirstMate";
42 const string ISMAPPED_PROPERTY            = "isMapped";
43 const string ISMATEMAPPED_PROPERTY        = "isMateMapped";
44 const string ISMATEREVERSESTRAND_PROPERTY = "isMateReverseStrand";
45 const string ISPAIRED_PROPERTY            = "isPaired";
46 const string ISPRIMARYALIGNMENT_PROPERTY  = "isPrimaryAlignment";
47 const string ISPROPERPAIR_PROPERTY        = "isProperPair";
48 const string ISREVERSESTRAND_PROPERTY     = "isReverseStrand";
49 const string ISSECONDMATE_PROPERTY        = "isSecondMate";
50 const string MAPQUALITY_PROPERTY          = "mapQuality";
51 const string MATEPOSITION_PROPERTY        = "matePosition";
52 const string MATEREFERENCE_PROPERTY       = "mateReference";
53 const string NAME_PROPERTY                = "name";
54 const string POSITION_PROPERTY            = "position";
55 const string QUERYBASES_PROPERTY          = "queryBases";
56 const string REFERENCE_PROPERTY           = "reference";
57 const string TAG_PROPERTY                 = "tag";
58
59 // boolalpha
60 const string TRUE_STR  = "true";
61 const string FALSE_STR = "false";
62     
63 RefVector filterToolReferences;    
64     
65 struct BamAlignmentChecker {
66     bool check(const PropertyFilter& filter, const BamAlignment& al) {
67       
68         bool keepAlignment = true;
69         const PropertyMap& properties = filter.Properties;
70         PropertyMap::const_iterator propertyIter = properties.begin();
71         PropertyMap::const_iterator propertyEnd  = properties.end();
72         for ( ; propertyIter != propertyEnd; ++propertyIter ) {
73           
74             // check alignment data field depending on propertyName
75             const string& propertyName = (*propertyIter).first;
76             const PropertyFilterValue& valueFilter = (*propertyIter).second;
77             
78             if      ( propertyName == ALIGNMENTFLAG_PROPERTY )  keepAlignment &= valueFilter.check(al.AlignmentFlag);
79             else if ( propertyName == CIGAR_PROPERTY ) {
80                 stringstream cigarSs;
81                 const vector<CigarOp>& cigarData = al.CigarData;
82                 if ( !cigarData.empty() ) {
83                     vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
84                     vector<CigarOp>::const_iterator cigarIter = cigarBegin;
85                     vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
86                     for ( ; cigarIter != cigarEnd; ++cigarIter ) {
87                         const CigarOp& op = (*cigarIter);
88                         cigarSs << op.Length << op.Type;
89                     }
90                     keepAlignment &= valueFilter.check(cigarSs.str());
91                 }
92             }
93             else if ( propertyName == INSERTSIZE_PROPERTY )           keepAlignment &= valueFilter.check(al.InsertSize);
94             else if ( propertyName == ISDUPLICATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsDuplicate());
95             else if ( propertyName == ISFAILEDQC_PROPERTY )           keepAlignment &= valueFilter.check(al.IsFailedQC());
96             else if ( propertyName == ISFIRSTMATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsFirstMate());
97             else if ( propertyName == ISMAPPED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsMapped());
98             else if ( propertyName == ISMATEMAPPED_PROPERTY )         keepAlignment &= valueFilter.check(al.IsMateMapped());
99             else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY )  keepAlignment &= valueFilter.check(al.IsMateReverseStrand());
100             else if ( propertyName == ISPAIRED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsPaired());
101             else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY )   keepAlignment &= valueFilter.check(al.IsPrimaryAlignment());
102             else if ( propertyName == ISPROPERPAIR_PROPERTY )         keepAlignment &= valueFilter.check(al.IsProperPair());
103             else if ( propertyName == ISREVERSESTRAND_PROPERTY )      keepAlignment &= valueFilter.check(al.IsReverseStrand());
104             else if ( propertyName == ISSECONDMATE_PROPERTY )         keepAlignment &= valueFilter.check(al.IsSecondMate());
105             else if ( propertyName == MAPQUALITY_PROPERTY )           keepAlignment &= valueFilter.check(al.MapQuality);
106             else if ( propertyName == MATEPOSITION_PROPERTY )         keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) );
107             else if ( propertyName == MATEREFERENCE_PROPERTY ) {
108                 if ( !al.IsPaired() || !al.IsMateMapped() ) return false;
109                 BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID");
110                 const string& refName = filterToolReferences.at(al.MateRefID).RefName;
111                 keepAlignment &= valueFilter.check(refName);
112             }
113             else if ( propertyName == NAME_PROPERTY )                 keepAlignment &= valueFilter.check(al.Name);
114             else if ( propertyName == POSITION_PROPERTY )             keepAlignment &= valueFilter.check(al.Position);
115             else if ( propertyName == QUERYBASES_PROPERTY )           keepAlignment &= valueFilter.check(al.QueryBases);
116             else if ( propertyName == REFERENCE_PROPERTY ) {
117                 BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID");
118                 const string& refName = filterToolReferences.at(al.RefID).RefName;
119                 keepAlignment &= valueFilter.check(refName);
120             }
121             else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al);
122             else BAMTOOLS_ASSERT_UNREACHABLE;
123             
124             // if alignment fails at ANY point, just quit and return false
125             if ( !keepAlignment ) return false;
126         }
127       
128         BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here");
129         return keepAlignment;
130     }
131     
132     bool checkAlignmentTag(const PropertyFilterValue& valueFilter, const BamAlignment& al) {
133      
134         // ensure filter contains string data
135         Variant entireTagFilter = valueFilter.Value;
136         if ( !entireTagFilter.is_type<string>() ) return false;
137
138         // localize string from variant
139         const string& entireTagFilterString = entireTagFilter.get<string>();
140
141         // ensure we have at least "XX:x"
142         if ( entireTagFilterString.length() < 4 ) return false;
143
144         // get tagName & lookup in alignment
145         // if found, set tagType to tag type character
146         // if not found, return false
147         const string& tagName = entireTagFilterString.substr(0,2);
148         char tagType = '\0';
149         if ( !al.GetTagType(tagName, tagType) ) return false;
150
151         // remove tagName & ":" from beginning tagFilter
152         string tagFilterString = entireTagFilterString.substr(3);
153
154         // switch on tag type to set tag query value & parse filter token
155         int32_t  intFilterValue,    intQueryValue;
156         uint32_t uintFilterValue,   uintQueryValue;
157         float    realFilterValue,   realQueryValue;
158         string   stringFilterValue, stringQueryValue;
159
160         PropertyFilterValue tagFilter;
161         PropertyFilterValue::ValueCompareType compareType;
162         bool keepAlignment = false;
163         switch (tagType) {
164
165             // signed int tag type
166             case 'c' :
167             case 's' :
168             case 'i' :
169                 if ( al.GetTag(tagName, intQueryValue) ) {
170                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, intFilterValue, compareType) ) {
171                         tagFilter.Value = intFilterValue;
172                         tagFilter.Type  = compareType;
173                         keepAlignment   = tagFilter.check(intQueryValue);
174                     }
175                 }
176                 break;
177
178             // unsigned int tag type
179             case 'C' :
180             case 'S' :
181             case 'I' :
182                 if ( al.GetTag(tagName, uintQueryValue) ) {
183                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, uintFilterValue, compareType) ) {
184                         tagFilter.Value = uintFilterValue;
185                         tagFilter.Type  = compareType;
186                         keepAlignment   = tagFilter.check(uintQueryValue);
187                     }
188                 }
189                 break;
190
191             // 'real' tag type
192             case 'f' :
193                 if ( al.GetTag(tagName, realQueryValue) ) {
194                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, realFilterValue, compareType) ) {
195                         tagFilter.Value = realFilterValue;
196                         tagFilter.Type  = compareType;
197                         keepAlignment   = tagFilter.check(realQueryValue);
198                     }
199                 }
200                 break;
201
202             // string tag type
203             case 'A':
204             case 'Z':
205             case 'H':
206                 if ( al.GetTag(tagName, stringQueryValue) ) {
207                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, stringFilterValue, compareType) ) {
208                         tagFilter.Value = stringFilterValue;
209                         tagFilter.Type  = compareType;
210                         keepAlignment   = tagFilter.check(stringQueryValue);
211                     }
212                 }
213                 break;
214
215             // unknown tag type
216             default :
217                 keepAlignment = false;
218         }
219
220         return keepAlignment;
221     }
222 };    
223     
224 } // namespace BamTools
225   
226 // ---------------------------------------------
227 // FilterSettings implementation
228
229 struct FilterTool::FilterSettings {
230
231     // ----------------------------------
232     // IO opts
233
234     // flags
235     bool HasInputBamFilename;
236     bool HasOutputBamFilename;
237     bool HasRegion;
238     bool HasScriptFilename;
239     bool IsForceCompression;
240
241     // filenames
242     vector<string> InputFiles;
243     string OutputFilename;
244     string Region;
245     string ScriptFilename;
246
247     // -----------------------------------
248     // General filter opts
249
250     // flags
251     bool HasAlignmentFlagFilter;
252     bool HasInsertSizeFilter;
253     bool HasMapQualityFilter;
254     bool HasNameFilter;
255     bool HasQueryBasesFilter;
256     bool HasTagFilter; //(s)
257
258     // filters
259     string AlignmentFlagFilter;
260     string InsertSizeFilter;
261     string NameFilter;
262     string MapQualityFilter;
263     string QueryBasesFilter;
264     string TagFilter;  // support multiple ?
265
266     // -----------------------------------
267     // AlignmentFlag filter opts
268
269     // flags
270     bool HasIsDuplicateFilter;
271     bool HasIsFailedQCFilter;
272     bool HasIsFirstMateFilter;
273     bool HasIsMappedFilter;
274     bool HasIsMateMappedFilter;
275     bool HasIsMateReverseStrandFilter;
276     bool HasIsPairedFilter;
277     bool HasIsPrimaryAlignmentFilter;
278     bool HasIsProperPairFilter;
279     bool HasIsReverseStrandFilter;
280     bool HasIsSecondMateFilter;
281
282     // filters
283     string IsDuplicateFilter;
284     string IsFailedQCFilter;
285     string IsFirstMateFilter;
286     string IsMappedFilter;
287     string IsMateMappedFilter;
288     string IsMateReverseStrandFilter;
289     string IsPairedFilter;
290     string IsPrimaryAlignmentFilter;
291     string IsProperPairFilter;
292     string IsReverseStrandFilter;
293     string IsSecondMateFilter;
294
295     // ---------------------------------
296     // constructor
297
298     FilterSettings(void)
299         : HasInputBamFilename(false)
300         , HasOutputBamFilename(false)
301         , HasRegion(false)
302         , HasScriptFilename(false)
303         , IsForceCompression(false)
304         , OutputFilename(Options::StandardOut())
305         , HasAlignmentFlagFilter(false)
306         , HasInsertSizeFilter(false)
307         , HasMapQualityFilter(false)
308         , HasNameFilter(false)
309         , HasQueryBasesFilter(false)
310         , HasTagFilter(false)
311         , HasIsDuplicateFilter(false)
312         , HasIsFailedQCFilter(false)
313         , HasIsFirstMateFilter(false)
314         , HasIsMappedFilter(false)
315         , HasIsMateMappedFilter(false)
316         , HasIsMateReverseStrandFilter(false)
317         , HasIsPairedFilter(false)
318         , HasIsPrimaryAlignmentFilter(false)
319         , HasIsProperPairFilter(false)
320         , HasIsReverseStrandFilter(false)
321         , HasIsSecondMateFilter(false)
322         , IsDuplicateFilter(TRUE_STR)
323         , IsFailedQCFilter(TRUE_STR)
324         , IsFirstMateFilter(TRUE_STR)
325         , IsMappedFilter(TRUE_STR)
326         , IsMateMappedFilter(TRUE_STR)
327         , IsMateReverseStrandFilter(TRUE_STR)
328         , IsPairedFilter(TRUE_STR)
329         , IsPrimaryAlignmentFilter(TRUE_STR)
330         , IsProperPairFilter(TRUE_STR)
331         , IsReverseStrandFilter(TRUE_STR)
332         , IsSecondMateFilter(TRUE_STR)
333     { }
334 };
335
336 // ---------------------------------------------
337 // FilterToolPrivate declaration
338
339 class FilterTool::FilterToolPrivate {
340       
341     // ctor & dtor
342     public:
343         FilterToolPrivate(FilterTool::FilterSettings* settings);
344         ~FilterToolPrivate(void);  
345         
346     // 'public' interface
347     public:
348         bool Run(void);
349         
350     // internal methods
351     private:
352         bool AddPropertyTokensToFilter(const string& filterName, const map<string, string>& propertyTokens);
353         bool CheckAlignment(const BamAlignment& al);
354         const string GetScriptContents(void);
355         void InitProperties(void);
356         bool ParseCommandLine(void);
357         bool ParseFilterObject(const string& filterName, const Json::Value& filterObject);
358         bool ParseScript(void);
359         bool SetupFilters(void);
360         
361     // data members
362     private:
363         vector<string> m_propertyNames;
364         FilterTool::FilterSettings* m_settings;
365         FilterEngine<BamAlignmentChecker> m_filterEngine;
366 };
367  
368 // ---------------------------------------------
369 // FilterToolPrivate implementation
370   
371 // constructor  
372 FilterTool::FilterToolPrivate::FilterToolPrivate(FilterTool::FilterSettings* settings)
373     : m_settings(settings)
374 { }  
375   
376 // destructor
377 FilterTool::FilterToolPrivate::~FilterToolPrivate(void) { }
378
379 bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName,
380                                                               const map<string,
381                                                               string>& propertyTokens)
382 {
383     // dummy temp values for token parsing
384     bool boolValue;
385     int32_t int32Value;
386     uint16_t uint16Value;
387     uint32_t uint32Value;
388     string stringValue;
389     PropertyFilterValue::ValueCompareType type;
390   
391     // iterate over property token map
392     map<string, string>::const_iterator mapIter = propertyTokens.begin();
393     map<string, string>::const_iterator mapEnd  = propertyTokens.end();
394     for ( ; mapIter != mapEnd; ++mapIter ) {
395       
396         const string& propertyName = (*mapIter).first;
397         const string& token        = (*mapIter).second;
398         
399         // ------------------------------
400         // convert token to value & compare type 
401         // then add to filter engine
402         
403         // bool conversion
404         if ( propertyName == ISDUPLICATE_PROPERTY ||
405              propertyName == ISFAILEDQC_PROPERTY ||
406              propertyName == ISFIRSTMATE_PROPERTY ||
407              propertyName == ISMAPPED_PROPERTY ||
408              propertyName == ISMATEMAPPED_PROPERTY ||
409              propertyName == ISMATEREVERSESTRAND_PROPERTY ||
410              propertyName == ISPAIRED_PROPERTY ||
411              propertyName == ISPRIMARYALIGNMENT_PROPERTY ||
412              propertyName == ISPROPERPAIR_PROPERTY ||
413              propertyName == ISREVERSESTRAND_PROPERTY ||
414              propertyName == ISSECONDMATE_PROPERTY
415            ) 
416         {
417             FilterEngine<BamAlignmentChecker>::parseToken(token, boolValue, type);
418             m_filterEngine.setProperty(filterName, propertyName, boolValue, type);
419         }
420         
421         // int32_t conversion
422         else if ( propertyName == INSERTSIZE_PROPERTY ||
423                   propertyName == MATEPOSITION_PROPERTY ||
424                   propertyName == POSITION_PROPERTY 
425                 ) 
426         {
427             FilterEngine<BamAlignmentChecker>::parseToken(token, int32Value, type);
428             m_filterEngine.setProperty(filterName, propertyName, int32Value, type);
429         }
430         
431         // uint16_t conversion
432         else if ( propertyName == MAPQUALITY_PROPERTY )
433         {
434             FilterEngine<BamAlignmentChecker>::parseToken(token, uint16Value, type);
435             m_filterEngine.setProperty(filterName, propertyName, uint16Value, type);
436         }
437         
438         // uint32_t conversion
439         else if ( propertyName == ALIGNMENTFLAG_PROPERTY )
440         {
441             FilterEngine<BamAlignmentChecker>::parseToken(token, uint32Value, type);
442             m_filterEngine.setProperty(filterName, propertyName, uint32Value, type);
443         }
444         
445         // string conversion
446         else if ( propertyName == CIGAR_PROPERTY || 
447                   propertyName == MATEREFERENCE_PROPERTY ||
448                   propertyName == NAME_PROPERTY ||
449                   propertyName == QUERYBASES_PROPERTY ||
450                   propertyName == REFERENCE_PROPERTY 
451                 ) 
452         {
453             FilterEngine<BamAlignmentChecker>::parseToken(token, stringValue, type);
454             m_filterEngine.setProperty(filterName, propertyName, stringValue, type);
455         }
456       
457     else if ( propertyName == TAG_PROPERTY ) {
458             // this will be stored directly as the TAG:VALUE token 
459             // (VALUE may contain compare ops, will be parsed out later)
460             m_filterEngine.setProperty(filterName, propertyName, token, PropertyFilterValue::EXACT);
461         }
462       
463         // else unknown property 
464         else {
465             cerr << "bamtools filter ERROR: unknown property - " << propertyName << endl;
466             return false;
467         }
468     }
469     return true;
470 }
471
472 bool FilterTool::FilterToolPrivate::CheckAlignment(const BamAlignment& al) {
473     return m_filterEngine.check(al);
474 }
475
476 const string FilterTool::FilterToolPrivate::GetScriptContents(void) {
477   
478     // open script for reading
479     FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb");
480     if ( !inFile ) {
481         cerr << "bamtools filter ERROR: could not open script: "
482              << m_settings->ScriptFilename << " for reading" << endl;
483         return false;
484     }
485     
486     // read in entire script contents  
487     char buffer[1024];
488     ostringstream docStream("");
489     while ( true ) {
490         
491         // peek ahead, make sure there is data available
492         char ch = fgetc(inFile);
493         ungetc(ch, inFile);
494         if( feof(inFile) ) break;       
495         
496         // read next block of data
497         if ( fgets(buffer, 1024, inFile) == 0 ) {
498             cerr << "bamtools filter ERROR: could not read script contents" << endl;
499             return false;
500         }
501         
502         docStream << buffer;
503     }
504     
505     // close script file
506     fclose(inFile);
507     
508     // import buffer contents to document, return
509     string document = docStream.str();
510     return document;
511 }
512
513 void FilterTool::FilterToolPrivate::InitProperties(void) {
514   
515     // store property names in vector 
516     m_propertyNames.push_back(ALIGNMENTFLAG_PROPERTY);
517     m_propertyNames.push_back(CIGAR_PROPERTY);
518     m_propertyNames.push_back(INSERTSIZE_PROPERTY);
519     m_propertyNames.push_back(ISDUPLICATE_PROPERTY);
520     m_propertyNames.push_back(ISFAILEDQC_PROPERTY);
521     m_propertyNames.push_back(ISFIRSTMATE_PROPERTY);
522     m_propertyNames.push_back(ISMAPPED_PROPERTY);
523     m_propertyNames.push_back(ISMATEMAPPED_PROPERTY);
524     m_propertyNames.push_back(ISMATEREVERSESTRAND_PROPERTY);
525     m_propertyNames.push_back(ISPAIRED_PROPERTY);
526     m_propertyNames.push_back(ISPRIMARYALIGNMENT_PROPERTY);
527     m_propertyNames.push_back(ISPROPERPAIR_PROPERTY);
528     m_propertyNames.push_back(ISREVERSESTRAND_PROPERTY);
529     m_propertyNames.push_back(ISSECONDMATE_PROPERTY);
530     m_propertyNames.push_back(MAPQUALITY_PROPERTY);
531     m_propertyNames.push_back(MATEPOSITION_PROPERTY);
532     m_propertyNames.push_back(MATEREFERENCE_PROPERTY);
533     m_propertyNames.push_back(NAME_PROPERTY);
534     m_propertyNames.push_back(POSITION_PROPERTY);
535     m_propertyNames.push_back(QUERYBASES_PROPERTY);
536     m_propertyNames.push_back(REFERENCE_PROPERTY);
537     m_propertyNames.push_back(TAG_PROPERTY);
538     
539     // add vector contents to FilterEngine<BamAlignmentChecker>
540     vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
541     vector<string>::const_iterator propertyNameEnd  = m_propertyNames.end();
542     for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter )
543         m_filterEngine.addProperty((*propertyNameIter));
544 }
545
546 bool FilterTool::FilterToolPrivate::ParseCommandLine(void) {
547   
548     // add a rule set to filter engine
549     const string CMD = "COMMAND_LINE";
550     m_filterEngine.addFilter(CMD);
551
552     // map property names to command line args
553     map<string, string> propertyTokens;
554     if ( m_settings->HasAlignmentFlagFilter )       propertyTokens.insert( make_pair(ALIGNMENTFLAG_PROPERTY,       m_settings->AlignmentFlagFilter) );
555     if ( m_settings->HasInsertSizeFilter )          propertyTokens.insert( make_pair(INSERTSIZE_PROPERTY,          m_settings->InsertSizeFilter) );
556     if ( m_settings->HasIsDuplicateFilter )         propertyTokens.insert( make_pair(ISDUPLICATE_PROPERTY,         m_settings->IsDuplicateFilter) );
557     if ( m_settings->HasIsFailedQCFilter )          propertyTokens.insert( make_pair(ISFAILEDQC_PROPERTY,          m_settings->IsFailedQCFilter) );
558     if ( m_settings->HasIsFirstMateFilter )         propertyTokens.insert( make_pair(ISFIRSTMATE_PROPERTY,         m_settings->IsFirstMateFilter) );
559     if ( m_settings->HasIsMappedFilter )            propertyTokens.insert( make_pair(ISMAPPED_PROPERTY,            m_settings->IsMappedFilter) );
560     if ( m_settings->HasIsMateMappedFilter )        propertyTokens.insert( make_pair(ISMATEMAPPED_PROPERTY,        m_settings->IsMateMappedFilter) );
561     if ( m_settings->HasIsMateReverseStrandFilter ) propertyTokens.insert( make_pair(ISMATEREVERSESTRAND_PROPERTY, m_settings->IsMateReverseStrandFilter) );
562     if ( m_settings->HasIsPairedFilter )            propertyTokens.insert( make_pair(ISPAIRED_PROPERTY,            m_settings->IsPairedFilter) );
563     if ( m_settings->HasIsPrimaryAlignmentFilter )  propertyTokens.insert( make_pair(ISPRIMARYALIGNMENT_PROPERTY,  m_settings->IsPrimaryAlignmentFilter) );
564     if ( m_settings->HasIsProperPairFilter )        propertyTokens.insert( make_pair(ISPROPERPAIR_PROPERTY,        m_settings->IsProperPairFilter) );
565     if ( m_settings->HasIsReverseStrandFilter )     propertyTokens.insert( make_pair(ISREVERSESTRAND_PROPERTY,     m_settings->IsReverseStrandFilter) );
566     if ( m_settings->HasIsSecondMateFilter )        propertyTokens.insert( make_pair(ISSECONDMATE_PROPERTY,        m_settings->IsSecondMateFilter) );
567     if ( m_settings->HasMapQualityFilter )          propertyTokens.insert( make_pair(MAPQUALITY_PROPERTY,          m_settings->MapQualityFilter) );
568     if ( m_settings->HasNameFilter )                propertyTokens.insert( make_pair(NAME_PROPERTY,                m_settings->NameFilter) );
569     if ( m_settings->HasQueryBasesFilter )          propertyTokens.insert( make_pair(QUERYBASES_PROPERTY,          m_settings->QueryBasesFilter) );
570     if ( m_settings->HasTagFilter )                 propertyTokens.insert( make_pair(TAG_PROPERTY,                 m_settings->TagFilter) );
571     
572     // send add these properties to filter set "COMMAND_LINE"
573     return AddPropertyTokensToFilter(CMD, propertyTokens);
574 }
575
576 bool FilterTool::FilterToolPrivate::ParseFilterObject(const string& filterName, const Json::Value& filterObject) {
577   
578     // filter object parsing variables
579     Json::Value null(Json::nullValue);
580     Json::Value propertyValue;
581     
582     // store results
583     map<string, string> propertyTokens;
584     
585     // iterate over known properties
586     vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
587     vector<string>::const_iterator propertyNameEnd  = m_propertyNames.end();
588     for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter ) {
589         const string& propertyName = (*propertyNameIter);
590         
591         // if property defined in filter, add to token list
592         propertyValue = filterObject.get(propertyName, null);
593         if ( propertyValue != null ) 
594             propertyTokens.insert( make_pair(propertyName, propertyValue.asString()) );
595     }
596   
597     // add this filter to engin
598     m_filterEngine.addFilter(filterName);
599   
600     // add token list to this filter
601     return AddPropertyTokensToFilter(filterName, propertyTokens);
602 }
603
604 bool FilterTool::FilterToolPrivate::ParseScript(void) {
605   
606     // read in script contents from file
607     const string document = GetScriptContents();
608     
609     // set up JsonCPP reader and attempt to parse script
610     Json::Value root;
611     Json::Reader reader;
612     if ( !reader.parse(document, root) ) {
613         // use built-in error reporting mechanism to alert user what was wrong with the script
614         cerr  << "bamtools filter ERROR: failed to parse script - see error message(s) below" << endl
615               << reader.getFormatedErrorMessages();
616         return false;     
617     }
618
619     // initialize return status
620     bool success = true;
621     
622     // see if root object contains multiple filters
623     const Json::Value filters = root["filters"];
624     if ( !filters.isNull() ) {
625       
626         // iterate over any filters found
627         int filterIndex = 0;
628         Json::Value::const_iterator filtersIter = filters.begin();
629         Json::Value::const_iterator filtersEnd  = filters.end();
630         for ( ; filtersIter != filtersEnd; ++filtersIter, ++filterIndex ) {
631             Json::Value filter = (*filtersIter);
632             
633             // convert filter index to string
634             string filterName;
635             
636             // if id tag supplied
637             const Json::Value id = filter["id"];
638             if ( !id.isNull() ) 
639                 filterName = id.asString();
640             
641             // use array index 
642             else {
643                 stringstream convert;
644                 convert << filterIndex;
645                 filterName = convert.str();
646             }
647             
648             // create & parse filter 
649             success &= ParseFilterObject(filterName, filter);
650         }
651         
652         // see if user defined a "rule" for these filters
653         // otherwise, use filter engine's default rule behavior
654         string ruleString("");
655         const Json::Value rule = root["rule"];
656         if ( rule.isString() )
657             ruleString = rule.asString();
658         m_filterEngine.setRule(ruleString);
659           
660         // return success/fail
661         return success;
662     } 
663     
664     // otherwise, root is the only filter (just contains properties)
665     // create & parse filter named "ROOT"
666     else success = ParseFilterObject("ROOT", root);
667     
668     // return success/failure
669     return success;
670 }
671
672 bool FilterTool::FilterToolPrivate::Run(void) {
673     
674     // set to default input if none provided
675     if ( !m_settings->HasInputBamFilename ) 
676         m_settings->InputFiles.push_back(Options::StandardIn());
677
678     // initialize defined properties & user-specified filters
679     // quit if failed
680     if ( !SetupFilters() ) return false;
681
682     // open reader without index
683     BamMultiReader reader;
684     if ( !reader.Open(m_settings->InputFiles) ) {
685         cerr << "bamtools filter ERROR: could not open input files for reading." << endl;
686         return false;
687     }
688
689     // retrieve reader header & reference data
690     const string headerText = reader.GetHeaderText();
691     filterToolReferences = reader.GetReferenceData();
692     
693     // determine compression mode for BamWriter
694     bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
695                               !m_settings->IsForceCompression );
696     BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
697     if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
698
699     // open BamWriter
700     BamWriter writer;
701     writer.SetCompressionMode(compressionMode);
702     if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences) ) {
703         cerr << "bamtools filter ERROR: could not open " << m_settings->OutputFilename << " for writing." << endl;
704         reader.Close();
705         return false;
706     }
707
708     // if no region specified, filter entire file 
709     BamAlignment al;
710     if ( !m_settings->HasRegion ) {
711         while ( reader.GetNextAlignment(al) ) {
712             if ( CheckAlignment(al) ) 
713                 writer.SaveAlignment(al);
714         }
715     }
716     
717     // otherwise attempt to use region as constraint
718     else {
719         
720         // if region string parses OK
721         BamRegion region;
722         if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
723
724             // attempt to find index files
725             reader.LocateIndexes();
726
727             // if index data available for all BAM files, we can use SetRegion
728             if ( reader.HasIndexes() ) {
729
730                 // attempt to use SetRegion(), if failed report error
731                 if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
732                     cerr << "bamtools filter ERROR: set region failed. Check that REGION describes a valid range" << endl;
733                     reader.Close();
734                     return false;
735                 } 
736               
737                 // everything checks out, just iterate through specified region, filtering alignments
738                 while ( reader.GetNextAlignment(al) )
739                     if ( CheckAlignment(al) ) 
740                         writer.SaveAlignment(al);
741                 }
742             
743             // no index data available, we have to iterate through until we
744             // find overlapping alignments
745             else {
746                 while ( reader.GetNextAlignment(al) ) {
747                     if ( (al.RefID >= region.LeftRefID)  && ((al.Position + al.Length) >= region.LeftPosition) &&
748                          (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) 
749                     {
750                         if ( CheckAlignment(al) ) 
751                             writer.SaveAlignment(al);
752                     }
753                 }
754             }
755         } 
756         
757         // error parsing REGION string
758         else {
759             cerr << "bamtools filter ERROR: could not parse REGION: " << m_settings->Region << endl;
760             cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
761                  << endl;
762             reader.Close();
763             return false;
764         }
765     }
766
767     // clean up & exit
768     reader.Close();
769     writer.Close();
770     return true;
771 }
772
773 bool FilterTool::FilterToolPrivate::SetupFilters(void) {
774   
775     // set up filter engine with supported properties
776     InitProperties();
777     
778     // parse script for filter rules, if given
779     if ( m_settings->HasScriptFilename )
780         return ParseScript();
781     
782     // otherwise check command line for filters
783     else return ParseCommandLine();
784 }
785
786 // ---------------------------------------------
787 // FilterTool implementation
788
789 FilterTool::FilterTool(void)
790     : AbstractTool()
791     , m_settings(new FilterSettings)
792     , m_impl(0)
793 {
794     // set program details
795     Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", "[-in <filename> -in <filename> ...] [-out <filename> | [-forceCompression]] [-region <REGION>] [ [-script <filename] | [filterOptions] ]");
796
797     OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
798     Options::AddValueOption("-in",     "BAM filename", "the input BAM file(s)", "", m_settings->HasInputBamFilename,  m_settings->InputFiles,     IO_Opts, Options::StandardIn());
799     Options::AddValueOption("-out",    "BAM filename", "the output BAM file",   "", m_settings->HasOutputBamFilename, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
800     Options::AddValueOption("-region", "REGION",       "only read data from this genomic region (see documentation for more details)", "", m_settings->HasRegion, m_settings->Region, IO_Opts);
801     Options::AddValueOption("-script", "filename",     "the filter script file (see documentation for more details)", "", m_settings->HasScriptFilename, m_settings->ScriptFilename, IO_Opts);
802     Options::AddOption("-forceCompression", "if results are sent to stdout (like when piping to another tool), default behavior is to leave output uncompressed. Use this flag to override and force compression", m_settings->IsForceCompression, IO_Opts);
803
804     OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters");
805     Options::AddValueOption("-alignmentFlag", "int",        "keep reads with this *exact* alignment flag (for more detailed queries, see below)", "", m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, FilterOpts);
806     Options::AddValueOption("-insertSize",    "int",        "keep reads with insert size that mathces pattern",             "", m_settings->HasInsertSizeFilter, m_settings->InsertSizeFilter, FilterOpts);
807     Options::AddValueOption("-mapQuality",    "[0-255]",    "keep reads with map quality that matches pattern",             "", m_settings->HasMapQualityFilter, m_settings->MapQualityFilter, FilterOpts);
808     Options::AddValueOption("-name",          "string",     "keep reads with name that matches pattern",                    "", m_settings->HasNameFilter,       m_settings->NameFilter,       FilterOpts);
809     Options::AddValueOption("-queryBases",    "string",     "keep reads with motif that mathces pattern",                   "", m_settings->HasQueryBasesFilter, m_settings->QueryBasesFilter, FilterOpts);
810     Options::AddValueOption("-tag",           "TAG:VALUE",  "keep reads with this key=>value pair",                         "", m_settings->HasTagFilter,        m_settings->TagFilter,        FilterOpts);
811
812     OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters");
813     Options::AddValueOption("-isDuplicate",         "true/false", "keep only alignments that are marked as duplicate?", "", m_settings->HasIsDuplicateFilter,         m_settings->IsDuplicateFilter,         AlignmentFlagOpts, TRUE_STR);
814     Options::AddValueOption("-isFailedQC",          "true/false", "keep only alignments that failed QC?",               "", m_settings->HasIsFailedQCFilter,          m_settings->IsFailedQCFilter,          AlignmentFlagOpts, TRUE_STR);
815     Options::AddValueOption("-isFirstMate",         "true/false", "keep only alignments marked as first mate?",         "", m_settings->HasIsFirstMateFilter,         m_settings->IsFirstMateFilter,         AlignmentFlagOpts, TRUE_STR);
816     Options::AddValueOption("-isMapped",            "true/false", "keep only alignments that were mapped?",             "", m_settings->HasIsMappedFilter,            m_settings->IsMappedFilter,            AlignmentFlagOpts, TRUE_STR);
817     Options::AddValueOption("-isMateMapped",        "true/false", "keep only alignments with mates that mapped",        "", m_settings->HasIsMateMappedFilter,        m_settings->IsMateMappedFilter,        AlignmentFlagOpts, TRUE_STR);
818     Options::AddValueOption("-isMateReverseStrand", "true/false", "keep only alignments with mate on reverese strand?", "", m_settings->HasIsMateReverseStrandFilter, m_settings->IsMateReverseStrandFilter, AlignmentFlagOpts, TRUE_STR);
819     Options::AddValueOption("-isPaired",            "true/false", "keep only alignments that were sequenced as paired?","", m_settings->HasIsPairedFilter,            m_settings->IsPairedFilter,            AlignmentFlagOpts, TRUE_STR);
820     Options::AddValueOption("-isPrimaryAlignment",  "true/false", "keep only alignments marked as primary?",            "", m_settings->HasIsPrimaryAlignmentFilter,  m_settings->IsPrimaryAlignmentFilter,  AlignmentFlagOpts, TRUE_STR);
821     Options::AddValueOption("-isProperPair",        "true/false", "keep only alignments that passed PE resolution?",    "", m_settings->HasIsProperPairFilter,        m_settings->IsProperPairFilter,        AlignmentFlagOpts, TRUE_STR);
822     Options::AddValueOption("-isReverseStrand",     "true/false", "keep only alignments on reverse strand?",            "", m_settings->HasIsReverseStrandFilter,     m_settings->IsReverseStrandFilter,     AlignmentFlagOpts, TRUE_STR);
823     Options::AddValueOption("-isSecondMate",        "true/false", "keep only alignments marked as second mate?",        "", m_settings->HasIsSecondMateFilter,        m_settings->IsSecondMateFilter,        AlignmentFlagOpts, TRUE_STR);
824 }
825
826 FilterTool::~FilterTool(void) {
827
828     delete m_settings;
829     m_settings = 0;
830
831     delete m_impl;
832     m_impl = 0;
833 }
834
835 int FilterTool::Help(void) {
836     Options::DisplayHelp();
837     return 0;
838 }
839
840 int FilterTool::Run(int argc, char* argv[]) {
841
842     // parse command line arguments
843     Options::Parse(argc, argv, 1);
844
845     // initialize FilterTool with settings
846     m_impl = new FilterToolPrivate(m_settings);
847
848     // run FilterTool, return success/fail
849     if ( m_impl->Run() )
850         return 0;
851     else
852         return 1;
853 }