]> git.donarmstrong.com Git - bamtools.git/blob - src/toolkit/bamtools_filter.cpp
Fixed: error in FilterTool - mishandling of ASCII tag values
[bamtools.git] / src / toolkit / bamtools_filter.cpp
1 // ***************************************************************************
2 // bamtools_filter.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 27 April 2013
6 // ---------------------------------------------------------------------------
7 // Filters BAM file(s) according to some user-specified criteria
8 // ***************************************************************************
9
10 #include "bamtools_filter.h"
11
12 #include <api/BamMultiReader.h>
13 #include <api/BamWriter.h>
14 #include <utils/bamtools_filter_engine.h>
15 #include <utils/bamtools_options.h>
16 #include <utils/bamtools_utilities.h>
17 using namespace BamTools;
18
19 #include <jsoncpp/json.h>
20 using namespace Json;
21
22 #include <cstdio>
23 #include <fstream>
24 #include <iostream>
25 #include <sstream>
26 #include <string>
27 #include <vector>
28 using namespace std;
29   
30 namespace BamTools {
31   
32 // -------------------------------  
33 // string literal constants  
34
35 // property names
36 const string ALIGNMENTFLAG_PROPERTY       = "alignmentFlag";
37 const string CIGAR_PROPERTY               = "cigar";
38 const string INSERTSIZE_PROPERTY          = "insertSize";
39 const string ISDUPLICATE_PROPERTY         = "isDuplicate";
40 const string ISFAILEDQC_PROPERTY          = "isFailedQC";
41 const string ISFIRSTMATE_PROPERTY         = "isFirstMate";
42 const string ISMAPPED_PROPERTY            = "isMapped";
43 const string ISMATEMAPPED_PROPERTY        = "isMateMapped";
44 const string ISMATEREVERSESTRAND_PROPERTY = "isMateReverseStrand";
45 const string ISPAIRED_PROPERTY            = "isPaired";
46 const string ISPRIMARYALIGNMENT_PROPERTY  = "isPrimaryAlignment";
47 const string ISPROPERPAIR_PROPERTY        = "isProperPair";
48 const string ISREVERSESTRAND_PROPERTY     = "isReverseStrand";
49 const string ISSECONDMATE_PROPERTY        = "isSecondMate";
50 const string ISSINGLETON_PROPERTY         = "isSingleton";
51 const string MAPQUALITY_PROPERTY          = "mapQuality";
52 const string MATEPOSITION_PROPERTY        = "matePosition";
53 const string MATEREFERENCE_PROPERTY       = "mateReference";
54 const string NAME_PROPERTY                = "name";
55 const string POSITION_PROPERTY            = "position";
56 const string QUERYBASES_PROPERTY          = "queryBases";
57 const string REFERENCE_PROPERTY           = "reference";
58 const string TAG_PROPERTY                 = "tag";
59
60 // boolalpha
61 const string TRUE_STR  = "true";
62 const string FALSE_STR = "false";
63     
64 RefVector filterToolReferences;    
65     
66 struct BamAlignmentChecker {
67     bool check(const PropertyFilter& filter, const BamAlignment& al) {
68       
69         bool keepAlignment = true;
70         const PropertyMap& properties = filter.Properties;
71         PropertyMap::const_iterator propertyIter = properties.begin();
72         PropertyMap::const_iterator propertyEnd  = properties.end();
73         for ( ; propertyIter != propertyEnd; ++propertyIter ) {
74           
75             // check alignment data field depending on propertyName
76             const string& propertyName = (*propertyIter).first;
77             const PropertyFilterValue& valueFilter = (*propertyIter).second;
78             
79             if      ( propertyName == ALIGNMENTFLAG_PROPERTY )  keepAlignment &= valueFilter.check(al.AlignmentFlag);
80             else if ( propertyName == CIGAR_PROPERTY ) {
81                 stringstream cigarSs;
82                 const vector<CigarOp>& cigarData = al.CigarData;
83                 if ( !cigarData.empty() ) {
84                     vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
85                     vector<CigarOp>::const_iterator cigarIter = cigarBegin;
86                     vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
87                     for ( ; cigarIter != cigarEnd; ++cigarIter ) {
88                         const CigarOp& op = (*cigarIter);
89                         cigarSs << op.Length << op.Type;
90                     }
91                     keepAlignment &= valueFilter.check(cigarSs.str());
92                 }
93             }
94             else if ( propertyName == INSERTSIZE_PROPERTY )           keepAlignment &= valueFilter.check(al.InsertSize);
95             else if ( propertyName == ISDUPLICATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsDuplicate());
96             else if ( propertyName == ISFAILEDQC_PROPERTY )           keepAlignment &= valueFilter.check(al.IsFailedQC());
97             else if ( propertyName == ISFIRSTMATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsFirstMate());
98             else if ( propertyName == ISMAPPED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsMapped());
99             else if ( propertyName == ISMATEMAPPED_PROPERTY )         keepAlignment &= valueFilter.check(al.IsMateMapped());
100             else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY )  keepAlignment &= valueFilter.check(al.IsMateReverseStrand());
101             else if ( propertyName == ISPAIRED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsPaired());
102             else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY )   keepAlignment &= valueFilter.check(al.IsPrimaryAlignment());
103             else if ( propertyName == ISPROPERPAIR_PROPERTY )         keepAlignment &= valueFilter.check(al.IsProperPair());
104             else if ( propertyName == ISREVERSESTRAND_PROPERTY )      keepAlignment &= valueFilter.check(al.IsReverseStrand());
105             else if ( propertyName == ISSECONDMATE_PROPERTY )         keepAlignment &= valueFilter.check(al.IsSecondMate());
106             else if ( propertyName == ISSINGLETON_PROPERTY ) {
107                 const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped();
108                 keepAlignment &= valueFilter.check(isSingleton);
109             }
110             else if ( propertyName == MAPQUALITY_PROPERTY )           keepAlignment &= valueFilter.check(al.MapQuality);
111             else if ( propertyName == MATEPOSITION_PROPERTY )         keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) );
112             else if ( propertyName == MATEREFERENCE_PROPERTY ) {
113                 if ( !al.IsPaired() || !al.IsMateMapped() ) return false;
114                 BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID");
115                 const string& refName = filterToolReferences.at(al.MateRefID).RefName;
116                 keepAlignment &= valueFilter.check(refName);
117             }
118             else if ( propertyName == NAME_PROPERTY )                 keepAlignment &= valueFilter.check(al.Name);
119             else if ( propertyName == POSITION_PROPERTY )             keepAlignment &= valueFilter.check(al.Position);
120             else if ( propertyName == QUERYBASES_PROPERTY )           keepAlignment &= valueFilter.check(al.QueryBases);
121             else if ( propertyName == REFERENCE_PROPERTY ) {
122                 BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID");
123                 const string& refName = filterToolReferences.at(al.RefID).RefName;
124                 keepAlignment &= valueFilter.check(refName);
125             }
126             else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al);
127             else BAMTOOLS_ASSERT_UNREACHABLE;
128             
129             // if alignment fails at ANY point, just quit and return false
130             if ( !keepAlignment ) return false;
131         }
132       
133         BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here");
134         return keepAlignment;
135     }
136     
137     bool checkAlignmentTag(const PropertyFilterValue& valueFilter, const BamAlignment& al) {
138      
139         // ensure filter contains string data
140         Variant entireTagFilter = valueFilter.Value;
141         if ( !entireTagFilter.is_type<string>() ) return false;
142
143         // localize string from variant
144         const string& entireTagFilterString = entireTagFilter.get<string>();
145
146         // ensure we have at least "XX:x"
147         if ( entireTagFilterString.length() < 4 ) return false;
148
149         // get tagName & lookup in alignment
150         // if found, set tagType to tag type character
151         // if not found, return false
152         const string& tagName = entireTagFilterString.substr(0,2);
153         char tagType = '\0';
154         if ( !al.GetTagType(tagName, tagType) ) return false;
155
156         // remove tagName & ":" from beginning tagFilter
157         string tagFilterString = entireTagFilterString.substr(3);
158
159         // switch on tag type to set tag query value & parse filter token
160         int8_t   asciiFilterValue,   asciiQueryValue;
161         int32_t  intFilterValue,    intQueryValue;
162         uint32_t uintFilterValue,   uintQueryValue;
163         float    realFilterValue,   realQueryValue;
164         string   stringFilterValue, stringQueryValue;
165
166         PropertyFilterValue tagFilter;
167         PropertyFilterValue::ValueCompareType compareType;
168         bool keepAlignment = false;
169         switch (tagType) {
170
171             // ASCII tag type
172             case 'A':
173                 if ( al.GetTag(tagName, asciiQueryValue) ) {
174                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, asciiFilterValue, compareType) ) {
175                         tagFilter.Value = asciiFilterValue;
176                         tagFilter.Type  = compareType;
177                         keepAlignment   = tagFilter.check(asciiQueryValue);
178                     }
179                 }
180                 break;
181
182             // signed int tag type
183             case 'c' :
184             case 's' :
185             case 'i' :
186                 if ( al.GetTag(tagName, intQueryValue) ) {
187                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, intFilterValue, compareType) ) {
188                         tagFilter.Value = intFilterValue;
189                         tagFilter.Type  = compareType;
190                         keepAlignment   = tagFilter.check(intQueryValue);
191                     }
192                 }
193                 break;
194
195             // unsigned int tag type
196             case 'C' :
197             case 'S' :
198             case 'I' :
199                 if ( al.GetTag(tagName, uintQueryValue) ) {
200                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, uintFilterValue, compareType) ) {
201                         tagFilter.Value = uintFilterValue;
202                         tagFilter.Type  = compareType;
203                         keepAlignment   = tagFilter.check(uintQueryValue);
204                     }
205                 }
206                 break;
207
208             // 'real' tag type
209             case 'f' :
210                 if ( al.GetTag(tagName, realQueryValue) ) {
211                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, realFilterValue, compareType) ) {
212                         tagFilter.Value = realFilterValue;
213                         tagFilter.Type  = compareType;
214                         keepAlignment   = tagFilter.check(realQueryValue);
215                     }
216                 }
217                 break;
218
219             // string tag type
220
221             case 'Z':
222             case 'H':
223                 if ( al.GetTag(tagName, stringQueryValue) ) {
224                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, stringFilterValue, compareType) ) {
225                         tagFilter.Value = stringFilterValue;
226                         tagFilter.Type  = compareType;
227                         keepAlignment   = tagFilter.check(stringQueryValue);
228                     }
229                 }
230                 break;
231
232             // unknown tag type
233             default :
234                 keepAlignment = false;
235         }
236
237         return keepAlignment;
238     }
239 };    
240     
241 } // namespace BamTools
242   
243 // ---------------------------------------------
244 // FilterSettings implementation
245
246 struct FilterTool::FilterSettings {
247
248     // ----------------------------------
249     // IO opts
250
251     // flags
252     bool HasInput;
253     bool HasInputFilelist;
254     bool HasOutput;
255     bool HasRegion;
256     bool HasScript;
257     bool IsForceCompression;
258
259     // filenames
260     vector<string> InputFiles;
261     string InputFilelist;
262     string OutputFilename;
263     string Region;
264     string ScriptFilename;
265
266     // -----------------------------------
267     // General filter opts
268
269     // flags
270     bool HasAlignmentFlagFilter;
271     bool HasInsertSizeFilter;
272     bool HasMapQualityFilter;
273     bool HasNameFilter;
274     bool HasQueryBasesFilter;
275     bool HasTagFilter; //(s)
276
277     // filters
278     string AlignmentFlagFilter;
279     string InsertSizeFilter;
280     string NameFilter;
281     string MapQualityFilter;
282     string QueryBasesFilter;
283     string TagFilter;  // support multiple ?
284
285     // -----------------------------------
286     // AlignmentFlag filter opts
287
288     // flags
289     bool HasIsDuplicateFilter;
290     bool HasIsFailedQCFilter;
291     bool HasIsFirstMateFilter;
292     bool HasIsMappedFilter;
293     bool HasIsMateMappedFilter;
294     bool HasIsMateReverseStrandFilter;
295     bool HasIsPairedFilter;
296     bool HasIsPrimaryAlignmentFilter;
297     bool HasIsProperPairFilter;
298     bool HasIsReverseStrandFilter;
299     bool HasIsSecondMateFilter;
300     bool HasIsSingletonFilter;
301
302     // filters
303     string IsDuplicateFilter;
304     string IsFailedQCFilter;
305     string IsFirstMateFilter;
306     string IsMappedFilter;
307     string IsMateMappedFilter;
308     string IsMateReverseStrandFilter;
309     string IsPairedFilter;
310     string IsPrimaryAlignmentFilter;
311     string IsProperPairFilter;
312     string IsReverseStrandFilter;
313     string IsSecondMateFilter;
314     string IsSingletonFilter;
315
316     // ---------------------------------
317     // constructor
318
319     FilterSettings(void)
320         : HasInput(false)
321         , HasInputFilelist(false)
322         , HasOutput(false)
323         , HasRegion(false)
324         , HasScript(false)
325         , IsForceCompression(false)
326         , OutputFilename(Options::StandardOut())
327         , HasAlignmentFlagFilter(false)
328         , HasInsertSizeFilter(false)
329         , HasMapQualityFilter(false)
330         , HasNameFilter(false)
331         , HasQueryBasesFilter(false)
332         , HasTagFilter(false)
333         , HasIsDuplicateFilter(false)
334         , HasIsFailedQCFilter(false)
335         , HasIsFirstMateFilter(false)
336         , HasIsMappedFilter(false)
337         , HasIsMateMappedFilter(false)
338         , HasIsMateReverseStrandFilter(false)
339         , HasIsPairedFilter(false)
340         , HasIsPrimaryAlignmentFilter(false)
341         , HasIsProperPairFilter(false)
342         , HasIsReverseStrandFilter(false)
343         , HasIsSecondMateFilter(false)
344         , HasIsSingletonFilter(false)
345         , IsDuplicateFilter(TRUE_STR)
346         , IsFailedQCFilter(TRUE_STR)
347         , IsFirstMateFilter(TRUE_STR)
348         , IsMappedFilter(TRUE_STR)
349         , IsMateMappedFilter(TRUE_STR)
350         , IsMateReverseStrandFilter(TRUE_STR)
351         , IsPairedFilter(TRUE_STR)
352         , IsPrimaryAlignmentFilter(TRUE_STR)
353         , IsProperPairFilter(TRUE_STR)
354         , IsReverseStrandFilter(TRUE_STR)
355         , IsSecondMateFilter(TRUE_STR)
356         , IsSingletonFilter(TRUE_STR)
357     { }
358 };
359
360 // ---------------------------------------------
361 // FilterToolPrivate declaration
362
363 class FilterTool::FilterToolPrivate {
364       
365     // ctor & dtor
366     public:
367         FilterToolPrivate(FilterTool::FilterSettings* settings);
368         ~FilterToolPrivate(void);  
369         
370     // 'public' interface
371     public:
372         bool Run(void);
373         
374     // internal methods
375     private:
376         bool AddPropertyTokensToFilter(const string& filterName, const map<string, string>& propertyTokens);
377         bool CheckAlignment(const BamAlignment& al);
378         const string GetScriptContents(void);
379         void InitProperties(void);
380         bool ParseCommandLine(void);
381         bool ParseFilterObject(const string& filterName, const Json::Value& filterObject);
382         bool ParseScript(void);
383         bool SetupFilters(void);
384         
385     // data members
386     private:
387         vector<string> m_propertyNames;
388         FilterTool::FilterSettings* m_settings;
389         FilterEngine<BamAlignmentChecker> m_filterEngine;
390 };
391  
392 // ---------------------------------------------
393 // FilterToolPrivate implementation
394   
395 // constructor  
396 FilterTool::FilterToolPrivate::FilterToolPrivate(FilterTool::FilterSettings* settings)
397     : m_settings(settings)
398 { }  
399   
400 // destructor
401 FilterTool::FilterToolPrivate::~FilterToolPrivate(void) { }
402
403 bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName,
404                                                               const map<string,
405                                                               string>& propertyTokens)
406 {
407     // dummy temp values for token parsing
408     bool boolValue;
409     int32_t int32Value;
410     uint16_t uint16Value;
411     uint32_t uint32Value;
412     string stringValue;
413     PropertyFilterValue::ValueCompareType type;
414   
415     // iterate over property token map
416     map<string, string>::const_iterator mapIter = propertyTokens.begin();
417     map<string, string>::const_iterator mapEnd  = propertyTokens.end();
418     for ( ; mapIter != mapEnd; ++mapIter ) {
419       
420         const string& propertyName = (*mapIter).first;
421         const string& token        = (*mapIter).second;
422         
423         // ------------------------------
424         // convert token to value & compare type 
425         // then add to filter engine
426         
427         // bool conversion
428         if ( propertyName == ISDUPLICATE_PROPERTY ||
429              propertyName == ISFAILEDQC_PROPERTY ||
430              propertyName == ISFIRSTMATE_PROPERTY ||
431              propertyName == ISMAPPED_PROPERTY ||
432              propertyName == ISMATEMAPPED_PROPERTY ||
433              propertyName == ISMATEREVERSESTRAND_PROPERTY ||
434              propertyName == ISPAIRED_PROPERTY ||
435              propertyName == ISPRIMARYALIGNMENT_PROPERTY ||
436              propertyName == ISPROPERPAIR_PROPERTY ||
437              propertyName == ISREVERSESTRAND_PROPERTY ||
438              propertyName == ISSECONDMATE_PROPERTY ||
439              propertyName == ISSINGLETON_PROPERTY
440            ) 
441         {
442             FilterEngine<BamAlignmentChecker>::parseToken(token, boolValue, type);
443             m_filterEngine.setProperty(filterName, propertyName, boolValue, type);
444         }
445         
446         // int32_t conversion
447         else if ( propertyName == INSERTSIZE_PROPERTY ||
448                   propertyName == MATEPOSITION_PROPERTY ||
449                   propertyName == POSITION_PROPERTY 
450                 ) 
451         {
452             FilterEngine<BamAlignmentChecker>::parseToken(token, int32Value, type);
453             m_filterEngine.setProperty(filterName, propertyName, int32Value, type);
454         }
455         
456         // uint16_t conversion
457         else if ( propertyName == MAPQUALITY_PROPERTY )
458         {
459             FilterEngine<BamAlignmentChecker>::parseToken(token, uint16Value, type);
460             m_filterEngine.setProperty(filterName, propertyName, uint16Value, type);
461         }
462         
463         // uint32_t conversion
464         else if ( propertyName == ALIGNMENTFLAG_PROPERTY )
465         {
466             FilterEngine<BamAlignmentChecker>::parseToken(token, uint32Value, type);
467             m_filterEngine.setProperty(filterName, propertyName, uint32Value, type);
468         }
469         
470         // string conversion
471         else if ( propertyName == CIGAR_PROPERTY || 
472                   propertyName == MATEREFERENCE_PROPERTY ||
473                   propertyName == NAME_PROPERTY ||
474                   propertyName == QUERYBASES_PROPERTY ||
475                   propertyName == REFERENCE_PROPERTY 
476                 ) 
477         {
478             FilterEngine<BamAlignmentChecker>::parseToken(token, stringValue, type);
479             m_filterEngine.setProperty(filterName, propertyName, stringValue, type);
480         }
481       
482         else if ( propertyName == TAG_PROPERTY ) {
483             // this will be stored directly as the TAG:VALUE token
484             // (VALUE may contain compare ops, will be parsed out later)
485             m_filterEngine.setProperty(filterName, propertyName, token, PropertyFilterValue::EXACT);
486         }
487       
488         // else unknown property 
489         else {
490             cerr << "bamtools filter ERROR: unknown property - " << propertyName << endl;
491             return false;
492         }
493     }
494     return true;
495 }
496
497 bool FilterTool::FilterToolPrivate::CheckAlignment(const BamAlignment& al) {
498     return m_filterEngine.check(al);
499 }
500
501 const string FilterTool::FilterToolPrivate::GetScriptContents(void) {
502   
503     // open script for reading
504     FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb");
505     if ( !inFile ) {
506         cerr << "bamtools filter ERROR: could not open script: "
507              << m_settings->ScriptFilename << " for reading" << endl;
508         return string();
509     }
510     
511     // read in entire script contents  
512     char buffer[1024];
513     ostringstream docStream("");
514     while ( true ) {
515         
516         // peek ahead, make sure there is data available
517         char ch = fgetc(inFile);
518         ungetc(ch, inFile);
519         if( feof(inFile) )
520             break;
521         
522         // read next block of data
523         if ( fgets(buffer, 1024, inFile) == 0 ) {
524             cerr << "bamtools filter ERROR: could not read script contents" << endl;
525             return string();
526         }
527         
528         docStream << buffer;
529     }
530     
531     // close script file
532     fclose(inFile);
533     
534     // import buffer contents to document, return
535     return docStream.str();
536 }
537
538 void FilterTool::FilterToolPrivate::InitProperties(void) {
539   
540     // store property names in vector 
541     m_propertyNames.push_back(ALIGNMENTFLAG_PROPERTY);
542     m_propertyNames.push_back(CIGAR_PROPERTY);
543     m_propertyNames.push_back(INSERTSIZE_PROPERTY);
544     m_propertyNames.push_back(ISDUPLICATE_PROPERTY);
545     m_propertyNames.push_back(ISFAILEDQC_PROPERTY);
546     m_propertyNames.push_back(ISFIRSTMATE_PROPERTY);
547     m_propertyNames.push_back(ISMAPPED_PROPERTY);
548     m_propertyNames.push_back(ISMATEMAPPED_PROPERTY);
549     m_propertyNames.push_back(ISMATEREVERSESTRAND_PROPERTY);
550     m_propertyNames.push_back(ISPAIRED_PROPERTY);
551     m_propertyNames.push_back(ISPRIMARYALIGNMENT_PROPERTY);
552     m_propertyNames.push_back(ISPROPERPAIR_PROPERTY);
553     m_propertyNames.push_back(ISREVERSESTRAND_PROPERTY);
554     m_propertyNames.push_back(ISSECONDMATE_PROPERTY);
555     m_propertyNames.push_back(ISSINGLETON_PROPERTY);
556     m_propertyNames.push_back(MAPQUALITY_PROPERTY);
557     m_propertyNames.push_back(MATEPOSITION_PROPERTY);
558     m_propertyNames.push_back(MATEREFERENCE_PROPERTY);
559     m_propertyNames.push_back(NAME_PROPERTY);
560     m_propertyNames.push_back(POSITION_PROPERTY);
561     m_propertyNames.push_back(QUERYBASES_PROPERTY);
562     m_propertyNames.push_back(REFERENCE_PROPERTY);
563     m_propertyNames.push_back(TAG_PROPERTY);
564     
565     // add vector contents to FilterEngine<BamAlignmentChecker>
566     vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
567     vector<string>::const_iterator propertyNameEnd  = m_propertyNames.end();
568     for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter )
569         m_filterEngine.addProperty((*propertyNameIter));
570 }
571
572 bool FilterTool::FilterToolPrivate::ParseCommandLine(void) {
573   
574     // add a rule set to filter engine
575     const string CMD = "COMMAND_LINE";
576     m_filterEngine.addFilter(CMD);
577
578     // map property names to command line args
579     map<string, string> propertyTokens;
580     if ( m_settings->HasAlignmentFlagFilter )       propertyTokens.insert( make_pair(ALIGNMENTFLAG_PROPERTY,       m_settings->AlignmentFlagFilter) );
581     if ( m_settings->HasInsertSizeFilter )          propertyTokens.insert( make_pair(INSERTSIZE_PROPERTY,          m_settings->InsertSizeFilter) );
582     if ( m_settings->HasIsDuplicateFilter )         propertyTokens.insert( make_pair(ISDUPLICATE_PROPERTY,         m_settings->IsDuplicateFilter) );
583     if ( m_settings->HasIsFailedQCFilter )          propertyTokens.insert( make_pair(ISFAILEDQC_PROPERTY,          m_settings->IsFailedQCFilter) );
584     if ( m_settings->HasIsFirstMateFilter )         propertyTokens.insert( make_pair(ISFIRSTMATE_PROPERTY,         m_settings->IsFirstMateFilter) );
585     if ( m_settings->HasIsMappedFilter )            propertyTokens.insert( make_pair(ISMAPPED_PROPERTY,            m_settings->IsMappedFilter) );
586     if ( m_settings->HasIsMateMappedFilter )        propertyTokens.insert( make_pair(ISMATEMAPPED_PROPERTY,        m_settings->IsMateMappedFilter) );
587     if ( m_settings->HasIsMateReverseStrandFilter ) propertyTokens.insert( make_pair(ISMATEREVERSESTRAND_PROPERTY, m_settings->IsMateReverseStrandFilter) );
588     if ( m_settings->HasIsPairedFilter )            propertyTokens.insert( make_pair(ISPAIRED_PROPERTY,            m_settings->IsPairedFilter) );
589     if ( m_settings->HasIsPrimaryAlignmentFilter )  propertyTokens.insert( make_pair(ISPRIMARYALIGNMENT_PROPERTY,  m_settings->IsPrimaryAlignmentFilter) );
590     if ( m_settings->HasIsProperPairFilter )        propertyTokens.insert( make_pair(ISPROPERPAIR_PROPERTY,        m_settings->IsProperPairFilter) );
591     if ( m_settings->HasIsReverseStrandFilter )     propertyTokens.insert( make_pair(ISREVERSESTRAND_PROPERTY,     m_settings->IsReverseStrandFilter) );
592     if ( m_settings->HasIsSecondMateFilter )        propertyTokens.insert( make_pair(ISSECONDMATE_PROPERTY,        m_settings->IsSecondMateFilter) );
593     if ( m_settings->HasIsSingletonFilter )         propertyTokens.insert( make_pair(ISSINGLETON_PROPERTY,         m_settings->IsSingletonFilter) );
594     if ( m_settings->HasMapQualityFilter )          propertyTokens.insert( make_pair(MAPQUALITY_PROPERTY,          m_settings->MapQualityFilter) );
595     if ( m_settings->HasNameFilter )                propertyTokens.insert( make_pair(NAME_PROPERTY,                m_settings->NameFilter) );
596     if ( m_settings->HasQueryBasesFilter )          propertyTokens.insert( make_pair(QUERYBASES_PROPERTY,          m_settings->QueryBasesFilter) );
597     if ( m_settings->HasTagFilter )                 propertyTokens.insert( make_pair(TAG_PROPERTY,                 m_settings->TagFilter) );
598     
599     // send add these properties to filter set "COMMAND_LINE"
600     return AddPropertyTokensToFilter(CMD, propertyTokens);
601 }
602
603 bool FilterTool::FilterToolPrivate::ParseFilterObject(const string& filterName, const Json::Value& filterObject) {
604   
605     // filter object parsing variables
606     Json::Value null(Json::nullValue);
607     Json::Value propertyValue;
608     
609     // store results
610     map<string, string> propertyTokens;
611     
612     // iterate over known properties
613     vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
614     vector<string>::const_iterator propertyNameEnd  = m_propertyNames.end();
615     for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter ) {
616         const string& propertyName = (*propertyNameIter);
617         
618         // if property defined in filter, add to token list
619         propertyValue = filterObject.get(propertyName, null);
620         if ( propertyValue != null ) 
621             propertyTokens.insert( make_pair(propertyName, propertyValue.asString()) );
622     }
623   
624     // add this filter to engin
625     m_filterEngine.addFilter(filterName);
626   
627     // add token list to this filter
628     return AddPropertyTokensToFilter(filterName, propertyTokens);
629 }
630
631 bool FilterTool::FilterToolPrivate::ParseScript(void) {
632   
633     // read in script contents from file
634     const string document = GetScriptContents();
635     
636     // set up JsonCPP reader and attempt to parse script
637     Json::Value root;
638     Json::Reader reader;
639     if ( !reader.parse(document, root) ) {
640         // use built-in error reporting mechanism to alert user what was wrong with the script
641         cerr  << "bamtools filter ERROR: failed to parse script - see error message(s) below" << endl
642               << reader.getFormatedErrorMessages();
643         return false;     
644     }
645
646     // initialize return status
647     bool success = true;
648     
649     // see if root object contains multiple filters
650     const Json::Value filters = root["filters"];
651     if ( !filters.isNull() ) {
652       
653         // iterate over any filters found
654         int filterIndex = 0;
655         Json::Value::const_iterator filtersIter = filters.begin();
656         Json::Value::const_iterator filtersEnd  = filters.end();
657         for ( ; filtersIter != filtersEnd; ++filtersIter, ++filterIndex ) {
658             Json::Value filter = (*filtersIter);
659             
660             // convert filter index to string
661             string filterName;
662             
663             // if id tag supplied
664             const Json::Value id = filter["id"];
665             if ( !id.isNull() ) 
666                 filterName = id.asString();
667             
668             // use array index 
669             else {
670                 stringstream convert;
671                 convert << filterIndex;
672                 filterName = convert.str();
673             }
674             
675             // create & parse filter 
676             success &= ParseFilterObject(filterName, filter);
677         }
678         
679         // see if user defined a "rule" for these filters
680         // otherwise, use filter engine's default rule behavior
681         string ruleString("");
682         const Json::Value rule = root["rule"];
683         if ( rule.isString() )
684             ruleString = rule.asString();
685         m_filterEngine.setRule(ruleString);
686           
687         // return success/fail
688         return success;
689     } 
690     
691     // otherwise, root is the only filter (just contains properties)
692     // create & parse filter named "ROOT"
693     else success = ParseFilterObject("ROOT", root);
694     
695     // return success/failure
696     return success;
697 }
698
699 bool FilterTool::FilterToolPrivate::Run(void) {
700     
701     // set to default input if none provided
702     if ( !m_settings->HasInput && !m_settings->HasInputFilelist )
703         m_settings->InputFiles.push_back(Options::StandardIn());
704
705     // add files in the filelist to the input file list
706     if ( m_settings->HasInputFilelist ) {
707
708         ifstream filelist(m_settings->InputFilelist.c_str(), ios::in);
709         if ( !filelist.is_open() ) {
710             cerr << "bamtools filter ERROR: could not open input BAM file list... Aborting." << endl;
711             return false;
712         }
713
714         string line;
715         while ( getline(filelist, line) )
716             m_settings->InputFiles.push_back(line);
717     }
718
719     // initialize defined properties & user-specified filters
720     // quit if failed
721     if ( !SetupFilters() )
722         return false;
723
724     // open reader without index
725     BamMultiReader reader;
726     if ( !reader.Open(m_settings->InputFiles) ) {
727         cerr << "bamtools filter ERROR: could not open input files for reading." << endl;
728         return false;
729     }
730
731     // retrieve reader header & reference data
732     const string headerText = reader.GetHeaderText();
733     filterToolReferences = reader.GetReferenceData();
734     
735     // determine compression mode for BamWriter
736     bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
737                               !m_settings->IsForceCompression );
738     BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
739     if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
740
741     // open BamWriter
742     BamWriter writer;
743     writer.SetCompressionMode(compressionMode);
744     if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences) ) {
745         cerr << "bamtools filter ERROR: could not open " << m_settings->OutputFilename << " for writing." << endl;
746         reader.Close();
747         return false;
748     }
749
750     // if no region specified, filter entire file 
751     BamAlignment al;
752     if ( !m_settings->HasRegion ) {
753         while ( reader.GetNextAlignment(al) ) {
754             if ( CheckAlignment(al) ) 
755                 writer.SaveAlignment(al);
756         }
757     }
758     
759     // otherwise attempt to use region as constraint
760     else {
761         
762         // if region string parses OK
763         BamRegion region;
764         if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
765
766             // attempt to find index files
767             reader.LocateIndexes();
768
769             // if index data available for all BAM files, we can use SetRegion
770             if ( reader.HasIndexes() ) {
771
772                 // attempt to use SetRegion(), if failed report error
773                 if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
774                     cerr << "bamtools filter ERROR: set region failed. Check that REGION describes a valid range" << endl;
775                     reader.Close();
776                     return false;
777                 } 
778               
779                 // everything checks out, just iterate through specified region, filtering alignments
780                 while ( reader.GetNextAlignment(al) )
781                     if ( CheckAlignment(al) ) 
782                         writer.SaveAlignment(al);
783                 }
784             
785             // no index data available, we have to iterate through until we
786             // find overlapping alignments
787             else {
788                 while ( reader.GetNextAlignment(al) ) {
789                     if ( (al.RefID >= region.LeftRefID)  && ((al.Position + al.Length) >= region.LeftPosition) &&
790                          (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) 
791                     {
792                         if ( CheckAlignment(al) ) 
793                             writer.SaveAlignment(al);
794                     }
795                 }
796             }
797         } 
798         
799         // error parsing REGION string
800         else {
801             cerr << "bamtools filter ERROR: could not parse REGION: " << m_settings->Region << endl;
802             cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
803                  << endl;
804             reader.Close();
805             return false;
806         }
807     }
808
809     // clean up & exit
810     reader.Close();
811     writer.Close();
812     return true;
813 }
814
815 bool FilterTool::FilterToolPrivate::SetupFilters(void) {
816   
817     // set up filter engine with supported properties
818     InitProperties();
819     
820     // parse script for filter rules, if given
821     if ( m_settings->HasScript )
822         return ParseScript();
823     
824     // otherwise check command line for filters
825     else return ParseCommandLine();
826 }
827
828 // ---------------------------------------------
829 // FilterTool implementation
830
831 FilterTool::FilterTool(void)
832     : AbstractTool()
833     , m_settings(new FilterSettings)
834     , m_impl(0)
835 {
836     // ----------------------------------
837     // set program details
838
839     const string usage = "[-in <filename> -in <filename> ... | -list <filelist>] "
840                          "[-out <filename> | [-forceCompression]] [-region <REGION>] "
841                          "[ [-script <filename] | [filterOptions] ]";
842
843     Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", usage );
844
845     // ----------------------------------
846     // I/O options
847
848     OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
849
850     const string inDesc     = "the input BAM file(s)";
851     const string listDesc   = "the input BAM file list, one line per file";
852     const string outDesc    = "the output BAM file";
853     const string regionDesc = "only read data from this genomic region (see documentation for more details)";
854     const string scriptDesc = "the filter script file (see documentation for more details)";
855     const string forceDesc  = "if results are sent to stdout (like when piping to another tool), "
856                               "default behavior is to leave output uncompressed. Use this flag to "
857                               "override and force compression";
858
859     Options::AddValueOption("-in",     "BAM filename", inDesc,     "", m_settings->HasInput,  m_settings->InputFiles,     IO_Opts, Options::StandardIn());
860     Options::AddValueOption("-list",   "filename",     listDesc,   "", m_settings->HasInputFilelist,  m_settings->InputFilelist, IO_Opts);
861     Options::AddValueOption("-out",    "BAM filename", outDesc,    "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
862     Options::AddValueOption("-region", "REGION",       regionDesc, "", m_settings->HasRegion, m_settings->Region,         IO_Opts);
863     Options::AddValueOption("-script", "filename",     scriptDesc, "", m_settings->HasScript, m_settings->ScriptFilename, IO_Opts);
864     Options::AddOption("-forceCompression",forceDesc, m_settings->IsForceCompression, IO_Opts);
865
866     // ----------------------------------
867     // general filter options
868
869     OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters");
870
871     const string flagDesc    = "keep reads with this *exact* alignment flag (for more detailed queries, see below)";
872     const string insertDesc  = "keep reads with insert size that matches pattern";
873     const string mapQualDesc = "keep reads with map quality that matches pattern";
874     const string nameDesc    = "keep reads with name that matches pattern";
875     const string queryDesc   = "keep reads with motif that matches pattern";
876     const string tagDesc     = "keep reads with this key=>value pair";
877
878     Options::AddValueOption("-alignmentFlag", "int",       flagDesc,    "", m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, FilterOpts);
879     Options::AddValueOption("-insertSize",    "int",       insertDesc,  "", m_settings->HasInsertSizeFilter,    m_settings->InsertSizeFilter,    FilterOpts);
880     Options::AddValueOption("-mapQuality",    "[0-255]",   mapQualDesc, "", m_settings->HasMapQualityFilter,    m_settings->MapQualityFilter,    FilterOpts);
881     Options::AddValueOption("-name",          "string",    nameDesc,    "", m_settings->HasNameFilter,          m_settings->NameFilter,          FilterOpts);
882     Options::AddValueOption("-queryBases",    "string",    queryDesc,   "", m_settings->HasQueryBasesFilter,    m_settings->QueryBasesFilter,    FilterOpts);
883     Options::AddValueOption("-tag",           "TAG:VALUE", tagDesc,     "", m_settings->HasTagFilter,           m_settings->TagFilter,           FilterOpts);
884
885     // ----------------------------------
886     // alignment flag filter options
887
888     OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters");
889
890     const string boolArg           = "true/false";
891     const string isDupDesc         = "keep only alignments that are marked as duplicate?";
892     const string isFailQcDesc      = "keep only alignments that failed QC?";
893     const string isFirstMateDesc   = "keep only alignments marked as first mate?";
894     const string isMappedDesc      = "keep only alignments that were mapped?";
895     const string isMateMappedDesc  = "keep only alignments with mates that mapped";
896     const string isMateReverseDesc = "keep only alignments with mate on reverese strand?";
897     const string isPairedDesc      = "keep only alignments that were sequenced as paired?";
898     const string isPrimaryDesc     = "keep only alignments marked as primary?";
899     const string isProperPairDesc  = "keep only alignments that passed PE resolution?";
900     const string isReverseDesc     = "keep only alignments on reverse strand?";
901     const string isSecondMateDesc  = "keep only alignments marked as second mate?";
902     const string isSingletonDesc   = "keep only singletons";
903
904     Options::AddValueOption("-isDuplicate",         boolArg, isDupDesc,         "", m_settings->HasIsDuplicateFilter,         m_settings->IsDuplicateFilter,         AlignmentFlagOpts, TRUE_STR);
905     Options::AddValueOption("-isFailedQC",          boolArg, isFailQcDesc,      "", m_settings->HasIsFailedQCFilter,          m_settings->IsFailedQCFilter,          AlignmentFlagOpts, TRUE_STR);
906     Options::AddValueOption("-isFirstMate",         boolArg, isFirstMateDesc,   "", m_settings->HasIsFirstMateFilter,         m_settings->IsFirstMateFilter,         AlignmentFlagOpts, TRUE_STR);
907     Options::AddValueOption("-isMapped",            boolArg, isMappedDesc,      "", m_settings->HasIsMappedFilter,            m_settings->IsMappedFilter,            AlignmentFlagOpts, TRUE_STR);
908     Options::AddValueOption("-isMateMapped",        boolArg, isMateMappedDesc,  "", m_settings->HasIsMateMappedFilter,        m_settings->IsMateMappedFilter,        AlignmentFlagOpts, TRUE_STR);
909     Options::AddValueOption("-isMateReverseStrand", boolArg, isMateReverseDesc, "", m_settings->HasIsMateReverseStrandFilter, m_settings->IsMateReverseStrandFilter, AlignmentFlagOpts, TRUE_STR);
910     Options::AddValueOption("-isPaired",            boolArg, isPairedDesc,      "", m_settings->HasIsPairedFilter,            m_settings->IsPairedFilter,            AlignmentFlagOpts, TRUE_STR);
911     Options::AddValueOption("-isPrimaryAlignment",  boolArg, isPrimaryDesc,     "", m_settings->HasIsPrimaryAlignmentFilter,  m_settings->IsPrimaryAlignmentFilter,  AlignmentFlagOpts, TRUE_STR);
912     Options::AddValueOption("-isProperPair",        boolArg, isProperPairDesc,  "", m_settings->HasIsProperPairFilter,        m_settings->IsProperPairFilter,        AlignmentFlagOpts, TRUE_STR);
913     Options::AddValueOption("-isReverseStrand",     boolArg, isReverseDesc,     "", m_settings->HasIsReverseStrandFilter,     m_settings->IsReverseStrandFilter,     AlignmentFlagOpts, TRUE_STR);
914     Options::AddValueOption("-isSecondMate",        boolArg, isSecondMateDesc,  "", m_settings->HasIsSecondMateFilter,        m_settings->IsSecondMateFilter,        AlignmentFlagOpts, TRUE_STR);
915     Options::AddValueOption("-isSingleton",         boolArg, isSingletonDesc,   "", m_settings->HasIsSingletonFilter,         m_settings->IsSingletonFilter,         AlignmentFlagOpts, TRUE_STR);
916 }
917
918 FilterTool::~FilterTool(void) {
919
920     delete m_settings;
921     m_settings = 0;
922
923     delete m_impl;
924     m_impl = 0;
925 }
926
927 int FilterTool::Help(void) {
928     Options::DisplayHelp();
929     return 0;
930 }
931
932 int FilterTool::Run(int argc, char* argv[]) {
933
934     // parse command line arguments
935     Options::Parse(argc, argv, 1);
936
937     // initialize FilterTool with settings
938     m_impl = new FilterToolPrivate(m_settings);
939
940     // run FilterTool, return success/fail
941     if ( m_impl->Run() )
942         return 0;
943     else
944         return 1;
945 }