]> git.donarmstrong.com Git - bamtools.git/blob - src/toolkit/bamtools_filter.cpp
Added length property to filter tool.
[bamtools.git] / src / toolkit / bamtools_filter.cpp
1 // ***************************************************************************
2 // bamtools_filter.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 3 May 2013
6 // ---------------------------------------------------------------------------
7 // Filters BAM file(s) according to some user-specified criteria
8 // ***************************************************************************
9
10 #include "bamtools_filter.h"
11
12 #include <api/BamMultiReader.h>
13 #include <api/BamWriter.h>
14 #include <utils/bamtools_filter_engine.h>
15 #include <utils/bamtools_options.h>
16 #include <utils/bamtools_utilities.h>
17 using namespace BamTools;
18
19 #include <jsoncpp/json.h>
20 using namespace Json;
21
22 #include <cstdio>
23 #include <fstream>
24 #include <iostream>
25 #include <sstream>
26 #include <string>
27 #include <vector>
28 using namespace std;
29   
30 namespace BamTools {
31   
32 // -------------------------------  
33 // string literal constants  
34
35 // property names
36 const string ALIGNMENTFLAG_PROPERTY       = "alignmentFlag";
37 const string CIGAR_PROPERTY               = "cigar";
38 const string INSERTSIZE_PROPERTY          = "insertSize";
39 const string ISDUPLICATE_PROPERTY         = "isDuplicate";
40 const string ISFAILEDQC_PROPERTY          = "isFailedQC";
41 const string ISFIRSTMATE_PROPERTY         = "isFirstMate";
42 const string ISMAPPED_PROPERTY            = "isMapped";
43 const string ISMATEMAPPED_PROPERTY        = "isMateMapped";
44 const string ISMATEREVERSESTRAND_PROPERTY = "isMateReverseStrand";
45 const string ISPAIRED_PROPERTY            = "isPaired";
46 const string ISPRIMARYALIGNMENT_PROPERTY  = "isPrimaryAlignment";
47 const string ISPROPERPAIR_PROPERTY        = "isProperPair";
48 const string ISREVERSESTRAND_PROPERTY     = "isReverseStrand";
49 const string ISSECONDMATE_PROPERTY        = "isSecondMate";
50 const string ISSINGLETON_PROPERTY         = "isSingleton";
51 const string LENGTH_PROPERTY              = "length";
52 const string MAPQUALITY_PROPERTY          = "mapQuality";
53 const string MATEPOSITION_PROPERTY        = "matePosition";
54 const string MATEREFERENCE_PROPERTY       = "mateReference";
55 const string NAME_PROPERTY                = "name";
56 const string POSITION_PROPERTY            = "position";
57 const string QUERYBASES_PROPERTY          = "queryBases";
58 const string REFERENCE_PROPERTY           = "reference";
59 const string TAG_PROPERTY                 = "tag";
60
61 // boolalpha
62 const string TRUE_STR  = "true";
63 const string FALSE_STR = "false";
64     
65 RefVector filterToolReferences;    
66     
67 struct BamAlignmentChecker {
68     bool check(const PropertyFilter& filter, const BamAlignment& al) {
69       
70         bool keepAlignment = true;
71         const PropertyMap& properties = filter.Properties;
72         PropertyMap::const_iterator propertyIter = properties.begin();
73         PropertyMap::const_iterator propertyEnd  = properties.end();
74         for ( ; propertyIter != propertyEnd; ++propertyIter ) {
75           
76             // check alignment data field depending on propertyName
77             const string& propertyName = (*propertyIter).first;
78             const PropertyFilterValue& valueFilter = (*propertyIter).second;
79             
80             if      ( propertyName == ALIGNMENTFLAG_PROPERTY )  keepAlignment &= valueFilter.check(al.AlignmentFlag);
81             else if ( propertyName == CIGAR_PROPERTY ) {
82                 stringstream cigarSs;
83                 const vector<CigarOp>& cigarData = al.CigarData;
84                 if ( !cigarData.empty() ) {
85                     vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
86                     vector<CigarOp>::const_iterator cigarIter = cigarBegin;
87                     vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
88                     for ( ; cigarIter != cigarEnd; ++cigarIter ) {
89                         const CigarOp& op = (*cigarIter);
90                         cigarSs << op.Length << op.Type;
91                     }
92                     keepAlignment &= valueFilter.check(cigarSs.str());
93                 }
94             }
95             else if ( propertyName == INSERTSIZE_PROPERTY )           keepAlignment &= valueFilter.check(al.InsertSize);
96             else if ( propertyName == ISDUPLICATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsDuplicate());
97             else if ( propertyName == ISFAILEDQC_PROPERTY )           keepAlignment &= valueFilter.check(al.IsFailedQC());
98             else if ( propertyName == ISFIRSTMATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsFirstMate());
99             else if ( propertyName == ISMAPPED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsMapped());
100             else if ( propertyName == ISMATEMAPPED_PROPERTY )         keepAlignment &= valueFilter.check(al.IsMateMapped());
101             else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY )  keepAlignment &= valueFilter.check(al.IsMateReverseStrand());
102             else if ( propertyName == ISPAIRED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsPaired());
103             else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY )   keepAlignment &= valueFilter.check(al.IsPrimaryAlignment());
104             else if ( propertyName == ISPROPERPAIR_PROPERTY )         keepAlignment &= valueFilter.check(al.IsProperPair());
105             else if ( propertyName == ISREVERSESTRAND_PROPERTY )      keepAlignment &= valueFilter.check(al.IsReverseStrand());
106             else if ( propertyName == ISSECONDMATE_PROPERTY )         keepAlignment &= valueFilter.check(al.IsSecondMate());
107             else if ( propertyName == ISSINGLETON_PROPERTY ) {
108                 const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped();
109                 keepAlignment &= valueFilter.check(isSingleton);
110             }
111             else if ( propertyName == LENGTH_PROPERTY )               keepAlignment &= valueFilter.check(al.Length);
112             else if ( propertyName == MAPQUALITY_PROPERTY )           keepAlignment &= valueFilter.check(al.MapQuality);
113             else if ( propertyName == MATEPOSITION_PROPERTY )         keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) );
114             else if ( propertyName == MATEREFERENCE_PROPERTY ) {
115                 if ( !al.IsPaired() || !al.IsMateMapped() ) return false;
116                 BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID");
117                 const string& refName = filterToolReferences.at(al.MateRefID).RefName;
118                 keepAlignment &= valueFilter.check(refName);
119             }
120             else if ( propertyName == NAME_PROPERTY )                 keepAlignment &= valueFilter.check(al.Name);
121             else if ( propertyName == POSITION_PROPERTY )             keepAlignment &= valueFilter.check(al.Position);
122             else if ( propertyName == QUERYBASES_PROPERTY )           keepAlignment &= valueFilter.check(al.QueryBases);
123             else if ( propertyName == REFERENCE_PROPERTY ) {
124                 BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID");
125                 const string& refName = filterToolReferences.at(al.RefID).RefName;
126                 keepAlignment &= valueFilter.check(refName);
127             }
128             else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al);
129             else BAMTOOLS_ASSERT_UNREACHABLE;
130             
131             // if alignment fails at ANY point, just quit and return false
132             if ( !keepAlignment ) return false;
133         }
134       
135         BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here");
136         return keepAlignment;
137     }
138     
139     bool checkAlignmentTag(const PropertyFilterValue& valueFilter, const BamAlignment& al) {
140      
141         // ensure filter contains string data
142         Variant entireTagFilter = valueFilter.Value;
143         if ( !entireTagFilter.is_type<string>() ) return false;
144
145         // localize string from variant
146         const string& entireTagFilterString = entireTagFilter.get<string>();
147
148         // ensure we have at least "XX:x"
149         if ( entireTagFilterString.length() < 4 ) return false;
150
151         // get tagName & lookup in alignment
152         // if found, set tagType to tag type character
153         // if not found, return false
154         const string& tagName = entireTagFilterString.substr(0,2);
155         char tagType = '\0';
156         if ( !al.GetTagType(tagName, tagType) ) return false;
157
158         // remove tagName & ":" from beginning tagFilter
159         string tagFilterString = entireTagFilterString.substr(3);
160
161         // switch on tag type to set tag query value & parse filter token
162         int8_t   asciiFilterValue,   asciiQueryValue;
163         int32_t  intFilterValue,    intQueryValue;
164         uint32_t uintFilterValue,   uintQueryValue;
165         float    realFilterValue,   realQueryValue;
166         string   stringFilterValue, stringQueryValue;
167
168         PropertyFilterValue tagFilter;
169         PropertyFilterValue::ValueCompareType compareType;
170         bool keepAlignment = false;
171         switch (tagType) {
172
173             // ASCII tag type
174             case 'A':
175                 if ( al.GetTag(tagName, asciiQueryValue) ) {
176                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, asciiFilterValue, compareType) ) {
177                         tagFilter.Value = asciiFilterValue;
178                         tagFilter.Type  = compareType;
179                         keepAlignment   = tagFilter.check(asciiQueryValue);
180                     }
181                 }
182                 break;
183
184             // signed int tag type
185             case 'c' :
186             case 's' :
187             case 'i' :
188                 if ( al.GetTag(tagName, intQueryValue) ) {
189                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, intFilterValue, compareType) ) {
190                         tagFilter.Value = intFilterValue;
191                         tagFilter.Type  = compareType;
192                         keepAlignment   = tagFilter.check(intQueryValue);
193                     }
194                 }
195                 break;
196
197             // unsigned int tag type
198             case 'C' :
199             case 'S' :
200             case 'I' :
201                 if ( al.GetTag(tagName, uintQueryValue) ) {
202                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, uintFilterValue, compareType) ) {
203                         tagFilter.Value = uintFilterValue;
204                         tagFilter.Type  = compareType;
205                         keepAlignment   = tagFilter.check(uintQueryValue);
206                     }
207                 }
208                 break;
209
210             // 'real' tag type
211             case 'f' :
212                 if ( al.GetTag(tagName, realQueryValue) ) {
213                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, realFilterValue, compareType) ) {
214                         tagFilter.Value = realFilterValue;
215                         tagFilter.Type  = compareType;
216                         keepAlignment   = tagFilter.check(realQueryValue);
217                     }
218                 }
219                 break;
220
221             // string tag type
222
223             case 'Z':
224             case 'H':
225                 if ( al.GetTag(tagName, stringQueryValue) ) {
226                     if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, stringFilterValue, compareType) ) {
227                         tagFilter.Value = stringFilterValue;
228                         tagFilter.Type  = compareType;
229                         keepAlignment   = tagFilter.check(stringQueryValue);
230                     }
231                 }
232                 break;
233
234             // unknown tag type
235             default :
236                 keepAlignment = false;
237         }
238
239         return keepAlignment;
240     }
241 };    
242     
243 } // namespace BamTools
244   
245 // ---------------------------------------------
246 // FilterSettings implementation
247
248 struct FilterTool::FilterSettings {
249
250     // ----------------------------------
251     // IO opts
252
253     // flags
254     bool HasInput;
255     bool HasInputFilelist;
256     bool HasOutput;
257     bool HasRegion;
258     bool HasScript;
259     bool IsForceCompression;
260
261     // filenames
262     vector<string> InputFiles;
263     string InputFilelist;
264     string OutputFilename;
265     string Region;
266     string ScriptFilename;
267
268     // -----------------------------------
269     // General filter opts
270
271     // flags
272     bool HasAlignmentFlagFilter;
273     bool HasInsertSizeFilter;
274     bool HasLengthFilter;
275     bool HasMapQualityFilter;
276     bool HasNameFilter;
277     bool HasQueryBasesFilter;
278     bool HasTagFilter; //(s)
279
280     // filters
281     string AlignmentFlagFilter;
282     string InsertSizeFilter;
283     string LengthFilter;
284     string MapQualityFilter;
285     string NameFilter;
286     string QueryBasesFilter;
287     string TagFilter;  // support multiple ?
288
289     // -----------------------------------
290     // AlignmentFlag filter opts
291
292     // flags
293     bool HasIsDuplicateFilter;
294     bool HasIsFailedQCFilter;
295     bool HasIsFirstMateFilter;
296     bool HasIsMappedFilter;
297     bool HasIsMateMappedFilter;
298     bool HasIsMateReverseStrandFilter;
299     bool HasIsPairedFilter;
300     bool HasIsPrimaryAlignmentFilter;
301     bool HasIsProperPairFilter;
302     bool HasIsReverseStrandFilter;
303     bool HasIsSecondMateFilter;
304     bool HasIsSingletonFilter;
305
306     // filters
307     string IsDuplicateFilter;
308     string IsFailedQCFilter;
309     string IsFirstMateFilter;
310     string IsMappedFilter;
311     string IsMateMappedFilter;
312     string IsMateReverseStrandFilter;
313     string IsPairedFilter;
314     string IsPrimaryAlignmentFilter;
315     string IsProperPairFilter;
316     string IsReverseStrandFilter;
317     string IsSecondMateFilter;
318     string IsSingletonFilter;
319
320     // ---------------------------------
321     // constructor
322
323     FilterSettings(void)
324         : HasInput(false)
325         , HasInputFilelist(false)
326         , HasOutput(false)
327         , HasRegion(false)
328         , HasScript(false)
329         , IsForceCompression(false)
330         , OutputFilename(Options::StandardOut())
331         , HasAlignmentFlagFilter(false)
332         , HasInsertSizeFilter(false)
333         , HasLengthFilter(false)
334         , HasMapQualityFilter(false)
335         , HasNameFilter(false)
336         , HasQueryBasesFilter(false)
337         , HasTagFilter(false)
338         , HasIsDuplicateFilter(false)
339         , HasIsFailedQCFilter(false)
340         , HasIsFirstMateFilter(false)
341         , HasIsMappedFilter(false)
342         , HasIsMateMappedFilter(false)
343         , HasIsMateReverseStrandFilter(false)
344         , HasIsPairedFilter(false)
345         , HasIsPrimaryAlignmentFilter(false)
346         , HasIsProperPairFilter(false)
347         , HasIsReverseStrandFilter(false)
348         , HasIsSecondMateFilter(false)
349         , HasIsSingletonFilter(false)
350         , IsDuplicateFilter(TRUE_STR)
351         , IsFailedQCFilter(TRUE_STR)
352         , IsFirstMateFilter(TRUE_STR)
353         , IsMappedFilter(TRUE_STR)
354         , IsMateMappedFilter(TRUE_STR)
355         , IsMateReverseStrandFilter(TRUE_STR)
356         , IsPairedFilter(TRUE_STR)
357         , IsPrimaryAlignmentFilter(TRUE_STR)
358         , IsProperPairFilter(TRUE_STR)
359         , IsReverseStrandFilter(TRUE_STR)
360         , IsSecondMateFilter(TRUE_STR)
361         , IsSingletonFilter(TRUE_STR)
362     { }
363 };
364
365 // ---------------------------------------------
366 // FilterToolPrivate declaration
367
368 class FilterTool::FilterToolPrivate {
369       
370     // ctor & dtor
371     public:
372         FilterToolPrivate(FilterTool::FilterSettings* settings);
373         ~FilterToolPrivate(void);  
374         
375     // 'public' interface
376     public:
377         bool Run(void);
378         
379     // internal methods
380     private:
381         bool AddPropertyTokensToFilter(const string& filterName, const map<string, string>& propertyTokens);
382         bool CheckAlignment(const BamAlignment& al);
383         const string GetScriptContents(void);
384         void InitProperties(void);
385         bool ParseCommandLine(void);
386         bool ParseFilterObject(const string& filterName, const Json::Value& filterObject);
387         bool ParseScript(void);
388         bool SetupFilters(void);
389         
390     // data members
391     private:
392         vector<string> m_propertyNames;
393         FilterTool::FilterSettings* m_settings;
394         FilterEngine<BamAlignmentChecker> m_filterEngine;
395 };
396  
397 // ---------------------------------------------
398 // FilterToolPrivate implementation
399   
400 // constructor  
401 FilterTool::FilterToolPrivate::FilterToolPrivate(FilterTool::FilterSettings* settings)
402     : m_settings(settings)
403 { }  
404   
405 // destructor
406 FilterTool::FilterToolPrivate::~FilterToolPrivate(void) { }
407
408 bool FilterTool::FilterToolPrivate::AddPropertyTokensToFilter(const string& filterName,
409                                                               const map<string,
410                                                               string>& propertyTokens)
411 {
412     // dummy temp values for token parsing
413     bool boolValue;
414     int32_t int32Value;
415     uint16_t uint16Value;
416     uint32_t uint32Value;
417     string stringValue;
418     PropertyFilterValue::ValueCompareType type;
419   
420     // iterate over property token map
421     map<string, string>::const_iterator mapIter = propertyTokens.begin();
422     map<string, string>::const_iterator mapEnd  = propertyTokens.end();
423     for ( ; mapIter != mapEnd; ++mapIter ) {
424       
425         const string& propertyName = (*mapIter).first;
426         const string& token        = (*mapIter).second;
427         
428         // ------------------------------
429         // convert token to value & compare type 
430         // then add to filter engine
431         
432         // bool conversion
433         if ( propertyName == ISDUPLICATE_PROPERTY ||
434              propertyName == ISFAILEDQC_PROPERTY ||
435              propertyName == ISFIRSTMATE_PROPERTY ||
436              propertyName == ISMAPPED_PROPERTY ||
437              propertyName == ISMATEMAPPED_PROPERTY ||
438              propertyName == ISMATEREVERSESTRAND_PROPERTY ||
439              propertyName == ISPAIRED_PROPERTY ||
440              propertyName == ISPRIMARYALIGNMENT_PROPERTY ||
441              propertyName == ISPROPERPAIR_PROPERTY ||
442              propertyName == ISREVERSESTRAND_PROPERTY ||
443              propertyName == ISSECONDMATE_PROPERTY ||
444              propertyName == ISSINGLETON_PROPERTY
445            ) 
446         {
447             FilterEngine<BamAlignmentChecker>::parseToken(token, boolValue, type);
448             m_filterEngine.setProperty(filterName, propertyName, boolValue, type);
449         }
450         
451         // int32_t conversion
452         else if ( propertyName == INSERTSIZE_PROPERTY ||
453                   propertyName == LENGTH_PROPERTY ||
454                   propertyName == MATEPOSITION_PROPERTY ||
455                   propertyName == POSITION_PROPERTY 
456                 ) 
457         {
458             FilterEngine<BamAlignmentChecker>::parseToken(token, int32Value, type);
459             m_filterEngine.setProperty(filterName, propertyName, int32Value, type);
460         }
461         
462         // uint16_t conversion
463         else if ( propertyName == MAPQUALITY_PROPERTY )
464         {
465             FilterEngine<BamAlignmentChecker>::parseToken(token, uint16Value, type);
466             m_filterEngine.setProperty(filterName, propertyName, uint16Value, type);
467         }
468         
469         // uint32_t conversion
470         else if ( propertyName == ALIGNMENTFLAG_PROPERTY )
471         {
472             FilterEngine<BamAlignmentChecker>::parseToken(token, uint32Value, type);
473             m_filterEngine.setProperty(filterName, propertyName, uint32Value, type);
474         }
475         
476         // string conversion
477         else if ( propertyName == CIGAR_PROPERTY || 
478                   propertyName == MATEREFERENCE_PROPERTY ||
479                   propertyName == NAME_PROPERTY ||
480                   propertyName == QUERYBASES_PROPERTY ||
481                   propertyName == REFERENCE_PROPERTY 
482                 ) 
483         {
484             FilterEngine<BamAlignmentChecker>::parseToken(token, stringValue, type);
485             m_filterEngine.setProperty(filterName, propertyName, stringValue, type);
486         }
487       
488         else if ( propertyName == TAG_PROPERTY ) {
489             // this will be stored directly as the TAG:VALUE token
490             // (VALUE may contain compare ops, will be parsed out later)
491             m_filterEngine.setProperty(filterName, propertyName, token, PropertyFilterValue::EXACT);
492         }
493       
494         // else unknown property 
495         else {
496             cerr << "bamtools filter ERROR: unknown property - " << propertyName << endl;
497             return false;
498         }
499     }
500     return true;
501 }
502
503 bool FilterTool::FilterToolPrivate::CheckAlignment(const BamAlignment& al) {
504     return m_filterEngine.check(al);
505 }
506
507 const string FilterTool::FilterToolPrivate::GetScriptContents(void) {
508   
509     // open script for reading
510     FILE* inFile = fopen(m_settings->ScriptFilename.c_str(), "rb");
511     if ( !inFile ) {
512         cerr << "bamtools filter ERROR: could not open script: "
513              << m_settings->ScriptFilename << " for reading" << endl;
514         return string();
515     }
516     
517     // read in entire script contents  
518     char buffer[1024];
519     ostringstream docStream("");
520     while ( true ) {
521         
522         // peek ahead, make sure there is data available
523         char ch = fgetc(inFile);
524         ungetc(ch, inFile);
525         if( feof(inFile) )
526             break;
527         
528         // read next block of data
529         if ( fgets(buffer, 1024, inFile) == 0 ) {
530             cerr << "bamtools filter ERROR: could not read script contents" << endl;
531             return string();
532         }
533         
534         docStream << buffer;
535     }
536     
537     // close script file
538     fclose(inFile);
539     
540     // import buffer contents to document, return
541     return docStream.str();
542 }
543
544 void FilterTool::FilterToolPrivate::InitProperties(void) {
545   
546     // store property names in vector 
547     m_propertyNames.push_back(ALIGNMENTFLAG_PROPERTY);
548     m_propertyNames.push_back(CIGAR_PROPERTY);
549     m_propertyNames.push_back(INSERTSIZE_PROPERTY);
550     m_propertyNames.push_back(ISDUPLICATE_PROPERTY);
551     m_propertyNames.push_back(ISFAILEDQC_PROPERTY);
552     m_propertyNames.push_back(ISFIRSTMATE_PROPERTY);
553     m_propertyNames.push_back(ISMAPPED_PROPERTY);
554     m_propertyNames.push_back(ISMATEMAPPED_PROPERTY);
555     m_propertyNames.push_back(ISMATEREVERSESTRAND_PROPERTY);
556     m_propertyNames.push_back(ISPAIRED_PROPERTY);
557     m_propertyNames.push_back(ISPRIMARYALIGNMENT_PROPERTY);
558     m_propertyNames.push_back(ISPROPERPAIR_PROPERTY);
559     m_propertyNames.push_back(ISREVERSESTRAND_PROPERTY);
560     m_propertyNames.push_back(ISSECONDMATE_PROPERTY);
561     m_propertyNames.push_back(ISSINGLETON_PROPERTY);
562     m_propertyNames.push_back(LENGTH_PROPERTY);
563     m_propertyNames.push_back(MAPQUALITY_PROPERTY);
564     m_propertyNames.push_back(MATEPOSITION_PROPERTY);
565     m_propertyNames.push_back(MATEREFERENCE_PROPERTY);
566     m_propertyNames.push_back(NAME_PROPERTY);
567     m_propertyNames.push_back(POSITION_PROPERTY);
568     m_propertyNames.push_back(QUERYBASES_PROPERTY);
569     m_propertyNames.push_back(REFERENCE_PROPERTY);
570     m_propertyNames.push_back(TAG_PROPERTY);
571     
572     // add vector contents to FilterEngine<BamAlignmentChecker>
573     vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
574     vector<string>::const_iterator propertyNameEnd  = m_propertyNames.end();
575     for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter )
576         m_filterEngine.addProperty((*propertyNameIter));
577 }
578
579 bool FilterTool::FilterToolPrivate::ParseCommandLine(void) {
580   
581     // add a rule set to filter engine
582     const string CMD = "COMMAND_LINE";
583     m_filterEngine.addFilter(CMD);
584
585     // map property names to command line args
586     map<string, string> propertyTokens;
587     if ( m_settings->HasAlignmentFlagFilter )       propertyTokens.insert( make_pair(ALIGNMENTFLAG_PROPERTY,       m_settings->AlignmentFlagFilter) );
588     if ( m_settings->HasInsertSizeFilter )          propertyTokens.insert( make_pair(INSERTSIZE_PROPERTY,          m_settings->InsertSizeFilter) );
589     if ( m_settings->HasIsDuplicateFilter )         propertyTokens.insert( make_pair(ISDUPLICATE_PROPERTY,         m_settings->IsDuplicateFilter) );
590     if ( m_settings->HasIsFailedQCFilter )          propertyTokens.insert( make_pair(ISFAILEDQC_PROPERTY,          m_settings->IsFailedQCFilter) );
591     if ( m_settings->HasIsFirstMateFilter )         propertyTokens.insert( make_pair(ISFIRSTMATE_PROPERTY,         m_settings->IsFirstMateFilter) );
592     if ( m_settings->HasIsMappedFilter )            propertyTokens.insert( make_pair(ISMAPPED_PROPERTY,            m_settings->IsMappedFilter) );
593     if ( m_settings->HasIsMateMappedFilter )        propertyTokens.insert( make_pair(ISMATEMAPPED_PROPERTY,        m_settings->IsMateMappedFilter) );
594     if ( m_settings->HasIsMateReverseStrandFilter ) propertyTokens.insert( make_pair(ISMATEREVERSESTRAND_PROPERTY, m_settings->IsMateReverseStrandFilter) );
595     if ( m_settings->HasIsPairedFilter )            propertyTokens.insert( make_pair(ISPAIRED_PROPERTY,            m_settings->IsPairedFilter) );
596     if ( m_settings->HasIsPrimaryAlignmentFilter )  propertyTokens.insert( make_pair(ISPRIMARYALIGNMENT_PROPERTY,  m_settings->IsPrimaryAlignmentFilter) );
597     if ( m_settings->HasIsProperPairFilter )        propertyTokens.insert( make_pair(ISPROPERPAIR_PROPERTY,        m_settings->IsProperPairFilter) );
598     if ( m_settings->HasIsReverseStrandFilter )     propertyTokens.insert( make_pair(ISREVERSESTRAND_PROPERTY,     m_settings->IsReverseStrandFilter) );
599     if ( m_settings->HasIsSecondMateFilter )        propertyTokens.insert( make_pair(ISSECONDMATE_PROPERTY,        m_settings->IsSecondMateFilter) );
600     if ( m_settings->HasIsSingletonFilter )         propertyTokens.insert( make_pair(ISSINGLETON_PROPERTY,         m_settings->IsSingletonFilter) );
601     if ( m_settings->HasLengthFilter )              propertyTokens.insert( make_pair(LENGTH_PROPERTY,              m_settings->LengthFilter) );
602     if ( m_settings->HasMapQualityFilter )          propertyTokens.insert( make_pair(MAPQUALITY_PROPERTY,          m_settings->MapQualityFilter) );
603     if ( m_settings->HasNameFilter )                propertyTokens.insert( make_pair(NAME_PROPERTY,                m_settings->NameFilter) );
604     if ( m_settings->HasQueryBasesFilter )          propertyTokens.insert( make_pair(QUERYBASES_PROPERTY,          m_settings->QueryBasesFilter) );
605     if ( m_settings->HasTagFilter )                 propertyTokens.insert( make_pair(TAG_PROPERTY,                 m_settings->TagFilter) );
606     
607     // send add these properties to filter set "COMMAND_LINE"
608     return AddPropertyTokensToFilter(CMD, propertyTokens);
609 }
610
611 bool FilterTool::FilterToolPrivate::ParseFilterObject(const string& filterName, const Json::Value& filterObject) {
612   
613     // filter object parsing variables
614     Json::Value null(Json::nullValue);
615     Json::Value propertyValue;
616     
617     // store results
618     map<string, string> propertyTokens;
619     
620     // iterate over known properties
621     vector<string>::const_iterator propertyNameIter = m_propertyNames.begin();
622     vector<string>::const_iterator propertyNameEnd  = m_propertyNames.end();
623     for ( ; propertyNameIter != propertyNameEnd; ++propertyNameIter ) {
624         const string& propertyName = (*propertyNameIter);
625         
626         // if property defined in filter, add to token list
627         propertyValue = filterObject.get(propertyName, null);
628         if ( propertyValue != null ) 
629             propertyTokens.insert( make_pair(propertyName, propertyValue.asString()) );
630     }
631   
632     // add this filter to engin
633     m_filterEngine.addFilter(filterName);
634   
635     // add token list to this filter
636     return AddPropertyTokensToFilter(filterName, propertyTokens);
637 }
638
639 bool FilterTool::FilterToolPrivate::ParseScript(void) {
640   
641     // read in script contents from file
642     const string document = GetScriptContents();
643     
644     // set up JsonCPP reader and attempt to parse script
645     Json::Value root;
646     Json::Reader reader;
647     if ( !reader.parse(document, root) ) {
648         // use built-in error reporting mechanism to alert user what was wrong with the script
649         cerr  << "bamtools filter ERROR: failed to parse script - see error message(s) below" << endl
650               << reader.getFormatedErrorMessages();
651         return false;     
652     }
653
654     // initialize return status
655     bool success = true;
656     
657     // see if root object contains multiple filters
658     const Json::Value filters = root["filters"];
659     if ( !filters.isNull() ) {
660       
661         // iterate over any filters found
662         int filterIndex = 0;
663         Json::Value::const_iterator filtersIter = filters.begin();
664         Json::Value::const_iterator filtersEnd  = filters.end();
665         for ( ; filtersIter != filtersEnd; ++filtersIter, ++filterIndex ) {
666             Json::Value filter = (*filtersIter);
667             
668             // convert filter index to string
669             string filterName;
670             
671             // if id tag supplied
672             const Json::Value id = filter["id"];
673             if ( !id.isNull() ) 
674                 filterName = id.asString();
675             
676             // use array index 
677             else {
678                 stringstream convert;
679                 convert << filterIndex;
680                 filterName = convert.str();
681             }
682             
683             // create & parse filter 
684             success &= ParseFilterObject(filterName, filter);
685         }
686         
687         // see if user defined a "rule" for these filters
688         // otherwise, use filter engine's default rule behavior
689         string ruleString("");
690         const Json::Value rule = root["rule"];
691         if ( rule.isString() )
692             ruleString = rule.asString();
693         m_filterEngine.setRule(ruleString);
694           
695         // return success/fail
696         return success;
697     } 
698     
699     // otherwise, root is the only filter (just contains properties)
700     // create & parse filter named "ROOT"
701     else success = ParseFilterObject("ROOT", root);
702     
703     // return success/failure
704     return success;
705 }
706
707 bool FilterTool::FilterToolPrivate::Run(void) {
708     
709     // set to default input if none provided
710     if ( !m_settings->HasInput && !m_settings->HasInputFilelist )
711         m_settings->InputFiles.push_back(Options::StandardIn());
712
713     // add files in the filelist to the input file list
714     if ( m_settings->HasInputFilelist ) {
715
716         ifstream filelist(m_settings->InputFilelist.c_str(), ios::in);
717         if ( !filelist.is_open() ) {
718             cerr << "bamtools filter ERROR: could not open input BAM file list... Aborting." << endl;
719             return false;
720         }
721
722         string line;
723         while ( getline(filelist, line) )
724             m_settings->InputFiles.push_back(line);
725     }
726
727     // initialize defined properties & user-specified filters
728     // quit if failed
729     if ( !SetupFilters() )
730         return false;
731
732     // open reader without index
733     BamMultiReader reader;
734     if ( !reader.Open(m_settings->InputFiles) ) {
735         cerr << "bamtools filter ERROR: could not open input files for reading." << endl;
736         return false;
737     }
738
739     // retrieve reader header & reference data
740     const string headerText = reader.GetHeaderText();
741     filterToolReferences = reader.GetReferenceData();
742     
743     // determine compression mode for BamWriter
744     bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
745                               !m_settings->IsForceCompression );
746     BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
747     if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
748
749     // open BamWriter
750     BamWriter writer;
751     writer.SetCompressionMode(compressionMode);
752     if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences) ) {
753         cerr << "bamtools filter ERROR: could not open " << m_settings->OutputFilename << " for writing." << endl;
754         reader.Close();
755         return false;
756     }
757
758     // if no region specified, filter entire file 
759     BamAlignment al;
760     if ( !m_settings->HasRegion ) {
761         while ( reader.GetNextAlignment(al) ) {
762             if ( CheckAlignment(al) ) 
763                 writer.SaveAlignment(al);
764         }
765     }
766     
767     // otherwise attempt to use region as constraint
768     else {
769         
770         // if region string parses OK
771         BamRegion region;
772         if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
773
774             // attempt to find index files
775             reader.LocateIndexes();
776
777             // if index data available for all BAM files, we can use SetRegion
778             if ( reader.HasIndexes() ) {
779
780                 // attempt to use SetRegion(), if failed report error
781                 if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
782                     cerr << "bamtools filter ERROR: set region failed. Check that REGION describes a valid range" << endl;
783                     reader.Close();
784                     return false;
785                 } 
786               
787                 // everything checks out, just iterate through specified region, filtering alignments
788                 while ( reader.GetNextAlignment(al) )
789                     if ( CheckAlignment(al) ) 
790                         writer.SaveAlignment(al);
791                 }
792             
793             // no index data available, we have to iterate through until we
794             // find overlapping alignments
795             else {
796                 while ( reader.GetNextAlignment(al) ) {
797                     if ( (al.RefID >= region.LeftRefID)  && ((al.Position + al.Length) >= region.LeftPosition) &&
798                          (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) 
799                     {
800                         if ( CheckAlignment(al) ) 
801                             writer.SaveAlignment(al);
802                     }
803                 }
804             }
805         } 
806         
807         // error parsing REGION string
808         else {
809             cerr << "bamtools filter ERROR: could not parse REGION: " << m_settings->Region << endl;
810             cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
811                  << endl;
812             reader.Close();
813             return false;
814         }
815     }
816
817     // clean up & exit
818     reader.Close();
819     writer.Close();
820     return true;
821 }
822
823 bool FilterTool::FilterToolPrivate::SetupFilters(void) {
824   
825     // set up filter engine with supported properties
826     InitProperties();
827     
828     // parse script for filter rules, if given
829     if ( m_settings->HasScript )
830         return ParseScript();
831     
832     // otherwise check command line for filters
833     else return ParseCommandLine();
834 }
835
836 // ---------------------------------------------
837 // FilterTool implementation
838
839 FilterTool::FilterTool(void)
840     : AbstractTool()
841     , m_settings(new FilterSettings)
842     , m_impl(0)
843 {
844     // ----------------------------------
845     // set program details
846
847     const string usage = "[-in <filename> -in <filename> ... | -list <filelist>] "
848                          "[-out <filename> | [-forceCompression]] [-region <REGION>] "
849                          "[ [-script <filename] | [filterOptions] ]";
850
851     Options::SetProgramInfo("bamtools filter", "filters BAM file(s)", usage );
852
853     // ----------------------------------
854     // I/O options
855
856     OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
857
858     const string inDesc     = "the input BAM file(s)";
859     const string listDesc   = "the input BAM file list, one line per file";
860     const string outDesc    = "the output BAM file";
861     const string regionDesc = "only read data from this genomic region (see documentation for more details)";
862     const string scriptDesc = "the filter script file (see documentation for more details)";
863     const string forceDesc  = "if results are sent to stdout (like when piping to another tool), "
864                               "default behavior is to leave output uncompressed. Use this flag to "
865                               "override and force compression";
866
867     Options::AddValueOption("-in",     "BAM filename", inDesc,     "", m_settings->HasInput,  m_settings->InputFiles,     IO_Opts, Options::StandardIn());
868     Options::AddValueOption("-list",   "filename",     listDesc,   "", m_settings->HasInputFilelist,  m_settings->InputFilelist, IO_Opts);
869     Options::AddValueOption("-out",    "BAM filename", outDesc,    "", m_settings->HasOutput, m_settings->OutputFilename, IO_Opts, Options::StandardOut());
870     Options::AddValueOption("-region", "REGION",       regionDesc, "", m_settings->HasRegion, m_settings->Region,         IO_Opts);
871     Options::AddValueOption("-script", "filename",     scriptDesc, "", m_settings->HasScript, m_settings->ScriptFilename, IO_Opts);
872     Options::AddOption("-forceCompression",forceDesc, m_settings->IsForceCompression, IO_Opts);
873
874     // ----------------------------------
875     // general filter options
876
877     OptionGroup* FilterOpts = Options::CreateOptionGroup("General Filters");
878
879     const string flagDesc    = "keep reads with this *exact* alignment flag (for more detailed queries, see below)";
880     const string insertDesc  = "keep reads with insert size that matches pattern";
881     const string lengthDesc  = "keep reads with length that matches pattern";
882     const string mapQualDesc = "keep reads with map quality that matches pattern";
883     const string nameDesc    = "keep reads with name that matches pattern";
884     const string queryDesc   = "keep reads with motif that matches pattern";
885     const string tagDesc     = "keep reads with this key=>value pair";
886
887     Options::AddValueOption("-alignmentFlag", "int",       flagDesc,    "", m_settings->HasAlignmentFlagFilter, m_settings->AlignmentFlagFilter, FilterOpts);
888     Options::AddValueOption("-insertSize",    "int",       insertDesc,  "", m_settings->HasInsertSizeFilter,    m_settings->InsertSizeFilter,    FilterOpts);
889     Options::AddValueOption("-length",        "int",       lengthDesc,  "", m_settings->HasLengthFilter,        m_settings->LengthFilter,        FilterOpts);
890     Options::AddValueOption("-mapQuality",    "[0-255]",   mapQualDesc, "", m_settings->HasMapQualityFilter,    m_settings->MapQualityFilter,    FilterOpts);
891     Options::AddValueOption("-name",          "string",    nameDesc,    "", m_settings->HasNameFilter,          m_settings->NameFilter,          FilterOpts);
892     Options::AddValueOption("-queryBases",    "string",    queryDesc,   "", m_settings->HasQueryBasesFilter,    m_settings->QueryBasesFilter,    FilterOpts);
893     Options::AddValueOption("-tag",           "TAG:VALUE", tagDesc,     "", m_settings->HasTagFilter,           m_settings->TagFilter,           FilterOpts);
894
895     // ----------------------------------
896     // alignment flag filter options
897
898     OptionGroup* AlignmentFlagOpts = Options::CreateOptionGroup("Alignment Flag Filters");
899
900     const string boolArg           = "true/false";
901     const string isDupDesc         = "keep only alignments that are marked as duplicate?";
902     const string isFailQcDesc      = "keep only alignments that failed QC?";
903     const string isFirstMateDesc   = "keep only alignments marked as first mate?";
904     const string isMappedDesc      = "keep only alignments that were mapped?";
905     const string isMateMappedDesc  = "keep only alignments with mates that mapped";
906     const string isMateReverseDesc = "keep only alignments with mate on reverese strand?";
907     const string isPairedDesc      = "keep only alignments that were sequenced as paired?";
908     const string isPrimaryDesc     = "keep only alignments marked as primary?";
909     const string isProperPairDesc  = "keep only alignments that passed PE resolution?";
910     const string isReverseDesc     = "keep only alignments on reverse strand?";
911     const string isSecondMateDesc  = "keep only alignments marked as second mate?";
912     const string isSingletonDesc   = "keep only singletons";
913
914     Options::AddValueOption("-isDuplicate",         boolArg, isDupDesc,         "", m_settings->HasIsDuplicateFilter,         m_settings->IsDuplicateFilter,         AlignmentFlagOpts, TRUE_STR);
915     Options::AddValueOption("-isFailedQC",          boolArg, isFailQcDesc,      "", m_settings->HasIsFailedQCFilter,          m_settings->IsFailedQCFilter,          AlignmentFlagOpts, TRUE_STR);
916     Options::AddValueOption("-isFirstMate",         boolArg, isFirstMateDesc,   "", m_settings->HasIsFirstMateFilter,         m_settings->IsFirstMateFilter,         AlignmentFlagOpts, TRUE_STR);
917     Options::AddValueOption("-isMapped",            boolArg, isMappedDesc,      "", m_settings->HasIsMappedFilter,            m_settings->IsMappedFilter,            AlignmentFlagOpts, TRUE_STR);
918     Options::AddValueOption("-isMateMapped",        boolArg, isMateMappedDesc,  "", m_settings->HasIsMateMappedFilter,        m_settings->IsMateMappedFilter,        AlignmentFlagOpts, TRUE_STR);
919     Options::AddValueOption("-isMateReverseStrand", boolArg, isMateReverseDesc, "", m_settings->HasIsMateReverseStrandFilter, m_settings->IsMateReverseStrandFilter, AlignmentFlagOpts, TRUE_STR);
920     Options::AddValueOption("-isPaired",            boolArg, isPairedDesc,      "", m_settings->HasIsPairedFilter,            m_settings->IsPairedFilter,            AlignmentFlagOpts, TRUE_STR);
921     Options::AddValueOption("-isPrimaryAlignment",  boolArg, isPrimaryDesc,     "", m_settings->HasIsPrimaryAlignmentFilter,  m_settings->IsPrimaryAlignmentFilter,  AlignmentFlagOpts, TRUE_STR);
922     Options::AddValueOption("-isProperPair",        boolArg, isProperPairDesc,  "", m_settings->HasIsProperPairFilter,        m_settings->IsProperPairFilter,        AlignmentFlagOpts, TRUE_STR);
923     Options::AddValueOption("-isReverseStrand",     boolArg, isReverseDesc,     "", m_settings->HasIsReverseStrandFilter,     m_settings->IsReverseStrandFilter,     AlignmentFlagOpts, TRUE_STR);
924     Options::AddValueOption("-isSecondMate",        boolArg, isSecondMateDesc,  "", m_settings->HasIsSecondMateFilter,        m_settings->IsSecondMateFilter,        AlignmentFlagOpts, TRUE_STR);
925     Options::AddValueOption("-isSingleton",         boolArg, isSingletonDesc,   "", m_settings->HasIsSingletonFilter,         m_settings->IsSingletonFilter,         AlignmentFlagOpts, TRUE_STR);
926 }
927
928 FilterTool::~FilterTool(void) {
929
930     delete m_settings;
931     m_settings = 0;
932
933     delete m_impl;
934     m_impl = 0;
935 }
936
937 int FilterTool::Help(void) {
938     Options::DisplayHelp();
939     return 0;
940 }
941
942 int FilterTool::Run(int argc, char* argv[]) {
943
944     // parse command line arguments
945     Options::Parse(argc, argv, 1);
946
947     // initialize FilterTool with settings
948     m_impl = new FilterToolPrivate(m_settings);
949
950     // run FilterTool, return success/fail
951     if ( m_impl->Run() )
952         return 0;
953     else
954         return 1;
955 }