1 // ***************************************************************************
2 // bamtools_resolve.cpp (c) 2011
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 6 July 2011
7 // ---------------------------------------------------------------------------
8 // Resolves paired-end reads (marking the IsProperPair flag as needed).
9 // ***************************************************************************
11 #include "bamtools_resolve.h"
12 #include "bamtools_version.h"
13 #include <api/BamReader.h>
14 #include <api/BamWriter.h>
15 #include <utils/bamtools_options.h>
16 #include <utils/bamtools_utilities.h>
17 using namespace BamTools;
33 // --------------------------------------------------------------------------
34 // general ResolveTool constants
35 // --------------------------------------------------------------------------
37 static const int NUM_MODELS = 8;
38 static const string READ_GROUP_TAG = "RG";
39 static const double DEFAULT_CONFIDENCE_INTERVAL = 0.9973;
40 static const uint16_t DEFAULT_MIN_MAPQUALITY = 1;
41 static const double DEFAULT_UNUSEDMODEL_THRESHOLD = 0.1;
43 // --------------------------------------------------------------------------
44 // stats file constants
45 // --------------------------------------------------------------------------
47 // basic char/string constants
48 static const char COMMENT_CHAR = '#';
49 static const char OPEN_BRACE_CHAR = '[';
50 static const char CLOSE_BRACE_CHAR = ']';
51 static const char EQUAL_CHAR = '=';
52 static const char TAB_CHAR = '\t';
54 static const string WHITESPACE_CHARS = " \t\n";
55 static const string TRUE_KEYWORD = "true";
56 static const string FALSE_KEYWORD = "false";
59 static const size_t NUM_OPTIONS_FIELDS = 2;
60 static const size_t NUM_READGROUPS_FIELDS = 7;
63 static const string INPUT_TOKEN = "[Input]";
64 static const string OPTIONS_TOKEN = "[Options]";
65 static const string READGROUPS_TOKEN = "[ReadGroups]";
68 static const string OPTION_CONFIDENCEINTERVAL = "ConfidenceInterval";
69 static const string OPTION_MINIMUMMAPQUALITY = "MinimumMapQuality";
70 static const string OPTION_UNUSEDMODELTHRESHOLD = "UnusedModelThreshold";
71 static const string OPTION_FORCEMARKREADGROUPS = "ForceMarkReadGroups";
73 // other string constants
74 static const string RG_FIELD_DESCRIPTION =
75 "#<name> <medianFL> <minFL> <maxFL> <topModelID> <nextTopModelID> <isAmbiguous?>";
77 // --------------------------------------------------------------------------
78 // unique readname file constants
79 // --------------------------------------------------------------------------
81 static const string READNAME_FILE_SUFFIX = ".uniq_names.txt";
82 static const string DEFAULT_READNAME_FILE = "bt_resolve_TEMP" + READNAME_FILE_SUFFIX;
84 // --------------------------------------------------------------------------
85 // ModelType implementation
91 vector<int32_t> FragmentLengths;
94 ModelType(const uint16_t id)
97 // preallocate space for 10K fragments per model type
98 FragmentLengths.reserve(10000);
101 // convenience access to internal fragment lengths vector
102 vector<int32_t>::iterator begin(void) { return FragmentLengths.begin(); }
103 vector<int32_t>::const_iterator begin(void) const { return FragmentLengths.begin(); }
104 void clear(void) { FragmentLengths.clear(); }
105 vector<int32_t>::iterator end(void) { return FragmentLengths.end(); }
106 vector<int32_t>::const_iterator end(void) const { return FragmentLengths.end(); }
107 void push_back(const int32_t& x) { FragmentLengths.push_back(x); }
108 size_t size(void) const { return FragmentLengths.size(); }
111 static const uint16_t DUMMY_ID;
114 const uint16_t ModelType::DUMMY_ID = 100;
116 bool operator>(const ModelType& lhs, const ModelType& rhs) {
117 return lhs.size() > rhs.size();
120 uint16_t CalculateModelType(const BamAlignment& al) {
122 // localize alignment's mate positions & orientations for convenience
123 const int32_t m1_begin = ( al.IsFirstMate() ? al.Position : al.MatePosition );
124 const int32_t m2_begin = ( al.IsFirstMate() ? al.MatePosition : al.Position );
125 const bool m1_isReverseStrand = ( al.IsFirstMate() ? al.IsReverseStrand() : al.IsMateReverseStrand() );
126 const bool m2_isReverseStrand = ( al.IsFirstMate() ? al.IsMateReverseStrand() : al.IsReverseStrand() );
128 // determine 'model type'
129 if ( m1_begin < m2_begin ) {
130 if ( !m1_isReverseStrand && !m2_isReverseStrand ) return 0; // ID: 1
131 if ( !m1_isReverseStrand && m2_isReverseStrand ) return 1; // ID: 2
132 if ( m1_isReverseStrand && !m2_isReverseStrand ) return 2; // ID: 3
133 if ( m1_isReverseStrand && m2_isReverseStrand ) return 3; // ID: 4
135 if ( !m2_isReverseStrand && !m1_isReverseStrand ) return 4; // ID: 5
136 if ( !m2_isReverseStrand && m1_isReverseStrand ) return 5; // ID: 6
137 if ( m2_isReverseStrand && !m1_isReverseStrand ) return 6; // ID: 7
138 if ( m2_isReverseStrand && m1_isReverseStrand ) return 7; // ID: 8
142 return ModelType::DUMMY_ID;
145 // --------------------------------------------------------------------------
146 // ReadGroupResolver implementation
148 struct ReadGroupResolver {
151 int32_t MinFragmentLength;
152 int32_t MedianFragmentLength;
153 int32_t MaxFragmentLength;
155 uint16_t NextTopModelId;
158 vector<ModelType> Models;
159 map<string, bool> ReadNames;
162 ReadGroupResolver(void);
165 bool IsValidInsertSize(const BamAlignment& al) const;
166 bool IsValidOrientation(const BamAlignment& al) const;
168 // select 2 best models based on observed data
169 void DetermineTopModels(const string& readGroupName);
172 static double ConfidenceInterval;
173 static double UnusedModelThreshold;
174 static void SetConfidenceInterval(const double& ci);
175 static void SetUnusedModelThreshold(const double& umt);
178 double ReadGroupResolver::ConfidenceInterval = DEFAULT_CONFIDENCE_INTERVAL;
179 double ReadGroupResolver::UnusedModelThreshold = DEFAULT_UNUSEDMODEL_THRESHOLD;
181 ReadGroupResolver::ReadGroupResolver(void)
182 : MinFragmentLength(0)
183 , MedianFragmentLength(0)
184 , MaxFragmentLength(0)
185 , TopModelId(ModelType::DUMMY_ID)
186 , NextTopModelId(ModelType::DUMMY_ID)
190 // pre-allocate space for 8 models
191 Models.reserve(NUM_MODELS);
192 for ( uint16_t i = 0; i < NUM_MODELS; ++i )
193 Models.push_back( ModelType(i+1) );
196 bool ReadGroupResolver::IsValidInsertSize(const BamAlignment& al) const {
197 const int32_t absInsertSize = abs(al.InsertSize);
198 return ( absInsertSize >= MinFragmentLength &&
199 absInsertSize <= MaxFragmentLength );
202 bool ReadGroupResolver::IsValidOrientation(const BamAlignment& al) const {
203 const uint16_t currentModelId = CalculateModelType(al) + 1; // convert model type (array index) to ID number
204 return ( currentModelId == TopModelId || currentModelId == NextTopModelId );
207 void ReadGroupResolver::DetermineTopModels(const string& readGroupName) {
209 // sort models (from most common to least common)
210 sort( Models.begin(), Models.end(), std::greater<ModelType>() );
212 // store top 2 models for later
213 TopModelId = Models[0].ID;
214 NextTopModelId = Models[1].ID;
216 // make sure that the 2 most common models are some threshold more common
217 // than the remaining models
218 const unsigned int activeModelCountSum = Models[0].size() + Models[1].size();
219 if ( activeModelCountSum == 0 ) return; // skip if no data in this read group
220 const unsigned int unusedModelCountSum = Models[2].size() + Models[3].size() +
221 Models[4].size() + Models[5].size() +
222 Models[6].size() + Models[7].size();
223 const double unusedPercentage = (double)unusedModelCountSum / (double)activeModelCountSum;
224 if ( unusedPercentage > UnusedModelThreshold ) {
225 cerr << "WARNING: " << readGroupName << " does not have clearly defined 'top models'" << endl
226 << " The fraction of alignments in bottom 6 models (" << unusedPercentage
227 << ") exceeds threshold: " << UnusedModelThreshold << endl;
231 // emit a warning if the best alignment models are non-standard
232 const bool isModel1Top = (TopModelId == 1) || (NextTopModelId == 1);
233 const bool isModel2Top = (TopModelId == 2) || (NextTopModelId == 2);
234 const bool isModel4Top = (TopModelId == 4) || (NextTopModelId == 4);
235 const bool isModel5Top = (TopModelId == 5) || (NextTopModelId == 5);
236 const bool isModel6Top = (TopModelId == 6) || (NextTopModelId == 6);
237 const bool isModel8Top = (TopModelId == 8) || (NextTopModelId == 8);
239 bool isMatePair = ( isModel4Top && isModel5Top ? true : false );
240 bool isPairedEnd = ( isModel2Top && isModel6Top ? true : false );
241 bool isSolidPair = ( isModel1Top && isModel8Top ? true : false );
243 if ( !isMatePair && !isPairedEnd && !isSolidPair ) {
244 cerr << "WARNING: Found a non-standard alignment model configuration. " << endl
245 << " Using alignment models " << TopModelId << " & " << NextTopModelId
249 // store only the fragments from the best alignment models, then sort
250 vector<int32_t> fragments;
251 fragments.reserve( Models[0].size() + Models[1].size() );
252 fragments.insert( fragments.end(), Models[0].begin(), Models[0].end() );
253 fragments.insert( fragments.end(), Models[1].begin(), Models[1].end() );
254 sort ( fragments.begin(), fragments.end() );
256 // clear out Model fragment data, not needed anymore
259 // skip if no fragments found for this read group
260 if ( fragments.empty() ) {
266 // calculate & store the min,median, & max fragment lengths
267 const unsigned int numFragmentLengths = fragments.size();
268 const double halfNonConfidenceInterval = (1.0 - ReadGroupResolver::ConfidenceInterval)/2.0;
269 const unsigned int minIndex = (unsigned int)(numFragmentLengths * halfNonConfidenceInterval);
270 const unsigned int medianIndex = (unsigned int)(numFragmentLengths * 0.5);
271 const unsigned int maxIndex = (unsigned int)(numFragmentLengths * (1.0-halfNonConfidenceInterval));
273 MinFragmentLength = fragments[minIndex];
274 MedianFragmentLength = fragments[medianIndex];
275 MaxFragmentLength = fragments[maxIndex];
278 void ReadGroupResolver::SetConfidenceInterval(const double& ci) {
279 ConfidenceInterval = ci;
282 void ReadGroupResolver::SetUnusedModelThreshold(const double& umt) {
283 UnusedModelThreshold = umt;
286 // --------------------------------------------------------------------------
287 // ResolveSettings implementation
289 struct ResolveTool::ResolveSettings {
297 bool HasInputBamFile;
298 bool HasOutputBamFile;
300 bool IsForceCompression;
302 // resolve option flags
303 bool HasConfidenceInterval;
304 bool HasForceMarkReadGroups;
305 bool HasMinimumMapQuality;
306 bool HasUnusedModelThreshold;
309 string InputBamFilename;
310 string OutputBamFilename;
311 string StatsFilename;
312 string ReadNamesFilename; // ** N.B. - Only used internally, not set from cmdline **
315 double ConfidenceInterval;
316 uint16_t MinimumMapQuality;
317 double UnusedModelThreshold;
320 ResolveSettings(void)
324 , HasInputBamFile(false)
325 , HasOutputBamFile(false)
326 , HasStatsFile(false)
327 , IsForceCompression(false)
328 , HasConfidenceInterval(false)
329 , HasForceMarkReadGroups(false)
330 , HasMinimumMapQuality(false)
331 , HasUnusedModelThreshold(false)
332 , InputBamFilename(Options::StandardIn())
333 , OutputBamFilename(Options::StandardOut())
335 , ReadNamesFilename(DEFAULT_READNAME_FILE)
336 , ConfidenceInterval(DEFAULT_CONFIDENCE_INTERVAL)
337 , MinimumMapQuality(DEFAULT_MIN_MAPQUALITY)
338 , UnusedModelThreshold(DEFAULT_UNUSEDMODEL_THRESHOLD)
342 // --------------------------------------------------------------------------
343 // ReadNamesFileReader implementation
345 struct ResolveTool::ReadNamesFileReader {
348 ReadNamesFileReader(void) { }
349 ~ReadNamesFileReader(void) { Close(); }
351 // main reader interface
354 bool Open(const string& filename);
355 bool Read(map<string, ReadGroupResolver>& readGroups);
362 void ResolveTool::ReadNamesFileReader::Close(void) {
363 if ( m_stream.is_open() )
367 bool ResolveTool::ReadNamesFileReader::Open(const string& filename) {
369 // make sure stream is fresh
372 // attempt to open filename, return status
373 m_stream.open(filename.c_str(), ifstream::in);
374 return m_stream.good();
377 bool ResolveTool::ReadNamesFileReader::Read(map<string, ReadGroupResolver>& readGroups) {
379 // up-front sanity check
380 if ( !m_stream.is_open() ) return false;
382 // parse read names file
384 vector<string> fields;
385 map<string, ReadGroupResolver>::iterator rgIter;
386 map<string, ReadGroupResolver>::iterator rgEnd = readGroups.end();
387 while ( getline(m_stream, line) ) {
389 // skip if empty line
390 if ( line.empty() ) continue;
392 // split line on '\t'
393 fields = Utilities::Split(line, TAB_CHAR);
394 if ( fields.size() != 2 ) continue;
396 // look up resolver for read group
397 rgIter = readGroups.find( fields[0] );
398 if ( rgIter == rgEnd ) return false;
399 ReadGroupResolver& resolver = (*rgIter).second;
401 // store read name with resolver
402 resolver.ReadNames.insert( make_pair<string,bool>(fields[1], true) ) ;
405 // if here, return success
409 // --------------------------------------------------------------------------
410 // ReadNamesFileWriter implementation
412 struct ResolveTool::ReadNamesFileWriter {
415 ReadNamesFileWriter(void) { }
416 ~ReadNamesFileWriter(void) { Close(); }
418 // main reader interface
421 bool Open(const string& filename);
422 void Write(const string& readGroupName, const string& readName);
429 void ResolveTool::ReadNamesFileWriter::Close(void) {
430 if ( m_stream.is_open() )
434 bool ResolveTool::ReadNamesFileWriter::Open(const string& filename) {
436 // make sure stream is fresh
439 // attempt to open filename, return status
440 m_stream.open(filename.c_str(), ofstream::out);
441 return m_stream.good();
444 void ResolveTool::ReadNamesFileWriter::Write(const string& readGroupName,
445 const string& readName)
447 m_stream << readGroupName << TAB_CHAR << readName << endl;
450 // --------------------------------------------------------------------------
451 // StatsFileReader implementation
453 struct ResolveTool::StatsFileReader {
457 StatsFileReader(void) { }
458 ~StatsFileReader(void) { Close(); }
460 // main reader interface
463 bool Open(const string& filename);
464 bool Read(ResolveTool::ResolveSettings* settings,
465 map<string, ReadGroupResolver>& readGroups);
469 bool IsComment(const string& line) const;
470 bool IsWhitespace(const string& line) const;
471 bool ParseInputLine(const string& line);
472 bool ParseOptionLine(const string& line, ResolveTool::ResolveSettings* settings);
473 bool ParseReadGroupLine(const string& line, map<string, ReadGroupResolver>& readGroups);
474 string SkipCommentsAndWhitespace(void);
480 enum State { None = 0
486 void ResolveTool::StatsFileReader::Close(void) {
487 if ( m_stream.is_open() )
491 bool ResolveTool::StatsFileReader::IsComment(const string& line) const {
492 assert( !line.empty() );
493 return ( line.at(0) == COMMENT_CHAR );
496 bool ResolveTool::StatsFileReader::IsWhitespace(const string& line) const {
499 return ( isspace(line.at(0)) );
502 bool ResolveTool::StatsFileReader::Open(const string& filename) {
504 // make sure stream is fresh
507 // attempt to open filename, return status
508 m_stream.open(filename.c_str(), ifstream::in);
509 return m_stream.good();
512 bool ResolveTool::StatsFileReader::ParseInputLine(const string& /*line*/) {
513 // input lines are ignored (for now at least), tool will use input from command line
517 bool ResolveTool::StatsFileReader::ParseOptionLine(const string& line,
518 ResolveTool::ResolveSettings* settings)
520 // split line into option, value
521 vector<string> fields = Utilities::Split(line, EQUAL_CHAR);
522 if ( fields.size() != NUM_OPTIONS_FIELDS )
524 const string& option = fields.at(0);
525 stringstream value(fields.at(1));
527 // -----------------------------------
528 // handle option based on keyword
530 // ConfidenceInterval
531 if ( option == OPTION_CONFIDENCEINTERVAL ) {
532 value >> settings->ConfidenceInterval;
533 settings->HasConfidenceInterval = true;
537 // ForceMarkReadGroups
538 if ( option == OPTION_FORCEMARKREADGROUPS ) {
539 value >> settings->HasForceMarkReadGroups;
544 if ( option == OPTION_MINIMUMMAPQUALITY ) {
545 value >> settings->MinimumMapQuality;
546 settings->HasMinimumMapQuality = true;
550 // UnusedModelThreshold
551 if ( option == OPTION_UNUSEDMODELTHRESHOLD ) {
552 value >> settings->UnusedModelThreshold;
553 settings->HasUnusedModelThreshold = true;
557 // otherwise unknown option
558 cerr << "bamtools resolve ERROR - unrecognized option: " << option << " in stats file" << endl;
562 bool ResolveTool::StatsFileReader::ParseReadGroupLine(const string& line,
563 map<string, ReadGroupResolver>& readGroups)
565 // split read group data in to fields
566 vector<string> fields = Utilities::Split(line, WHITESPACE_CHARS);
567 if ( fields.size() != NUM_READGROUPS_FIELDS ) return false;
570 const string& name = fields.at(0);
572 // populate RG's 'resolver' data
573 ReadGroupResolver resolver;
575 stringstream dataStream;
576 dataStream.str(fields.at(1));
577 dataStream >> resolver.MedianFragmentLength;
580 dataStream.str(fields.at(2));
581 dataStream >> resolver.MinFragmentLength;
584 dataStream.str(fields.at(3));
585 dataStream >> resolver.MaxFragmentLength;
588 dataStream.str(fields.at(4));
589 dataStream >> resolver.TopModelId;
592 dataStream.str(fields.at(5));
593 dataStream >> resolver.NextTopModelId;
596 resolver.IsAmbiguous = ( fields.at(6) == TRUE_KEYWORD );
598 // store RG entry and return success
599 readGroups.insert( make_pair<string, ReadGroupResolver>(name, resolver) );
603 bool ResolveTool::StatsFileReader::Read(ResolveTool::ResolveSettings* settings,
604 map<string, ReadGroupResolver>& readGroups)
606 // up-front sanity checks
607 if ( !m_stream.is_open() || settings == 0 )
610 // clear out read group data
614 State currentState = StatsFileReader::None;
617 string line = SkipCommentsAndWhitespace();
618 while ( !line.empty() ) {
620 bool foundError = false;
622 // switch state on keyword found
623 if ( Utilities::StartsWith(line, INPUT_TOKEN) )
624 currentState = StatsFileReader::InInput;
625 else if ( Utilities::StartsWith(line, OPTIONS_TOKEN) )
626 currentState = StatsFileReader::InOptions;
627 else if ( Utilities::StartsWith(line, READGROUPS_TOKEN) )
628 currentState = StatsFileReader::InReadGroups;
630 // otherwise parse data line, depending on state
632 if ( currentState == StatsFileReader::InInput )
633 foundError = !ParseInputLine(line);
634 else if ( currentState == StatsFileReader::InOptions )
635 foundError = !ParseOptionLine(line, settings);
636 else if ( currentState == StatsFileReader::InReadGroups )
637 foundError = !ParseReadGroupLine(line, readGroups);
642 // break out if error found
647 line = SkipCommentsAndWhitespace();
650 // if here, return success
654 string ResolveTool::StatsFileReader::SkipCommentsAndWhitespace(void) {
657 if ( m_stream.eof() )
659 getline(m_stream, line);
660 } while ( IsWhitespace(line) || IsComment(line) );
664 // --------------------------------------------------------------------------
665 // StatsFileReader implementation
667 struct ResolveTool::StatsFileWriter {
671 StatsFileWriter(void) { }
672 ~StatsFileWriter(void) { Close(); }
674 // main reader interface
677 bool Open(const string& filename);
678 bool Write(ResolveTool::ResolveSettings* settings,
679 const map<string, ReadGroupResolver>& readGroups);
683 void WriteHeader(void);
684 void WriteInput(ResolveTool::ResolveSettings* settings);
685 void WriteOptions(ResolveTool::ResolveSettings* settings);
686 void WriteReadGroups(const map<string, ReadGroupResolver>& readGroups);
693 void ResolveTool::StatsFileWriter::Close(void) {
694 if ( m_stream.is_open() )
698 bool ResolveTool::StatsFileWriter::Open(const string& filename) {
700 // make sure stream is fresh
703 // attempt to open filename, return status
704 m_stream.open(filename.c_str(), ofstream::out);
705 return m_stream.good();
708 bool ResolveTool::StatsFileWriter::Write(ResolveTool::ResolveSettings* settings,
709 const map<string, ReadGroupResolver>& readGroups)
711 // return failure if file not open
712 if ( !m_stream.is_open() )
715 // write stats file elements
717 WriteInput(settings);
718 WriteOptions(settings);
719 WriteReadGroups(readGroups);
725 void ResolveTool::StatsFileWriter::WriteHeader(void) {
727 // stringify current bamtools version
728 stringstream versionStream("");
730 << BAMTOOLS_VERSION_MAJOR << "."
731 << BAMTOOLS_VERSION_MINOR << "."
732 << BAMTOOLS_VERSION_BUILD;
734 // # bamtools resolve (vX.Y.Z)
737 m_stream << COMMENT_CHAR << " bamtools resolve (" << versionStream.str() << ")" << endl
741 void ResolveTool::StatsFileWriter::WriteInput(ResolveTool::ResolveSettings* settings) {
747 m_stream << INPUT_TOKEN << endl
748 << settings->InputBamFilename << endl
752 void ResolveTool::StatsFileWriter::WriteOptions(ResolveTool::ResolveSettings* settings) {
755 // ConfidenceInterval=<double>
756 // ForceMarkReadGroups=<true|false>
757 // MinimumMapQuality=<uint16_t>
758 // UnusedModelThreshold=<double>
761 m_stream << OPTIONS_TOKEN << endl
762 << OPTION_CONFIDENCEINTERVAL << EQUAL_CHAR << settings->ConfidenceInterval << endl
763 << OPTION_MINIMUMMAPQUALITY << EQUAL_CHAR << settings->MinimumMapQuality << endl
764 << OPTION_FORCEMARKREADGROUPS << EQUAL_CHAR << boolalpha << settings->HasForceMarkReadGroups << endl
765 << OPTION_UNUSEDMODELTHRESHOLD << EQUAL_CHAR << settings->UnusedModelThreshold << endl
769 void ResolveTool::StatsFileWriter::WriteReadGroups(const map<string, ReadGroupResolver>& readGroups) {
772 // #<name> <medianFL> <minFL> <maxFL> <topModelID> <nextTopModelID> <isAmbiguous?>
773 m_stream << READGROUPS_TOKEN << endl
774 << RG_FIELD_DESCRIPTION << endl;
776 // iterate over read groups
777 map<string, ReadGroupResolver>::const_iterator rgIter = readGroups.begin();
778 map<string, ReadGroupResolver>::const_iterator rgEnd = readGroups.end();
779 for ( ; rgIter != rgEnd; ++rgIter ) {
780 const string& name = (*rgIter).first;
781 const ReadGroupResolver& resolver = (*rgIter).second;
783 // skip if read group has no data
784 if ( !resolver.HasData )
787 // write read group data
788 m_stream << name << TAB_CHAR
789 << resolver.MedianFragmentLength << TAB_CHAR
790 << resolver.MinFragmentLength << TAB_CHAR
791 << resolver.MaxFragmentLength << TAB_CHAR
792 << resolver.TopModelId << TAB_CHAR
793 << resolver.NextTopModelId << TAB_CHAR
794 << boolalpha << resolver.IsAmbiguous
798 // extra newline at end
802 // --------------------------------------------------------------------------
803 // ResolveToolPrivate implementation
805 struct ResolveTool::ResolveToolPrivate {
809 ResolveToolPrivate(ResolveTool::ResolveSettings* settings)
810 : m_settings(settings)
812 ~ResolveToolPrivate(void) { }
814 // 'public' interface
820 bool CheckSettings(vector<string>& errors);
821 bool MakeStats(void);
822 void ParseHeader(const SamHeader& header);
823 bool ReadStatsFile(void);
824 void ResolveAlignment(BamAlignment& al);
825 bool ResolvePairs(void);
826 bool WriteStatsFile(void);
830 ResolveTool::ResolveSettings* m_settings;
831 map<string, ReadGroupResolver> m_readGroups;
834 bool ResolveTool::ResolveToolPrivate::CheckSettings(vector<string>& errors) {
836 // ensure clean slate
840 if ( m_settings->IsMakeStats ) {
843 if ( m_settings->IsMarkPairs )
844 errors.push_back("Cannot run in both -makeStats & -markPairs modes. Please select ONE.");
845 if ( m_settings->IsTwoPass )
846 errors.push_back("Cannot run in both -makeStats & -twoPass modes. Please select ONE.");
848 // error if output BAM options supplied
849 if ( m_settings->HasOutputBamFile )
850 errors.push_back("Cannot use -out (output BAM file) in -makeStats mode.");
851 if ( m_settings->IsForceCompression )
852 errors.push_back("Cannot use -forceCompression. No output BAM file is being generated.");
854 // make sure required stats file supplied
855 if ( !m_settings->HasStatsFile )
856 errors.push_back("Ouptut stats filename required for -makeStats mode. Please specify one using -stats option.");
858 // check for UseStats options
859 if ( m_settings->HasForceMarkReadGroups )
860 errors.push_back("Cannot use -forceMarkReadGroups. -markPairs options are DISABLED in -makeStats mode.");
864 else if ( m_settings->IsMarkPairs ) {
867 if ( m_settings->IsMakeStats )
868 errors.push_back("Cannot run in both -makeStats & -markPairs modes. Please select ONE.");
869 if ( m_settings->IsTwoPass )
870 errors.push_back("Cannot run in both -markPairs & -twoPass modes. Please select ONE.");
872 // make sure required stats file supplied
873 if ( !m_settings->HasStatsFile )
874 errors.push_back("Input stats filename required for -markPairs mode. Please specify one using -stats option.");
876 // check for MakeStats options
877 if ( m_settings->HasConfidenceInterval )
878 errors.push_back("Cannot use -ci. -makeStats options are DISABLED is -markPairs mode.");
882 else if ( m_settings->IsTwoPass ) {
885 if ( m_settings->IsMakeStats )
886 errors.push_back("Cannot run in both -makeStats & -twoPass modes. Please select ONE.");
887 if ( m_settings->IsMarkPairs )
888 errors.push_back("Cannot run in both -markPairs & -twoPass modes. Please select ONE.");
890 // make sure input is file not stdin
891 if ( !m_settings->HasInputBamFile || m_settings->InputBamFilename == Options::StandardIn() )
892 errors.push_back("Cannot run -twoPass mode with BAM data from stdin. Please specify existing file using -in option.");
897 errors.push_back("No resolve mode specified. Please select ONE of the following: -makeStats, -markPairs, or -twoPass. See help for more info.");
899 // boundary checks on values
900 if ( m_settings->HasConfidenceInterval ) {
901 if ( m_settings->ConfidenceInterval < 0.0 || m_settings->ConfidenceInterval > 1.0 )
902 errors.push_back("Invalid confidence interval. Must be between 0 and 1");
904 if ( m_settings->HasMinimumMapQuality ) {
905 if ( m_settings->MinimumMapQuality >= 256 )
906 errors.push_back("Invalid minimum map quality. Must be between 0 and 255");
908 if ( m_settings->HasUnusedModelThreshold ) {
909 if ( m_settings->UnusedModelThreshold < 0.0 || m_settings->UnusedModelThreshold > 1.0 )
910 errors.push_back("Invalid unused model threshold. Must be between 0 and 1");
913 // return success if no errors found
914 return ( errors.empty() );
917 bool ResolveTool::ResolveToolPrivate::MakeStats(void) {
919 // pull resolver settings from command-line settings
920 ReadGroupResolver::SetConfidenceInterval(m_settings->ConfidenceInterval);
921 ReadGroupResolver::SetUnusedModelThreshold(m_settings->UnusedModelThreshold);
923 // open our BAM reader
925 if ( !bamReader.Open(m_settings->InputBamFilename) ) {
926 cerr << "bamtools resolve ERROR: could not open input BAM file: "
927 << m_settings->InputBamFilename << endl;
931 // retrieve header & parse for read groups
932 const SamHeader& header = bamReader.GetHeader();
935 // open ReadNamesFileWriter
936 ResolveTool::ReadNamesFileWriter readNamesWriter;
937 if ( !readNamesWriter.Open(m_settings->ReadNamesFilename) ) {
938 cerr << "bamtools resolve ERROR: could not open (temp) output read names file: "
939 << m_settings->ReadNamesFilename << endl;
944 // read through BAM file
946 string readGroup("");
947 map<string, ReadGroupResolver>::iterator rgIter;
948 map<string, bool>::iterator readNameIter;
949 while ( bamReader.GetNextAlignmentCore(al) ) {
951 // skip if alignment is not paired, mapped, nor mate is mapped
952 if ( !al.IsPaired() || !al.IsMapped() || !al.IsMateMapped() )
955 // skip if alignment & mate not on same reference sequence
956 if ( al.RefID != al.MateRefID ) continue;
958 // flesh out the char data, so we can retrieve its read group ID
961 // get read group from alignment (OK if empty)
963 al.GetTag(READ_GROUP_TAG, readGroup);
965 // look up resolver for read group
966 rgIter = m_readGroups.find(readGroup);
967 if ( rgIter == m_readGroups.end() ) {
968 cerr << "bamtools resolve ERROR - unable to calculate stats, unknown read group encountered: "
969 << readGroup << endl;
973 ReadGroupResolver& resolver = (*rgIter).second;
975 // determine unique-ness of current alignment
976 const bool isCurrentMateUnique = ( al.MapQuality >= m_settings->MinimumMapQuality );
979 readNameIter = resolver.ReadNames.find(al.Name);
981 // if read name found (current alignment's mate already parsed)
982 if ( readNameIter != resolver.ReadNames.end() ) {
984 // if both unique mates are unique, store read name & insert size for later
985 const bool isStoredMateUnique = (*readNameIter).second;
986 if ( isCurrentMateUnique && isStoredMateUnique ) {
988 // save read name in temp file as candidates for later pair marking
989 readNamesWriter.Write(readGroup, al.Name);
991 // determine model type & store fragment length for stats calculation
992 const uint16_t currentModelType = CalculateModelType(al);
993 assert( currentModelType != ModelType::DUMMY_ID );
994 resolver.Models[currentModelType].push_back( abs(al.InsertSize) );
997 // unique or not, remove read name from map
998 resolver.ReadNames.erase(readNameIter);
1001 // if read name not found, store new entry
1002 else resolver.ReadNames.insert( make_pair<string, bool>(al.Name, isCurrentMateUnique) );
1006 readNamesWriter.Close();
1009 // iterate back through read groups
1010 map<string, ReadGroupResolver>::iterator rgEnd = m_readGroups.end();
1011 for ( rgIter = m_readGroups.begin(); rgIter != rgEnd; ++rgIter ) {
1012 const string& name = (*rgIter).first;
1013 ReadGroupResolver& resolver = (*rgIter).second;
1015 // calculate acceptable orientation & insert sizes for this read group
1016 resolver.DetermineTopModels(name);
1018 // clear out left over read names
1019 // (these have mates that did not pass filters or were already removed as non-unique)
1020 resolver.ReadNames.clear();
1023 // if we get here, return success
1027 void ResolveTool::ResolveToolPrivate::ParseHeader(const SamHeader& header) {
1029 // iterate over header read groups, creating a 'resolver' for each
1030 SamReadGroupConstIterator rgIter = header.ReadGroups.ConstBegin();
1031 SamReadGroupConstIterator rgEnd = header.ReadGroups.ConstEnd();
1032 for ( ; rgIter != rgEnd; ++rgIter ) {
1033 const SamReadGroup& rg = (*rgIter);
1034 m_readGroups.insert( make_pair<string, ReadGroupResolver>(rg.ID, ReadGroupResolver()) );
1038 bool ResolveTool::ResolveToolPrivate::ReadStatsFile(void) {
1040 // skip if no filename provided
1041 if ( m_settings->StatsFilename.empty() )
1044 // attempt to open stats file
1045 ResolveTool::StatsFileReader statsReader;
1046 if ( !statsReader.Open(m_settings->StatsFilename) ) {
1047 cerr << "bamtools resolve ERROR - could not open stats file: "
1048 << m_settings->StatsFilename << " for reading" << endl;
1052 // attempt to read stats data
1053 if ( !statsReader.Read(m_settings, m_readGroups) ) {
1054 cerr << "bamtools resolve ERROR - could not parse stats file: "
1055 << m_settings->StatsFilename << " for data" << endl;
1063 void ResolveTool::ResolveToolPrivate::ResolveAlignment(BamAlignment& al) {
1065 // clear proper-pair flag
1066 al.SetIsProperPair(false);
1068 // quit check if alignment is not from paired-end read
1069 if ( !al.IsPaired() ) return;
1071 // quit check if either alignment or its mate are unmapped
1072 if ( !al.IsMapped() || !al.IsMateMapped() ) return;
1074 // quit check if alignment & its mate are on differenct references
1075 if ( al.RefID != al.MateRefID ) return;
1077 // quit check if map quality less than cutoff
1078 if ( al.MapQuality < m_settings->MinimumMapQuality ) return;
1080 // get read group from alignment
1081 // empty string if not found, this is OK - we handle empty read group case
1082 string readGroupName("");
1083 al.GetTag(READ_GROUP_TAG, readGroupName);
1085 // look up read group's 'resolver'
1086 map<string, ReadGroupResolver>::iterator rgIter = m_readGroups.find(readGroupName);
1087 if ( rgIter == m_readGroups.end() ) {
1088 cerr << "bamtools resolve ERROR - read group found that was not in header: "
1089 << readGroupName << endl;
1092 const ReadGroupResolver& resolver = (*rgIter).second;
1094 // quit check if pairs are not in proper orientation (can differ for each RG)
1095 if ( !resolver.IsValidOrientation(al) ) return;
1097 // quit check if pairs are not within "reasonable" distance (can differ for each RG)
1098 if ( !resolver.IsValidInsertSize(al) ) return;
1100 // quit check if alignment is not a "candidate proper pair"
1101 map<string, bool>::const_iterator readNameIter;
1102 readNameIter = resolver.ReadNames.find(al.Name);
1103 if ( readNameIter == resolver.ReadNames.end() )
1106 // if we get here, alignment is OK - set 'proper pair' flag
1107 al.SetIsProperPair(true);
1110 bool ResolveTool::ResolveToolPrivate::ResolvePairs(void) {
1112 // open file containing read names of candidate proper pairs
1113 ResolveTool::ReadNamesFileReader readNamesReader;
1114 if ( !readNamesReader.Open(m_settings->ReadNamesFilename) ) {
1115 cerr << "bamtools resolve ERROR: could not open (temp) inputput read names file: "
1116 << m_settings->ReadNamesFilename << endl;
1120 // parse read names (matching with corresponding read groups)
1121 if ( !readNamesReader.Read(m_readGroups) ) {
1122 cerr << "bamtools resolve ERROR: could not read candidate read names from file: "
1123 << m_settings->ReadNamesFilename << endl;
1124 readNamesReader.Close();
1128 // close read name file reader & delete temp file
1129 readNamesReader.Close();
1130 if ( remove(m_settings->ReadNamesFilename.c_str()) != 0 ) {
1131 cerr << "bamtools resolve WARNING: could not delete temp file: "
1132 << m_settings->ReadNamesFilename << endl;
1135 // open our BAM reader
1137 if ( !reader.Open(m_settings->InputBamFilename) ) {
1138 cerr << "bamtools resolve ERROR: could not open input BAM file: "
1139 << m_settings->InputBamFilename << endl;
1143 // retrieve header & reference dictionary info
1144 const SamHeader& header = reader.GetHeader();
1145 const RefVector& references = reader.GetReferenceData();
1147 // determine compression mode for BamWriter
1148 bool writeUncompressed = ( m_settings->OutputBamFilename == Options::StandardOut() &&
1149 !m_settings->IsForceCompression );
1150 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
1151 if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
1155 writer.SetCompressionMode(compressionMode);
1156 if ( !writer.Open(m_settings->OutputBamFilename, header, references) ) {
1157 cerr << "bamtools resolve ERROR: could not open "
1158 << m_settings->OutputBamFilename << " for writing." << endl;
1163 // plow through alignments, setting/clearing 'proper pair' flag
1164 // and writing to new output BAM file
1166 while ( reader.GetNextAlignment(al) ) {
1167 ResolveAlignment(al);
1168 writer.SaveAlignment(al);
1171 // clean up & return success
1177 bool ResolveTool::ResolveToolPrivate::Run(void) {
1179 // verify that command line settings are acceptable
1180 vector<string> errors;
1181 if ( !CheckSettings(errors) ) {
1182 cerr << "bamtools resolve ERROR - invalid settings: " << endl;
1183 vector<string>::const_iterator errorIter = errors.begin();
1184 vector<string>::const_iterator errorEnd = errors.end();
1185 for ( ; errorIter != errorEnd; ++errorIter )
1186 cerr << (*errorIter) << endl;
1190 // initialize read group map with default (empty name) read group
1191 m_readGroups.insert( make_pair<string, ReadGroupResolver>("", ReadGroupResolver()) );
1193 // init readname filename
1194 // uses (adjusted) stats filename if provided (req'd for makeStats, markPairs modes; optional for twoPass)
1195 // else keep default filename
1196 if ( m_settings->HasStatsFile )
1197 m_settings->ReadNamesFilename = m_settings->StatsFilename + READNAME_FILE_SUFFIX;
1200 if ( m_settings->IsMakeStats ) {
1202 // generate stats data
1203 if ( !MakeStats() ) {
1204 cerr << "bamtools resolve ERROR - could not generate stats" << endl;
1208 // write stats to file
1209 if ( !WriteStatsFile() ) {
1210 cerr << "bamtools resolve ERROR - could not write stats file: "
1211 << m_settings->StatsFilename << endl;
1217 else if ( m_settings->IsMarkPairs ) {
1219 // read stats from file
1220 if ( !ReadStatsFile() ) {
1221 cerr << "bamtools resolve ERROR - could not read stats file: "
1222 << m_settings->StatsFilename << endl;
1226 // do paired-end resolution
1227 if ( !ResolvePairs() ) {
1228 cerr << "bamtools resolve ERROR - could not resolve pairs" << endl;
1236 // generate stats data
1237 if ( !MakeStats() ) {
1238 cerr << "bamtools resolve ERROR - could not generate stats" << endl;
1242 // if stats file requested
1243 if ( m_settings->HasStatsFile ) {
1245 // write stats to file
1246 // emit warning if write fails, but paired-end resolution should be allowed to proceed
1247 if ( !WriteStatsFile() )
1248 cerr << "bamtools resolve WARNING - could not write stats file: "
1249 << m_settings->StatsFilename << endl;
1252 // do paired-end resolution
1253 if ( !ResolvePairs() ) {
1254 cerr << "bamtools resolve ERROR - could not resolve pairs" << endl;
1263 bool ResolveTool::ResolveToolPrivate::WriteStatsFile(void) {
1265 // skip if no filename provided
1266 if ( m_settings->StatsFilename.empty() )
1269 // attempt to open stats file
1270 ResolveTool::StatsFileWriter statsWriter;
1271 if ( !statsWriter.Open(m_settings->StatsFilename) ) {
1272 cerr << "bamtools resolve ERROR - could not open stats file: "
1273 << m_settings->StatsFilename << " for writing" << endl;
1277 // attempt to write stats data
1278 if ( !statsWriter.Write(m_settings, m_readGroups) ) {
1279 cerr << "bamtools resolve ERROR - could not write stats file: "
1280 << m_settings->StatsFilename << " for data" << endl;
1288 // --------------------------------------------------------------------------
1289 // ResolveTool implementation
1291 ResolveTool::ResolveTool(void)
1293 , m_settings(new ResolveSettings)
1296 // set description texts
1297 const string programDescription = "resolves paired-end reads (marking the IsProperPair flag as needed)";
1298 const string programUsage = "<mode> [options] [-in <filename>] [-out <filename> | [-forceCompression] ] [-stats <filename>]";
1299 const string inputBamDescription = "the input BAM file(s)";
1300 const string outputBamDescription = "the output BAM file";
1301 const string statsFileDescription = "input/output stats file, depending on selected mode (see below). "
1302 "This file is human-readable, storing fragment length data generated per read group, as well as "
1303 "the options used to configure the -makeStats mode";
1304 const string forceCompressionDescription = "if results are sent to stdout (like when piping to another tool), "
1305 "default behavior is to leave output uncompressed."
1306 "Use this flag to override and force compression. This feature is disabled in -makeStats mode.";
1307 const string makeStatsDescription = "generates a fragment-length stats file from the input BAM. "
1308 "Data is written to file specified using the -stats option. "
1309 "MarkPairs Mode Settings are DISABLED.";
1310 const string markPairsDescription = "generates an output BAM with alignments marked with proper-pair status. "
1311 "Stats data is read from file specified using the -stats option. "
1312 "MakeStats Mode Settings are DISABLED";
1313 const string twoPassDescription = "combines the -makeStats & -markPairs modes into a single command. "
1314 "However, due to the two-pass nature of paired-end resolution, piping BAM data via stdin is DISABLED. "
1315 "You must supply an explicit input BAM file. Output BAM may be piped to stdout, however, if desired. "
1316 "All MakeStats & MarkPairs Mode Settings are available. "
1317 "The intermediate stats file is not necessary, but if the -stats options is used, then one will be generated. "
1318 "You may find this useful for documentation purposes.";
1319 const string minMapQualDescription = "minimum map quality. Used in -makeStats mode as a heuristic for determining a mate's "
1320 "uniqueness. Used in -markPairs mode as a filter for marking candidate proper pairs.";
1321 const string confidenceIntervalDescription = "confidence interval. Set min/max fragment lengths such that we capture "
1322 "this fraction of pairs";
1323 const string unusedModelThresholdDescription = "unused model threshold. The resolve tool considers 8 possible orientation models "
1324 "for pairs. The top 2 are selected for later use when actually marking alignments. This value determines the "
1325 "cutoff for marking a read group as ambiguous. Meaning that if the ratio of the number of alignments from bottom 6 models "
1326 "to the top 2 is greater than this threshold, then the read group is flagged as ambiguous. By default, NO alignments "
1327 "from ambiguous read groups will be marked as proper pairs. You may override this behavior with the -force option "
1328 "in -markPairs mode";
1329 const string forceMarkDescription = "forces all read groups to be marked according to their top 2 'orientation models'. "
1330 "When generating stats, the 2 (out of 8 possible) models with the most observations are chosen as the top models for each read group. "
1331 "If the remaining 6 models account for more than some threshold ([default=10%], see -umt), then the read group is marked as ambiguous. "
1332 "The default behavior is that for an ambiguous read group, NONE of its alignments are marked as proper-pairs. "
1333 "By setting this option, a read group's ambiguity flag will be ignored, and all of its alignments will be compared to the top 2 models.";
1335 // set program details
1336 Options::SetProgramInfo("bamtools resolve", programDescription, programUsage);
1338 // set up I/O options
1339 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
1340 Options::AddValueOption("-in", "BAM filename", inputBamDescription, "",
1341 m_settings->HasInputBamFile, m_settings->InputBamFilename,
1342 IO_Opts, Options::StandardIn());
1343 Options::AddValueOption("-out", "BAM filename", outputBamDescription, "",
1344 m_settings->HasOutputBamFile, m_settings->OutputBamFilename,
1345 IO_Opts, Options::StandardOut());
1346 Options::AddValueOption("-stats", "STATS filename", statsFileDescription, "",
1347 m_settings->HasStatsFile, m_settings->StatsFilename, IO_Opts);
1348 Options::AddOption("-forceCompression", forceCompressionDescription,
1349 m_settings->IsForceCompression, IO_Opts);
1351 OptionGroup* ModeOpts = Options::CreateOptionGroup("Resolve Modes (must select ONE of the following)");
1352 Options::AddOption("-makeStats", makeStatsDescription, m_settings->IsMakeStats, ModeOpts);
1353 Options::AddOption("-markPairs", markPairsDescription, m_settings->IsMarkPairs, ModeOpts);
1354 Options::AddOption("-twoPass", twoPassDescription, m_settings->IsTwoPass, ModeOpts);
1356 OptionGroup* GeneralOpts = Options::CreateOptionGroup("General Resolve Options (available in all modes)");
1357 Options::AddValueOption("-minMQ", "unsigned short", minMapQualDescription, "",
1358 m_settings->HasMinimumMapQuality, m_settings->MinimumMapQuality, GeneralOpts);
1360 OptionGroup* MakeStatsOpts = Options::CreateOptionGroup("MakeStats Mode Options (disabled in -markPairs mode)");
1361 Options::AddValueOption("-ci", "double", confidenceIntervalDescription, "",
1362 m_settings->HasConfidenceInterval, m_settings->ConfidenceInterval, MakeStatsOpts);
1363 Options::AddValueOption("-umt", "double", unusedModelThresholdDescription, "",
1364 m_settings->HasUnusedModelThreshold, m_settings->UnusedModelThreshold, MakeStatsOpts);
1366 OptionGroup* MarkPairsOpts = Options::CreateOptionGroup("MarkPairs Mode Options (disabled in -makeStats mode)");
1367 Options::AddOption("-force", forceMarkDescription, m_settings->HasForceMarkReadGroups, MarkPairsOpts);
1370 ResolveTool::~ResolveTool(void) {
1379 int ResolveTool::Help(void) {
1380 Options::DisplayHelp();
1384 int ResolveTool::Run(int argc, char* argv[]) {
1386 // parse command line arguments
1387 Options::Parse(argc, argv, 1);
1389 // initialize ResolveTool
1390 m_impl = new ResolveToolPrivate(m_settings);
1392 // run ResolveTool, return success/failure
1393 if ( m_impl->Run() )