1 // ***************************************************************************
2 // bamtools_resolve.cpp (c) 2011
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 14 June 2011
7 // ---------------------------------------------------------------------------
8 // Resolves paired-end reads (marking the IsProperPair flag as needed).
9 // ***************************************************************************
11 #include "bamtools_resolve.h"
12 #include "bamtools_version.h"
13 #include <api/BamReader.h>
14 #include <api/BamWriter.h>
15 #include <utils/bamtools_options.h>
16 #include <utils/bamtools_utilities.h>
17 using namespace BamTools;
32 // --------------------------------------------------------------------------
33 // general ResolveTool constants
34 // --------------------------------------------------------------------------
36 static const int NUM_MODELS = 8;
37 static const string READ_GROUP_TAG = "RG";
38 static const double DEFAULT_CONFIDENCE_INTERVAL = 0.9973;
39 static const double DEFAULT_UNUSEDMODEL_THRESHOLD = 0.1;
41 // --------------------------------------------------------------------------
42 // stats file constants
43 // --------------------------------------------------------------------------
45 // basic char/string constants
46 static const char COMMENT_CHAR = '#';
47 static const char OPEN_BRACE_CHAR = '[';
48 static const char CLOSE_BRACE_CHAR = ']';
49 static const char EQUAL_CHAR = '=';
50 static const char TAB_CHAR = '\t';
52 static const string WHITESPACE_CHARS = " \t\n";
53 static const string TRUE_KEYWORD = "true";
54 static const string FALSE_KEYWORD = "false";
57 static const size_t NUM_OPTIONS_FIELDS = 2;
58 static const size_t NUM_READGROUPS_FIELDS = 7;
61 static const string INPUT_TOKEN = "[Input]";
62 static const string OPTIONS_TOKEN = "[Options]";
63 static const string READGROUPS_TOKEN = "[ReadGroups]";
66 static const string OPTION_CONFIDENCEINTERVAL = "ConfidenceInterval";
67 static const string OPTION_UNUSEDMODELTHRESHOLD = "UnusedModelThreshold";
68 static const string OPTION_FORCEMARKREADGROUPS = "ForceMarkReadGroups";
70 // other string constants
71 static const string RG_FIELD_DESCRIPTION =
72 "#<name> <medianFL> <minFL> <maxFL> <topModelID> <nextTopModelID> <isAmbiguous?>";
74 // --------------------------------------------------------------------------
75 // ModelType implementation
81 vector<int32_t> FragmentLengths;
84 ModelType(const uint16_t id)
87 // preallocate space for 10K fragments per model type
88 FragmentLengths.reserve(10000);
91 // convenience access to internal fragment lengths vector
92 vector<int32_t>::iterator begin(void) { return FragmentLengths.begin(); }
93 vector<int32_t>::const_iterator begin(void) const { return FragmentLengths.begin(); }
94 void clear(void) { FragmentLengths.clear(); }
95 vector<int32_t>::iterator end(void) { return FragmentLengths.end(); }
96 vector<int32_t>::const_iterator end(void) const { return FragmentLengths.end(); }
97 void push_back(const int32_t& x) { FragmentLengths.push_back(x); }
98 size_t size(void) const { return FragmentLengths.size(); }
101 static const uint16_t DUMMY_ID;
104 const uint16_t ModelType::DUMMY_ID = 100;
106 bool operator>(const ModelType& lhs, const ModelType& rhs) {
107 return lhs.size() > rhs.size();
110 uint16_t CalculateModelType(const BamAlignment& al) {
112 // localize alignment's mate positions & orientations for convenience
113 const int32_t m1_begin = ( al.IsFirstMate() ? al.Position : al.MatePosition );
114 const int32_t m2_begin = ( al.IsFirstMate() ? al.MatePosition : al.Position );
115 const bool m1_isReverseStrand = ( al.IsFirstMate() ? al.IsReverseStrand() : al.IsMateReverseStrand() );
116 const bool m2_isReverseStrand = ( al.IsFirstMate() ? al.IsMateReverseStrand() : al.IsReverseStrand() );
118 // determine 'model type'
119 if ( m1_begin < m2_begin ) {
120 if ( !m1_isReverseStrand && !m2_isReverseStrand ) return 0; // ID: 1
121 if ( !m1_isReverseStrand && m2_isReverseStrand ) return 1; // ID: 2
122 if ( m1_isReverseStrand && !m2_isReverseStrand ) return 2; // ID: 3
123 if ( m1_isReverseStrand && m2_isReverseStrand ) return 3; // ID: 4
125 if ( !m2_isReverseStrand && !m1_isReverseStrand ) return 4; // ID: 5
126 if ( !m2_isReverseStrand && m1_isReverseStrand ) return 5; // ID: 6
127 if ( m2_isReverseStrand && !m1_isReverseStrand ) return 6; // ID: 7
128 if ( m2_isReverseStrand && m1_isReverseStrand ) return 7; // ID: 8
132 return ModelType::DUMMY_ID;
135 // --------------------------------------------------------------------------
136 // ReadGroupResolver implementation
138 struct ReadGroupResolver {
141 int32_t MinFragmentLength;
142 int32_t MedianFragmentLength;
143 int32_t MaxFragmentLength;
145 uint16_t NextTopModelId;
148 vector<ModelType> Models;
151 ReadGroupResolver(void);
154 bool IsValidInsertSize(const BamAlignment& al) const;
155 bool IsValidOrientation(const BamAlignment& al) const;
157 // select 2 best models based on observed data
158 void DetermineTopModels(const string& readGroupName);
161 static double ConfidenceInterval;
162 static double UnusedModelThreshold;
163 static void SetConfidenceInterval(const double& ci);
164 static void SetUnusedModelThreshold(const double& umt);
167 double ReadGroupResolver::ConfidenceInterval = DEFAULT_CONFIDENCE_INTERVAL;
168 double ReadGroupResolver::UnusedModelThreshold = DEFAULT_UNUSEDMODEL_THRESHOLD;
170 ReadGroupResolver::ReadGroupResolver(void)
171 : MinFragmentLength(0)
172 , MedianFragmentLength(0)
173 , MaxFragmentLength(0)
174 , TopModelId(ModelType::DUMMY_ID)
175 , NextTopModelId(ModelType::DUMMY_ID)
179 // pre-allocate space for 8 models
180 Models.reserve(NUM_MODELS);
181 for ( uint16_t i = 0; i < NUM_MODELS; ++i )
182 Models.push_back( ModelType(i+1) );
185 bool ReadGroupResolver::IsValidInsertSize(const BamAlignment& al) const {
186 const int32_t absInsertSize = abs(al.InsertSize);
187 return ( absInsertSize >= MinFragmentLength &&
188 absInsertSize <= MaxFragmentLength );
191 bool ReadGroupResolver::IsValidOrientation(const BamAlignment& al) const {
192 const uint16_t currentModelId = CalculateModelType(al) + 1; // convert model type (array index) to ID number
193 return ( currentModelId == TopModelId || currentModelId == NextTopModelId );
196 void ReadGroupResolver::DetermineTopModels(const string& readGroupName) {
198 // sort models (from most common to least common)
199 sort( Models.begin(), Models.end(), std::greater<ModelType>() );
201 // store top 2 models for later
202 TopModelId = Models[0].ID;
203 NextTopModelId = Models[1].ID;
205 // make sure that the 2 most common models are some threshold more common
206 // than the remaining models
207 const unsigned int activeModelCountSum = Models[0].size() + Models[1].size();
208 if ( activeModelCountSum == 0 ) return; // skip if no data in this read group
209 const unsigned int unusedModelCountSum = Models[2].size() + Models[3].size() +
210 Models[4].size() + Models[5].size() +
211 Models[6].size() + Models[7].size();
212 const double unusedPercentage = (double)unusedModelCountSum / (double)activeModelCountSum;
213 if ( unusedPercentage > UnusedModelThreshold ) {
214 cerr << "WARNING: " << readGroupName << " does not have clearly defined 'top models'" << endl
215 << " The fraction of alignments in bottom 6 models (" << unusedPercentage
216 << ") exceeds threshold: " << UnusedModelThreshold << endl;
220 // emit a warning if the best alignment models are non-standard
221 const bool isModel1Top = (TopModelId == 1) || (NextTopModelId == 1);
222 const bool isModel2Top = (TopModelId == 2) || (NextTopModelId == 2);
223 const bool isModel4Top = (TopModelId == 4) || (NextTopModelId == 4);
224 const bool isModel5Top = (TopModelId == 5) || (NextTopModelId == 5);
225 const bool isModel6Top = (TopModelId == 6) || (NextTopModelId == 6);
226 const bool isModel8Top = (TopModelId == 8) || (NextTopModelId == 8);
228 bool isMatePair = ( isModel4Top && isModel5Top ? true : false );
229 bool isPairedEnd = ( isModel2Top && isModel6Top ? true : false );
230 bool isSolidPair = ( isModel1Top && isModel8Top ? true : false );
232 if ( !isMatePair && !isPairedEnd && !isSolidPair ) {
233 cerr << "WARNING: Found a non-standard alignment model configuration. " << endl
234 << " Using alignment models " << TopModelId << " & " << NextTopModelId
238 // store only the fragments from the best alignment models, then sort
239 vector<int32_t> fragments;
240 fragments.reserve( Models[0].size() + Models[1].size() );
241 fragments.insert( fragments.end(), Models[0].begin(), Models[0].end() );
242 fragments.insert( fragments.end(), Models[1].begin(), Models[1].end() );
243 sort ( fragments.begin(), fragments.end() );
245 // clear out Model fragment data, not needed anymore
248 // skip if no fragments found for this read group
249 if ( fragments.empty() ) {
255 // calculate & store the min,median, & max fragment lengths
256 const unsigned int numFragmentLengths = fragments.size();
257 const double halfNonConfidenceInterval = (1.0 - ReadGroupResolver::ConfidenceInterval)/2.0;
258 const unsigned int minIndex = (unsigned int)(numFragmentLengths * halfNonConfidenceInterval);
259 const unsigned int medianIndex = (unsigned int)(numFragmentLengths * 0.5);
260 const unsigned int maxIndex = (unsigned int)(numFragmentLengths * (1.0-halfNonConfidenceInterval));
262 MinFragmentLength = fragments[minIndex];
263 MedianFragmentLength = fragments[medianIndex];
264 MaxFragmentLength = fragments[maxIndex];
267 void ReadGroupResolver::SetConfidenceInterval(const double& ci) {
268 ConfidenceInterval = ci;
271 void ReadGroupResolver::SetUnusedModelThreshold(const double& umt) {
272 UnusedModelThreshold = umt;
275 // --------------------------------------------------------------------------
276 // ResolveSettings implementation
278 struct ResolveTool::ResolveSettings {
281 bool HasInputBamFile;
282 bool HasOutputBamFile;
283 bool IsForceCompression;
288 bool HasConfidenceInterval;
289 bool HasUnusedModelThreshold;
292 string InputBamFilename;
293 string OutputBamFilename;
294 string StatsFilename;
297 double ConfidenceInterval;
298 double UnusedModelThreshold;
299 bool HasForceMarkReadGroups;
302 ResolveSettings(void)
303 : HasInputBamFile(false)
304 , HasOutputBamFile(false)
305 , IsForceCompression(false)
306 , HasStatsFile(false)
310 , HasConfidenceInterval(false)
311 , HasUnusedModelThreshold(false)
312 , InputBamFilename(Options::StandardIn())
313 , OutputBamFilename(Options::StandardOut())
315 , ConfidenceInterval(DEFAULT_CONFIDENCE_INTERVAL)
316 , UnusedModelThreshold(DEFAULT_UNUSEDMODEL_THRESHOLD)
317 , HasForceMarkReadGroups(false)
321 // --------------------------------------------------------------------------
322 // StatsFileReader implementation
324 struct ResolveTool::StatsFileReader {
328 StatsFileReader(void) { }
329 ~StatsFileReader(void) { Close(); }
331 // main reader interface
334 bool Open(const std::string& filename);
335 bool Read(ResolveTool::ResolveSettings* settings,
336 map<string, ReadGroupResolver>& readGroups);
340 bool IsComment(const string& line) const;
341 bool IsWhitespace(const string& line) const;
342 bool ParseInputLine(const string& line);
343 bool ParseOptionLine(const string& line, ResolveTool::ResolveSettings* settings);
344 bool ParseReadGroupLine(const string& line, map<string, ReadGroupResolver>& readGroups);
345 string SkipCommentsAndWhitespace(void);
351 enum State { None = 0
357 void ResolveTool::StatsFileReader::Close(void) {
358 if ( m_stream.is_open() )
362 bool ResolveTool::StatsFileReader::IsComment(const string& line) const {
363 assert( !line.empty() );
364 return ( line.at(0) == COMMENT_CHAR );
367 bool ResolveTool::StatsFileReader::IsWhitespace(const string& line) const {
370 return ( isspace(line.at(0)) );
373 bool ResolveTool::StatsFileReader::Open(const string& filename) {
375 // make sure stream is fresh
378 // attempt to open filename, return status
379 m_stream.open(filename.c_str(), ifstream::in);
380 return m_stream.good();
383 bool ResolveTool::StatsFileReader::ParseInputLine(const string& /*line*/) {
384 // input lines are ignored (for now at least), tool will use input from command line
388 bool ResolveTool::StatsFileReader::ParseOptionLine(const string& line,
389 ResolveTool::ResolveSettings* settings)
391 // split line into option, value
392 vector<string> fields = Utilities::Split(line, EQUAL_CHAR);
393 if ( fields.size() != NUM_OPTIONS_FIELDS )
395 const string& option = fields.at(0);
396 stringstream value(fields.at(1));
398 // -----------------------------------
399 // handle option based on keyword
401 // ConfidenceInterval
402 if ( option == OPTION_CONFIDENCEINTERVAL ) {
403 value >> settings->ConfidenceInterval;
404 settings->HasConfidenceInterval = true;
408 // UnusedModelThreshold
409 if ( option == OPTION_UNUSEDMODELTHRESHOLD ) {
410 value >> settings->UnusedModelThreshold;
411 settings->HasUnusedModelThreshold = true;
415 // ForceMarkReadGroups
416 if ( option == OPTION_FORCEMARKREADGROUPS ) {
417 value >> settings->HasForceMarkReadGroups;
421 // otherwise unknown option
422 cerr << "bamtools resolve ERROR - unrecognized option: " << option << " in stats file" << endl;
426 bool ResolveTool::StatsFileReader::ParseReadGroupLine(const string& line,
427 map<string, ReadGroupResolver>& readGroups)
429 // split read group data in to fields
430 vector<string> fields = Utilities::Split(line, WHITESPACE_CHARS);
431 if ( fields.size() != NUM_READGROUPS_FIELDS ) return false;
434 const string& name = fields.at(0);
436 // populate RG's 'resolver' data
437 ReadGroupResolver resolver;
439 stringstream dataStream;
440 dataStream.str(fields.at(1));
441 dataStream >> resolver.MedianFragmentLength;
444 dataStream.str(fields.at(2));
445 dataStream >> resolver.MinFragmentLength;
448 dataStream.str(fields.at(3));
449 dataStream >> resolver.MaxFragmentLength;
452 dataStream.str(fields.at(4));
453 dataStream >> resolver.TopModelId;
456 dataStream.str(fields.at(5));
457 dataStream >> resolver.NextTopModelId;
460 resolver.IsAmbiguous = ( fields.at(6) == TRUE_KEYWORD );
462 // store RG entry and return success
463 readGroups.insert( make_pair<string, ReadGroupResolver>(name, resolver) );
467 bool ResolveTool::StatsFileReader::Read(ResolveTool::ResolveSettings* settings,
468 map<string, ReadGroupResolver>& readGroups)
470 // up-front sanity checks
471 if ( !m_stream.is_open() || settings == 0 )
474 // clear out read group data
478 State currentState = StatsFileReader::None;
481 string line = SkipCommentsAndWhitespace();
482 while ( !line.empty() ) {
484 bool foundError = false;
486 // switch state on keyword found
487 if ( Utilities::StartsWith(line, INPUT_TOKEN) )
488 currentState = StatsFileReader::InInput;
489 else if ( Utilities::StartsWith(line, OPTIONS_TOKEN) )
490 currentState = StatsFileReader::InOptions;
491 else if ( Utilities::StartsWith(line, READGROUPS_TOKEN) )
492 currentState = StatsFileReader::InReadGroups;
494 // otherwise parse data line, depending on state
496 if ( currentState == StatsFileReader::InInput )
497 foundError = !ParseInputLine(line);
498 else if ( currentState == StatsFileReader::InOptions )
499 foundError = !ParseOptionLine(line, settings);
500 else if ( currentState == StatsFileReader::InReadGroups )
501 foundError = !ParseReadGroupLine(line, readGroups);
506 // break out if error found
511 line = SkipCommentsAndWhitespace();
514 // if here, return success
518 string ResolveTool::StatsFileReader::SkipCommentsAndWhitespace(void) {
521 if ( m_stream.eof() )
523 getline(m_stream, line);
524 } while ( IsWhitespace(line) || IsComment(line) );
528 // --------------------------------------------------------------------------
529 // StatsFileReader implementation
531 struct ResolveTool::StatsFileWriter {
535 StatsFileWriter(void) { }
536 ~StatsFileWriter(void) { Close(); }
538 // main reader interface
541 bool Open(const std::string& filename);
542 bool Write(ResolveTool::ResolveSettings* settings,
543 const map<string, ReadGroupResolver>& readGroups);
547 void WriteHeader(void);
548 void WriteInput(ResolveTool::ResolveSettings* settings);
549 void WriteOptions(ResolveTool::ResolveSettings* settings);
550 void WriteReadGroups(const map<string, ReadGroupResolver>& readGroups);
557 void ResolveTool::StatsFileWriter::Close(void) {
558 if ( m_stream.is_open() )
562 bool ResolveTool::StatsFileWriter::Open(const string& filename) {
564 // make sure stream is fresh
567 // attempt to open filename, return status
568 m_stream.open(filename.c_str(), ofstream::out);
569 return m_stream.good();
572 bool ResolveTool::StatsFileWriter::Write(ResolveTool::ResolveSettings* settings,
573 const map<string, ReadGroupResolver>& readGroups)
575 // return failure if file not open
576 if ( !m_stream.is_open() )
579 // write stats file elements
581 WriteInput(settings);
582 WriteOptions(settings);
583 WriteReadGroups(readGroups);
589 void ResolveTool::StatsFileWriter::WriteHeader(void) {
591 // stringify current bamtools version
592 stringstream versionStream("");
594 << BAMTOOLS_VERSION_MAJOR << "."
595 << BAMTOOLS_VERSION_MINOR << "."
596 << BAMTOOLS_VERSION_BUILD;
598 // # bamtools resolve (vX.Y.Z)
601 m_stream << COMMENT_CHAR << " bamtools resolve (" << versionStream.str() << ")" << endl
605 void ResolveTool::StatsFileWriter::WriteInput(ResolveTool::ResolveSettings* settings) {
611 m_stream << INPUT_TOKEN << endl
612 << settings->InputBamFilename << endl
616 void ResolveTool::StatsFileWriter::WriteOptions(ResolveTool::ResolveSettings* settings) {
619 // ConfidenceInterval=<double>
620 // UnusedModelThreshold=<double>
621 // ForceMarkReadGroups=<true|false>
624 m_stream << OPTIONS_TOKEN << endl
625 << OPTION_CONFIDENCEINTERVAL << EQUAL_CHAR << settings->ConfidenceInterval << endl
626 << OPTION_UNUSEDMODELTHRESHOLD << EQUAL_CHAR << settings->UnusedModelThreshold << endl
627 << OPTION_FORCEMARKREADGROUPS << EQUAL_CHAR << boolalpha << settings->HasForceMarkReadGroups << endl
631 void ResolveTool::StatsFileWriter::WriteReadGroups(const map<string, ReadGroupResolver>& readGroups) {
634 // #<name> <medianFL> <minFL> <maxFL> <topModelID> <nextTopModelID> <isAmbiguous?>
635 m_stream << READGROUPS_TOKEN << endl
636 << RG_FIELD_DESCRIPTION << endl;
638 // iterate over read groups
639 map<string, ReadGroupResolver>::const_iterator rgIter = readGroups.begin();
640 map<string, ReadGroupResolver>::const_iterator rgEnd = readGroups.end();
641 for ( ; rgIter != rgEnd; ++rgIter ) {
642 const string& name = (*rgIter).first;
643 const ReadGroupResolver& resolver = (*rgIter).second;
645 // skip if read group has no data
646 if ( !resolver.HasData )
649 // write read group data
650 m_stream << name << TAB_CHAR
651 << resolver.MedianFragmentLength << TAB_CHAR
652 << resolver.MinFragmentLength << TAB_CHAR
653 << resolver.MaxFragmentLength << TAB_CHAR
654 << resolver.TopModelId << TAB_CHAR
655 << resolver.NextTopModelId << TAB_CHAR
656 << boolalpha << resolver.IsAmbiguous
660 // extra newline at end
664 // --------------------------------------------------------------------------
665 // ResolveToolPrivate implementation
667 struct ResolveTool::ResolveToolPrivate {
671 ResolveToolPrivate(ResolveTool::ResolveSettings* settings)
672 : m_settings(settings)
674 ~ResolveToolPrivate(void) { }
676 // 'public' interface
682 bool CheckSettings(vector<string>& errors);
683 bool MakeStats(void);
684 void ParseHeader(const SamHeader& header);
685 bool ReadStatsFile(void);
686 void ResolveAlignment(BamAlignment& al);
687 bool ResolvePairs(void);
688 bool WriteStatsFile(void);
692 ResolveTool::ResolveSettings* m_settings;
693 map<string, ReadGroupResolver> m_readGroups;
696 bool ResolveTool::ResolveToolPrivate::CheckSettings(vector<string>& errors) {
698 // ensure clean slate
702 if ( m_settings->IsMakeStats ) {
705 if ( m_settings->IsMarkPairs )
706 errors.push_back("Cannot run in both -makeStats & -markPairs modes. Please select ONE.");
707 if ( m_settings->IsTwoPass )
708 errors.push_back("Cannot run in both -makeStats & -twoPass modes. Please select ONE.");
710 // error if output BAM options supplied
711 if ( m_settings->HasOutputBamFile )
712 errors.push_back("Cannot use -out (output BAM file) in -makeStats mode.");
713 if ( m_settings->IsForceCompression )
714 errors.push_back("Cannot use -forceCompression. No output BAM file is being generated.");
716 // make sure required stats file supplied
717 if ( !m_settings->HasStatsFile )
718 errors.push_back("Ouptut stats filename required for -makeStats mode. Please specify one using -stats option.");
720 // check for UseStats options
721 if ( m_settings->HasForceMarkReadGroups )
722 errors.push_back("Cannot use -forceMarkReadGroups. -markPairs options are DISABLED in -makeStats mode.");
726 else if ( m_settings->IsMarkPairs ) {
729 if ( m_settings->IsMakeStats )
730 errors.push_back("Cannot run in both -makeStats & -markPairs modes. Please select ONE.");
731 if ( m_settings->IsTwoPass )
732 errors.push_back("Cannot run in both -markPairs & -twoPass modes. Please select ONE.");
734 // make sure required stats file supplied
735 if ( !m_settings->HasStatsFile )
736 errors.push_back("Input stats filename required for -markPairs mode. Please specify one using -stats option.");
738 // check for MakeStats options
739 if ( m_settings->HasConfidenceInterval )
740 errors.push_back("Cannot use -ci. -makeStats options are DISABLED is -markPairs mode.");
744 else if ( m_settings->IsTwoPass ) {
747 if ( m_settings->IsMakeStats )
748 errors.push_back("Cannot run in both -makeStats & -twoPass modes. Please select ONE.");
749 if ( m_settings->IsMarkPairs )
750 errors.push_back("Cannot run in both -markPairs & -twoPass modes. Please select ONE.");
752 // make sure input is file not stdin
753 if ( !m_settings->HasInputBamFile || m_settings->InputBamFilename == Options::StandardIn() )
754 errors.push_back("Cannot run -twoPass mode with BAM data from stdin. Please specify existing file using -in option.");
759 errors.push_back("No resolve mode specified. Please select ONE of the following: -makeStats, -markPairs, or -twoPass. See help for more info.");
761 // boundary checks on values
762 if ( m_settings->HasConfidenceInterval ) {
763 if ( m_settings->ConfidenceInterval < 0.0 || m_settings->ConfidenceInterval > 1.0 )
764 errors.push_back("Invalid confidence interval. Must be between 0 and 1");
766 if ( m_settings->HasUnusedModelThreshold ) {
767 if ( m_settings->UnusedModelThreshold < 0.0 || m_settings->UnusedModelThreshold > 1.0 )
768 errors.push_back("Invalid unused model threshold. Must be between 0 and 1");
771 // return success if no errors found
772 return ( errors.empty() );
775 bool ResolveTool::ResolveToolPrivate::MakeStats(void) {
777 // pull resolver settings from command-line settings
778 ReadGroupResolver::SetConfidenceInterval(m_settings->ConfidenceInterval);
779 ReadGroupResolver::SetUnusedModelThreshold(m_settings->UnusedModelThreshold);
781 // open our BAM reader
783 if ( !reader.Open(m_settings->InputBamFilename) ) {
784 cerr << "bamtools resolve ERROR: could not open input BAM file: "
785 << m_settings->InputBamFilename << endl;
789 // retrieve header & parse for read groups
790 const SamHeader& header = reader.GetHeader();
793 // read through BAM file
795 string readGroup("");
796 map<string, ReadGroupResolver>::iterator rgIter;
797 while ( reader.GetNextAlignmentCore(al) ) {
799 // skip if alignment is not paired, mapped, nor mate is mapped
800 if ( !al.IsPaired() || !al.IsMapped() || !al.IsMateMapped() )
803 // skip if alignment & mate not on same reference sequence
804 if ( al.RefID != al.MateRefID ) continue;
806 // skip if map quality is 0
807 if ( al.MapQuality == 0 ) continue;
809 // skip if insert size is less than (we'll count its mate)
810 // or equal to zero (single-end or missing data)
811 if ( al.InsertSize <= 0 ) continue;
813 // determine model type, skip if model unknown
814 const uint16_t currentModelType = CalculateModelType(al);
815 assert( currentModelType != ModelType::DUMMY_ID );
817 // flesh out the char data, so we can retrieve its read group ID
820 // get read group from alignment (OK if empty)
822 al.GetTag(READ_GROUP_TAG, readGroup);
824 // look up resolver for read group
825 rgIter = m_readGroups.find(readGroup);
826 if ( rgIter == m_readGroups.end() ) {
827 cerr << "bamtools resolve ERROR - unable to calculate stats, unknown read group encountered: "
828 << readGroup << endl;
832 ReadGroupResolver& resolver = (*rgIter).second;
834 // update stats for current read group, current model type
835 resolver.Models[currentModelType].push_back(al.InsertSize);
838 // iterate back through read groups
839 map<string, ReadGroupResolver>::iterator rgEnd = m_readGroups.end();
840 for ( rgIter = m_readGroups.begin(); rgIter != rgEnd; ++rgIter ) {
841 const string& name = (*rgIter).first;
842 ReadGroupResolver& resolver = (*rgIter).second;
844 // calculate acceptable orientation & insert sizes for this read group
845 resolver.DetermineTopModels(name);
848 // close reader & return success
853 void ResolveTool::ResolveToolPrivate::ParseHeader(const SamHeader& header) {
855 // iterate over header read groups, creating a 'resolver' for each
856 SamReadGroupConstIterator rgIter = header.ReadGroups.ConstBegin();
857 SamReadGroupConstIterator rgEnd = header.ReadGroups.ConstEnd();
858 for ( ; rgIter != rgEnd; ++rgIter ) {
859 const SamReadGroup& rg = (*rgIter);
860 m_readGroups.insert( make_pair<string, ReadGroupResolver>(rg.ID, ReadGroupResolver()) );
864 bool ResolveTool::ResolveToolPrivate::ReadStatsFile(void) {
866 // skip if no filename provided
867 if ( m_settings->StatsFilename.empty() )
870 // attempt to open stats file
871 ResolveTool::StatsFileReader statsReader;
872 if ( !statsReader.Open(m_settings->StatsFilename) ) {
873 cerr << "bamtools resolve ERROR - could not open stats file: "
874 << m_settings->StatsFilename << " for reading" << endl;
878 // attempt to read stats data
879 if ( !statsReader.Read(m_settings, m_readGroups) ) {
880 cerr << "bamtools resolve ERROR - could not parse stats file: "
881 << m_settings->StatsFilename << " for data" << endl;
889 void ResolveTool::ResolveToolPrivate::ResolveAlignment(BamAlignment& al) {
891 // clear proper-pair flag
892 al.SetIsProperPair(false);
894 // quit check if alignment is not from paired-end read
895 if ( !al.IsPaired() ) return;
897 // quit check if either alignment or its mate are unmapped
898 if ( !al.IsMapped() || !al.IsMateMapped() ) return;
900 // quit check if alignment & its mate are on differenct references
901 if ( al.RefID != al.MateRefID ) return;
903 // quit check if map quality is 0
904 if ( al.MapQuality == 0 ) return;
906 // get read group from alignment
907 // empty string if not found, this is OK - we handle empty read group case
908 string readGroupName("");
909 al.GetTag(READ_GROUP_TAG, readGroupName);
911 // look up read group's 'resolver'
912 map<string, ReadGroupResolver>::iterator rgIter = m_readGroups.find(readGroupName);
913 if ( rgIter == m_readGroups.end() ) {
914 cerr << "bamtools resolve ERROR - read group found that was not in header: "
915 << readGroupName << endl;
918 const ReadGroupResolver& resolver = (*rgIter).second;
920 // quit check if pairs are not in proper orientation (can differ for each RG)
921 if ( !resolver.IsValidOrientation(al) ) return;
923 // quit check if pairs are not within "reasonable" distance (can differ for each RG)
924 if ( !resolver.IsValidInsertSize(al) ) return;
926 // if we get here, alignment is OK - set 'proper pair' flag
927 al.SetIsProperPair(true);
930 bool ResolveTool::ResolveToolPrivate::ResolvePairs(void) {
932 // open our BAM reader
934 if ( !reader.Open(m_settings->InputBamFilename) ) {
935 cerr << "bamtools resolve ERROR: could not open input BAM file: "
936 << m_settings->InputBamFilename << endl;
940 // retrieve header & reference dictionary info
941 const SamHeader& header = reader.GetHeader();
942 const RefVector& references = reader.GetReferenceData();
944 // determine compression mode for BamWriter
945 bool writeUncompressed = ( m_settings->OutputBamFilename == Options::StandardOut() &&
946 !m_settings->IsForceCompression );
947 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
948 if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
952 writer.SetCompressionMode(compressionMode);
953 if ( !writer.Open(m_settings->OutputBamFilename, header, references) ) {
954 cerr << "bamtools resolve ERROR: could not open "
955 << m_settings->OutputBamFilename << " for writing." << endl;
960 // plow through alignments, setting/clearing 'proper pair' flag
961 // and writing to new output BAM file
963 while ( reader.GetNextAlignment(al) ) {
964 ResolveAlignment(al);
965 writer.SaveAlignment(al);
968 // clean up & return success
974 bool ResolveTool::ResolveToolPrivate::Run(void) {
976 // verify that command line settings are acceptable
977 vector<string> errors;
978 if ( !CheckSettings(errors) ) {
979 cerr << "bamtools resolve ERROR - invalid settings: " << endl;
980 vector<string>::const_iterator errorIter = errors.begin();
981 vector<string>::const_iterator errorEnd = errors.end();
982 for ( ; errorIter != errorEnd; ++errorIter )
983 cerr << (*errorIter) << endl;
987 // initialize read group map with default (empty name) read group
988 m_readGroups.insert( make_pair<string, ReadGroupResolver>("", ReadGroupResolver()) );
991 if ( m_settings->IsMakeStats ) {
993 // generate stats data
994 if ( !MakeStats() ) {
995 cerr << "bamtools resolve ERROR - could not generate stats" << endl;
999 // write stats to file
1000 if ( !WriteStatsFile() ) {
1001 cerr << "bamtools resolve ERROR - could not write stats file: "
1002 << m_settings->StatsFilename << endl;
1008 else if ( m_settings->IsMarkPairs ) {
1010 // read stats from file
1011 if ( !ReadStatsFile() ) {
1012 cerr << "bamtools resolve ERROR - could not read stats file: "
1013 << m_settings->StatsFilename << endl;
1017 // do paired-end resolution
1018 if ( !ResolvePairs() ) {
1019 cerr << "bamtools resolve ERROR - could not resolve pairs" << endl;
1027 // generate stats data
1028 if ( !MakeStats() ) {
1029 cerr << "bamtools resolve ERROR - could not generate stats" << endl;
1033 // if stats file requested
1034 if ( m_settings->HasStatsFile ) {
1036 // write stats to file
1037 // emit warning if write fails, but paired-end resolution should be allowed to proceed
1038 if ( !WriteStatsFile() )
1039 cerr << "bamtools resolve WARNING - could not write stats file: "
1040 << m_settings->StatsFilename << endl;
1043 // do paired-end resolution
1044 if ( !ResolvePairs() ) {
1045 cerr << "bamtools resolve ERROR - could not resolve pairs" << endl;
1054 bool ResolveTool::ResolveToolPrivate::WriteStatsFile(void) {
1056 // skip if no filename provided
1057 if ( m_settings->StatsFilename.empty() )
1060 // attempt to open stats file
1061 ResolveTool::StatsFileWriter statsWriter;
1062 if ( !statsWriter.Open(m_settings->StatsFilename) ) {
1063 cerr << "bamtools resolve ERROR - could not open stats file: "
1064 << m_settings->StatsFilename << " for writing" << endl;
1068 // attempt to write stats data
1069 if ( !statsWriter.Write(m_settings, m_readGroups) ) {
1070 cerr << "bamtools resolve ERROR - could not write stats file: "
1071 << m_settings->StatsFilename << " for data" << endl;
1079 // --------------------------------------------------------------------------
1080 // ResolveTool implementation
1082 ResolveTool::ResolveTool(void)
1084 , m_settings(new ResolveSettings)
1087 // set description texts
1088 const string programDescription = "resolves paired-end reads (marking the IsProperPair flag as needed)";
1089 const string programUsage = "<mode> [options] [-in <filename>] [-out <filename> | [-forceCompression] ] [-stats <filename>]";
1090 const string inputBamDescription = "the input BAM file(s)";
1091 const string outputBamDescription = "the output BAM file";
1092 const string statsFileDescription = "input/output stats file, depending on selected mode (see below). "
1093 "This file is human-readable, storing fragment length data generated per read group, as well as "
1094 "the options used to configure the -makeStats mode";
1095 const string forceCompressionDescription = "if results are sent to stdout (like when piping to another tool), "
1096 "default behavior is to leave output uncompressed."
1097 "Use this flag to override and force compression. This feature is disabled in -makeStats mode.";
1098 const string makeStatsDescription = "generates a fragment-length stats file from the input BAM. "
1099 "Data is written to file specified using the -stats option. "
1100 "MarkPairs Mode Settings are DISABLED.";
1101 const string markPairsDescription = "generates an output BAM with alignments marked with proper-pair status. "
1102 "Stats data is read from file specified using the -stats option. "
1103 "MakeStats Mode Settings are DISABLED";
1104 const string twoPassDescription = "combines the -makeStats & -markPairs modes into a single command. "
1105 "However, due to the two-pass nature of paired-end resolution, piping BAM data via stdin is DISABLED. "
1106 "You must supply an explicit input BAM file. Output BAM may be piped to stdout, however, if desired. "
1107 "All MakeStats & MarkPairs Mode Settings are available. "
1108 "The intermediate stats file is not necessary, but if the -stats options is used, then one will be generated. "
1109 "You may find this useful for documentation purposes.";
1110 const string confidenceIntervalDescription = "confidence interval. Set min/max fragment lengths such that we capture "
1111 "this fraction of pairs";
1112 const string unusedModelThresholdDescription = "unused model threshold. The resolve tool considers 8 possible orientation models "
1113 "for pairs. The top 2 are selected for later use when actually marking alignments. This value determines the "
1114 "cutoff for marking a read group as ambiguous. Meaning that if the ratio of the number of alignments from bottom 6 models "
1115 "to the top 2 is greater than this threshold, then the read group is flagged as ambiguous. By default, NO alignments "
1116 "from ambiguous read groups will be marked as proper pairs. You may override this behavior with the -force option "
1117 "in -markPairs mode";
1118 const string forceMarkDescription = "forces all read groups to be marked according to their top 2 'orientation models'. "
1119 "When generating stats, the 2 (out of 8 possible) models with the most observations are chosen as the top models for each read group. "
1120 "If the remaining 6 models account for more than some threshold ([default=10%], see -umt), then the read group is marked as ambiguous. "
1121 "The default behavior is that for an ambiguous read group, NONE of its alignments are marked as proper-pairs. "
1122 "By setting this option, a read group's ambiguity flag will be ignored, and all of its alignments will be compared to the top 2 models.";
1124 // set program details
1125 Options::SetProgramInfo("bamtools resolve", programDescription, programUsage);
1127 // set up I/O options
1128 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
1129 Options::AddValueOption("-in", "BAM filename", inputBamDescription, "",
1130 m_settings->HasInputBamFile, m_settings->InputBamFilename,
1131 IO_Opts, Options::StandardIn());
1132 Options::AddValueOption("-out", "BAM filename", outputBamDescription, "",
1133 m_settings->HasOutputBamFile, m_settings->OutputBamFilename,
1134 IO_Opts, Options::StandardOut());
1135 Options::AddValueOption("-stats", "STATS filename", statsFileDescription, "",
1136 m_settings->HasStatsFile, m_settings->StatsFilename, IO_Opts);
1137 Options::AddOption("-forceCompression", forceCompressionDescription,
1138 m_settings->IsForceCompression, IO_Opts);
1140 OptionGroup* ModeOpts = Options::CreateOptionGroup("Resolve Modes (must select ONE of the following)");
1141 Options::AddOption("-makeStats", makeStatsDescription, m_settings->IsMakeStats, ModeOpts);
1142 Options::AddOption("-markPairs", markPairsDescription, m_settings->IsMarkPairs, ModeOpts);
1143 Options::AddOption("-twoPass", twoPassDescription, m_settings->IsTwoPass, ModeOpts);
1145 OptionGroup* MakeStatsOpts = Options::CreateOptionGroup("MakeStats Mode Options (disabled in -markPairs mode)");
1146 Options::AddValueOption("-ci", "double", confidenceIntervalDescription, "",
1147 m_settings->HasConfidenceInterval, m_settings->ConfidenceInterval, MakeStatsOpts);
1148 Options::AddValueOption("-umt", "double", unusedModelThresholdDescription, "",
1149 m_settings->HasUnusedModelThreshold, m_settings->UnusedModelThreshold, MakeStatsOpts);
1151 OptionGroup* MarkPairsOpts = Options::CreateOptionGroup("MarkPairs Mode Options (disabled in -makeStats mode)");
1152 Options::AddOption("-force", forceMarkDescription, m_settings->HasForceMarkReadGroups, MarkPairsOpts);
1155 ResolveTool::~ResolveTool(void) {
1164 int ResolveTool::Help(void) {
1165 Options::DisplayHelp();
1169 int ResolveTool::Run(int argc, char* argv[]) {
1171 // parse command line arguments
1172 Options::Parse(argc, argv, 1);
1174 // initialize ResolveTool
1175 m_impl = new ResolveToolPrivate(m_settings);
1177 // run ResolveTool, return success/failure
1178 if ( m_impl->Run() )