1 // ***************************************************************************
2 // bamtools_resolve.cpp (c) 2011
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 11 June 2011
7 // ---------------------------------------------------------------------------
8 // Resolves paired-end reads (marking the IsProperPair flag as needed).
9 // ***************************************************************************
11 #include "bamtools_resolve.h"
12 #include "bamtools_version.h"
13 #include <api/BamReader.h>
14 #include <api/BamWriter.h>
15 #include <utils/bamtools_options.h>
16 #include <utils/bamtools_utilities.h>
17 using namespace BamTools;
32 // --------------------------------------------------------------------------
33 // general ResolveTool constants
34 // --------------------------------------------------------------------------
36 static const int NUM_MODELS = 8;
37 static const string READ_GROUP_TAG = "RG";
38 static const double DEFAULT_CONFIDENCE_INTERVAL = 0.9973;
39 static const double DEFAULT_UNUSEDMODEL_THRESHOLD = 0.1;
41 // --------------------------------------------------------------------------
42 // stats file constants
43 // --------------------------------------------------------------------------
45 // basic char/string constants
46 static const char COMMENT_CHAR = '#';
47 static const char OPEN_BRACE_CHAR = '[';
48 static const char CLOSE_BRACE_CHAR = ']';
49 static const char EQUAL_CHAR = '=';
50 static const char TAB_CHAR = '\t';
52 static const string WHITESPACE_CHARS = " \t\n";
53 static const string TRUE_KEYWORD = "true";
54 static const string FALSE_KEYWORD = "false";
57 static const size_t NUM_OPTIONS_FIELDS = 2;
58 static const size_t NUM_READGROUPS_FIELDS = 7;
61 static const string INPUT_TOKEN = "[Input]";
62 static const string OPTIONS_TOKEN = "[Options]";
63 static const string READGROUPS_TOKEN = "[ReadGroups]";
66 static const string OPTION_CONFIDENCEINTERVAL = "ConfidenceInterval";
67 static const string OPTION_UNUSEDMODELTHRESHOLD = "UnusedModelThreshold";
68 static const string OPTION_FORCEMARKREADGROUPS = "ForceMarkReadGroups";
70 // --------------------------------------------------------------------------
71 // ModelType implementation
77 vector<uint32_t> FragmentLengths;
80 ModelType(const uint16_t id)
83 // preallocate space for 10K fragments per model type
84 FragmentLengths.reserve(10000);
87 // convenience access to internal fragment lengths vector
88 vector<uint32_t>::iterator begin(void) { return FragmentLengths.begin(); }
89 vector<uint32_t>::const_iterator begin(void) const { return FragmentLengths.begin(); }
90 void clear(void) { FragmentLengths.clear(); }
91 vector<uint32_t>::iterator end(void) { return FragmentLengths.end(); }
92 vector<uint32_t>::const_iterator end(void) const { return FragmentLengths.end(); }
93 void push_back(const uint32_t& x) { FragmentLengths.push_back(x); }
94 size_t size(void) const { return FragmentLengths.size(); }
97 static const uint16_t DUMMY_ID;
100 const uint16_t ModelType::DUMMY_ID = 100;
102 bool operator>(const ModelType& lhs, const ModelType& rhs) {
103 return lhs.size() > rhs.size();
106 uint16_t CalculateModelType(const BamAlignment& al) {
108 // localize alignment's mate positions & orientations for convenience
109 const int32_t m1_begin = ( al.IsFirstMate() ? al.Position : al.MatePosition );
110 const int32_t m2_begin = ( al.IsFirstMate() ? al.MatePosition : al.Position );
111 const bool m1_isReverseStrand = ( al.IsFirstMate() ? al.IsReverseStrand() : al.IsMateReverseStrand() );
112 const bool m2_isReverseStrand = ( al.IsFirstMate() ? al.IsMateReverseStrand() : al.IsReverseStrand() );
114 // determine 'model type'
115 if ( m1_begin < m2_begin ) {
116 if ( !m1_isReverseStrand && !m2_isReverseStrand ) return 0;
117 if ( !m1_isReverseStrand && m2_isReverseStrand ) return 1;
118 if ( m1_isReverseStrand && !m2_isReverseStrand ) return 2;
119 if ( m1_isReverseStrand && m2_isReverseStrand ) return 3;
121 if ( !m2_isReverseStrand && !m1_isReverseStrand ) return 4;
122 if ( !m2_isReverseStrand && m1_isReverseStrand ) return 5;
123 if ( m2_isReverseStrand && !m1_isReverseStrand ) return 6;
124 if ( m2_isReverseStrand && m1_isReverseStrand ) return 7;
128 return ModelType::DUMMY_ID;
131 // --------------------------------------------------------------------------
132 // ReadGroupResolver implementation
134 struct ReadGroupResolver {
137 int32_t MinFragmentLength;
138 int32_t MedianFragmentLength;
139 int32_t MaxFragmentLength;
141 uint16_t NextTopModelId;
144 vector<ModelType> Models;
147 ReadGroupResolver(void);
150 bool IsValidInsertSize(const BamAlignment& al) const;
151 bool IsValidOrientation(const BamAlignment& al) const;
153 // select 2 best models based on observed data
154 void DetermineTopModels(const string& readGroupName);
157 static double ConfidenceInterval;
158 static double UnusedModelThreshold;
159 static void SetConfidenceInterval(const double& ci);
160 static void SetUnusedModelThreshold(const double& umt);
163 double ReadGroupResolver::ConfidenceInterval = DEFAULT_CONFIDENCE_INTERVAL;
164 double ReadGroupResolver::UnusedModelThreshold = DEFAULT_UNUSEDMODEL_THRESHOLD;
166 ReadGroupResolver::ReadGroupResolver(void)
167 : MinFragmentLength(0)
168 , MedianFragmentLength(0)
169 , MaxFragmentLength(0)
170 , TopModelId(ModelType::DUMMY_ID)
171 , NextTopModelId(ModelType::DUMMY_ID)
175 // pre-allocate space for 8 models
176 Models.reserve(NUM_MODELS);
177 for ( uint16_t i = 0; i < NUM_MODELS; ++i )
178 Models.push_back( ModelType(i+1) );
181 bool ReadGroupResolver::IsValidInsertSize(const BamAlignment& al) const {
182 return ( al.InsertSize >= MinFragmentLength &&
183 al.InsertSize <= MaxFragmentLength );
186 bool ReadGroupResolver::IsValidOrientation(const BamAlignment& al) const {
187 const uint16_t currentModel = CalculateModelType(al);
188 return ( currentModel == TopModelId || currentModel == NextTopModelId );
191 void ReadGroupResolver::DetermineTopModels(const string& readGroupName) {
193 // sort models (most common -> least common)
194 sort( Models.begin(), Models.end(), std::greater<ModelType>() );
196 // store top 2 models for later
197 TopModelId = Models[0].ID;
198 NextTopModelId = Models[1].ID;
200 // make sure that the 2 most common models are some threshold more common
201 // than the remaining models
202 const unsigned int activeModelCountSum = Models[0].size() + Models[1].size();
203 if ( activeModelCountSum == 0 ) return; // skip if no data in this read group
204 const unsigned int unusedModelCountSum = Models[2].size() + Models[3].size() +
205 Models[4].size() + Models[5].size() +
206 Models[6].size() + Models[7].size();
207 const double unusedPercentage = (double)unusedModelCountSum / (double)activeModelCountSum;
208 if ( unusedPercentage > UnusedModelThreshold ) {
209 cerr << "WARNING: " << readGroupName << " does not have clearly defined 'top models'" << endl
210 << " The fraction of alignments in bottom 6 models (" << unusedPercentage
211 << ") exceeds threshold: " << UnusedModelThreshold << endl;
215 // emit a warning if the best alignment models are non-standard
216 const bool isModel1Top = (TopModelId == 1) || (NextTopModelId == 1);
217 const bool isModel2Top = (TopModelId == 2) || (NextTopModelId == 2);
218 const bool isModel4Top = (TopModelId == 4) || (NextTopModelId == 4);
219 const bool isModel5Top = (TopModelId == 5) || (NextTopModelId == 5);
220 const bool isModel6Top = (TopModelId == 6) || (NextTopModelId == 6);
221 const bool isModel8Top = (TopModelId == 8) || (NextTopModelId == 8);
223 bool isMatePair = ( isModel4Top && isModel5Top ? true : false );
224 bool isPairedEnd = ( isModel2Top && isModel6Top ? true : false );
225 bool isSolidPair = ( isModel1Top && isModel8Top ? true : false );
227 if ( !isMatePair && !isPairedEnd && !isSolidPair ) {
228 cerr << "WARNING: Found a non-standard alignment model configuration. " << endl
229 << " Using alignment models " << TopModelId << " & " << NextTopModelId
233 // store only the fragments from the best alignment models, then sort
234 vector<uint32_t> fragments;
235 fragments.reserve( Models[0].size() + Models[1].size() );
236 fragments.insert( fragments.end(), Models[0].begin(), Models[0].end() );
237 fragments.insert( fragments.end(), Models[1].begin(), Models[1].end() );
238 sort ( fragments.begin(), fragments.end() );
240 // clear out Model fragment data, not needed anymore
243 // determine min,median, & max fragment lengths for each read group
244 const double halfNonConfidenceInterval = (1.0 - ReadGroupResolver::ConfidenceInterval)/2.0;
245 const unsigned int numFragmentLengths = fragments.size();
246 if ( numFragmentLengths == 0 ) return;
248 const unsigned int minIndex = (unsigned int)(numFragmentLengths * halfNonConfidenceInterval);
249 const unsigned int medianIndex = (unsigned int)(numFragmentLengths * 0.5);
250 const unsigned int maxIndex = (unsigned int)(numFragmentLengths * (1.0-halfNonConfidenceInterval));
252 MinFragmentLength = fragments[minIndex];
253 MedianFragmentLength = fragments[medianIndex];
254 MaxFragmentLength = fragments[maxIndex];
258 void ReadGroupResolver::SetConfidenceInterval(const double& ci) {
259 ConfidenceInterval = ci;
262 void ReadGroupResolver::SetUnusedModelThreshold(const double& umt) {
263 UnusedModelThreshold = umt;
266 // --------------------------------------------------------------------------
267 // ResolveSettings implementation
269 struct ResolveTool::ResolveSettings {
272 bool HasInputBamFile;
273 bool HasOutputBamFile;
274 bool IsForceCompression;
279 bool HasConfidenceInterval;
280 bool HasUnusedModelThreshold;
283 string InputBamFilename;
284 string OutputBamFilename;
285 string StatsFilename;
288 double ConfidenceInterval;
289 double UnusedModelThreshold;
290 bool HasForceMarkReadGroups;
293 ResolveSettings(void)
294 : HasInputBamFile(false)
295 , HasOutputBamFile(false)
296 , IsForceCompression(false)
297 , HasStatsFile(false)
301 , HasConfidenceInterval(false)
302 , HasUnusedModelThreshold(false)
303 , InputBamFilename(Options::StandardIn())
304 , OutputBamFilename(Options::StandardOut())
306 , ConfidenceInterval(DEFAULT_CONFIDENCE_INTERVAL)
307 , UnusedModelThreshold(DEFAULT_UNUSEDMODEL_THRESHOLD)
308 , HasForceMarkReadGroups(false)
312 // --------------------------------------------------------------------------
313 // StatsFileReader implementation
315 struct ResolveTool::StatsFileReader {
319 StatsFileReader(void) { }
320 ~StatsFileReader(void) { Close(); }
322 // main reader interface
325 bool Open(const std::string& filename);
326 bool Read(ResolveTool::ResolveSettings* settings,
327 map<string, ReadGroupResolver>& readGroups);
331 bool IsComment(const string& line) const;
332 bool IsWhitespace(const string& line) const;
333 bool ParseInputLine(const string& line);
334 bool ParseOptionLine(const string& line, ResolveTool::ResolveSettings* settings);
335 bool ParseReadGroupLine(const string& line, map<string, ReadGroupResolver>& readGroups);
336 string SkipCommentsAndWhitespace(void);
342 enum State { None = 0
348 void ResolveTool::StatsFileReader::Close(void) {
349 if ( m_stream.is_open() )
353 bool ResolveTool::StatsFileReader::IsComment(const string& line) const {
354 assert( !line.empty() );
355 return ( line.at(0) == COMMENT_CHAR );
358 bool ResolveTool::StatsFileReader::IsWhitespace(const string& line) const {
361 return ( isspace(line.at(0)) );
364 bool ResolveTool::StatsFileReader::Open(const string& filename) {
366 // make sure stream is fresh
369 // attempt to open filename, return status
370 m_stream.open(filename.c_str(), ifstream::in);
371 return m_stream.good();
374 bool ResolveTool::StatsFileReader::ParseInputLine(const string& /*line*/) {
375 // input lines are ignored (for now at least), tool will use input from command line
379 bool ResolveTool::StatsFileReader::ParseOptionLine(const string& line,
380 ResolveTool::ResolveSettings* settings)
382 // split line into option, value
383 vector<string> fields = Utilities::Split(line, EQUAL_CHAR);
384 if ( fields.size() != NUM_OPTIONS_FIELDS )
386 const string& option = fields.at(0);
387 stringstream value(fields.at(1));
389 // -----------------------------------
390 // handle option based on keyword
392 // ConfidenceInterval
393 if ( option == OPTION_CONFIDENCEINTERVAL ) {
394 value >> settings->ConfidenceInterval;
395 settings->HasConfidenceInterval = true;
399 // UnusedModelThreshold
400 if ( option == OPTION_UNUSEDMODELTHRESHOLD ) {
401 value >> settings->UnusedModelThreshold;
402 settings->HasUnusedModelThreshold = true;
406 // ForceMarkReadGroups
407 if ( option == OPTION_FORCEMARKREADGROUPS ) {
408 value >> settings->HasForceMarkReadGroups;
412 // otherwise unknown option
413 cerr << "bamtools resolve ERROR - unrecognized option: " << option << " in stats file" << endl;
417 bool ResolveTool::StatsFileReader::ParseReadGroupLine(const string& line,
418 map<string, ReadGroupResolver>& readGroups)
420 // split read group data in to fields
421 vector<string> fields = Utilities::Split(line, WHITESPACE_CHARS);
422 if ( fields.size() != NUM_READGROUPS_FIELDS ) return false;
425 const string& name = fields.at(0);
427 // populate RG's 'resolver' data
428 ReadGroupResolver resolver;
430 stringstream dataStream;
431 dataStream.str(fields.at(1));
432 dataStream >> resolver.MedianFragmentLength;
435 dataStream.str(fields.at(2));
436 dataStream >> resolver.MinFragmentLength;
439 dataStream.str(fields.at(3));
440 dataStream >> resolver.MaxFragmentLength;
443 dataStream.str(fields.at(4));
444 dataStream >> resolver.TopModelId;
447 dataStream.str(fields.at(5));
448 dataStream >> resolver.NextTopModelId;
451 resolver.IsAmbiguous = ( fields.at(6) == TRUE_KEYWORD );
453 // store RG entry and return success
454 readGroups.insert( make_pair<string, ReadGroupResolver>(name, resolver) );
458 bool ResolveTool::StatsFileReader::Read(ResolveTool::ResolveSettings* settings,
459 map<string, ReadGroupResolver>& readGroups)
461 // up-front sanity checks
462 if ( !m_stream.is_open() || settings == 0 )
465 // clear out read group data
469 State currentState = StatsFileReader::None;
472 string line = SkipCommentsAndWhitespace();
473 while ( !line.empty() ) {
475 bool foundError = false;
477 // switch state on keyword found
478 if ( Utilities::StartsWith(line, INPUT_TOKEN) )
479 currentState = StatsFileReader::InInput;
480 else if ( Utilities::StartsWith(line, OPTIONS_TOKEN) )
481 currentState = StatsFileReader::InOptions;
482 else if ( Utilities::StartsWith(line, READGROUPS_TOKEN) )
483 currentState = StatsFileReader::InReadGroups;
485 // otherwise parse data line, depending on state
487 if ( currentState == StatsFileReader::InInput )
488 foundError = !ParseInputLine(line);
489 else if ( currentState == StatsFileReader::InOptions )
490 foundError = !ParseOptionLine(line, settings);
491 else if ( currentState == StatsFileReader::InReadGroups )
492 foundError = !ParseReadGroupLine(line, readGroups);
497 // break out if error found
502 line = SkipCommentsAndWhitespace();
505 // if here, return success
509 string ResolveTool::StatsFileReader::SkipCommentsAndWhitespace(void) {
512 if ( m_stream.eof() )
514 getline(m_stream, line);
515 } while ( IsWhitespace(line) || IsComment(line) );
519 // --------------------------------------------------------------------------
520 // StatsFileReader implementation
522 struct ResolveTool::StatsFileWriter {
526 StatsFileWriter(void) { }
527 ~StatsFileWriter(void) { Close(); }
529 // main reader interface
532 bool Open(const std::string& filename);
533 bool Write(ResolveTool::ResolveSettings* settings,
534 const map<string, ReadGroupResolver>& readGroups);
538 void WriteHeader(void);
539 void WriteInput(ResolveTool::ResolveSettings* settings);
540 void WriteOptions(ResolveTool::ResolveSettings* settings);
541 void WriteReadGroups(const map<string, ReadGroupResolver>& readGroups);
548 void ResolveTool::StatsFileWriter::Close(void) {
549 if ( m_stream.is_open() )
553 bool ResolveTool::StatsFileWriter::Open(const string& filename) {
555 // make sure stream is fresh
558 // attempt to open filename, return status
559 m_stream.open(filename.c_str(), ofstream::out);
560 return m_stream.good();
563 bool ResolveTool::StatsFileWriter::Write(ResolveTool::ResolveSettings* settings,
564 const map<string, ReadGroupResolver>& readGroups)
566 // return failure if file not open
567 if ( !m_stream.is_open() )
570 // write stats file elements
572 WriteInput(settings);
573 WriteOptions(settings);
574 WriteReadGroups(readGroups);
580 void ResolveTool::StatsFileWriter::WriteHeader(void) {
582 // stringify current bamtools version
583 stringstream versionStream("");
585 << BAMTOOLS_VERSION_MAJOR << "."
586 << BAMTOOLS_VERSION_MINOR << "."
587 << BAMTOOLS_VERSION_BUILD;
589 // # bamtools resolve (vX.Y.Z)
592 m_stream << COMMENT_CHAR << " bamtools resolve (" << versionStream.str() << ")" << endl
596 void ResolveTool::StatsFileWriter::WriteInput(ResolveTool::ResolveSettings* settings) {
602 m_stream << INPUT_TOKEN << endl
603 << settings->InputBamFilename << endl
607 void ResolveTool::StatsFileWriter::WriteOptions(ResolveTool::ResolveSettings* settings) {
610 // ConfidenceInterval=<double>
611 // UnusedModelThreshold=<double>
612 // ForceMarkReadGroups=<true|false>
615 m_stream << OPTIONS_TOKEN << endl
616 << OPTION_CONFIDENCEINTERVAL << EQUAL_CHAR << settings->ConfidenceInterval << endl
617 << OPTION_UNUSEDMODELTHRESHOLD << EQUAL_CHAR << settings->UnusedModelThreshold << endl
618 << OPTION_FORCEMARKREADGROUPS << EQUAL_CHAR << boolalpha << settings->HasForceMarkReadGroups << endl
622 void ResolveTool::StatsFileWriter::WriteReadGroups(const map<string, ReadGroupResolver>& readGroups) {
625 m_stream << READGROUPS_TOKEN << endl;
627 // iterate over read groups
628 map<string, ReadGroupResolver>::const_iterator rgIter = readGroups.begin();
629 map<string, ReadGroupResolver>::const_iterator rgEnd = readGroups.end();
630 for ( ; rgIter != rgEnd; ++rgIter ) {
631 const string& name = (*rgIter).first;
632 const ReadGroupResolver& resolver = (*rgIter).second;
634 // skip if read group has no data
635 if ( !resolver.HasData )
638 // write read group data
639 // <name> <medianFragmentLength> <minFragmentLength> <maxFragmentLength> <topModelId> <nextTopModelId> <isAmbiguous?>
640 m_stream << name << TAB_CHAR
641 << resolver.MedianFragmentLength << TAB_CHAR
642 << resolver.MinFragmentLength << TAB_CHAR
643 << resolver.MaxFragmentLength << TAB_CHAR
644 << resolver.TopModelId << TAB_CHAR
645 << resolver.NextTopModelId << TAB_CHAR
646 << boolalpha << resolver.IsAmbiguous
650 // extra newline at end
654 // --------------------------------------------------------------------------
655 // ResolveToolPrivate implementation
657 struct ResolveTool::ResolveToolPrivate {
661 ResolveToolPrivate(ResolveTool::ResolveSettings* settings)
662 : m_settings(settings)
664 ~ResolveToolPrivate(void) { }
666 // 'public' interface
672 bool CheckSettings(vector<string>& errors);
673 bool MakeStats(void);
674 void ParseHeader(const SamHeader& header);
675 bool ReadStatsFile(void);
676 void ResolveAlignment(BamAlignment& al);
677 bool ResolvePairs(void);
678 bool WriteStatsFile(void);
682 ResolveTool::ResolveSettings* m_settings;
683 map<string, ReadGroupResolver> m_readGroups;
686 bool ResolveTool::ResolveToolPrivate::CheckSettings(vector<string>& errors) {
688 // ensure clean slate
692 if ( m_settings->IsMakeStats ) {
695 if ( m_settings->IsMarkPairs )
696 errors.push_back("Cannot run in both -makeStats & -markPairs modes. Please select ONE.");
697 if ( m_settings->IsTwoPass )
698 errors.push_back("Cannot run in both -makeStats & -twoPass modes. Please select ONE.");
700 // error if output BAM options supplied
701 if ( m_settings->HasOutputBamFile )
702 errors.push_back("Cannot use -out (output BAM file) in -makeStats mode.");
703 if ( m_settings->IsForceCompression )
704 errors.push_back("Cannot use -forceCompression. No output BAM file is being generated.");
706 // make sure required stats file supplied
707 if ( !m_settings->HasStatsFile )
708 errors.push_back("Ouptut stats filename required for -makeStats mode. Please specify one using -stats option.");
710 // check for UseStats options
711 if ( m_settings->HasForceMarkReadGroups )
712 errors.push_back("Cannot use -forceMarkReadGroups. -markPairs options are DISABLED in -makeStats mode.");
716 else if ( m_settings->IsMarkPairs ) {
719 if ( m_settings->IsMakeStats )
720 errors.push_back("Cannot run in both -makeStats & -markPairs modes. Please select ONE.");
721 if ( m_settings->IsTwoPass )
722 errors.push_back("Cannot run in both -markPairs & -twoPass modes. Please select ONE.");
724 // make sure required stats file supplied
725 if ( !m_settings->HasStatsFile )
726 errors.push_back("Input stats filename required for -markPairs mode. Please specify one using -stats option.");
728 // check for MakeStats options
729 if ( m_settings->HasConfidenceInterval )
730 errors.push_back("Cannot use -ci. -makeStats options are DISABLED is -markPairs mode.");
734 else if ( m_settings->IsTwoPass ) {
737 if ( m_settings->IsMakeStats )
738 errors.push_back("Cannot run in both -makeStats & -twoPass modes. Please select ONE.");
739 if ( m_settings->IsMarkPairs )
740 errors.push_back("Cannot run in both -markPairs & -twoPass modes. Please select ONE.");
742 // make sure input is file not stdin
743 if ( !m_settings->HasInputBamFile || m_settings->InputBamFilename == Options::StandardIn() )
744 errors.push_back("Cannot run -twoPass mode with BAM data from stdin. Please specify existing file using -in option.");
749 errors.push_back("No resolve mode specified. Please select ONE of the following: -makeStats, -markPairs, or -twoPass. See help for more info.");
751 // boundary checks on values
752 if ( m_settings->HasConfidenceInterval ) {
753 if ( m_settings->ConfidenceInterval < 0.0 || m_settings->ConfidenceInterval > 1.0 )
754 errors.push_back("Invalid confidence interval. Must be between 0 and 1");
756 if ( m_settings->HasUnusedModelThreshold ) {
757 if ( m_settings->UnusedModelThreshold < 0.0 || m_settings->UnusedModelThreshold > 1.0 )
758 errors.push_back("Invalid unused model threshold. Must be between 0 and 1");
761 // return success if no errors found
762 return ( errors.empty() );
765 bool ResolveTool::ResolveToolPrivate::MakeStats(void) {
767 // pull resolver settings from command-line settings
768 ReadGroupResolver::SetConfidenceInterval(m_settings->ConfidenceInterval);
769 ReadGroupResolver::SetUnusedModelThreshold(m_settings->UnusedModelThreshold);
771 // open our BAM reader
773 if ( !reader.Open(m_settings->InputBamFilename) ) {
774 cerr << "bamtools resolve ERROR: could not open input BAM file: "
775 << m_settings->InputBamFilename << endl;
779 // retrieve header & reference dictionary info
780 const SamHeader& header = reader.GetHeader();
782 // parse BAM header for read groups
785 // read through BAM file
787 string readGroup("");
788 map<string, ReadGroupResolver>::iterator rgIter;
789 while ( reader.GetNextAlignmentCore(al) ) {
791 // skip if alignment is not paired, mapped, nor mate is mapped
792 if ( !al.IsPaired() || !al.IsMapped() || !al.IsMateMapped() )
795 // determine model type, skip if model unknown
796 const uint16_t currentModelType = CalculateModelType(al);
797 assert( currentModelType != ModelType::DUMMY_ID );
799 // flesh out the char data, so we can retrieve its read group ID
802 // get read group from alignment (OK if empty)
804 al.GetTag(READ_GROUP_TAG, readGroup);
806 // look up resolver for read group
807 rgIter = m_readGroups.find(readGroup);
808 if ( rgIter == m_readGroups.end() ) {
809 cerr << "bamtools resolve ERROR - unable to calculate stats, unknown read group encountered: "
810 << readGroup << endl;
814 ReadGroupResolver& resolver = (*rgIter).second;
816 // update stats for current read group, current model type
817 resolver.Models[currentModelType].push_back(al.InsertSize);
820 // iterate back through read groups
821 map<string, ReadGroupResolver>::iterator rgEnd = m_readGroups.end();
822 for ( rgIter = m_readGroups.begin(); rgIter != rgEnd; ++rgIter ) {
823 const string& name = (*rgIter).first;
824 ReadGroupResolver& resolver = (*rgIter).second;
826 // calculate acceptable orientation & insert sizes for this read group
827 resolver.DetermineTopModels(name);
830 // close reader & return success
835 void ResolveTool::ResolveToolPrivate::ParseHeader(const SamHeader& header) {
837 // iterate over header read groups, creating a 'resolver' for each
838 SamReadGroupConstIterator rgIter = header.ReadGroups.ConstBegin();
839 SamReadGroupConstIterator rgEnd = header.ReadGroups.ConstEnd();
840 for ( ; rgIter != rgEnd; ++rgIter ) {
841 const SamReadGroup& rg = (*rgIter);
842 m_readGroups.insert( make_pair<string, ReadGroupResolver>(rg.ID, ReadGroupResolver()) );
846 bool ResolveTool::ResolveToolPrivate::ReadStatsFile(void) {
848 // skip if no filename provided
849 if ( m_settings->StatsFilename.empty() )
852 // attempt to open stats file
853 ResolveTool::StatsFileReader statsReader;
854 if ( !statsReader.Open(m_settings->StatsFilename) ) {
855 cerr << "bamtools resolve ERROR - could not open stats file: "
856 << m_settings->StatsFilename << " for reading" << endl;
860 // attempt to read stats data
861 if ( !statsReader.Read(m_settings, m_readGroups) ) {
862 cerr << "bamtools resolve ERROR - could not parse stats file: "
863 << m_settings->StatsFilename << " for data" << endl;
871 void ResolveTool::ResolveToolPrivate::ResolveAlignment(BamAlignment& al) {
873 // clear proper-pair flag
874 al.SetIsProperPair(false);
876 // quit check if alignment is not from paired-end read
877 if ( !al.IsPaired() ) return;
879 // quit check if either alignment or its mate are unmapped
880 if ( !al.IsMapped() || !al.IsMateMapped() ) return;
882 // quit check if map quality is 0
883 if ( al.MapQuality == 0 ) return;
885 // get read group from alignment
886 // empty string if not found, this is OK - we handle empty read group case
887 string readGroupName("");
888 al.GetTag(READ_GROUP_TAG, readGroupName);
890 // look up read group's 'resolver'
891 map<string, ReadGroupResolver>::iterator rgIter = m_readGroups.find(readGroupName);
892 if ( rgIter == m_readGroups.end() ) {
893 cerr << "bamtools resolve ERROR - read group found that was not in header: "
894 << readGroupName << endl;
897 const ReadGroupResolver& resolver = (*rgIter).second;
899 // quit check if pairs are not in proper orientation (can differ for each RG)
900 if ( !resolver.IsValidOrientation(al) ) return;
902 // quit check if pairs are not within "reasonable" distance (can differ for each RG)
903 if ( !resolver.IsValidInsertSize(al) ) return;
905 // if we get here, alignment is OK - set 'proper pair' flag
906 al.SetIsProperPair(true);
909 bool ResolveTool::ResolveToolPrivate::ResolvePairs(void) {
911 // open our BAM reader
913 if ( !reader.Open(m_settings->InputBamFilename) ) {
914 cerr << "bamtools resolve ERROR: could not open input BAM file: "
915 << m_settings->InputBamFilename << endl;
919 // retrieve header & reference dictionary info
920 const SamHeader& header = reader.GetHeader();
921 const RefVector& references = reader.GetReferenceData();
923 // determine compression mode for BamWriter
924 bool writeUncompressed = ( m_settings->OutputBamFilename == Options::StandardOut() &&
925 !m_settings->IsForceCompression );
926 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
927 if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
931 writer.SetCompressionMode(compressionMode);
932 if ( !writer.Open(m_settings->OutputBamFilename, header, references) ) {
933 cerr << "bamtools resolve ERROR: could not open "
934 << m_settings->OutputBamFilename << " for writing." << endl;
939 // plow through alignments, setting/clearing 'proper pair' flag
940 // and writing to new output BAM file
942 while ( reader.GetNextAlignment(al) ) {
943 ResolveAlignment(al);
944 writer.SaveAlignment(al);
947 // clean up & return success
953 bool ResolveTool::ResolveToolPrivate::Run(void) {
955 // verify that command line settings are acceptable
956 vector<string> errors;
957 if ( !CheckSettings(errors) ) {
958 cerr << "bamtools resolve ERROR - invalid settings: " << endl;
959 vector<string>::const_iterator errorIter = errors.begin();
960 vector<string>::const_iterator errorEnd = errors.end();
961 for ( ; errorIter != errorEnd; ++errorIter )
962 cerr << (*errorIter) << endl;
966 // initialize read group map with default (empty name) read group
967 m_readGroups.insert( make_pair<string, ReadGroupResolver>("", ReadGroupResolver()) );
970 if ( m_settings->IsMakeStats ) {
972 // generate stats data
973 if ( !MakeStats() ) {
974 cerr << "bamtools resolve ERROR - could not generate stats" << endl;
978 // write stats to file
979 if ( !WriteStatsFile() ) {
980 cerr << "bamtools resolve ERROR - could not write stats file: "
981 << m_settings->StatsFilename << endl;
987 else if ( m_settings->IsMarkPairs ) {
989 // read stats from file
990 if ( !ReadStatsFile() ) {
991 cerr << "bamtools resolve ERROR - could not read stats file: "
992 << m_settings->StatsFilename << endl;
996 // do paired-end resolution
997 if ( !ResolvePairs() ) {
998 cerr << "bamtools resolve ERROR - could not resolve pairs" << endl;
1006 // generate stats data
1007 if ( !MakeStats() ) {
1008 cerr << "bamtools resolve ERROR - could not generate stats" << endl;
1012 // if stats file requested
1013 if ( m_settings->HasStatsFile ) {
1015 // write stats to file
1016 // emit warning if write fails, but paired-end resolution should be allowed to proceed
1017 if ( !WriteStatsFile() )
1018 cerr << "bamtools resolve WARNING - could not write stats file: "
1019 << m_settings->StatsFilename << endl;
1022 // do paired-end resolution
1023 if ( !ResolvePairs() ) {
1024 cerr << "bamtools resolve ERROR - could not resolve pairs" << endl;
1033 bool ResolveTool::ResolveToolPrivate::WriteStatsFile(void) {
1035 // skip if no filename provided
1036 if ( m_settings->StatsFilename.empty() )
1039 // attempt to open stats file
1040 ResolveTool::StatsFileWriter statsWriter;
1041 if ( !statsWriter.Open(m_settings->StatsFilename) ) {
1042 cerr << "bamtools resolve ERROR - could not open stats file: "
1043 << m_settings->StatsFilename << " for writing" << endl;
1047 // attempt to write stats data
1048 if ( !statsWriter.Write(m_settings, m_readGroups) ) {
1049 cerr << "bamtools resolve ERROR - could not write stats file: "
1050 << m_settings->StatsFilename << " for data" << endl;
1058 // --------------------------------------------------------------------------
1059 // ResolveTool implementation
1061 ResolveTool::ResolveTool(void)
1063 , m_settings(new ResolveSettings)
1066 // set description texts
1067 const string programDescription = "resolves paired-end reads (marking the IsProperPair flag as needed)";
1068 const string programUsage = "<mode> [options] [-in <filename>] [-out <filename> | [-forceCompression] ] [-stats <filename>]";
1069 const string inputBamDescription = "the input BAM file(s)";
1070 const string outputBamDescription = "the output BAM file";
1071 const string statsFileDescription = "input/output stats file, depending on selected mode (see below). "
1072 "This file is human-readable, storing fragment length data generated per read group, as well as "
1073 "the options used to configure the -makeStats mode";
1074 const string forceCompressionDescription = "if results are sent to stdout (like when piping to another tool), "
1075 "default behavior is to leave output uncompressed."
1076 "Use this flag to override and force compression. This feature is disabled in -makeStats mode.";
1077 const string makeStatsDescription = "generates a fragment-length stats file from the input BAM. "
1078 "Data is written to file specified using the -stats option. "
1079 "MarkPairs Mode Settings are DISABLED.";
1080 const string markPairsDescription = "generates an output BAM with alignments marked with proper-pair status. "
1081 "Stats data is read from file specified using the -stats option. "
1082 "MakeStats Mode Settings are DISABLED";
1083 const string twoPassDescription = "combines the -makeStats & -markPairs modes into a single command. "
1084 "However, due to the two-pass nature of paired-end resolution, piping BAM data via stdin is DISABLED. "
1085 "You must supply an explicit input BAM file. Output BAM may be piped to stdout, however, if desired. "
1086 "All MakeStats & MarkPairs Mode Settings are available. "
1087 "The intermediate stats file is not necessary, but if the -stats options is used, then one will be generated. "
1088 "You may find this useful for documentation purposes.";
1089 const string confidenceIntervalDescription = "confidence interval. Set min/max fragment lengths such that we capture "
1090 "this fraction of pairs";
1091 const string unusedModelThresholdDescription = "unused model threshold. The resolve tool considers 8 possible orientation models "
1092 "for pairs. The top 2 are selected for later use when actually marking alignments. This value determines the "
1093 "cutoff for marking a read group as ambiguous. Meaning that if the ratio of the number of alignments from bottom 6 models "
1094 "to the top 2 is greater than this threshold, then the read group is flagged as ambiguous. By default, NO alignments "
1095 "from ambiguous read groups will be marked as proper pairs. You may override this behavior with the -force option "
1096 "in -markPairs mode";
1097 const string forceMarkDescription = "forces all read groups to be marked according to their top 2 'orientation models'. "
1098 "When generating stats, the 2 (out of 8 possible) models with the most observations are chosen as the top models for each read group. "
1099 "If the remaining 6 models account for more than some threshold ([default=10%], see -umt), then the read group is marked as ambiguous. "
1100 "The default behavior is that for an ambiguous read group, NONE of its alignments are marked as proper-pairs. "
1101 "By setting this option, a read group's ambiguity flag will be ignored, and all of its alignments will be compared to the top 2 models.";
1103 // set program details
1104 Options::SetProgramInfo("bamtools resolve", programDescription, programUsage);
1106 // set up I/O options
1107 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
1108 Options::AddValueOption("-in", "BAM filename", inputBamDescription, "",
1109 m_settings->HasInputBamFile, m_settings->InputBamFilename,
1110 IO_Opts, Options::StandardIn());
1111 Options::AddValueOption("-out", "BAM filename", outputBamDescription, "",
1112 m_settings->HasOutputBamFile, m_settings->OutputBamFilename,
1113 IO_Opts, Options::StandardOut());
1114 Options::AddValueOption("-stats", "STATS filename", statsFileDescription, "",
1115 m_settings->HasStatsFile, m_settings->StatsFilename, IO_Opts);
1116 Options::AddOption("-forceCompression", forceCompressionDescription,
1117 m_settings->IsForceCompression, IO_Opts);
1119 OptionGroup* ModeOpts = Options::CreateOptionGroup("Resolve Modes (must select ONE of the following)");
1120 Options::AddOption("-makeStats", makeStatsDescription, m_settings->IsMakeStats, ModeOpts);
1121 Options::AddOption("-markPairs", markPairsDescription, m_settings->IsMarkPairs, ModeOpts);
1122 Options::AddOption("-twoPass", twoPassDescription, m_settings->IsTwoPass, ModeOpts);
1124 OptionGroup* MakeStatsOpts = Options::CreateOptionGroup("MakeStats Mode Options (disabled in -markPairs mode)");
1125 Options::AddValueOption("-ci", "double", confidenceIntervalDescription, "",
1126 m_settings->HasConfidenceInterval, m_settings->ConfidenceInterval, MakeStatsOpts);
1127 Options::AddValueOption("-umt", "double", unusedModelThresholdDescription, "",
1128 m_settings->HasUnusedModelThreshold, m_settings->UnusedModelThreshold, MakeStatsOpts);
1130 OptionGroup* MarkPairsOpts = Options::CreateOptionGroup("MarkPairs Mode Options (disabled in -makeStats mode)");
1131 Options::AddOption("-force", forceMarkDescription, m_settings->HasForceMarkReadGroups, MarkPairsOpts);
1134 ResolveTool::~ResolveTool(void) {
1143 int ResolveTool::Help(void) {
1144 Options::DisplayHelp();
1148 int ResolveTool::Run(int argc, char* argv[]) {
1150 // parse command line arguments
1151 Options::Parse(argc, argv, 1);
1153 // initialize ResolveTool
1154 m_impl = new ResolveToolPrivate(m_settings);
1156 // run ResolveTool, return success/failure
1157 if ( m_impl->Run() )