1 // ***************************************************************************
2 // bamtools_resolve.cpp (c) 2011
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 28 June 2011
7 // ---------------------------------------------------------------------------
8 // Resolves paired-end reads (marking the IsProperPair flag as needed).
9 // ***************************************************************************
11 #include "bamtools_resolve.h"
12 #include "bamtools_version.h"
13 #include <api/BamReader.h>
14 #include <api/BamWriter.h>
15 #include <utils/bamtools_options.h>
16 #include <utils/bamtools_utilities.h>
17 using namespace BamTools;
33 // --------------------------------------------------------------------------
34 // general ResolveTool constants
35 // --------------------------------------------------------------------------
37 static const int NUM_MODELS = 8;
38 static const string READ_GROUP_TAG = "RG";
39 static const double DEFAULT_CONFIDENCE_INTERVAL = 0.9973;
40 static const double DEFAULT_UNUSEDMODEL_THRESHOLD = 0.1;
42 // --------------------------------------------------------------------------
43 // stats file constants
44 // --------------------------------------------------------------------------
46 // basic char/string constants
47 static const char COMMENT_CHAR = '#';
48 static const char OPEN_BRACE_CHAR = '[';
49 static const char CLOSE_BRACE_CHAR = ']';
50 static const char EQUAL_CHAR = '=';
51 static const char TAB_CHAR = '\t';
53 static const string WHITESPACE_CHARS = " \t\n";
54 static const string TRUE_KEYWORD = "true";
55 static const string FALSE_KEYWORD = "false";
58 static const size_t NUM_OPTIONS_FIELDS = 2;
59 static const size_t NUM_READGROUPS_FIELDS = 7;
62 static const string INPUT_TOKEN = "[Input]";
63 static const string OPTIONS_TOKEN = "[Options]";
64 static const string READGROUPS_TOKEN = "[ReadGroups]";
67 static const string OPTION_CONFIDENCEINTERVAL = "ConfidenceInterval";
68 static const string OPTION_UNUSEDMODELTHRESHOLD = "UnusedModelThreshold";
69 static const string OPTION_FORCEMARKREADGROUPS = "ForceMarkReadGroups";
71 // other string constants
72 static const string RG_FIELD_DESCRIPTION =
73 "#<name> <medianFL> <minFL> <maxFL> <topModelID> <nextTopModelID> <isAmbiguous?>";
75 // --------------------------------------------------------------------------
76 // unique readname file constants
77 // --------------------------------------------------------------------------
79 static const string READNAME_FILE_SUFFIX = ".uniq_names.txt";
80 static const string DEFAULT_READNAME_FILE = "bt_resolve_TEMP" + READNAME_FILE_SUFFIX;
82 // --------------------------------------------------------------------------
83 // ModelType implementation
89 vector<int32_t> FragmentLengths;
92 ModelType(const uint16_t id)
95 // preallocate space for 10K fragments per model type
96 FragmentLengths.reserve(10000);
99 // convenience access to internal fragment lengths vector
100 vector<int32_t>::iterator begin(void) { return FragmentLengths.begin(); }
101 vector<int32_t>::const_iterator begin(void) const { return FragmentLengths.begin(); }
102 void clear(void) { FragmentLengths.clear(); }
103 vector<int32_t>::iterator end(void) { return FragmentLengths.end(); }
104 vector<int32_t>::const_iterator end(void) const { return FragmentLengths.end(); }
105 void push_back(const int32_t& x) { FragmentLengths.push_back(x); }
106 size_t size(void) const { return FragmentLengths.size(); }
109 static const uint16_t DUMMY_ID;
112 const uint16_t ModelType::DUMMY_ID = 100;
114 bool operator>(const ModelType& lhs, const ModelType& rhs) {
115 return lhs.size() > rhs.size();
118 uint16_t CalculateModelType(const BamAlignment& al) {
120 // localize alignment's mate positions & orientations for convenience
121 const int32_t m1_begin = ( al.IsFirstMate() ? al.Position : al.MatePosition );
122 const int32_t m2_begin = ( al.IsFirstMate() ? al.MatePosition : al.Position );
123 const bool m1_isReverseStrand = ( al.IsFirstMate() ? al.IsReverseStrand() : al.IsMateReverseStrand() );
124 const bool m2_isReverseStrand = ( al.IsFirstMate() ? al.IsMateReverseStrand() : al.IsReverseStrand() );
126 // determine 'model type'
127 if ( m1_begin < m2_begin ) {
128 if ( !m1_isReverseStrand && !m2_isReverseStrand ) return 0; // ID: 1
129 if ( !m1_isReverseStrand && m2_isReverseStrand ) return 1; // ID: 2
130 if ( m1_isReverseStrand && !m2_isReverseStrand ) return 2; // ID: 3
131 if ( m1_isReverseStrand && m2_isReverseStrand ) return 3; // ID: 4
133 if ( !m2_isReverseStrand && !m1_isReverseStrand ) return 4; // ID: 5
134 if ( !m2_isReverseStrand && m1_isReverseStrand ) return 5; // ID: 6
135 if ( m2_isReverseStrand && !m1_isReverseStrand ) return 6; // ID: 7
136 if ( m2_isReverseStrand && m1_isReverseStrand ) return 7; // ID: 8
140 return ModelType::DUMMY_ID;
143 // --------------------------------------------------------------------------
144 // ReadGroupResolver implementation
146 struct ReadGroupResolver {
149 int32_t MinFragmentLength;
150 int32_t MedianFragmentLength;
151 int32_t MaxFragmentLength;
153 uint16_t NextTopModelId;
156 vector<ModelType> Models;
157 map<string, bool> ReadNames;
160 ReadGroupResolver(void);
163 bool IsValidInsertSize(const BamAlignment& al) const;
164 bool IsValidOrientation(const BamAlignment& al) const;
166 // select 2 best models based on observed data
167 void DetermineTopModels(const string& readGroupName);
170 static double ConfidenceInterval;
171 static double UnusedModelThreshold;
172 static void SetConfidenceInterval(const double& ci);
173 static void SetUnusedModelThreshold(const double& umt);
176 double ReadGroupResolver::ConfidenceInterval = DEFAULT_CONFIDENCE_INTERVAL;
177 double ReadGroupResolver::UnusedModelThreshold = DEFAULT_UNUSEDMODEL_THRESHOLD;
179 ReadGroupResolver::ReadGroupResolver(void)
180 : MinFragmentLength(0)
181 , MedianFragmentLength(0)
182 , MaxFragmentLength(0)
183 , TopModelId(ModelType::DUMMY_ID)
184 , NextTopModelId(ModelType::DUMMY_ID)
188 // pre-allocate space for 8 models
189 Models.reserve(NUM_MODELS);
190 for ( uint16_t i = 0; i < NUM_MODELS; ++i )
191 Models.push_back( ModelType(i+1) );
194 bool ReadGroupResolver::IsValidInsertSize(const BamAlignment& al) const {
195 const int32_t absInsertSize = abs(al.InsertSize);
196 return ( absInsertSize >= MinFragmentLength &&
197 absInsertSize <= MaxFragmentLength );
200 bool ReadGroupResolver::IsValidOrientation(const BamAlignment& al) const {
201 const uint16_t currentModelId = CalculateModelType(al) + 1; // convert model type (array index) to ID number
202 return ( currentModelId == TopModelId || currentModelId == NextTopModelId );
205 void ReadGroupResolver::DetermineTopModels(const string& readGroupName) {
207 // sort models (from most common to least common)
208 sort( Models.begin(), Models.end(), std::greater<ModelType>() );
210 // store top 2 models for later
211 TopModelId = Models[0].ID;
212 NextTopModelId = Models[1].ID;
214 // make sure that the 2 most common models are some threshold more common
215 // than the remaining models
216 const unsigned int activeModelCountSum = Models[0].size() + Models[1].size();
217 if ( activeModelCountSum == 0 ) return; // skip if no data in this read group
218 const unsigned int unusedModelCountSum = Models[2].size() + Models[3].size() +
219 Models[4].size() + Models[5].size() +
220 Models[6].size() + Models[7].size();
221 const double unusedPercentage = (double)unusedModelCountSum / (double)activeModelCountSum;
222 if ( unusedPercentage > UnusedModelThreshold ) {
223 cerr << "WARNING: " << readGroupName << " does not have clearly defined 'top models'" << endl
224 << " The fraction of alignments in bottom 6 models (" << unusedPercentage
225 << ") exceeds threshold: " << UnusedModelThreshold << endl;
229 // emit a warning if the best alignment models are non-standard
230 const bool isModel1Top = (TopModelId == 1) || (NextTopModelId == 1);
231 const bool isModel2Top = (TopModelId == 2) || (NextTopModelId == 2);
232 const bool isModel4Top = (TopModelId == 4) || (NextTopModelId == 4);
233 const bool isModel5Top = (TopModelId == 5) || (NextTopModelId == 5);
234 const bool isModel6Top = (TopModelId == 6) || (NextTopModelId == 6);
235 const bool isModel8Top = (TopModelId == 8) || (NextTopModelId == 8);
237 bool isMatePair = ( isModel4Top && isModel5Top ? true : false );
238 bool isPairedEnd = ( isModel2Top && isModel6Top ? true : false );
239 bool isSolidPair = ( isModel1Top && isModel8Top ? true : false );
241 if ( !isMatePair && !isPairedEnd && !isSolidPair ) {
242 cerr << "WARNING: Found a non-standard alignment model configuration. " << endl
243 << " Using alignment models " << TopModelId << " & " << NextTopModelId
247 // store only the fragments from the best alignment models, then sort
248 vector<int32_t> fragments;
249 fragments.reserve( Models[0].size() + Models[1].size() );
250 fragments.insert( fragments.end(), Models[0].begin(), Models[0].end() );
251 fragments.insert( fragments.end(), Models[1].begin(), Models[1].end() );
252 sort ( fragments.begin(), fragments.end() );
254 // clear out Model fragment data, not needed anymore
257 // skip if no fragments found for this read group
258 if ( fragments.empty() ) {
264 // calculate & store the min,median, & max fragment lengths
265 const unsigned int numFragmentLengths = fragments.size();
266 const double halfNonConfidenceInterval = (1.0 - ReadGroupResolver::ConfidenceInterval)/2.0;
267 const unsigned int minIndex = (unsigned int)(numFragmentLengths * halfNonConfidenceInterval);
268 const unsigned int medianIndex = (unsigned int)(numFragmentLengths * 0.5);
269 const unsigned int maxIndex = (unsigned int)(numFragmentLengths * (1.0-halfNonConfidenceInterval));
271 MinFragmentLength = fragments[minIndex];
272 MedianFragmentLength = fragments[medianIndex];
273 MaxFragmentLength = fragments[maxIndex];
276 void ReadGroupResolver::SetConfidenceInterval(const double& ci) {
277 ConfidenceInterval = ci;
280 void ReadGroupResolver::SetUnusedModelThreshold(const double& umt) {
281 UnusedModelThreshold = umt;
284 // --------------------------------------------------------------------------
285 // ResolveSettings implementation
287 struct ResolveTool::ResolveSettings {
290 bool HasInputBamFile;
291 bool HasOutputBamFile;
292 bool IsForceCompression;
297 bool HasConfidenceInterval;
298 bool HasUnusedModelThreshold;
301 string InputBamFilename;
302 string OutputBamFilename;
303 string StatsFilename;
304 string ReadNamesFilename; // ** N.B. - Only used internally, not set from cmdline **
307 double ConfidenceInterval;
308 double UnusedModelThreshold;
309 bool HasForceMarkReadGroups;
312 ResolveSettings(void)
313 : HasInputBamFile(false)
314 , HasOutputBamFile(false)
315 , IsForceCompression(false)
316 , HasStatsFile(false)
320 , HasConfidenceInterval(false)
321 , HasUnusedModelThreshold(false)
322 , InputBamFilename(Options::StandardIn())
323 , OutputBamFilename(Options::StandardOut())
325 , ReadNamesFilename(DEFAULT_READNAME_FILE)
326 , ConfidenceInterval(DEFAULT_CONFIDENCE_INTERVAL)
327 , UnusedModelThreshold(DEFAULT_UNUSEDMODEL_THRESHOLD)
328 , HasForceMarkReadGroups(false)
332 // --------------------------------------------------------------------------
333 // ReadNamesFileReader implementation
335 struct ResolveTool::ReadNamesFileReader {
338 ReadNamesFileReader(void) { }
339 ~ReadNamesFileReader(void) { Close(); }
341 // main reader interface
344 bool Open(const string& filename);
345 bool Read(map<string, ReadGroupResolver>& readGroups);
352 void ResolveTool::ReadNamesFileReader::Close(void) {
353 if ( m_stream.is_open() )
357 bool ResolveTool::ReadNamesFileReader::Open(const string& filename) {
359 // make sure stream is fresh
362 // attempt to open filename, return status
363 m_stream.open(filename.c_str(), ifstream::in);
364 return m_stream.good();
367 bool ResolveTool::ReadNamesFileReader::Read(map<string, ReadGroupResolver>& readGroups) {
369 // up-front sanity check
370 if ( !m_stream.is_open() ) return false;
372 // parse read names file
374 vector<string> fields;
375 map<string, ReadGroupResolver>::iterator rgIter;
376 map<string, ReadGroupResolver>::iterator rgEnd = readGroups.end();
377 while ( getline(m_stream, line) ) {
379 // skip if empty line
380 if ( line.empty() ) continue;
382 // split line on '\t'
383 fields = Utilities::Split(line, TAB_CHAR);
384 if ( fields.size() != 2 ) continue;
386 // look up resolver for read group
387 rgIter = readGroups.find( fields[0] );
388 if ( rgIter == rgEnd ) return false;
389 ReadGroupResolver& resolver = (*rgIter).second;
391 // store read name with resolver
392 resolver.ReadNames.insert( make_pair<string,bool>(fields[1], true) ) ;
395 // if here, return success
399 // --------------------------------------------------------------------------
400 // ReadNamesFileWriter implementation
402 struct ResolveTool::ReadNamesFileWriter {
405 ReadNamesFileWriter(void) { }
406 ~ReadNamesFileWriter(void) { Close(); }
408 // main reader interface
411 bool Open(const string& filename);
412 void Write(const string& readGroupName, const string& readName);
419 void ResolveTool::ReadNamesFileWriter::Close(void) {
420 if ( m_stream.is_open() )
424 bool ResolveTool::ReadNamesFileWriter::Open(const string& filename) {
426 // make sure stream is fresh
429 // attempt to open filename, return status
430 m_stream.open(filename.c_str(), ofstream::out);
431 return m_stream.good();
434 void ResolveTool::ReadNamesFileWriter::Write(const string& readGroupName,
435 const string& readName)
437 m_stream << readGroupName << TAB_CHAR << readName << endl;
440 // --------------------------------------------------------------------------
441 // StatsFileReader implementation
443 struct ResolveTool::StatsFileReader {
447 StatsFileReader(void) { }
448 ~StatsFileReader(void) { Close(); }
450 // main reader interface
453 bool Open(const string& filename);
454 bool Read(ResolveTool::ResolveSettings* settings,
455 map<string, ReadGroupResolver>& readGroups);
459 bool IsComment(const string& line) const;
460 bool IsWhitespace(const string& line) const;
461 bool ParseInputLine(const string& line);
462 bool ParseOptionLine(const string& line, ResolveTool::ResolveSettings* settings);
463 bool ParseReadGroupLine(const string& line, map<string, ReadGroupResolver>& readGroups);
464 string SkipCommentsAndWhitespace(void);
470 enum State { None = 0
476 void ResolveTool::StatsFileReader::Close(void) {
477 if ( m_stream.is_open() )
481 bool ResolveTool::StatsFileReader::IsComment(const string& line) const {
482 assert( !line.empty() );
483 return ( line.at(0) == COMMENT_CHAR );
486 bool ResolveTool::StatsFileReader::IsWhitespace(const string& line) const {
489 return ( isspace(line.at(0)) );
492 bool ResolveTool::StatsFileReader::Open(const string& filename) {
494 // make sure stream is fresh
497 // attempt to open filename, return status
498 m_stream.open(filename.c_str(), ifstream::in);
499 return m_stream.good();
502 bool ResolveTool::StatsFileReader::ParseInputLine(const string& /*line*/) {
503 // input lines are ignored (for now at least), tool will use input from command line
507 bool ResolveTool::StatsFileReader::ParseOptionLine(const string& line,
508 ResolveTool::ResolveSettings* settings)
510 // split line into option, value
511 vector<string> fields = Utilities::Split(line, EQUAL_CHAR);
512 if ( fields.size() != NUM_OPTIONS_FIELDS )
514 const string& option = fields.at(0);
515 stringstream value(fields.at(1));
517 // -----------------------------------
518 // handle option based on keyword
520 // ConfidenceInterval
521 if ( option == OPTION_CONFIDENCEINTERVAL ) {
522 value >> settings->ConfidenceInterval;
523 settings->HasConfidenceInterval = true;
527 // UnusedModelThreshold
528 if ( option == OPTION_UNUSEDMODELTHRESHOLD ) {
529 value >> settings->UnusedModelThreshold;
530 settings->HasUnusedModelThreshold = true;
534 // ForceMarkReadGroups
535 if ( option == OPTION_FORCEMARKREADGROUPS ) {
536 value >> settings->HasForceMarkReadGroups;
540 // otherwise unknown option
541 cerr << "bamtools resolve ERROR - unrecognized option: " << option << " in stats file" << endl;
545 bool ResolveTool::StatsFileReader::ParseReadGroupLine(const string& line,
546 map<string, ReadGroupResolver>& readGroups)
548 // split read group data in to fields
549 vector<string> fields = Utilities::Split(line, WHITESPACE_CHARS);
550 if ( fields.size() != NUM_READGROUPS_FIELDS ) return false;
553 const string& name = fields.at(0);
555 // populate RG's 'resolver' data
556 ReadGroupResolver resolver;
558 stringstream dataStream;
559 dataStream.str(fields.at(1));
560 dataStream >> resolver.MedianFragmentLength;
563 dataStream.str(fields.at(2));
564 dataStream >> resolver.MinFragmentLength;
567 dataStream.str(fields.at(3));
568 dataStream >> resolver.MaxFragmentLength;
571 dataStream.str(fields.at(4));
572 dataStream >> resolver.TopModelId;
575 dataStream.str(fields.at(5));
576 dataStream >> resolver.NextTopModelId;
579 resolver.IsAmbiguous = ( fields.at(6) == TRUE_KEYWORD );
581 // store RG entry and return success
582 readGroups.insert( make_pair<string, ReadGroupResolver>(name, resolver) );
586 bool ResolveTool::StatsFileReader::Read(ResolveTool::ResolveSettings* settings,
587 map<string, ReadGroupResolver>& readGroups)
589 // up-front sanity checks
590 if ( !m_stream.is_open() || settings == 0 )
593 // clear out read group data
597 State currentState = StatsFileReader::None;
600 string line = SkipCommentsAndWhitespace();
601 while ( !line.empty() ) {
603 bool foundError = false;
605 // switch state on keyword found
606 if ( Utilities::StartsWith(line, INPUT_TOKEN) )
607 currentState = StatsFileReader::InInput;
608 else if ( Utilities::StartsWith(line, OPTIONS_TOKEN) )
609 currentState = StatsFileReader::InOptions;
610 else if ( Utilities::StartsWith(line, READGROUPS_TOKEN) )
611 currentState = StatsFileReader::InReadGroups;
613 // otherwise parse data line, depending on state
615 if ( currentState == StatsFileReader::InInput )
616 foundError = !ParseInputLine(line);
617 else if ( currentState == StatsFileReader::InOptions )
618 foundError = !ParseOptionLine(line, settings);
619 else if ( currentState == StatsFileReader::InReadGroups )
620 foundError = !ParseReadGroupLine(line, readGroups);
625 // break out if error found
630 line = SkipCommentsAndWhitespace();
633 // if here, return success
637 string ResolveTool::StatsFileReader::SkipCommentsAndWhitespace(void) {
640 if ( m_stream.eof() )
642 getline(m_stream, line);
643 } while ( IsWhitespace(line) || IsComment(line) );
647 // --------------------------------------------------------------------------
648 // StatsFileReader implementation
650 struct ResolveTool::StatsFileWriter {
654 StatsFileWriter(void) { }
655 ~StatsFileWriter(void) { Close(); }
657 // main reader interface
660 bool Open(const string& filename);
661 bool Write(ResolveTool::ResolveSettings* settings,
662 const map<string, ReadGroupResolver>& readGroups);
666 void WriteHeader(void);
667 void WriteInput(ResolveTool::ResolveSettings* settings);
668 void WriteOptions(ResolveTool::ResolveSettings* settings);
669 void WriteReadGroups(const map<string, ReadGroupResolver>& readGroups);
676 void ResolveTool::StatsFileWriter::Close(void) {
677 if ( m_stream.is_open() )
681 bool ResolveTool::StatsFileWriter::Open(const string& filename) {
683 // make sure stream is fresh
686 // attempt to open filename, return status
687 m_stream.open(filename.c_str(), ofstream::out);
688 return m_stream.good();
691 bool ResolveTool::StatsFileWriter::Write(ResolveTool::ResolveSettings* settings,
692 const map<string, ReadGroupResolver>& readGroups)
694 // return failure if file not open
695 if ( !m_stream.is_open() )
698 // write stats file elements
700 WriteInput(settings);
701 WriteOptions(settings);
702 WriteReadGroups(readGroups);
708 void ResolveTool::StatsFileWriter::WriteHeader(void) {
710 // stringify current bamtools version
711 stringstream versionStream("");
713 << BAMTOOLS_VERSION_MAJOR << "."
714 << BAMTOOLS_VERSION_MINOR << "."
715 << BAMTOOLS_VERSION_BUILD;
717 // # bamtools resolve (vX.Y.Z)
720 m_stream << COMMENT_CHAR << " bamtools resolve (" << versionStream.str() << ")" << endl
724 void ResolveTool::StatsFileWriter::WriteInput(ResolveTool::ResolveSettings* settings) {
730 m_stream << INPUT_TOKEN << endl
731 << settings->InputBamFilename << endl
735 void ResolveTool::StatsFileWriter::WriteOptions(ResolveTool::ResolveSettings* settings) {
738 // ConfidenceInterval=<double>
739 // UnusedModelThreshold=<double>
740 // ForceMarkReadGroups=<true|false>
743 m_stream << OPTIONS_TOKEN << endl
744 << OPTION_CONFIDENCEINTERVAL << EQUAL_CHAR << settings->ConfidenceInterval << endl
745 << OPTION_UNUSEDMODELTHRESHOLD << EQUAL_CHAR << settings->UnusedModelThreshold << endl
746 << OPTION_FORCEMARKREADGROUPS << EQUAL_CHAR << boolalpha << settings->HasForceMarkReadGroups << endl
750 void ResolveTool::StatsFileWriter::WriteReadGroups(const map<string, ReadGroupResolver>& readGroups) {
753 // #<name> <medianFL> <minFL> <maxFL> <topModelID> <nextTopModelID> <isAmbiguous?>
754 m_stream << READGROUPS_TOKEN << endl
755 << RG_FIELD_DESCRIPTION << endl;
757 // iterate over read groups
758 map<string, ReadGroupResolver>::const_iterator rgIter = readGroups.begin();
759 map<string, ReadGroupResolver>::const_iterator rgEnd = readGroups.end();
760 for ( ; rgIter != rgEnd; ++rgIter ) {
761 const string& name = (*rgIter).first;
762 const ReadGroupResolver& resolver = (*rgIter).second;
764 // skip if read group has no data
765 if ( !resolver.HasData )
768 // write read group data
769 m_stream << name << TAB_CHAR
770 << resolver.MedianFragmentLength << TAB_CHAR
771 << resolver.MinFragmentLength << TAB_CHAR
772 << resolver.MaxFragmentLength << TAB_CHAR
773 << resolver.TopModelId << TAB_CHAR
774 << resolver.NextTopModelId << TAB_CHAR
775 << boolalpha << resolver.IsAmbiguous
779 // extra newline at end
783 // --------------------------------------------------------------------------
784 // ResolveToolPrivate implementation
786 struct ResolveTool::ResolveToolPrivate {
790 ResolveToolPrivate(ResolveTool::ResolveSettings* settings)
791 : m_settings(settings)
793 ~ResolveToolPrivate(void) { }
795 // 'public' interface
801 bool CheckSettings(vector<string>& errors);
802 bool MakeStats(void);
803 void ParseHeader(const SamHeader& header);
804 bool ReadStatsFile(void);
805 void ResolveAlignment(BamAlignment& al);
806 bool ResolvePairs(void);
807 bool WriteStatsFile(void);
811 ResolveTool::ResolveSettings* m_settings;
812 map<string, ReadGroupResolver> m_readGroups;
815 bool ResolveTool::ResolveToolPrivate::CheckSettings(vector<string>& errors) {
817 // ensure clean slate
821 if ( m_settings->IsMakeStats ) {
824 if ( m_settings->IsMarkPairs )
825 errors.push_back("Cannot run in both -makeStats & -markPairs modes. Please select ONE.");
826 if ( m_settings->IsTwoPass )
827 errors.push_back("Cannot run in both -makeStats & -twoPass modes. Please select ONE.");
829 // error if output BAM options supplied
830 if ( m_settings->HasOutputBamFile )
831 errors.push_back("Cannot use -out (output BAM file) in -makeStats mode.");
832 if ( m_settings->IsForceCompression )
833 errors.push_back("Cannot use -forceCompression. No output BAM file is being generated.");
835 // make sure required stats file supplied
836 if ( !m_settings->HasStatsFile )
837 errors.push_back("Ouptut stats filename required for -makeStats mode. Please specify one using -stats option.");
839 // check for UseStats options
840 if ( m_settings->HasForceMarkReadGroups )
841 errors.push_back("Cannot use -forceMarkReadGroups. -markPairs options are DISABLED in -makeStats mode.");
845 else if ( m_settings->IsMarkPairs ) {
848 if ( m_settings->IsMakeStats )
849 errors.push_back("Cannot run in both -makeStats & -markPairs modes. Please select ONE.");
850 if ( m_settings->IsTwoPass )
851 errors.push_back("Cannot run in both -markPairs & -twoPass modes. Please select ONE.");
853 // make sure required stats file supplied
854 if ( !m_settings->HasStatsFile )
855 errors.push_back("Input stats filename required for -markPairs mode. Please specify one using -stats option.");
857 // check for MakeStats options
858 if ( m_settings->HasConfidenceInterval )
859 errors.push_back("Cannot use -ci. -makeStats options are DISABLED is -markPairs mode.");
863 else if ( m_settings->IsTwoPass ) {
866 if ( m_settings->IsMakeStats )
867 errors.push_back("Cannot run in both -makeStats & -twoPass modes. Please select ONE.");
868 if ( m_settings->IsMarkPairs )
869 errors.push_back("Cannot run in both -markPairs & -twoPass modes. Please select ONE.");
871 // make sure input is file not stdin
872 if ( !m_settings->HasInputBamFile || m_settings->InputBamFilename == Options::StandardIn() )
873 errors.push_back("Cannot run -twoPass mode with BAM data from stdin. Please specify existing file using -in option.");
878 errors.push_back("No resolve mode specified. Please select ONE of the following: -makeStats, -markPairs, or -twoPass. See help for more info.");
880 // boundary checks on values
881 if ( m_settings->HasConfidenceInterval ) {
882 if ( m_settings->ConfidenceInterval < 0.0 || m_settings->ConfidenceInterval > 1.0 )
883 errors.push_back("Invalid confidence interval. Must be between 0 and 1");
885 if ( m_settings->HasUnusedModelThreshold ) {
886 if ( m_settings->UnusedModelThreshold < 0.0 || m_settings->UnusedModelThreshold > 1.0 )
887 errors.push_back("Invalid unused model threshold. Must be between 0 and 1");
890 // return success if no errors found
891 return ( errors.empty() );
894 bool ResolveTool::ResolveToolPrivate::MakeStats(void) {
896 // pull resolver settings from command-line settings
897 ReadGroupResolver::SetConfidenceInterval(m_settings->ConfidenceInterval);
898 ReadGroupResolver::SetUnusedModelThreshold(m_settings->UnusedModelThreshold);
900 // open our BAM reader
902 if ( !bamReader.Open(m_settings->InputBamFilename) ) {
903 cerr << "bamtools resolve ERROR: could not open input BAM file: "
904 << m_settings->InputBamFilename << endl;
908 // retrieve header & parse for read groups
909 const SamHeader& header = bamReader.GetHeader();
912 // open ReadNamesFileWriter
913 ResolveTool::ReadNamesFileWriter readNamesWriter;
914 if ( !readNamesWriter.Open(m_settings->ReadNamesFilename) ) {
915 cerr << "bamtools resolve ERROR: could not open (temp) output read names file: "
916 << m_settings->ReadNamesFilename << endl;
921 // read through BAM file
923 string readGroup("");
924 map<string, ReadGroupResolver>::iterator rgIter;
925 map<string, bool>::iterator readNameIter;
926 while ( bamReader.GetNextAlignmentCore(al) ) {
928 // skip if alignment is not paired, mapped, nor mate is mapped
929 if ( !al.IsPaired() || !al.IsMapped() || !al.IsMateMapped() )
932 // skip if alignment & mate not on same reference sequence
933 if ( al.RefID != al.MateRefID ) continue;
935 // flesh out the char data, so we can retrieve its read group ID
938 // get read group from alignment (OK if empty)
940 al.GetTag(READ_GROUP_TAG, readGroup);
942 // look up resolver for read group
943 rgIter = m_readGroups.find(readGroup);
944 if ( rgIter == m_readGroups.end() ) {
945 cerr << "bamtools resolve ERROR - unable to calculate stats, unknown read group encountered: "
946 << readGroup << endl;
950 ReadGroupResolver& resolver = (*rgIter).second;
952 // determine unique-ness of current alignment
953 const bool isCurrentMateUnique = ( al.MapQuality != 0 );
956 readNameIter = resolver.ReadNames.find(al.Name);
958 // if read name found (current alignment's mate already parsed)
959 if ( readNameIter != resolver.ReadNames.end() ) {
961 // if both unique mates are unique, store read name & insert size for later
962 const bool isStoredMateUnique = (*readNameIter).second;
963 if ( isCurrentMateUnique && isStoredMateUnique ) {
965 // save read name in temp file as candidates for later pair marking
966 readNamesWriter.Write(readGroup, al.Name);
968 // determine model type & store fragment length for stats calculation
969 const uint16_t currentModelType = CalculateModelType(al);
970 assert( currentModelType != ModelType::DUMMY_ID );
971 resolver.Models[currentModelType].push_back( abs(al.InsertSize) );
974 // unique or not, remove read name from map
975 resolver.ReadNames.erase(readNameIter);
978 // if read name not found, store new entry
979 else resolver.ReadNames.insert( make_pair<string, bool>(al.Name, isCurrentMateUnique) );
983 readNamesWriter.Close();
986 // iterate back through read groups
987 map<string, ReadGroupResolver>::iterator rgEnd = m_readGroups.end();
988 for ( rgIter = m_readGroups.begin(); rgIter != rgEnd; ++rgIter ) {
989 const string& name = (*rgIter).first;
990 ReadGroupResolver& resolver = (*rgIter).second;
992 // calculate acceptable orientation & insert sizes for this read group
993 resolver.DetermineTopModels(name);
995 // clear out left over read names
996 // (these have mates that did not pass filters or were already removed as non-unique)
997 resolver.ReadNames.clear();
1000 // if we get here, return success
1004 void ResolveTool::ResolveToolPrivate::ParseHeader(const SamHeader& header) {
1006 // iterate over header read groups, creating a 'resolver' for each
1007 SamReadGroupConstIterator rgIter = header.ReadGroups.ConstBegin();
1008 SamReadGroupConstIterator rgEnd = header.ReadGroups.ConstEnd();
1009 for ( ; rgIter != rgEnd; ++rgIter ) {
1010 const SamReadGroup& rg = (*rgIter);
1011 m_readGroups.insert( make_pair<string, ReadGroupResolver>(rg.ID, ReadGroupResolver()) );
1015 bool ResolveTool::ResolveToolPrivate::ReadStatsFile(void) {
1017 // skip if no filename provided
1018 if ( m_settings->StatsFilename.empty() )
1021 // attempt to open stats file
1022 ResolveTool::StatsFileReader statsReader;
1023 if ( !statsReader.Open(m_settings->StatsFilename) ) {
1024 cerr << "bamtools resolve ERROR - could not open stats file: "
1025 << m_settings->StatsFilename << " for reading" << endl;
1029 // attempt to read stats data
1030 if ( !statsReader.Read(m_settings, m_readGroups) ) {
1031 cerr << "bamtools resolve ERROR - could not parse stats file: "
1032 << m_settings->StatsFilename << " for data" << endl;
1040 void ResolveTool::ResolveToolPrivate::ResolveAlignment(BamAlignment& al) {
1042 // clear proper-pair flag
1043 al.SetIsProperPair(false);
1045 // quit check if alignment is not from paired-end read
1046 if ( !al.IsPaired() ) return;
1048 // quit check if either alignment or its mate are unmapped
1049 if ( !al.IsMapped() || !al.IsMateMapped() ) return;
1051 // quit check if alignment & its mate are on differenct references
1052 if ( al.RefID != al.MateRefID ) return;
1054 // quit check if map quality is 0
1055 if ( al.MapQuality == 0 ) return;
1057 // get read group from alignment
1058 // empty string if not found, this is OK - we handle empty read group case
1059 string readGroupName("");
1060 al.GetTag(READ_GROUP_TAG, readGroupName);
1062 // look up read group's 'resolver'
1063 map<string, ReadGroupResolver>::iterator rgIter = m_readGroups.find(readGroupName);
1064 if ( rgIter == m_readGroups.end() ) {
1065 cerr << "bamtools resolve ERROR - read group found that was not in header: "
1066 << readGroupName << endl;
1069 const ReadGroupResolver& resolver = (*rgIter).second;
1071 // quit check if pairs are not in proper orientation (can differ for each RG)
1072 if ( !resolver.IsValidOrientation(al) ) return;
1074 // quit check if pairs are not within "reasonable" distance (can differ for each RG)
1075 if ( !resolver.IsValidInsertSize(al) ) return;
1077 // quit check if alignment is not a "candidate proper pair"
1078 map<string, bool>::const_iterator readNameIter;
1079 readNameIter = resolver.ReadNames.find(al.Name);
1080 if ( readNameIter == resolver.ReadNames.end() )
1083 // if we get here, alignment is OK - set 'proper pair' flag
1084 al.SetIsProperPair(true);
1087 bool ResolveTool::ResolveToolPrivate::ResolvePairs(void) {
1089 // open file containing read names of candidate proper pairs
1090 ResolveTool::ReadNamesFileReader readNamesReader;
1091 if ( !readNamesReader.Open(m_settings->ReadNamesFilename) ) {
1092 cerr << "bamtools resolve ERROR: could not open (temp) inputput read names file: "
1093 << m_settings->ReadNamesFilename << endl;
1097 // parse read names (matching with corresponding read groups)
1098 if ( !readNamesReader.Read(m_readGroups) ) {
1099 cerr << "bamtools resolve ERROR: could not read candidate read names from file: "
1100 << m_settings->ReadNamesFilename << endl;
1101 readNamesReader.Close();
1105 // close read name file reader & delete temp file
1106 readNamesReader.Close();
1107 if ( remove(m_settings->ReadNamesFilename.c_str()) != 0 ) {
1108 cerr << "bamtools resolve WARNING: could not delete temp file: "
1109 << m_settings->ReadNamesFilename << endl;
1112 // open our BAM reader
1114 if ( !reader.Open(m_settings->InputBamFilename) ) {
1115 cerr << "bamtools resolve ERROR: could not open input BAM file: "
1116 << m_settings->InputBamFilename << endl;
1120 // retrieve header & reference dictionary info
1121 const SamHeader& header = reader.GetHeader();
1122 const RefVector& references = reader.GetReferenceData();
1124 // determine compression mode for BamWriter
1125 bool writeUncompressed = ( m_settings->OutputBamFilename == Options::StandardOut() &&
1126 !m_settings->IsForceCompression );
1127 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
1128 if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
1132 writer.SetCompressionMode(compressionMode);
1133 if ( !writer.Open(m_settings->OutputBamFilename, header, references) ) {
1134 cerr << "bamtools resolve ERROR: could not open "
1135 << m_settings->OutputBamFilename << " for writing." << endl;
1140 // plow through alignments, setting/clearing 'proper pair' flag
1141 // and writing to new output BAM file
1143 while ( reader.GetNextAlignment(al) ) {
1144 ResolveAlignment(al);
1145 writer.SaveAlignment(al);
1148 // clean up & return success
1154 bool ResolveTool::ResolveToolPrivate::Run(void) {
1156 // verify that command line settings are acceptable
1157 vector<string> errors;
1158 if ( !CheckSettings(errors) ) {
1159 cerr << "bamtools resolve ERROR - invalid settings: " << endl;
1160 vector<string>::const_iterator errorIter = errors.begin();
1161 vector<string>::const_iterator errorEnd = errors.end();
1162 for ( ; errorIter != errorEnd; ++errorIter )
1163 cerr << (*errorIter) << endl;
1167 // initialize read group map with default (empty name) read group
1168 m_readGroups.insert( make_pair<string, ReadGroupResolver>("", ReadGroupResolver()) );
1170 // init readname filename
1171 // uses (adjusted) stats filename if provided (req'd for makeStats, markPairs modes; optional for twoPass)
1172 // else keep default filename
1173 if ( m_settings->HasStatsFile )
1174 m_settings->ReadNamesFilename = m_settings->StatsFilename + READNAME_FILE_SUFFIX;
1177 if ( m_settings->IsMakeStats ) {
1179 // generate stats data
1180 if ( !MakeStats() ) {
1181 cerr << "bamtools resolve ERROR - could not generate stats" << endl;
1185 // write stats to file
1186 if ( !WriteStatsFile() ) {
1187 cerr << "bamtools resolve ERROR - could not write stats file: "
1188 << m_settings->StatsFilename << endl;
1194 else if ( m_settings->IsMarkPairs ) {
1196 // read stats from file
1197 if ( !ReadStatsFile() ) {
1198 cerr << "bamtools resolve ERROR - could not read stats file: "
1199 << m_settings->StatsFilename << endl;
1203 // do paired-end resolution
1204 if ( !ResolvePairs() ) {
1205 cerr << "bamtools resolve ERROR - could not resolve pairs" << endl;
1213 // generate stats data
1214 if ( !MakeStats() ) {
1215 cerr << "bamtools resolve ERROR - could not generate stats" << endl;
1219 // if stats file requested
1220 if ( m_settings->HasStatsFile ) {
1222 // write stats to file
1223 // emit warning if write fails, but paired-end resolution should be allowed to proceed
1224 if ( !WriteStatsFile() )
1225 cerr << "bamtools resolve WARNING - could not write stats file: "
1226 << m_settings->StatsFilename << endl;
1229 // do paired-end resolution
1230 if ( !ResolvePairs() ) {
1231 cerr << "bamtools resolve ERROR - could not resolve pairs" << endl;
1240 bool ResolveTool::ResolveToolPrivate::WriteStatsFile(void) {
1242 // skip if no filename provided
1243 if ( m_settings->StatsFilename.empty() )
1246 // attempt to open stats file
1247 ResolveTool::StatsFileWriter statsWriter;
1248 if ( !statsWriter.Open(m_settings->StatsFilename) ) {
1249 cerr << "bamtools resolve ERROR - could not open stats file: "
1250 << m_settings->StatsFilename << " for writing" << endl;
1254 // attempt to write stats data
1255 if ( !statsWriter.Write(m_settings, m_readGroups) ) {
1256 cerr << "bamtools resolve ERROR - could not write stats file: "
1257 << m_settings->StatsFilename << " for data" << endl;
1265 // --------------------------------------------------------------------------
1266 // ResolveTool implementation
1268 ResolveTool::ResolveTool(void)
1270 , m_settings(new ResolveSettings)
1273 // set description texts
1274 const string programDescription = "resolves paired-end reads (marking the IsProperPair flag as needed)";
1275 const string programUsage = "<mode> [options] [-in <filename>] [-out <filename> | [-forceCompression] ] [-stats <filename>]";
1276 const string inputBamDescription = "the input BAM file(s)";
1277 const string outputBamDescription = "the output BAM file";
1278 const string statsFileDescription = "input/output stats file, depending on selected mode (see below). "
1279 "This file is human-readable, storing fragment length data generated per read group, as well as "
1280 "the options used to configure the -makeStats mode";
1281 const string forceCompressionDescription = "if results are sent to stdout (like when piping to another tool), "
1282 "default behavior is to leave output uncompressed."
1283 "Use this flag to override and force compression. This feature is disabled in -makeStats mode.";
1284 const string makeStatsDescription = "generates a fragment-length stats file from the input BAM. "
1285 "Data is written to file specified using the -stats option. "
1286 "MarkPairs Mode Settings are DISABLED.";
1287 const string markPairsDescription = "generates an output BAM with alignments marked with proper-pair status. "
1288 "Stats data is read from file specified using the -stats option. "
1289 "MakeStats Mode Settings are DISABLED";
1290 const string twoPassDescription = "combines the -makeStats & -markPairs modes into a single command. "
1291 "However, due to the two-pass nature of paired-end resolution, piping BAM data via stdin is DISABLED. "
1292 "You must supply an explicit input BAM file. Output BAM may be piped to stdout, however, if desired. "
1293 "All MakeStats & MarkPairs Mode Settings are available. "
1294 "The intermediate stats file is not necessary, but if the -stats options is used, then one will be generated. "
1295 "You may find this useful for documentation purposes.";
1296 const string confidenceIntervalDescription = "confidence interval. Set min/max fragment lengths such that we capture "
1297 "this fraction of pairs";
1298 const string unusedModelThresholdDescription = "unused model threshold. The resolve tool considers 8 possible orientation models "
1299 "for pairs. The top 2 are selected for later use when actually marking alignments. This value determines the "
1300 "cutoff for marking a read group as ambiguous. Meaning that if the ratio of the number of alignments from bottom 6 models "
1301 "to the top 2 is greater than this threshold, then the read group is flagged as ambiguous. By default, NO alignments "
1302 "from ambiguous read groups will be marked as proper pairs. You may override this behavior with the -force option "
1303 "in -markPairs mode";
1304 const string forceMarkDescription = "forces all read groups to be marked according to their top 2 'orientation models'. "
1305 "When generating stats, the 2 (out of 8 possible) models with the most observations are chosen as the top models for each read group. "
1306 "If the remaining 6 models account for more than some threshold ([default=10%], see -umt), then the read group is marked as ambiguous. "
1307 "The default behavior is that for an ambiguous read group, NONE of its alignments are marked as proper-pairs. "
1308 "By setting this option, a read group's ambiguity flag will be ignored, and all of its alignments will be compared to the top 2 models.";
1310 // set program details
1311 Options::SetProgramInfo("bamtools resolve", programDescription, programUsage);
1313 // set up I/O options
1314 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
1315 Options::AddValueOption("-in", "BAM filename", inputBamDescription, "",
1316 m_settings->HasInputBamFile, m_settings->InputBamFilename,
1317 IO_Opts, Options::StandardIn());
1318 Options::AddValueOption("-out", "BAM filename", outputBamDescription, "",
1319 m_settings->HasOutputBamFile, m_settings->OutputBamFilename,
1320 IO_Opts, Options::StandardOut());
1321 Options::AddValueOption("-stats", "STATS filename", statsFileDescription, "",
1322 m_settings->HasStatsFile, m_settings->StatsFilename, IO_Opts);
1323 Options::AddOption("-forceCompression", forceCompressionDescription,
1324 m_settings->IsForceCompression, IO_Opts);
1326 OptionGroup* ModeOpts = Options::CreateOptionGroup("Resolve Modes (must select ONE of the following)");
1327 Options::AddOption("-makeStats", makeStatsDescription, m_settings->IsMakeStats, ModeOpts);
1328 Options::AddOption("-markPairs", markPairsDescription, m_settings->IsMarkPairs, ModeOpts);
1329 Options::AddOption("-twoPass", twoPassDescription, m_settings->IsTwoPass, ModeOpts);
1331 OptionGroup* MakeStatsOpts = Options::CreateOptionGroup("MakeStats Mode Options (disabled in -markPairs mode)");
1332 Options::AddValueOption("-ci", "double", confidenceIntervalDescription, "",
1333 m_settings->HasConfidenceInterval, m_settings->ConfidenceInterval, MakeStatsOpts);
1334 Options::AddValueOption("-umt", "double", unusedModelThresholdDescription, "",
1335 m_settings->HasUnusedModelThreshold, m_settings->UnusedModelThreshold, MakeStatsOpts);
1337 OptionGroup* MarkPairsOpts = Options::CreateOptionGroup("MarkPairs Mode Options (disabled in -makeStats mode)");
1338 Options::AddOption("-force", forceMarkDescription, m_settings->HasForceMarkReadGroups, MarkPairsOpts);
1341 ResolveTool::~ResolveTool(void) {
1350 int ResolveTool::Help(void) {
1351 Options::DisplayHelp();
1355 int ResolveTool::Run(int argc, char* argv[]) {
1357 // parse command line arguments
1358 Options::Parse(argc, argv, 1);
1360 // initialize ResolveTool
1361 m_impl = new ResolveToolPrivate(m_settings);
1363 // run ResolveTool, return success/failure
1364 if ( m_impl->Run() )