1 // ***************************************************************************
2 // bamtools_resolve.cpp (c) 2011
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 24 July 2013 (DB)
6 // ---------------------------------------------------------------------------
7 // Resolves paired-end reads (marking the IsProperPair flag as needed).
8 // ***************************************************************************
10 #include "bamtools_resolve.h"
11 #include "bamtools_version.h"
12 #include <api/BamReader.h>
13 #include <api/BamWriter.h>
14 #include <utils/bamtools_options.h>
15 #include <utils/bamtools_utilities.h>
16 using namespace BamTools;
32 // --------------------------------------------------------------------------
33 // general ResolveTool constants
34 // --------------------------------------------------------------------------
36 static const int NUM_MODELS = 8;
37 static const string READ_GROUP_TAG = "RG";
38 static const double DEFAULT_CONFIDENCE_INTERVAL = 0.9973;
39 static const uint16_t DEFAULT_MIN_MAPQUALITY = 1;
40 static const double DEFAULT_UNUSEDMODEL_THRESHOLD = 0.1;
42 // --------------------------------------------------------------------------
43 // stats file constants
44 // --------------------------------------------------------------------------
46 // basic char/string constants
47 static const char COMMENT_CHAR = '#';
48 static const char OPEN_BRACE_CHAR = '[';
49 static const char CLOSE_BRACE_CHAR = ']';
50 static const char EQUAL_CHAR = '=';
51 static const char TAB_CHAR = '\t';
53 static const string WHITESPACE_CHARS = " \t\n";
54 static const string TRUE_KEYWORD = "true";
55 static const string FALSE_KEYWORD = "false";
58 static const size_t NUM_OPTIONS_FIELDS = 2;
59 static const size_t NUM_READGROUPS_FIELDS = 7;
62 static const string INPUT_TOKEN = "[Input]";
63 static const string OPTIONS_TOKEN = "[Options]";
64 static const string READGROUPS_TOKEN = "[ReadGroups]";
67 static const string OPTION_CONFIDENCEINTERVAL = "ConfidenceInterval";
68 static const string OPTION_MINIMUMMAPQUALITY = "MinimumMapQuality";
69 static const string OPTION_UNUSEDMODELTHRESHOLD = "UnusedModelThreshold";
70 static const string OPTION_FORCEMARKREADGROUPS = "ForceMarkReadGroups";
72 // other string constants
73 static const string RG_FIELD_DESCRIPTION =
74 "#<name> <medianFL> <minFL> <maxFL> <topModelID> <nextTopModelID> <isAmbiguous?>";
76 static const string MODEL_DESCRIPTION =
77 "# ------------- Model Types Description ---------------\n"
79 "# ID Position Orientation \n"
80 "# 1 mate1 < mate2 mate1:forward, mate2:forward \n"
81 "# 2 mate1 < mate2 mate1:forward, mate2:reverse \n"
82 "# 3 mate1 < mate2 mate1:reverse, mate2:forward \n"
83 "# 4 mate1 < mate2 mate1:reverse, mate2:reverse \n"
84 "# 5 mate2 < mate1 mate2:forward, mate1:forward \n"
85 "# 6 mate2 < mate1 mate2:forward, mate1:reverse \n"
86 "# 7 mate2 < mate1 mate2:reverse, mate1:forward \n"
87 "# 8 mate2 < mate1 mate2:reverse, mate1:reverse \n"
88 "# -----------------------------------------------------\n";
90 // --------------------------------------------------------------------------
91 // unique readname file constants
92 // --------------------------------------------------------------------------
94 static const string READNAME_FILE_SUFFIX = ".uniq_names.txt";
95 static const string DEFAULT_READNAME_FILE = "bt_resolve_TEMP" + READNAME_FILE_SUFFIX;
97 // --------------------------------------------------------------------------
98 // ModelType implementation
104 vector<int32_t> FragmentLengths;
107 ModelType(const uint16_t id)
110 // preallocate space for 10K fragments per model type
111 FragmentLengths.reserve(10000);
114 // convenience access to internal fragment lengths vector
115 vector<int32_t>::iterator begin(void) { return FragmentLengths.begin(); }
116 vector<int32_t>::const_iterator begin(void) const { return FragmentLengths.begin(); }
117 void clear(void) { FragmentLengths.clear(); }
118 vector<int32_t>::iterator end(void) { return FragmentLengths.end(); }
119 vector<int32_t>::const_iterator end(void) const { return FragmentLengths.end(); }
120 void push_back(const int32_t& x) { FragmentLengths.push_back(x); }
121 size_t size(void) const { return FragmentLengths.size(); }
124 static const uint16_t DUMMY_ID;
127 const uint16_t ModelType::DUMMY_ID = 100;
129 bool operator>(const ModelType& lhs, const ModelType& rhs) {
130 return lhs.size() > rhs.size();
133 uint16_t CalculateModelType(const BamAlignment& al) {
135 // localize alignment's mate positions & orientations for convenience
136 const int32_t m1_begin = ( al.IsFirstMate() ? al.Position : al.MatePosition );
137 const int32_t m2_begin = ( al.IsFirstMate() ? al.MatePosition : al.Position );
138 const bool m1_isReverseStrand = ( al.IsFirstMate() ? al.IsReverseStrand() : al.IsMateReverseStrand() );
139 const bool m2_isReverseStrand = ( al.IsFirstMate() ? al.IsMateReverseStrand() : al.IsReverseStrand() );
141 // determine 'model type'
142 if ( m1_begin < m2_begin ) {
143 if ( !m1_isReverseStrand && !m2_isReverseStrand ) return 0; // ID: 1
144 if ( !m1_isReverseStrand && m2_isReverseStrand ) return 1; // ID: 2
145 if ( m1_isReverseStrand && !m2_isReverseStrand ) return 2; // ID: 3
146 if ( m1_isReverseStrand && m2_isReverseStrand ) return 3; // ID: 4
148 if ( !m2_isReverseStrand && !m1_isReverseStrand ) return 4; // ID: 5
149 if ( !m2_isReverseStrand && m1_isReverseStrand ) return 5; // ID: 6
150 if ( m2_isReverseStrand && !m1_isReverseStrand ) return 6; // ID: 7
151 if ( m2_isReverseStrand && m1_isReverseStrand ) return 7; // ID: 8
155 return ModelType::DUMMY_ID;
158 // --------------------------------------------------------------------------
159 // ReadGroupResolver implementation
161 struct ReadGroupResolver {
164 int32_t MinFragmentLength;
165 int32_t MedianFragmentLength;
166 int32_t MaxFragmentLength;
168 uint16_t NextTopModelId;
171 vector<ModelType> Models;
172 map<string, bool> ReadNames;
175 ReadGroupResolver(void);
178 bool IsValidInsertSize(const BamAlignment& al) const;
179 bool IsValidOrientation(const BamAlignment& al) const;
181 // select 2 best models based on observed data
182 void DetermineTopModels(const string& readGroupName);
185 static double ConfidenceInterval;
186 static double UnusedModelThreshold;
187 static void SetConfidenceInterval(const double& ci);
188 static void SetUnusedModelThreshold(const double& umt);
191 double ReadGroupResolver::ConfidenceInterval = DEFAULT_CONFIDENCE_INTERVAL;
192 double ReadGroupResolver::UnusedModelThreshold = DEFAULT_UNUSEDMODEL_THRESHOLD;
194 ReadGroupResolver::ReadGroupResolver(void)
195 : MinFragmentLength(0)
196 , MedianFragmentLength(0)
197 , MaxFragmentLength(0)
198 , TopModelId(ModelType::DUMMY_ID)
199 , NextTopModelId(ModelType::DUMMY_ID)
203 // pre-allocate space for 8 models
204 Models.reserve(NUM_MODELS);
205 for ( uint16_t i = 0; i < NUM_MODELS; ++i )
206 Models.push_back( ModelType(i+1) );
209 bool ReadGroupResolver::IsValidInsertSize(const BamAlignment& al) const {
210 const int32_t absInsertSize = abs(al.InsertSize);
211 return ( absInsertSize >= MinFragmentLength &&
212 absInsertSize <= MaxFragmentLength );
215 bool ReadGroupResolver::IsValidOrientation(const BamAlignment& al) const {
216 const uint16_t currentModelId = CalculateModelType(al) + 1; // convert model type (array index) to ID number
217 return ( currentModelId == TopModelId || currentModelId == NextTopModelId );
220 void ReadGroupResolver::DetermineTopModels(const string& readGroupName) {
222 // sort models (from most common to least common)
223 sort( Models.begin(), Models.end(), std::greater<ModelType>() );
225 // store top 2 models for later
226 TopModelId = Models[0].ID;
227 NextTopModelId = Models[1].ID;
229 // make sure that the 2 most common models are some threshold more common
230 // than the remaining models
231 const unsigned int activeModelCountSum = Models[0].size() + Models[1].size();
232 if ( activeModelCountSum == 0 ) return; // skip if no data in this read group
233 const unsigned int unusedModelCountSum = Models[2].size() + Models[3].size() +
234 Models[4].size() + Models[5].size() +
235 Models[6].size() + Models[7].size();
236 const double unusedPercentage = (double)unusedModelCountSum / (double)activeModelCountSum;
237 if ( unusedPercentage > UnusedModelThreshold ) {
238 cerr << "WARNING: " << readGroupName << " does not have clearly defined 'top models'" << endl
239 << " The fraction of alignments in bottom 6 models (" << unusedPercentage
240 << ") exceeds threshold: " << UnusedModelThreshold << endl;
244 // emit a warning if the best alignment models are non-standard
245 const bool isModel1Top = (TopModelId == 1) || (NextTopModelId == 1);
246 const bool isModel2Top = (TopModelId == 2) || (NextTopModelId == 2);
247 const bool isModel4Top = (TopModelId == 4) || (NextTopModelId == 4);
248 const bool isModel5Top = (TopModelId == 5) || (NextTopModelId == 5);
249 const bool isModel6Top = (TopModelId == 6) || (NextTopModelId == 6);
250 const bool isModel8Top = (TopModelId == 8) || (NextTopModelId == 8);
252 bool isMatePair = ( isModel4Top && isModel5Top ? true : false );
253 bool isPairedEnd = ( isModel2Top && isModel6Top ? true : false );
254 bool isSolidPair = ( isModel1Top && isModel8Top ? true : false );
256 if ( !isMatePair && !isPairedEnd && !isSolidPair ) {
257 cerr << "WARNING: Found a non-standard alignment model configuration. " << endl
258 << " Using alignment models " << TopModelId << " & " << NextTopModelId
262 // store only the fragments from the best alignment models, then sort
263 vector<int32_t> fragments;
264 fragments.reserve( Models[0].size() + Models[1].size() );
265 fragments.insert( fragments.end(), Models[0].begin(), Models[0].end() );
266 fragments.insert( fragments.end(), Models[1].begin(), Models[1].end() );
267 sort ( fragments.begin(), fragments.end() );
269 // clear out Model fragment data, not needed anymore
272 // skip if no fragments found for this read group
273 if ( fragments.empty() ) {
279 // calculate & store the min,median, & max fragment lengths
280 const unsigned int numFragmentLengths = fragments.size();
281 const double halfNonConfidenceInterval = (1.0 - ReadGroupResolver::ConfidenceInterval)/2.0;
282 const unsigned int minIndex = (unsigned int)(numFragmentLengths * halfNonConfidenceInterval);
283 const unsigned int medianIndex = (unsigned int)(numFragmentLengths * 0.5);
284 const unsigned int maxIndex = (unsigned int)(numFragmentLengths * (1.0-halfNonConfidenceInterval));
286 MinFragmentLength = fragments[minIndex];
287 MedianFragmentLength = fragments[medianIndex];
288 MaxFragmentLength = fragments[maxIndex];
291 void ReadGroupResolver::SetConfidenceInterval(const double& ci) {
292 ConfidenceInterval = ci;
295 void ReadGroupResolver::SetUnusedModelThreshold(const double& umt) {
296 UnusedModelThreshold = umt;
299 // --------------------------------------------------------------------------
300 // ResolveSettings implementation
302 struct ResolveTool::ResolveSettings {
310 bool HasInputBamFile;
311 bool HasOutputBamFile;
313 bool IsForceCompression;
315 // resolve option flags
316 bool HasConfidenceInterval;
317 bool HasForceMarkReadGroups;
318 bool HasMinimumMapQuality;
319 bool HasUnusedModelThreshold;
322 string InputBamFilename;
323 string OutputBamFilename;
324 string StatsFilename;
325 string ReadNamesFilename; // ** N.B. - Only used internally, not set from cmdline **
328 double ConfidenceInterval;
329 uint16_t MinimumMapQuality;
330 double UnusedModelThreshold;
333 ResolveSettings(void)
337 , HasInputBamFile(false)
338 , HasOutputBamFile(false)
339 , HasStatsFile(false)
340 , IsForceCompression(false)
341 , HasConfidenceInterval(false)
342 , HasForceMarkReadGroups(false)
343 , HasMinimumMapQuality(false)
344 , HasUnusedModelThreshold(false)
345 , InputBamFilename(Options::StandardIn())
346 , OutputBamFilename(Options::StandardOut())
348 , ReadNamesFilename(DEFAULT_READNAME_FILE)
349 , ConfidenceInterval(DEFAULT_CONFIDENCE_INTERVAL)
350 , MinimumMapQuality(DEFAULT_MIN_MAPQUALITY)
351 , UnusedModelThreshold(DEFAULT_UNUSEDMODEL_THRESHOLD)
355 // --------------------------------------------------------------------------
356 // ReadNamesFileReader implementation
358 struct ResolveTool::ReadNamesFileReader {
361 ReadNamesFileReader(void) { }
362 ~ReadNamesFileReader(void) { Close(); }
364 // main reader interface
367 bool Open(const string& filename);
368 bool Read(map<string, ReadGroupResolver>& readGroups);
375 void ResolveTool::ReadNamesFileReader::Close(void) {
376 if ( m_stream.is_open() )
380 bool ResolveTool::ReadNamesFileReader::Open(const string& filename) {
382 // make sure stream is fresh
385 // attempt to open filename, return status
386 m_stream.open(filename.c_str(), ifstream::in);
387 return m_stream.good();
390 bool ResolveTool::ReadNamesFileReader::Read(map<string, ReadGroupResolver>& readGroups) {
392 // up-front sanity check
393 if ( !m_stream.is_open() ) return false;
395 // parse read names file
397 vector<string> fields;
398 map<string, ReadGroupResolver>::iterator rgIter;
399 map<string, ReadGroupResolver>::iterator rgEnd = readGroups.end();
400 while ( getline(m_stream, line) ) {
402 // skip if empty line
403 if ( line.empty() ) continue;
405 // split line on '\t'
406 fields = Utilities::Split(line, TAB_CHAR);
407 if ( fields.size() != 2 ) continue;
409 // look up resolver for read group
410 rgIter = readGroups.find( fields[0] );
411 if ( rgIter == rgEnd ) return false;
412 ReadGroupResolver& resolver = (*rgIter).second;
414 // store read name with resolver
415 resolver.ReadNames.insert( make_pair<string,bool>(fields[1], true) ) ;
418 // if here, return success
422 // --------------------------------------------------------------------------
423 // ReadNamesFileWriter implementation
425 struct ResolveTool::ReadNamesFileWriter {
428 ReadNamesFileWriter(void) { }
429 ~ReadNamesFileWriter(void) { Close(); }
431 // main reader interface
434 bool Open(const string& filename);
435 void Write(const string& readGroupName, const string& readName);
442 void ResolveTool::ReadNamesFileWriter::Close(void) {
443 if ( m_stream.is_open() )
447 bool ResolveTool::ReadNamesFileWriter::Open(const string& filename) {
449 // make sure stream is fresh
452 // attempt to open filename, return status
453 m_stream.open(filename.c_str(), ofstream::out);
454 return m_stream.good();
457 void ResolveTool::ReadNamesFileWriter::Write(const string& readGroupName,
458 const string& readName)
460 m_stream << readGroupName << TAB_CHAR << readName << endl;
463 // --------------------------------------------------------------------------
464 // StatsFileReader implementation
466 struct ResolveTool::StatsFileReader {
470 StatsFileReader(void) { }
471 ~StatsFileReader(void) { Close(); }
473 // main reader interface
476 bool Open(const string& filename);
477 bool Read(ResolveTool::ResolveSettings* settings,
478 map<string, ReadGroupResolver>& readGroups);
482 bool IsComment(const string& line) const;
483 bool IsWhitespace(const string& line) const;
484 bool ParseInputLine(const string& line);
485 bool ParseOptionLine(const string& line, ResolveTool::ResolveSettings* settings);
486 bool ParseReadGroupLine(const string& line, map<string, ReadGroupResolver>& readGroups);
487 string SkipCommentsAndWhitespace(void);
493 enum State { None = 0
499 void ResolveTool::StatsFileReader::Close(void) {
500 if ( m_stream.is_open() )
504 bool ResolveTool::StatsFileReader::IsComment(const string& line) const {
505 assert( !line.empty() );
506 return ( line.at(0) == COMMENT_CHAR );
509 bool ResolveTool::StatsFileReader::IsWhitespace(const string& line) const {
512 return ( isspace(line.at(0)) );
515 bool ResolveTool::StatsFileReader::Open(const string& filename) {
517 // make sure stream is fresh
520 // attempt to open filename, return status
521 m_stream.open(filename.c_str(), ifstream::in);
522 return m_stream.good();
525 bool ResolveTool::StatsFileReader::ParseInputLine(const string& /*line*/) {
526 // input lines are ignored (for now at least), tool will use input from command line
530 bool ResolveTool::StatsFileReader::ParseOptionLine(const string& line,
531 ResolveTool::ResolveSettings* settings)
533 // split line into option, value
534 vector<string> fields = Utilities::Split(line, EQUAL_CHAR);
535 if ( fields.size() != NUM_OPTIONS_FIELDS )
537 const string& option = fields.at(0);
538 stringstream value(fields.at(1));
540 // -----------------------------------
541 // handle option based on keyword
543 // ConfidenceInterval
544 if ( option == OPTION_CONFIDENCEINTERVAL ) {
545 value >> settings->ConfidenceInterval;
546 settings->HasConfidenceInterval = true;
550 // ForceMarkReadGroups
551 if ( option == OPTION_FORCEMARKREADGROUPS ) {
552 value >> settings->HasForceMarkReadGroups;
557 if ( option == OPTION_MINIMUMMAPQUALITY ) {
558 value >> settings->MinimumMapQuality;
559 settings->HasMinimumMapQuality = true;
563 // UnusedModelThreshold
564 if ( option == OPTION_UNUSEDMODELTHRESHOLD ) {
565 value >> settings->UnusedModelThreshold;
566 settings->HasUnusedModelThreshold = true;
570 // otherwise unknown option
571 cerr << "bamtools resolve ERROR - unrecognized option: " << option << " in stats file" << endl;
575 bool ResolveTool::StatsFileReader::ParseReadGroupLine(const string& line,
576 map<string, ReadGroupResolver>& readGroups)
578 // split read group data in to fields
579 vector<string> fields = Utilities::Split(line, WHITESPACE_CHARS);
580 if ( fields.size() != NUM_READGROUPS_FIELDS ) return false;
583 const string& name = fields.at(0);
585 // populate RG's 'resolver' data
586 ReadGroupResolver resolver;
588 stringstream dataStream;
589 dataStream.str(fields.at(1));
590 dataStream >> resolver.MedianFragmentLength;
593 dataStream.str(fields.at(2));
594 dataStream >> resolver.MinFragmentLength;
597 dataStream.str(fields.at(3));
598 dataStream >> resolver.MaxFragmentLength;
601 dataStream.str(fields.at(4));
602 dataStream >> resolver.TopModelId;
605 dataStream.str(fields.at(5));
606 dataStream >> resolver.NextTopModelId;
609 resolver.IsAmbiguous = ( fields.at(6) == TRUE_KEYWORD );
611 // store RG entry and return success
612 readGroups.insert( make_pair<string, ReadGroupResolver>(name, resolver) );
616 bool ResolveTool::StatsFileReader::Read(ResolveTool::ResolveSettings* settings,
617 map<string, ReadGroupResolver>& readGroups)
619 // up-front sanity checks
620 if ( !m_stream.is_open() || settings == 0 )
623 // clear out read group data
627 State currentState = StatsFileReader::None;
630 string line = SkipCommentsAndWhitespace();
631 while ( !line.empty() ) {
633 bool foundError = false;
635 // switch state on keyword found
636 if ( Utilities::StartsWith(line, INPUT_TOKEN) )
637 currentState = StatsFileReader::InInput;
638 else if ( Utilities::StartsWith(line, OPTIONS_TOKEN) )
639 currentState = StatsFileReader::InOptions;
640 else if ( Utilities::StartsWith(line, READGROUPS_TOKEN) )
641 currentState = StatsFileReader::InReadGroups;
643 // otherwise parse data line, depending on state
645 if ( currentState == StatsFileReader::InInput )
646 foundError = !ParseInputLine(line);
647 else if ( currentState == StatsFileReader::InOptions )
648 foundError = !ParseOptionLine(line, settings);
649 else if ( currentState == StatsFileReader::InReadGroups )
650 foundError = !ParseReadGroupLine(line, readGroups);
655 // break out if error found
660 line = SkipCommentsAndWhitespace();
663 // if here, return success
667 string ResolveTool::StatsFileReader::SkipCommentsAndWhitespace(void) {
670 if ( m_stream.eof() )
672 getline(m_stream, line);
673 } while ( IsWhitespace(line) || IsComment(line) );
677 // --------------------------------------------------------------------------
678 // StatsFileReader implementation
680 struct ResolveTool::StatsFileWriter {
684 StatsFileWriter(void) { }
685 ~StatsFileWriter(void) { Close(); }
687 // main reader interface
690 bool Open(const string& filename);
691 bool Write(ResolveTool::ResolveSettings* settings,
692 const map<string, ReadGroupResolver>& readGroups);
696 void WriteHeader(void);
697 void WriteInput(ResolveTool::ResolveSettings* settings);
698 void WriteOptions(ResolveTool::ResolveSettings* settings);
699 void WriteReadGroups(const map<string, ReadGroupResolver>& readGroups);
706 void ResolveTool::StatsFileWriter::Close(void) {
707 if ( m_stream.is_open() )
711 bool ResolveTool::StatsFileWriter::Open(const string& filename) {
713 // make sure stream is fresh
716 // attempt to open filename, return status
717 m_stream.open(filename.c_str(), ofstream::out);
718 return m_stream.good();
721 bool ResolveTool::StatsFileWriter::Write(ResolveTool::ResolveSettings* settings,
722 const map<string, ReadGroupResolver>& readGroups)
724 // return failure if file not open
725 if ( !m_stream.is_open() )
728 // write stats file elements
730 WriteInput(settings);
731 WriteOptions(settings);
732 WriteReadGroups(readGroups);
738 void ResolveTool::StatsFileWriter::WriteHeader(void) {
740 // stringify current bamtools version
741 stringstream versionStream("");
743 << BAMTOOLS_VERSION_MAJOR << "."
744 << BAMTOOLS_VERSION_MINOR << "."
745 << BAMTOOLS_VERSION_BUILD;
747 // # bamtools resolve (vX.Y.Z)
749 // # MODEL DESCRIPTION - see above for actual text
752 m_stream << COMMENT_CHAR << " bamtools resolve (" << versionStream.str() << ")" << endl
753 << COMMENT_CHAR << endl
758 void ResolveTool::StatsFileWriter::WriteInput(ResolveTool::ResolveSettings* settings) {
764 m_stream << INPUT_TOKEN << endl
765 << settings->InputBamFilename << endl
769 void ResolveTool::StatsFileWriter::WriteOptions(ResolveTool::ResolveSettings* settings) {
772 // ConfidenceInterval=<double>
773 // ForceMarkReadGroups=<true|false>
774 // MinimumMapQuality=<uint16_t>
775 // UnusedModelThreshold=<double>
778 m_stream << OPTIONS_TOKEN << endl
779 << OPTION_CONFIDENCEINTERVAL << EQUAL_CHAR << settings->ConfidenceInterval << endl
780 << OPTION_FORCEMARKREADGROUPS << EQUAL_CHAR << boolalpha << settings->HasForceMarkReadGroups << endl
781 << OPTION_MINIMUMMAPQUALITY << EQUAL_CHAR << settings->MinimumMapQuality << endl
782 << OPTION_UNUSEDMODELTHRESHOLD << EQUAL_CHAR << settings->UnusedModelThreshold << endl
786 void ResolveTool::StatsFileWriter::WriteReadGroups(const map<string, ReadGroupResolver>& readGroups) {
789 // #<name> <medianFL> <minFL> <maxFL> <topModelID> <nextTopModelID> <isAmbiguous?>
790 m_stream << READGROUPS_TOKEN << endl
791 << RG_FIELD_DESCRIPTION << endl;
793 // iterate over read groups
794 map<string, ReadGroupResolver>::const_iterator rgIter = readGroups.begin();
795 map<string, ReadGroupResolver>::const_iterator rgEnd = readGroups.end();
796 for ( ; rgIter != rgEnd; ++rgIter ) {
797 const string& name = (*rgIter).first;
798 const ReadGroupResolver& resolver = (*rgIter).second;
800 // skip if read group has no data
801 if ( !resolver.HasData )
804 // write read group data
805 m_stream << name << TAB_CHAR
806 << resolver.MedianFragmentLength << TAB_CHAR
807 << resolver.MinFragmentLength << TAB_CHAR
808 << resolver.MaxFragmentLength << TAB_CHAR
809 << resolver.TopModelId << TAB_CHAR
810 << resolver.NextTopModelId << TAB_CHAR
811 << boolalpha << resolver.IsAmbiguous
815 // extra newline at end
819 // --------------------------------------------------------------------------
820 // ResolveToolPrivate implementation
822 struct ResolveTool::ResolveToolPrivate {
826 ResolveToolPrivate(ResolveTool::ResolveSettings* settings)
827 : m_settings(settings)
829 ~ResolveToolPrivate(void) { }
831 // 'public' interface
837 bool CheckSettings(vector<string>& errors);
838 bool MakeStats(void);
839 void ParseHeader(const SamHeader& header);
840 bool ReadStatsFile(void);
841 void ResolveAlignment(BamAlignment& al);
842 bool ResolvePairs(void);
843 bool WriteStatsFile(void);
847 ResolveTool::ResolveSettings* m_settings;
848 map<string, ReadGroupResolver> m_readGroups;
851 bool ResolveTool::ResolveToolPrivate::CheckSettings(vector<string>& errors) {
853 // ensure clean slate
857 if ( m_settings->IsMakeStats ) {
860 if ( m_settings->IsMarkPairs )
861 errors.push_back("Cannot run in both -makeStats & -markPairs modes. Please select ONE.");
862 if ( m_settings->IsTwoPass )
863 errors.push_back("Cannot run in both -makeStats & -twoPass modes. Please select ONE.");
865 // error if output BAM options supplied
866 if ( m_settings->HasOutputBamFile )
867 errors.push_back("Cannot use -out (output BAM file) in -makeStats mode.");
868 if ( m_settings->IsForceCompression )
869 errors.push_back("Cannot use -forceCompression. No output BAM file is being generated.");
871 // make sure required stats file supplied
872 if ( !m_settings->HasStatsFile )
873 errors.push_back("Ouptut stats filename required for -makeStats mode. Please specify one using -stats option.");
875 // check for UseStats options
876 if ( m_settings->HasForceMarkReadGroups )
877 errors.push_back("Cannot use -forceMarkReadGroups. -markPairs options are DISABLED in -makeStats mode.");
881 else if ( m_settings->IsMarkPairs ) {
884 if ( m_settings->IsMakeStats )
885 errors.push_back("Cannot run in both -makeStats & -markPairs modes. Please select ONE.");
886 if ( m_settings->IsTwoPass )
887 errors.push_back("Cannot run in both -markPairs & -twoPass modes. Please select ONE.");
889 // make sure required stats file supplied
890 if ( !m_settings->HasStatsFile )
891 errors.push_back("Input stats filename required for -markPairs mode. Please specify one using -stats option.");
893 // check for MakeStats options
894 if ( m_settings->HasConfidenceInterval )
895 errors.push_back("Cannot use -ci. -makeStats options are DISABLED is -markPairs mode.");
899 else if ( m_settings->IsTwoPass ) {
902 if ( m_settings->IsMakeStats )
903 errors.push_back("Cannot run in both -makeStats & -twoPass modes. Please select ONE.");
904 if ( m_settings->IsMarkPairs )
905 errors.push_back("Cannot run in both -markPairs & -twoPass modes. Please select ONE.");
907 // make sure input is file not stdin
908 if ( !m_settings->HasInputBamFile || m_settings->InputBamFilename == Options::StandardIn() )
909 errors.push_back("Cannot run -twoPass mode with BAM data from stdin. Please specify existing file using -in option.");
914 errors.push_back("No resolve mode specified. Please select ONE of the following: -makeStats, -markPairs, or -twoPass. See help for more info.");
916 // boundary checks on values
917 if ( m_settings->HasConfidenceInterval ) {
918 if ( m_settings->ConfidenceInterval < 0.0 || m_settings->ConfidenceInterval > 1.0 )
919 errors.push_back("Invalid confidence interval. Must be between 0 and 1");
921 if ( m_settings->HasMinimumMapQuality ) {
922 if ( m_settings->MinimumMapQuality >= 256 )
923 errors.push_back("Invalid minimum map quality. Must be between 0 and 255");
925 if ( m_settings->HasUnusedModelThreshold ) {
926 if ( m_settings->UnusedModelThreshold < 0.0 || m_settings->UnusedModelThreshold > 1.0 )
927 errors.push_back("Invalid unused model threshold. Must be between 0 and 1");
930 // return success if no errors found
931 return ( errors.empty() );
934 bool ResolveTool::ResolveToolPrivate::MakeStats(void) {
936 // pull resolver settings from command-line settings
937 ReadGroupResolver::SetConfidenceInterval(m_settings->ConfidenceInterval);
938 ReadGroupResolver::SetUnusedModelThreshold(m_settings->UnusedModelThreshold);
940 // open our BAM reader
942 if ( !bamReader.Open(m_settings->InputBamFilename) ) {
943 cerr << "bamtools resolve ERROR: could not open input BAM file: "
944 << m_settings->InputBamFilename << endl;
948 // retrieve header & parse for read groups
949 const SamHeader& header = bamReader.GetHeader();
952 // open ReadNamesFileWriter
953 ResolveTool::ReadNamesFileWriter readNamesWriter;
954 if ( !readNamesWriter.Open(m_settings->ReadNamesFilename) ) {
955 cerr << "bamtools resolve ERROR: could not open (temp) output read names file: "
956 << m_settings->ReadNamesFilename << endl;
961 // read through BAM file
963 string readGroup("");
964 map<string, ReadGroupResolver>::iterator rgIter;
965 map<string, bool>::iterator readNameIter;
966 while ( bamReader.GetNextAlignmentCore(al) ) {
968 // skip if alignment is not paired, mapped, nor mate is mapped
969 if ( !al.IsPaired() || !al.IsMapped() || !al.IsMateMapped() )
972 // skip if alignment & mate not on same reference sequence
973 if ( al.RefID != al.MateRefID ) continue;
975 // flesh out the char data, so we can retrieve its read group ID
978 // get read group from alignment (OK if empty)
980 al.GetTag(READ_GROUP_TAG, readGroup);
982 // look up resolver for read group
983 rgIter = m_readGroups.find(readGroup);
984 if ( rgIter == m_readGroups.end() ) {
985 cerr << "bamtools resolve ERROR - unable to calculate stats, unknown read group encountered: "
986 << readGroup << endl;
990 ReadGroupResolver& resolver = (*rgIter).second;
992 // determine unique-ness of current alignment
993 const bool isCurrentMateUnique = ( al.MapQuality >= m_settings->MinimumMapQuality );
996 readNameIter = resolver.ReadNames.find(al.Name);
998 // if read name found (current alignment's mate already parsed)
999 if ( readNameIter != resolver.ReadNames.end() ) {
1001 // if both unique mates are unique, store read name & insert size for later
1002 const bool isStoredMateUnique = (*readNameIter).second;
1003 if ( isCurrentMateUnique && isStoredMateUnique ) {
1005 // save read name in temp file as candidates for later pair marking
1006 readNamesWriter.Write(readGroup, al.Name);
1008 // determine model type & store fragment length for stats calculation
1009 const uint16_t currentModelType = CalculateModelType(al);
1010 assert( currentModelType != ModelType::DUMMY_ID );
1011 resolver.Models[currentModelType].push_back( abs(al.InsertSize) );
1014 // unique or not, remove read name from map
1015 resolver.ReadNames.erase(readNameIter);
1018 // if read name not found, store new entry
1019 else resolver.ReadNames.insert( make_pair<string, bool>(al.Name, isCurrentMateUnique) );
1023 readNamesWriter.Close();
1026 // iterate back through read groups
1027 map<string, ReadGroupResolver>::iterator rgEnd = m_readGroups.end();
1028 for ( rgIter = m_readGroups.begin(); rgIter != rgEnd; ++rgIter ) {
1029 const string& name = (*rgIter).first;
1030 ReadGroupResolver& resolver = (*rgIter).second;
1032 // calculate acceptable orientation & insert sizes for this read group
1033 resolver.DetermineTopModels(name);
1035 // clear out left over read names
1036 // (these have mates that did not pass filters or were already removed as non-unique)
1037 resolver.ReadNames.clear();
1040 // if we get here, return success
1044 void ResolveTool::ResolveToolPrivate::ParseHeader(const SamHeader& header) {
1046 // iterate over header read groups, creating a 'resolver' for each
1047 SamReadGroupConstIterator rgIter = header.ReadGroups.ConstBegin();
1048 SamReadGroupConstIterator rgEnd = header.ReadGroups.ConstEnd();
1049 for ( ; rgIter != rgEnd; ++rgIter ) {
1050 const SamReadGroup& rg = (*rgIter);
1051 m_readGroups.insert( make_pair<string, ReadGroupResolver>(rg.ID, ReadGroupResolver()) );
1055 bool ResolveTool::ResolveToolPrivate::ReadStatsFile(void) {
1057 // skip if no filename provided
1058 if ( m_settings->StatsFilename.empty() )
1061 // attempt to open stats file
1062 ResolveTool::StatsFileReader statsReader;
1063 if ( !statsReader.Open(m_settings->StatsFilename) ) {
1064 cerr << "bamtools resolve ERROR - could not open stats file: "
1065 << m_settings->StatsFilename << " for reading" << endl;
1069 // attempt to read stats data
1070 if ( !statsReader.Read(m_settings, m_readGroups) ) {
1071 cerr << "bamtools resolve ERROR - could not parse stats file: "
1072 << m_settings->StatsFilename << " for data" << endl;
1080 void ResolveTool::ResolveToolPrivate::ResolveAlignment(BamAlignment& al) {
1082 // clear proper-pair flag
1083 al.SetIsProperPair(false);
1085 // quit check if alignment is not from paired-end read
1086 if ( !al.IsPaired() ) return;
1088 // quit check if either alignment or its mate are unmapped
1089 if ( !al.IsMapped() || !al.IsMateMapped() ) return;
1091 // quit check if alignment & its mate are on differenct references
1092 if ( al.RefID != al.MateRefID ) return;
1094 // quit check if map quality less than cutoff
1095 if ( al.MapQuality < m_settings->MinimumMapQuality ) return;
1097 // get read group from alignment
1098 // empty string if not found, this is OK - we handle empty read group case
1099 string readGroupName("");
1100 al.GetTag(READ_GROUP_TAG, readGroupName);
1102 // look up read group's 'resolver'
1103 map<string, ReadGroupResolver>::iterator rgIter = m_readGroups.find(readGroupName);
1104 if ( rgIter == m_readGroups.end() ) {
1105 cerr << "bamtools resolve ERROR - read group found that was not in header: "
1106 << readGroupName << endl;
1109 const ReadGroupResolver& resolver = (*rgIter).second;
1111 // quit check if pairs are not in proper orientation (can differ for each RG)
1112 if ( !resolver.IsValidOrientation(al) ) return;
1114 // quit check if pairs are not within "reasonable" distance (can differ for each RG)
1115 if ( !resolver.IsValidInsertSize(al) ) return;
1117 // quit check if alignment is not a "candidate proper pair"
1118 map<string, bool>::const_iterator readNameIter;
1119 readNameIter = resolver.ReadNames.find(al.Name);
1120 if ( readNameIter == resolver.ReadNames.end() )
1123 // if we get here, alignment is OK - set 'proper pair' flag
1124 al.SetIsProperPair(true);
1127 bool ResolveTool::ResolveToolPrivate::ResolvePairs(void) {
1129 // open file containing read names of candidate proper pairs
1130 ResolveTool::ReadNamesFileReader readNamesReader;
1131 if ( !readNamesReader.Open(m_settings->ReadNamesFilename) ) {
1132 cerr << "bamtools resolve ERROR: could not open (temp) inputput read names file: "
1133 << m_settings->ReadNamesFilename << endl;
1137 // parse read names (matching with corresponding read groups)
1138 if ( !readNamesReader.Read(m_readGroups) ) {
1139 cerr << "bamtools resolve ERROR: could not read candidate read names from file: "
1140 << m_settings->ReadNamesFilename << endl;
1141 readNamesReader.Close();
1145 // close read name file reader & delete temp file
1146 readNamesReader.Close();
1147 if ( remove(m_settings->ReadNamesFilename.c_str()) != 0 ) {
1148 cerr << "bamtools resolve WARNING: could not delete temp file: "
1149 << m_settings->ReadNamesFilename << endl;
1152 // open our BAM reader
1154 if ( !reader.Open(m_settings->InputBamFilename) ) {
1155 cerr << "bamtools resolve ERROR: could not open input BAM file: "
1156 << m_settings->InputBamFilename << endl;
1160 // retrieve header & reference dictionary info
1161 const SamHeader& header = reader.GetHeader();
1162 const RefVector& references = reader.GetReferenceData();
1164 // determine compression mode for BamWriter
1165 bool writeUncompressed = ( m_settings->OutputBamFilename == Options::StandardOut() &&
1166 !m_settings->IsForceCompression );
1167 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
1168 if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;
1172 writer.SetCompressionMode(compressionMode);
1173 if ( !writer.Open(m_settings->OutputBamFilename, header, references) ) {
1174 cerr << "bamtools resolve ERROR: could not open "
1175 << m_settings->OutputBamFilename << " for writing." << endl;
1180 // plow through alignments, setting/clearing 'proper pair' flag
1181 // and writing to new output BAM file
1183 while ( reader.GetNextAlignment(al) ) {
1184 ResolveAlignment(al);
1185 writer.SaveAlignment(al);
1188 // clean up & return success
1194 bool ResolveTool::ResolveToolPrivate::Run(void) {
1196 // verify that command line settings are acceptable
1197 vector<string> errors;
1198 if ( !CheckSettings(errors) ) {
1199 cerr << "bamtools resolve ERROR - invalid settings: " << endl;
1200 vector<string>::const_iterator errorIter = errors.begin();
1201 vector<string>::const_iterator errorEnd = errors.end();
1202 for ( ; errorIter != errorEnd; ++errorIter )
1203 cerr << (*errorIter) << endl;
1207 // initialize read group map with default (empty name) read group
1208 m_readGroups.insert( make_pair<string, ReadGroupResolver>("", ReadGroupResolver()) );
1210 // init readname filename
1211 // uses (adjusted) stats filename if provided (req'd for makeStats, markPairs modes; optional for twoPass)
1212 // else keep default filename
1213 if ( m_settings->HasStatsFile )
1214 m_settings->ReadNamesFilename = m_settings->StatsFilename + READNAME_FILE_SUFFIX;
1217 if ( m_settings->IsMakeStats ) {
1219 // generate stats data
1220 if ( !MakeStats() ) {
1221 cerr << "bamtools resolve ERROR - could not generate stats" << endl;
1225 // write stats to file
1226 if ( !WriteStatsFile() ) {
1227 cerr << "bamtools resolve ERROR - could not write stats file: "
1228 << m_settings->StatsFilename << endl;
1234 else if ( m_settings->IsMarkPairs ) {
1236 // read stats from file
1237 if ( !ReadStatsFile() ) {
1238 cerr << "bamtools resolve ERROR - could not read stats file: "
1239 << m_settings->StatsFilename << endl;
1243 // do paired-end resolution
1244 if ( !ResolvePairs() ) {
1245 cerr << "bamtools resolve ERROR - could not resolve pairs" << endl;
1253 // generate stats data
1254 if ( !MakeStats() ) {
1255 cerr << "bamtools resolve ERROR - could not generate stats" << endl;
1259 // if stats file requested
1260 if ( m_settings->HasStatsFile ) {
1262 // write stats to file
1263 // emit warning if write fails, but paired-end resolution should be allowed to proceed
1264 if ( !WriteStatsFile() )
1265 cerr << "bamtools resolve WARNING - could not write stats file: "
1266 << m_settings->StatsFilename << endl;
1269 // do paired-end resolution
1270 if ( !ResolvePairs() ) {
1271 cerr << "bamtools resolve ERROR - could not resolve pairs" << endl;
1280 bool ResolveTool::ResolveToolPrivate::WriteStatsFile(void) {
1282 // skip if no filename provided
1283 if ( m_settings->StatsFilename.empty() )
1286 // attempt to open stats file
1287 ResolveTool::StatsFileWriter statsWriter;
1288 if ( !statsWriter.Open(m_settings->StatsFilename) ) {
1289 cerr << "bamtools resolve ERROR - could not open stats file: "
1290 << m_settings->StatsFilename << " for writing" << endl;
1294 // attempt to write stats data
1295 if ( !statsWriter.Write(m_settings, m_readGroups) ) {
1296 cerr << "bamtools resolve ERROR - could not write stats file: "
1297 << m_settings->StatsFilename << " for data" << endl;
1305 // --------------------------------------------------------------------------
1306 // ResolveTool implementation
1308 ResolveTool::ResolveTool(void)
1310 , m_settings(new ResolveSettings)
1313 // set description texts
1314 const string programDescription = "resolves paired-end reads (marking the IsProperPair flag as needed)";
1315 const string programUsage = "<mode> [options] [-in <filename>] [-out <filename> | [-forceCompression] ] [-stats <filename>]";
1316 const string inputBamDescription = "the input BAM file(s)";
1317 const string outputBamDescription = "the output BAM file";
1318 const string statsFileDescription = "input/output stats file, depending on selected mode (see below). "
1319 "This file is human-readable, storing fragment length data generated per read group, as well as "
1320 "the options used to configure the -makeStats mode";
1321 const string forceCompressionDescription = "if results are sent to stdout (like when piping to another tool), "
1322 "default behavior is to leave output uncompressed."
1323 "Use this flag to override and force compression. This feature is disabled in -makeStats mode.";
1324 const string makeStatsDescription = "generates a fragment-length stats file from the input BAM. "
1325 "Data is written to file specified using the -stats option. "
1326 "MarkPairs Mode Settings are DISABLED.";
1327 const string markPairsDescription = "generates an output BAM with alignments marked with proper-pair status. "
1328 "Stats data is read from file specified using the -stats option. "
1329 "MakeStats Mode Settings are DISABLED";
1330 const string twoPassDescription = "combines the -makeStats & -markPairs modes into a single command. "
1331 "However, due to the two-pass nature of paired-end resolution, piping BAM data via stdin is DISABLED. "
1332 "You must supply an explicit input BAM file. Output BAM may be piped to stdout, however, if desired. "
1333 "All MakeStats & MarkPairs Mode Settings are available. "
1334 "The intermediate stats file is not necessary, but if the -stats options is used, then one will be generated. "
1335 "You may find this useful for documentation purposes.";
1336 const string minMapQualDescription = "minimum map quality. Used in -makeStats mode as a heuristic for determining a mate's "
1337 "uniqueness. Used in -markPairs mode as a filter for marking candidate proper pairs.";
1338 const string confidenceIntervalDescription = "confidence interval. Set min/max fragment lengths such that we capture "
1339 "this fraction of pairs";
1340 const string unusedModelThresholdDescription = "unused model threshold. The resolve tool considers 8 possible orientation models "
1341 "for pairs. The top 2 are selected for later use when actually marking alignments. This value determines the "
1342 "cutoff for marking a read group as ambiguous. Meaning that if the ratio of the number of alignments from bottom 6 models "
1343 "to the top 2 is greater than this threshold, then the read group is flagged as ambiguous. By default, NO alignments "
1344 "from ambiguous read groups will be marked as proper pairs. You may override this behavior with the -force option "
1345 "in -markPairs mode";
1346 const string forceMarkDescription = "forces all read groups to be marked according to their top 2 'orientation models'. "
1347 "When generating stats, the 2 (out of 8 possible) models with the most observations are chosen as the top models for each read group. "
1348 "If the remaining 6 models account for more than some threshold ([default=10%], see -umt), then the read group is marked as ambiguous. "
1349 "The default behavior is that for an ambiguous read group, NONE of its alignments are marked as proper-pairs. "
1350 "By setting this option, a read group's ambiguity flag will be ignored, and all of its alignments will be compared to the top 2 models.";
1352 // set program details
1353 Options::SetProgramInfo("bamtools resolve", programDescription, programUsage);
1355 // set up I/O options
1356 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
1357 Options::AddValueOption("-in", "BAM filename", inputBamDescription, "",
1358 m_settings->HasInputBamFile, m_settings->InputBamFilename,
1359 IO_Opts, Options::StandardIn());
1360 Options::AddValueOption("-out", "BAM filename", outputBamDescription, "",
1361 m_settings->HasOutputBamFile, m_settings->OutputBamFilename,
1362 IO_Opts, Options::StandardOut());
1363 Options::AddValueOption("-stats", "STATS filename", statsFileDescription, "",
1364 m_settings->HasStatsFile, m_settings->StatsFilename, IO_Opts);
1365 Options::AddOption("-forceCompression", forceCompressionDescription,
1366 m_settings->IsForceCompression, IO_Opts);
1368 OptionGroup* ModeOpts = Options::CreateOptionGroup("Resolve Modes (must select ONE of the following)");
1369 Options::AddOption("-makeStats", makeStatsDescription, m_settings->IsMakeStats, ModeOpts);
1370 Options::AddOption("-markPairs", markPairsDescription, m_settings->IsMarkPairs, ModeOpts);
1371 Options::AddOption("-twoPass", twoPassDescription, m_settings->IsTwoPass, ModeOpts);
1373 OptionGroup* GeneralOpts = Options::CreateOptionGroup("General Resolve Options (available in all modes)");
1374 Options::AddValueOption("-minMQ", "unsigned short", minMapQualDescription, "",
1375 m_settings->HasMinimumMapQuality, m_settings->MinimumMapQuality, GeneralOpts);
1377 OptionGroup* MakeStatsOpts = Options::CreateOptionGroup("MakeStats Mode Options (disabled in -markPairs mode)");
1378 Options::AddValueOption("-ci", "double", confidenceIntervalDescription, "",
1379 m_settings->HasConfidenceInterval, m_settings->ConfidenceInterval, MakeStatsOpts);
1380 Options::AddValueOption("-umt", "double", unusedModelThresholdDescription, "",
1381 m_settings->HasUnusedModelThreshold, m_settings->UnusedModelThreshold, MakeStatsOpts);
1383 OptionGroup* MarkPairsOpts = Options::CreateOptionGroup("MarkPairs Mode Options (disabled in -makeStats mode)");
1384 Options::AddOption("-force", forceMarkDescription, m_settings->HasForceMarkReadGroups, MarkPairsOpts);
1387 ResolveTool::~ResolveTool(void) {
1396 int ResolveTool::Help(void) {
1397 Options::DisplayHelp();
1401 int ResolveTool::Run(int argc, char* argv[]) {
1403 // parse command line arguments
1404 Options::Parse(argc, argv, 1);
1406 // initialize ResolveTool
1407 m_impl = new ResolveToolPrivate(m_settings);
1409 // run ResolveTool, return success/failure
1410 if ( m_impl->Run() )