From: Derek Date: Tue, 22 Jun 2010 02:23:32 +0000 (-0400) Subject: Rough implementation of sort tool. Generate lots of smaller sorted (STL sort with... X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=a9e3b2afa1a84d7d92bb54e5c99b6ff341c1328a;p=bamtools.git Rough implementation of sort tool. Generate lots of smaller sorted (STL sort with 'custom' compare fxn) temp files in one pass, using a specified buffer size. Uses BamMultiReader paired with BamWriter to re-merge all temp files back into a single output BAM, also in one pass. Some work remains on optimizing parameters (e.g. default buffer size), scalability (single pass generates lots of temp files), & parallelization to make tool more sophisticated & robust. --- diff --git a/bamtools_sort.cpp b/bamtools_sort.cpp index 67c38d9..816ac82 100644 --- a/bamtools_sort.cpp +++ b/bamtools_sort.cpp @@ -3,21 +3,89 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 26 May 2010 +// Last modified: 21 June 2010 (DB) // --------------------------------------------------------------------------- // Sorts an input BAM file (default by position) and stores in a new BAM file. // *************************************************************************** +#include +#include #include +#include #include +#include #include "bamtools_sort.h" #include "bamtools_options.h" #include "BamReader.h" +#include "BamMultiReader.h" +#include "BamWriter.h" using namespace std; using namespace BamTools; +namespace BamTools { + + // defaults + // + // ** These defaults should be tweaked & 'optimized' per testing ** // + // I say 'optimized' because each system will naturally perform + // differently. We will attempt to determine a sensible + // compromise that should perform well on average. + const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT = 10000; // max numberOfAlignments for buffer + const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024; // Mb + + // ----------------------------------- + // comparison objects (for sorting) + + struct SortLessThanPosition { + bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) { + if ( lhs.RefID != rhs.RefID ) + return lhs.RefID < rhs.RefID; + else + return lhs.Position < rhs.Position; + } + }; + + struct SortLessThanName { + bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) { + return lhs.Name < rhs.Name; + } + }; + +} // namespace BamTools + +// --------------------------------------------- +// SortToolPrivate declaration +class SortTool::SortToolPrivate { + + // ctor & dtor + public: + SortToolPrivate(SortTool::SortSettings* settings); + ~SortToolPrivate(void); + + // 'public' interface + public: + bool Run(void); + + // internal methods + private: + bool GenerateSortedRuns(void); + bool HandleBufferContents(vector& buffer); + bool MergeSortedRuns(void); + bool WriteTempFile(const vector& buffer, const string& tempFilename); + void SortBuffer(vector& buffer); + + // data members + private: + SortTool::SortSettings* m_settings; + string m_tempFilenameStub; + int m_numberOfRuns; + string m_headerText; + RefVector m_references; + vector m_tempFilenames; +}; + // --------------------------------------------- // SortSettings implementation @@ -25,19 +93,31 @@ struct SortTool::SortSettings { // flags bool HasInputBamFilename; + bool HasMaxBufferCount; + bool HasMaxBufferMemory; bool HasOutputBamFilename; + bool IsSortingByName; // filenames string InputBamFilename; string OutputBamFilename; + // parameters + unsigned int MaxBufferCount; + unsigned int MaxBufferMemory; + // constructor SortSettings(void) : HasInputBamFilename(false) + , HasMaxBufferCount(false) + , HasMaxBufferMemory(false) , HasOutputBamFilename(false) + , IsSortingByName(false) , InputBamFilename(Options::StandardIn()) , OutputBamFilename(Options::StandardOut()) - { } + , MaxBufferCount(SORT_DEFAULT_MAX_BUFFER_COUNT) + , MaxBufferMemory(SORT_DEFAULT_MAX_BUFFER_MEMORY) + { } }; // --------------------------------------------- @@ -46,6 +126,7 @@ struct SortTool::SortSettings { SortTool::SortTool(void) : AbstractTool() , m_settings(new SortSettings) + , m_impl(0) { // set program details Options::SetProgramInfo("bamtools sort", "sorts a BAM file", "[-in ] [-out ]"); @@ -54,11 +135,22 @@ SortTool::SortTool(void) OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn()); Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputBamFilename, IO_Opts, Options::StandardOut()); + + OptionGroup* SortOpts = Options::CreateOptionGroup("Sorting Methods"); + Options::AddOption("-byname", "sort by alignment name", m_settings->IsSortingByName, SortOpts); + + OptionGroup* MemOpts = Options::CreateOptionGroup("Memory Settings"); + Options::AddValueOption("-n", "count", "max number of alignments per tempfile", "", m_settings->HasMaxBufferCount, m_settings->MaxBufferCount, MemOpts, SORT_DEFAULT_MAX_BUFFER_COUNT); + Options::AddValueOption("-mem", "Mb", "max memory to use", "", m_settings->HasMaxBufferMemory, m_settings->MaxBufferMemory, MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY); } SortTool::~SortTool(void) { + delete m_settings; m_settings = 0; + + delete m_impl; + m_impl = 0; } int SortTool::Help(void) { @@ -71,7 +163,154 @@ int SortTool::Run(int argc, char* argv[]) { // parse command line arguments Options::Parse(argc, argv, 1); + // run internal SortTool implementation, return success/fail + m_impl = new SortToolPrivate(m_settings); + + if ( m_impl->Run() ) return 0; + else return 1; +} + +// --------------------------------------------- +// SortToolPrivate implementation + +// constructor +SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings) + : m_settings(settings) + , m_tempFilenameStub("bamtools.sort.temp.") + , m_numberOfRuns(0) +{ } + +// destructor +SortTool::SortToolPrivate::~SortToolPrivate(void) { } + +// generates mutiple sorted temp BAM files from single unsorted BAM file +bool SortTool::SortToolPrivate::GenerateSortedRuns(void) { + + // open input BAM file + BamReader inputReader; + inputReader.Open(m_settings->InputBamFilename); + + // get basic data that will be shared by all temp/output files + m_headerText = inputReader.GetHeaderText(); + m_references = inputReader.GetReferenceData(); + + // set up alignments buffer + vector buffer; + buffer.reserve(m_settings->MaxBufferCount); + + // while data available + BamAlignment al; + while ( inputReader.GetNextAlignmentCore(al) ) { + + // store alignments in buffer + buffer.push_back(al); + + // if buffer is full, handle contents (sort & write to temp file) + if ( buffer.size() == m_settings->MaxBufferCount ) + HandleBufferContents(buffer); + } + + // handle any remaining buffer contents + if ( buffer.size() > 0 ) + HandleBufferContents(buffer); + + // close reader & return success + inputReader.Close(); + return true; +} + +bool SortTool::SortToolPrivate::HandleBufferContents(vector& buffer ) { + // do sorting + SortBuffer(buffer); + + // write sorted contents to temp file, store success/fail + stringstream tempStr; + tempStr << m_tempFilenameStub << m_numberOfRuns; + bool success = WriteTempFile( buffer, tempStr.str() ); - return 0; + // save temp filename for merging later + m_tempFilenames.push_back(tempStr.str()); + + // clear buffer contents & update run counter + buffer.clear(); + ++m_numberOfRuns; + + // return success/fail of writing to temp file + return success; } + +// merges sorted temp BAM files into single sorted output BAM file +bool SortTool::SortToolPrivate::MergeSortedRuns(void) { + + // open up multi reader for all of our temp files + // this might get broken up if we do a multi-pass system later ?? + BamMultiReader multiReader; + multiReader.Open(m_tempFilenames, false, true); + + // open writer for our completely sorted output BAM file + BamWriter mergedWriter; + mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references); + + // while data available in temp files + BamAlignment al; + while ( multiReader.GetNextAlignment(al) ) { + mergedWriter.SaveAlignment(al); + } + + // close readers + multiReader.Close(); + mergedWriter.Close(); + + // delete all temp files + vector::const_iterator tempIter = m_tempFilenames.begin(); + vector::const_iterator tempEnd = m_tempFilenames.end(); + for ( ; tempIter != tempEnd; ++tempIter ) { + const string& tempFilename = (*tempIter); + remove(tempFilename.c_str()); + } + + return true; +} + +bool SortTool::SortToolPrivate::Run(void) { + + // this does a single pass, chunking up the input file into smaller sorted temp files, + // then write out using BamMultiReader to handle merging + + if ( GenerateSortedRuns() ) + return MergeSortedRuns(); + else + return false; +} + +void SortTool::SortToolPrivate::SortBuffer(vector& buffer) { + + // ** add further custom sort options later ?? ** + + // sort buffer by desired method + if ( m_settings->IsSortingByName ) + sort ( buffer.begin(), buffer.end(), SortLessThanName() ); + else + sort ( buffer.begin(), buffer.end(), SortLessThanPosition() ); +} + + +bool SortTool::SortToolPrivate::WriteTempFile(const vector& buffer, const string& tempFilename) { + + // open temp file for writing + BamWriter tempWriter; + tempWriter.Open(tempFilename, m_headerText, m_references); + + // write data + vector::const_iterator buffIter = buffer.begin(); + vector::const_iterator buffEnd = buffer.end(); + for ( ; buffIter != buffEnd; ++buffIter ) { + const BamAlignment& al = (*buffIter); + tempWriter.SaveAlignment(al); + } + + // close temp file & return success + tempWriter.Close(); + return true; +} \ No newline at end of file diff --git a/bamtools_sort.h b/bamtools_sort.h index 7d2ac5a..0241b02 100644 --- a/bamtools_sort.h +++ b/bamtools_sort.h @@ -3,7 +3,7 @@ // Marth Lab, Department of Biology, Boston College // All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 1 June 2010 +// Last modified: 21 June 2010 (DB) // --------------------------------------------------------------------------- // Sorts a BAM file. // *************************************************************************** @@ -28,6 +28,9 @@ class SortTool : public AbstractTool { private: struct SortSettings; SortSettings* m_settings; + + struct SortToolPrivate; + SortToolPrivate* m_impl; }; } // namespace BamTools