X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Ftoolkit%2Fbamtools_sort.cpp;h=e268bee748b7d8c726bec2a71ade943a871f7f5d;hb=76bb08a359f3974eee3e96327b965ccf7958ed1a;hp=83584a6c8bd9b258bca86a6b4094b2761ceddc82;hpb=0180d7256f5c781364b04559b39ed2db41585787;p=bamtools.git diff --git a/src/toolkit/bamtools_sort.cpp b/src/toolkit/bamtools_sort.cpp index 83584a6..e268bee 100644 --- a/src/toolkit/bamtools_sort.cpp +++ b/src/toolkit/bamtools_sort.cpp @@ -1,92 +1,44 @@ // *************************************************************************** // bamtools_sort.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College -// All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 21 June 2010 (DB) +// Last modified: 27 March 2012 (DB) // --------------------------------------------------------------------------- -// Sorts an input BAM file (default by position) and stores in a new BAM file. +// Sorts an input BAM file // *************************************************************************** +#include "bamtools_sort.h" + +#include +#include +#include +#include +#include +using namespace BamTools; +using namespace BamTools::Algorithms; + #include #include #include #include #include #include - -#include "bamtools_sort.h" -#include "bamtools_options.h" -#include "BamReader.h" -#include "BamMultiReader.h" -#include "BamWriter.h" - using namespace std; -using namespace BamTools; namespace BamTools { - // defaults - // - // ** These defaults should be tweaked & 'optimized' per testing ** // - // I say 'optimized' because each system will naturally perform - // differently. We will attempt to determine a sensible - // compromise that should perform well on average. - const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT = 10000; // max numberOfAlignments for buffer - const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024; // Mb - - // ----------------------------------- - // comparison objects (for sorting) - - struct SortLessThanPosition { - bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) { - if ( lhs.RefID != rhs.RefID ) - return lhs.RefID < rhs.RefID; - else - return lhs.Position < rhs.Position; - } - }; - - struct SortLessThanName { - bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) { - return lhs.Name < rhs.Name; - } - }; +// defaults +// +// ** These defaults should be tweaked & 'optimized' per testing ** // +// +// I say 'optimized' because each system will naturally perform +// differently. We will attempt to determine a sensible +// compromise that should perform well on average. +const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT = 500000; // max numberOfAlignments for buffer +const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024; // Mb } // namespace BamTools -// --------------------------------------------- -// SortToolPrivate declaration -class SortTool::SortToolPrivate { - - // ctor & dtor - public: - SortToolPrivate(SortTool::SortSettings* settings); - ~SortToolPrivate(void); - - // 'public' interface - public: - bool Run(void); - - // internal methods - private: - void ClearBuffer(vector& buffer); - bool GenerateSortedRuns(void); - bool HandleBufferContents(vector& buffer); - bool MergeSortedRuns(void); - bool WriteTempFile(const vector& buffer, const string& tempFilename); - void SortBuffer(vector& buffer); - - // data members - private: - SortTool::SortSettings* m_settings; - string m_tempFilenameStub; - int m_numberOfRuns; - string m_headerText; - RefVector m_references; - vector m_tempFilenames; -}; - // --------------------------------------------- // SortSettings implementation @@ -102,11 +54,11 @@ struct SortTool::SortSettings { // filenames string InputBamFilename; string OutputBamFilename; - + // parameters unsigned int MaxBufferCount; unsigned int MaxBufferMemory; - + // constructor SortSettings(void) : HasInputBamFilename(false) @@ -118,62 +70,41 @@ struct SortTool::SortSettings { , OutputBamFilename(Options::StandardOut()) , MaxBufferCount(SORT_DEFAULT_MAX_BUFFER_COUNT) , MaxBufferMemory(SORT_DEFAULT_MAX_BUFFER_MEMORY) - { } -}; - -// --------------------------------------------- -// SortTool implementation - -SortTool::SortTool(void) - : AbstractTool() - , m_settings(new SortSettings) - , m_impl(0) -{ - // set program details - Options::SetProgramInfo("bamtools sort", "sorts a BAM file", "[-in ] [-out ]"); - - // set up options - OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn()); - Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputBamFilename, IO_Opts, Options::StandardOut()); - - OptionGroup* SortOpts = Options::CreateOptionGroup("Sorting Methods"); - Options::AddOption("-byname", "sort by alignment name", m_settings->IsSortingByName, SortOpts); - - OptionGroup* MemOpts = Options::CreateOptionGroup("Memory Settings"); - Options::AddValueOption("-n", "count", "max number of alignments per tempfile", "", m_settings->HasMaxBufferCount, m_settings->MaxBufferCount, MemOpts, SORT_DEFAULT_MAX_BUFFER_COUNT); - Options::AddValueOption("-mem", "Mb", "max memory to use", "", m_settings->HasMaxBufferMemory, m_settings->MaxBufferMemory, MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY); -} - -SortTool::~SortTool(void) { - - delete m_settings; - m_settings = 0; - - delete m_impl; - m_impl = 0; -} - -int SortTool::Help(void) { - Options::DisplayHelp(); - return 0; -} - -int SortTool::Run(int argc, char* argv[]) { - - // parse command line arguments - Options::Parse(argc, argv, 1); - - // run internal SortTool implementation, return success/fail - m_impl = new SortToolPrivate(m_settings); - - if ( m_impl->Run() ) return 0; - else return 1; -} + { } +}; // --------------------------------------------- // SortToolPrivate implementation +class SortTool::SortToolPrivate { + + // ctor & dtor + public: + SortToolPrivate(SortTool::SortSettings* settings); + ~SortToolPrivate(void) { } + + // 'public' interface + public: + bool Run(void); + + // internal methods + private: + bool CreateSortedTempFile(vector& buffer); + bool GenerateSortedRuns(void); + bool MergeSortedRuns(void); + bool WriteTempFile(const vector& buffer, const string& tempFilename); + void SortBuffer(vector& buffer); + + // data members + private: + SortTool::SortSettings* m_settings; + string m_tempFilenameStub; + int m_numberOfRuns; + string m_headerText; + RefVector m_references; + vector m_tempFilenames; +}; + // constructor SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings) : m_settings(settings) @@ -183,52 +114,96 @@ SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings) // that way multiple sort runs don't trip on each other's temp files if ( m_settings) { size_t extensionFound = m_settings->InputBamFilename.find(".bam"); - if (extensionFound != string::npos ) + if ( extensionFound != string::npos ) m_tempFilenameStub = m_settings->InputBamFilename.substr(0,extensionFound); m_tempFilenameStub.append(".sort.temp."); } } -// destructor -SortTool::SortToolPrivate::~SortToolPrivate(void) { } - // generates mutiple sorted temp BAM files from single unsorted BAM file bool SortTool::SortToolPrivate::GenerateSortedRuns(void) { // open input BAM file - BamReader inputReader; - inputReader.Open(m_settings->InputBamFilename); + BamReader reader; + if ( !reader.Open(m_settings->InputBamFilename) ) { + cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename + << " for reading... Aborting." << endl; + return false; + } // get basic data that will be shared by all temp/output files - m_headerText = inputReader.GetHeaderText(); - m_references = inputReader.GetReferenceData(); + SamHeader header = reader.GetHeader(); + if ( !header.HasVersion() ) + header.Version = Constants::SAM_CURRENT_VERSION; + header.SortOrder = ( m_settings->IsSortingByName + ? Constants::SAM_HD_SORTORDER_QUERYNAME + : Constants::SAM_HD_SORTORDER_COORDINATE ); + m_headerText = header.ToString(); + m_references = reader.GetReferenceData(); // set up alignments buffer - vector buffer; - buffer.reserve(m_settings->MaxBufferCount); - - // while data available BamAlignment al; - while ( inputReader.GetNextAlignmentCore(al)) { - - // store alignments in buffer - buffer.push_back(al); - - // if buffer is full, handle contents (sort & write to temp file) - if ( buffer.size() == m_settings->MaxBufferCount ) - HandleBufferContents(buffer); + vector buffer; + buffer.reserve( (size_t)(m_settings->MaxBufferCount*1.1) ); + bool bufferFull = false; + + // if sorting by name, we need to generate full char data + // so can't use GetNextAlignmentCore() + if ( m_settings->IsSortingByName ) { + + // iterate through file + while ( reader.GetNextAlignment(al)) { + + // check buffer's usage + bufferFull = ( buffer.size() >= m_settings->MaxBufferCount ); + + // store alignments until buffer is "full" + if ( !bufferFull ) + buffer.push_back(al); + + // if buffer is "full" + else { + // so create a sorted temp file with current buffer contents + // then push "al" into fresh buffer + CreateSortedTempFile(buffer); + buffer.push_back(al); + } + } } - - // handle any remaining buffer contents - if ( buffer.size() > 0 ) - HandleBufferContents(buffer); + + // sorting by position, can take advantage of GNACore() speedup + else { + + // iterate through file + while ( reader.GetNextAlignmentCore(al) ) { + + // check buffer's usage + bufferFull = ( buffer.size() >= m_settings->MaxBufferCount ); + + // store alignments until buffer is "full" + if ( !bufferFull ) + buffer.push_back(al); + + // if buffer is "full" + else { + // create a sorted temp file with current buffer contents + // then push "al" into fresh buffer + CreateSortedTempFile(buffer); + buffer.push_back(al); + } + } + } + + // handle any leftover buffer contents + if ( !buffer.empty() ) + CreateSortedTempFile(buffer); // close reader & return success - inputReader.Close(); + reader.Close(); return true; } -bool SortTool::SortToolPrivate::HandleBufferContents(vector& buffer ) { +bool SortTool::SortToolPrivate::CreateSortedTempFile(vector& buffer) { // do sorting SortBuffer(buffer); @@ -246,6 +221,7 @@ bool SortTool::SortToolPrivate::HandleBufferContents(vector& buffe ++m_numberOfRuns; // return success/fail of writing to temp file + // TODO: a failure returned here is not actually caught and handled anywhere return success; } @@ -255,19 +231,27 @@ bool SortTool::SortToolPrivate::MergeSortedRuns(void) { // open up multi reader for all of our temp files // this might get broken up if we do a multi-pass system later ?? BamMultiReader multiReader; - multiReader.Open(m_tempFilenames, false, true); - + if ( !multiReader.Open(m_tempFilenames) ) { + cerr << "bamtools sort ERROR: could not open BamMultiReader for merging temp files... Aborting." + << endl; + return false; + } + // open writer for our completely sorted output BAM file BamWriter mergedWriter; - mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references); + if ( !mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references) ) { + cerr << "bamtools sort ERROR: could not open " << m_settings->OutputBamFilename + << " for writing... Aborting." << endl; + multiReader.Close(); + return false; + } // while data available in temp files BamAlignment al; - while ( multiReader.GetNextAlignmentCore(al) ) { + while ( multiReader.GetNextAlignmentCore(al) ) mergedWriter.SaveAlignment(al); - } - // close readers + // close files multiReader.Close(); mergedWriter.Close(); @@ -279,6 +263,7 @@ bool SortTool::SortToolPrivate::MergeSortedRuns(void) { remove(tempFilename.c_str()); } + // return success return true; } @@ -299,17 +284,21 @@ void SortTool::SortToolPrivate::SortBuffer(vector& buffer) { // sort buffer by desired method if ( m_settings->IsSortingByName ) - sort ( buffer.begin(), buffer.end(), SortLessThanName() ); - else - sort ( buffer.begin(), buffer.end(), SortLessThanPosition() ); + std::stable_sort( buffer.begin(), buffer.end(), Sort::ByName() ); + else + std::stable_sort( buffer.begin(), buffer.end(), Sort::ByPosition() ); } - -bool SortTool::SortToolPrivate::WriteTempFile(const vector& buffer, const string& tempFilename) { - +bool SortTool::SortToolPrivate::WriteTempFile(const vector& buffer, + const string& tempFilename) +{ // open temp file for writing BamWriter tempWriter; - tempWriter.Open(tempFilename, m_headerText, m_references); + if ( !tempWriter.Open(tempFilename, m_headerText, m_references) ) { + cerr << "bamtools sort ERROR: could not open " << tempFilename + << " for writing." << endl; + return false; + } // write data vector::const_iterator buffIter = buffer.begin(); @@ -322,4 +311,65 @@ bool SortTool::SortToolPrivate::WriteTempFile(const vector& buffer // close temp file & return success tempWriter.Close(); return true; -} \ No newline at end of file +} + +// --------------------------------------------- +// SortTool implementation + +SortTool::SortTool(void) + : AbstractTool() + , m_settings(new SortSettings) + , m_impl(0) +{ + // set program details + Options::SetProgramInfo("bamtools sort", "sorts a BAM file", "[-in ] [-out ] [sortOptions]"); + + // set up options + OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); + Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", + m_settings->HasInputBamFilename, m_settings->InputBamFilename, + IO_Opts, Options::StandardIn()); + Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", + m_settings->HasOutputBamFilename, m_settings->OutputBamFilename, + IO_Opts, Options::StandardOut()); + + OptionGroup* SortOpts = Options::CreateOptionGroup("Sorting Methods"); + Options::AddOption("-byname", "sort by alignment name", m_settings->IsSortingByName, SortOpts); + + OptionGroup* MemOpts = Options::CreateOptionGroup("Memory Settings"); + Options::AddValueOption("-n", "count", "max number of alignments per tempfile", "", + m_settings->HasMaxBufferCount, m_settings->MaxBufferCount, + MemOpts, SORT_DEFAULT_MAX_BUFFER_COUNT); + Options::AddValueOption("-mem", "Mb", "max memory to use", "", + m_settings->HasMaxBufferMemory, m_settings->MaxBufferMemory, + MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY); +} + +SortTool::~SortTool(void) { + + delete m_settings; + m_settings = 0; + + delete m_impl; + m_impl = 0; +} + +int SortTool::Help(void) { + Options::DisplayHelp(); + return 0; +} + +int SortTool::Run(int argc, char* argv[]) { + + // parse command line arguments + Options::Parse(argc, argv, 1); + + // initialize SortTool with settings + m_impl = new SortToolPrivate(m_settings); + + // run SortTool, return success/fail + if ( m_impl->Run() ) + return 0; + else + return 1; +}