X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Ftoolkit%2Fbamtools_sort.cpp;h=e268bee748b7d8c726bec2a71ade943a871f7f5d;hb=76bb08a359f3974eee3e96327b965ccf7958ed1a;hp=5cbeaba5f42a6f58ca5ff353a9a31ff5d8553d86;hpb=a357e1a5853b5bd5a43ceb673269863afc54a08e;p=bamtools.git diff --git a/src/toolkit/bamtools_sort.cpp b/src/toolkit/bamtools_sort.cpp index 5cbeaba..e268bee 100644 --- a/src/toolkit/bamtools_sort.cpp +++ b/src/toolkit/bamtools_sort.cpp @@ -1,9 +1,8 @@ // *************************************************************************** // bamtools_sort.cpp (c) 2010 Derek Barnett, Erik Garrison // Marth Lab, Department of Biology, Boston College -// All rights reserved. // --------------------------------------------------------------------------- -// Last modified: 7 April 2011 (DB) +// Last modified: 27 March 2012 (DB) // --------------------------------------------------------------------------- // Sorts an input BAM file // *************************************************************************** @@ -13,8 +12,10 @@ #include #include #include +#include #include using namespace BamTools; +using namespace BamTools::Algorithms; #include #include @@ -35,24 +36,6 @@ namespace BamTools { // compromise that should perform well on average. const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT = 500000; // max numberOfAlignments for buffer const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024; // Mb - -// ----------------------------------- -// comparison objects (for sorting) - -struct SortLessThanPosition { - bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) { - if ( lhs.RefID != rhs.RefID ) - return lhs.RefID < rhs.RefID; - else - return lhs.Position < rhs.Position; - } -}; - -struct SortLessThanName { - bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) { - return lhs.Name < rhs.Name; - } -}; } // namespace BamTools @@ -106,9 +89,8 @@ class SortTool::SortToolPrivate { // internal methods private: - void ClearBuffer(vector& buffer); + bool CreateSortedTempFile(vector& buffer); bool GenerateSortedRuns(void); - bool HandleBufferContents(vector& buffer); bool MergeSortedRuns(void); bool WriteTempFile(const vector& buffer, const string& tempFilename); void SortBuffer(vector& buffer); @@ -123,7 +105,6 @@ class SortTool::SortToolPrivate { vector m_tempFilenames; }; - // constructor SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings) : m_settings(settings) @@ -133,7 +114,7 @@ SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings) // that way multiple sort runs don't trip on each other's temp files if ( m_settings) { size_t extensionFound = m_settings->InputBamFilename.find(".bam"); - if (extensionFound != string::npos ) + if ( extensionFound != string::npos ) m_tempFilenameStub = m_settings->InputBamFilename.substr(0,extensionFound); m_tempFilenameStub.append(".sort.temp."); } @@ -143,68 +124,86 @@ SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings) bool SortTool::SortToolPrivate::GenerateSortedRuns(void) { // open input BAM file - BamReader inputReader; - if ( !inputReader.Open(m_settings->InputBamFilename) ) { + BamReader reader; + if ( !reader.Open(m_settings->InputBamFilename) ) { cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename << " for reading... Aborting." << endl; return false; } // get basic data that will be shared by all temp/output files - SamHeader header = inputReader.GetHeader(); + SamHeader header = reader.GetHeader(); + if ( !header.HasVersion() ) + header.Version = Constants::SAM_CURRENT_VERSION; header.SortOrder = ( m_settings->IsSortingByName ? Constants::SAM_HD_SORTORDER_QUERYNAME : Constants::SAM_HD_SORTORDER_COORDINATE ); m_headerText = header.ToString(); - m_references = inputReader.GetReferenceData(); + m_references = reader.GetReferenceData(); // set up alignments buffer BamAlignment al; vector buffer; - buffer.reserve(m_settings->MaxBufferCount); - + buffer.reserve( (size_t)(m_settings->MaxBufferCount*1.1) ); + bool bufferFull = false; + // if sorting by name, we need to generate full char data // so can't use GetNextAlignmentCore() if ( m_settings->IsSortingByName ) { // iterate through file - while ( inputReader.GetNextAlignment(al)) { - - // store alignments in buffer - buffer.push_back(al); - - // if buffer is full, handle contents (sort & write to temp file) - if ( buffer.size() == m_settings->MaxBufferCount ) - HandleBufferContents(buffer); + while ( reader.GetNextAlignment(al)) { + + // check buffer's usage + bufferFull = ( buffer.size() >= m_settings->MaxBufferCount ); + + // store alignments until buffer is "full" + if ( !bufferFull ) + buffer.push_back(al); + + // if buffer is "full" + else { + // so create a sorted temp file with current buffer contents + // then push "al" into fresh buffer + CreateSortedTempFile(buffer); + buffer.push_back(al); + } } - } // sorting by position, can take advantage of GNACore() speedup else { // iterate through file - while ( inputReader.GetNextAlignmentCore(al) ) { - - // store alignments in buffer - buffer.push_back(al); - - // if buffer is full, handle contents (sort & write to temp file) - if ( buffer.size() == m_settings->MaxBufferCount ) - HandleBufferContents(buffer); + while ( reader.GetNextAlignmentCore(al) ) { + + // check buffer's usage + bufferFull = ( buffer.size() >= m_settings->MaxBufferCount ); + + // store alignments until buffer is "full" + if ( !bufferFull ) + buffer.push_back(al); + + // if buffer is "full" + else { + // create a sorted temp file with current buffer contents + // then push "al" into fresh buffer + CreateSortedTempFile(buffer); + buffer.push_back(al); + } } } - // handle any remaining buffer contents - if ( buffer.size() > 0 ) - HandleBufferContents(buffer); + // handle any leftover buffer contents + if ( !buffer.empty() ) + CreateSortedTempFile(buffer); // close reader & return success - inputReader.Close(); + reader.Close(); return true; } -bool SortTool::SortToolPrivate::HandleBufferContents(vector& buffer ) { +bool SortTool::SortToolPrivate::CreateSortedTempFile(vector& buffer) { // do sorting SortBuffer(buffer); @@ -233,16 +232,11 @@ bool SortTool::SortToolPrivate::MergeSortedRuns(void) { // this might get broken up if we do a multi-pass system later ?? BamMultiReader multiReader; if ( !multiReader.Open(m_tempFilenames) ) { - cerr << "bamtools sort ERROR: could not open BamMultiReader for merging temp files... Aborting." << endl; + cerr << "bamtools sort ERROR: could not open BamMultiReader for merging temp files... Aborting." + << endl; return false; } - // set sort order for merge - if ( m_settings->IsSortingByName ) - multiReader.SetSortOrder(BamMultiReader::SortedByReadName); - else - multiReader.SetSortOrder(BamMultiReader::SortedByPosition); - // open writer for our completely sorted output BAM file BamWriter mergedWriter; if ( !mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references) ) { @@ -257,7 +251,7 @@ bool SortTool::SortToolPrivate::MergeSortedRuns(void) { while ( multiReader.GetNextAlignmentCore(al) ) mergedWriter.SaveAlignment(al); - // close readers + // close files multiReader.Close(); mergedWriter.Close(); @@ -269,6 +263,7 @@ bool SortTool::SortToolPrivate::MergeSortedRuns(void) { remove(tempFilename.c_str()); } + // return success return true; } @@ -289,14 +284,14 @@ void SortTool::SortToolPrivate::SortBuffer(vector& buffer) { // sort buffer by desired method if ( m_settings->IsSortingByName ) - sort ( buffer.begin(), buffer.end(), SortLessThanName() ); - else - sort ( buffer.begin(), buffer.end(), SortLessThanPosition() ); + std::stable_sort( buffer.begin(), buffer.end(), Sort::ByName() ); + else + std::stable_sort( buffer.begin(), buffer.end(), Sort::ByPosition() ); } - -bool SortTool::SortToolPrivate::WriteTempFile(const vector& buffer, const string& tempFilename) { - +bool SortTool::SortToolPrivate::WriteTempFile(const vector& buffer, + const string& tempFilename) +{ // open temp file for writing BamWriter tempWriter; if ( !tempWriter.Open(tempFilename, m_headerText, m_references) ) { @@ -331,15 +326,23 @@ SortTool::SortTool(void) // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn()); - Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputBamFilename, IO_Opts, Options::StandardOut()); + Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", + m_settings->HasInputBamFilename, m_settings->InputBamFilename, + IO_Opts, Options::StandardIn()); + Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", + m_settings->HasOutputBamFilename, m_settings->OutputBamFilename, + IO_Opts, Options::StandardOut()); OptionGroup* SortOpts = Options::CreateOptionGroup("Sorting Methods"); Options::AddOption("-byname", "sort by alignment name", m_settings->IsSortingByName, SortOpts); OptionGroup* MemOpts = Options::CreateOptionGroup("Memory Settings"); - Options::AddValueOption("-n", "count", "max number of alignments per tempfile", "", m_settings->HasMaxBufferCount, m_settings->MaxBufferCount, MemOpts, SORT_DEFAULT_MAX_BUFFER_COUNT); - Options::AddValueOption("-mem", "Mb", "max memory to use", "", m_settings->HasMaxBufferMemory, m_settings->MaxBufferMemory, MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY); + Options::AddValueOption("-n", "count", "max number of alignments per tempfile", "", + m_settings->HasMaxBufferCount, m_settings->MaxBufferCount, + MemOpts, SORT_DEFAULT_MAX_BUFFER_COUNT); + Options::AddValueOption("-mem", "Mb", "max memory to use", "", + m_settings->HasMaxBufferMemory, m_settings->MaxBufferMemory, + MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY); } SortTool::~SortTool(void) {