]> git.donarmstrong.com Git - bamtools.git/blobdiff - src/toolkit/bamtools_sort.cpp
MultiReader (&MultiMerger) now using Algorithms::Sort objects
[bamtools.git] / src / toolkit / bamtools_sort.cpp
index 5cbeaba5f42a6f58ca5ff353a9a31ff5d8553d86..37a74e50e13275c687dbf7b9d2a123b9f7c038fc 100644 (file)
@@ -3,7 +3,7 @@
 // Marth Lab, Department of Biology, Boston College
 // All rights reserved.
 // ---------------------------------------------------------------------------
-// Last modified: 7 April 2011 (DB)
+// Last modified: 3 October 2011 (DB)
 // ---------------------------------------------------------------------------
 // Sorts an input BAM file
 // ***************************************************************************
@@ -106,9 +106,8 @@ class SortTool::SortToolPrivate {
         
     // internal methods
     private:
-        void ClearBuffer(vector<BamAlignment>& buffer);
+        bool CreateSortedTempFile(vector<BamAlignment>& buffer);
         bool GenerateSortedRuns(void);
-        bool HandleBufferContents(vector<BamAlignment>& buffer);
         bool MergeSortedRuns(void);
         bool WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename);
         void SortBuffer(vector<BamAlignment>& buffer);
@@ -123,7 +122,6 @@ class SortTool::SortToolPrivate {
         vector<string> m_tempFilenames;
 };
 
-
 // constructor
 SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings) 
     : m_settings(settings)
@@ -143,68 +141,100 @@ SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings)
 bool SortTool::SortToolPrivate::GenerateSortedRuns(void) {
     
     // open input BAM file
-    BamReader inputReader;
-    if ( !inputReader.Open(m_settings->InputBamFilename) ) {
+    BamReader reader;
+    if ( !reader.Open(m_settings->InputBamFilename) ) {
         cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename
              << " for reading... Aborting." << endl;
         return false;
     }
     
     // get basic data that will be shared by all temp/output files 
-    SamHeader header = inputReader.GetHeader();
+    SamHeader header = reader.GetHeader();
     header.SortOrder = ( m_settings->IsSortingByName
                        ? Constants::SAM_HD_SORTORDER_QUERYNAME
                        : Constants::SAM_HD_SORTORDER_COORDINATE );
     m_headerText = header.ToString();
-    m_references = inputReader.GetReferenceData();
+    m_references = reader.GetReferenceData();
     
     // set up alignments buffer
     BamAlignment al;
     vector<BamAlignment> buffer;
-    buffer.reserve(m_settings->MaxBufferCount);
-    
+    buffer.reserve( (size_t)(m_settings->MaxBufferCount*1.1) );
+    bool bufferFull = false;
+
     // if sorting by name, we need to generate full char data
     // so can't use GetNextAlignmentCore()
     if ( m_settings->IsSortingByName ) {
 
         // iterate through file
-        while ( inputReader.GetNextAlignment(al)) {
-
-            // store alignments in buffer
-            buffer.push_back(al);
-
-            // if buffer is full, handle contents (sort & write to temp file)
-            if ( buffer.size() == m_settings->MaxBufferCount )
-                HandleBufferContents(buffer);
+        while ( reader.GetNextAlignment(al)) {
+
+            // check buffer's usage
+            bufferFull = ( buffer.size() >= m_settings->MaxBufferCount );
+
+            // store alignments until buffer is "full"
+            if ( !bufferFull )
+                buffer.push_back(al);
+
+            // if buffer is "full"
+            else {
+
+                // push any unmapped reads into buffer,
+                // don't want to split these into a separate temp file
+                if ( !al.IsMapped() )
+                    buffer.push_back(al);
+
+                // "al" is mapped, so create a sorted temp file with current buffer contents
+                // then push "al" into fresh buffer
+                else {
+                    CreateSortedTempFile(buffer);
+                    buffer.push_back(al);
+                }
+            }
         }
-
     }
 
     // sorting by position, can take advantage of GNACore() speedup
     else {
 
         // iterate through file
-        while ( inputReader.GetNextAlignmentCore(al) ) {
-
-            // store alignments in buffer
-            buffer.push_back(al);
-
-            // if buffer is full, handle contents (sort & write to temp file)
-            if ( buffer.size() == m_settings->MaxBufferCount )
-                HandleBufferContents(buffer);
+        while ( reader.GetNextAlignmentCore(al) ) {
+
+            // check buffer's usage
+            bufferFull = ( buffer.size() >= m_settings->MaxBufferCount );
+
+            // store alignments until buffer is "full"
+            if ( !bufferFull )
+                buffer.push_back(al);
+
+            // if buffer is "full"
+            else {
+
+                // push any unmapped reads into buffer,
+                // don't want to split these into a separate temp file
+                if ( !al.IsMapped() )
+                    buffer.push_back(al);
+
+                // "al" is mapped, so create a sorted temp file with current buffer contents
+                // then push "al" into fresh buffer
+                else {
+                    CreateSortedTempFile(buffer);
+                    buffer.push_back(al);
+                }
+            }
         }
     }
 
-    // handle any remaining buffer contents
-    if ( buffer.size() > 0 )
-        HandleBufferContents(buffer);
+    // handle any leftover buffer contents
+    if ( !buffer.empty() )
+        CreateSortedTempFile(buffer);
     
     // close reader & return success
-    inputReader.Close();
+    reader.Close();
     return true;
 }
 
-bool SortTool::SortToolPrivate::HandleBufferContents(vector<BamAlignment>& buffer ) {
+bool SortTool::SortToolPrivate::CreateSortedTempFile(vector<BamAlignment>& buffer) {
  
     // do sorting
     SortBuffer(buffer);
@@ -233,16 +263,11 @@ bool SortTool::SortToolPrivate::MergeSortedRuns(void) {
     // this might get broken up if we do a multi-pass system later ??
     BamMultiReader multiReader;
     if ( !multiReader.Open(m_tempFilenames) ) {
-        cerr << "bamtools sort ERROR: could not open BamMultiReader for merging temp files... Aborting." << endl;
+        cerr << "bamtools sort ERROR: could not open BamMultiReader for merging temp files... Aborting."
+             << endl;
         return false;
     }
 
-    // set sort order for merge
-    if ( m_settings->IsSortingByName )
-        multiReader.SetSortOrder(BamMultiReader::SortedByReadName);
-    else
-        multiReader.SetSortOrder(BamMultiReader::SortedByPosition);
-    
     // open writer for our completely sorted output BAM file
     BamWriter mergedWriter;
     if ( !mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references) ) {
@@ -257,7 +282,7 @@ bool SortTool::SortToolPrivate::MergeSortedRuns(void) {
     while ( multiReader.GetNextAlignmentCore(al) )
         mergedWriter.SaveAlignment(al);
   
-    // close readers
+    // close files
     multiReader.Close();
     mergedWriter.Close();
     
@@ -269,6 +294,7 @@ bool SortTool::SortToolPrivate::MergeSortedRuns(void) {
         remove(tempFilename.c_str());
     }
   
+    // return success
     return true;
 }
 
@@ -289,14 +315,14 @@ void SortTool::SortToolPrivate::SortBuffer(vector<BamAlignment>& buffer) {
     
     // sort buffer by desired method
     if ( m_settings->IsSortingByName )
-        sort ( buffer.begin(), buffer.end(), SortLessThanName() );
+        std::stable_sort ( buffer.begin(), buffer.end(), SortLessThanName() );
     else 
-        sort ( buffer.begin(), buffer.end(), SortLessThanPosition() );
+        std::stable_sort ( buffer.begin(), buffer.end(), SortLessThanPosition() );
 }
     
-    
-bool SortTool::SortToolPrivate::WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename) {
-
+bool SortTool::SortToolPrivate::WriteTempFile(const vector<BamAlignment>& buffer,
+                                              const string& tempFilename)
+{
     // open temp file for writing
     BamWriter tempWriter;
     if ( !tempWriter.Open(tempFilename, m_headerText, m_references) ) {
@@ -331,15 +357,23 @@ SortTool::SortTool(void)
 
     // set up options
     OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
-    Options::AddValueOption("-in",  "BAM filename", "the input BAM file",  "", m_settings->HasInputBamFilename,  m_settings->InputBamFilename,  IO_Opts, Options::StandardIn());
-    Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputBamFilename, IO_Opts, Options::StandardOut());
+    Options::AddValueOption("-in",  "BAM filename", "the input BAM file",  "",
+                            m_settings->HasInputBamFilename,  m_settings->InputBamFilename,
+                            IO_Opts, Options::StandardIn());
+    Options::AddValueOption("-out", "BAM filename", "the output BAM file", "",
+                            m_settings->HasOutputBamFilename, m_settings->OutputBamFilename,
+                            IO_Opts, Options::StandardOut());
 
     OptionGroup* SortOpts = Options::CreateOptionGroup("Sorting Methods");
     Options::AddOption("-byname", "sort by alignment name", m_settings->IsSortingByName, SortOpts);
 
     OptionGroup* MemOpts = Options::CreateOptionGroup("Memory Settings");
-    Options::AddValueOption("-n",   "count", "max number of alignments per tempfile", "", m_settings->HasMaxBufferCount,  m_settings->MaxBufferCount,  MemOpts, SORT_DEFAULT_MAX_BUFFER_COUNT);
-    Options::AddValueOption("-mem", "Mb",    "max memory to use",                     "", m_settings->HasMaxBufferMemory, m_settings->MaxBufferMemory, MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY);
+    Options::AddValueOption("-n",   "count", "max number of alignments per tempfile", "",
+                            m_settings->HasMaxBufferCount,  m_settings->MaxBufferCount,
+                            MemOpts, SORT_DEFAULT_MAX_BUFFER_COUNT);
+    Options::AddValueOption("-mem", "Mb", "max memory to use", "",
+                            m_settings->HasMaxBufferMemory, m_settings->MaxBufferMemory,
+                            MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY);
 }
 
 SortTool::~SortTool(void) {