]> git.donarmstrong.com Git - bamtools.git/blobdiff - src/toolkit/bamtools_sort.cpp
Fixed: sorting order lost during merge step of sort tool, if input BAM
[bamtools.git] / src / toolkit / bamtools_sort.cpp
index 83584a6c8bd9b258bca86a6b4094b2761ceddc82..e268bee748b7d8c726bec2a71ade943a871f7f5d 100644 (file)
@@ -1,92 +1,44 @@
 // ***************************************************************************
 // bamtools_sort.cpp (c) 2010 Derek Barnett, Erik Garrison
 // Marth Lab, Department of Biology, Boston College
-// All rights reserved.
 // ---------------------------------------------------------------------------
-// Last modified: 21 June 2010 (DB)
+// Last modified: 27 March 2012 (DB)
 // ---------------------------------------------------------------------------
-// Sorts an input BAM file (default by position) and stores in a new BAM file.
+// Sorts an input BAM file
 // ***************************************************************************
 
+#include "bamtools_sort.h"
+
+#include <api/SamConstants.h>
+#include <api/BamMultiReader.h>
+#include <api/BamWriter.h>
+#include <api/algorithms/Sort.h>
+#include <utils/bamtools_options.h>
+using namespace BamTools;
+using namespace BamTools::Algorithms;
+
 #include <cstdio>
 #include <algorithm>
 #include <iostream>
 #include <sstream>
 #include <string>
 #include <vector>
-
-#include "bamtools_sort.h"
-#include "bamtools_options.h"
-#include "BamReader.h"
-#include "BamMultiReader.h"
-#include "BamWriter.h"
-
 using namespace std;
-using namespace BamTools;
 
 namespace BamTools {
   
-    // defaults
-    //
-    // ** These defaults should be tweaked & 'optimized' per testing ** //
-    //    I say 'optimized' because each system will naturally perform
-    //    differently.  We will attempt to determine a sensible 
-    //    compromise that should perform well on average.
-    const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT  = 10000; // max numberOfAlignments for buffer
-    const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024;  // Mb
-
-    // -----------------------------------
-    // comparison objects (for sorting) 
-
-    struct SortLessThanPosition {
-        bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) {
-            if ( lhs.RefID != rhs.RefID )
-                return lhs.RefID < rhs.RefID;
-            else 
-                return lhs.Position < rhs.Position;
-        }
-    };
-    
-    struct SortLessThanName {
-        bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) {
-            return lhs.Name < rhs.Name;
-        }
-    };
+// defaults
+//
+// ** These defaults should be tweaked & 'optimized' per testing ** //
+//
+//    I say 'optimized' because each system will naturally perform
+//    differently.  We will attempt to determine a sensible
+//    compromise that should perform well on average.
+const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT  = 500000;  // max numberOfAlignments for buffer
+const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024;    // Mb
     
 } // namespace BamTools
 
-// ---------------------------------------------
-// SortToolPrivate declaration
-class SortTool::SortToolPrivate {
-      
-    // ctor & dtor
-    public:
-        SortToolPrivate(SortTool::SortSettings* settings);
-        ~SortToolPrivate(void);
-        
-    // 'public' interface
-    public:
-        bool Run(void);
-        
-    // internal methods
-    private:
-        void ClearBuffer(vector<BamAlignment>& buffer);
-        bool GenerateSortedRuns(void);
-        bool HandleBufferContents(vector<BamAlignment>& buffer);
-        bool MergeSortedRuns(void);
-        bool WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename);
-        void SortBuffer(vector<BamAlignment>& buffer);
-        
-    // data members
-    private:
-        SortTool::SortSettings* m_settings;
-        string m_tempFilenameStub;
-        int m_numberOfRuns;
-        string m_headerText;
-        RefVector m_references;
-        vector<string> m_tempFilenames;
-};
-
 // ---------------------------------------------
 // SortSettings implementation
 
@@ -102,11 +54,11 @@ struct SortTool::SortSettings {
     // filenames
     string InputBamFilename;
     string OutputBamFilename;
-    
+
     // parameters
     unsigned int MaxBufferCount;
     unsigned int MaxBufferMemory;
-    
+
     // constructor
     SortSettings(void)
         : HasInputBamFilename(false)
@@ -118,62 +70,41 @@ struct SortTool::SortSettings {
         , OutputBamFilename(Options::StandardOut())
         , MaxBufferCount(SORT_DEFAULT_MAX_BUFFER_COUNT)
         , MaxBufferMemory(SORT_DEFAULT_MAX_BUFFER_MEMORY)
-    { } 
-};  
-
-// ---------------------------------------------
-// SortTool implementation
-
-SortTool::SortTool(void)
-    : AbstractTool()
-    , m_settings(new SortSettings)
-    , m_impl(0)
-{
-    // set program details
-    Options::SetProgramInfo("bamtools sort", "sorts a BAM file", "[-in <filename>] [-out <filename>]");
-    
-    // set up options 
-    OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
-    Options::AddValueOption("-in",  "BAM filename", "the input BAM file",  "", m_settings->HasInputBamFilename,  m_settings->InputBamFilename,  IO_Opts, Options::StandardIn());
-    Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputBamFilename, IO_Opts, Options::StandardOut());
-    
-    OptionGroup* SortOpts = Options::CreateOptionGroup("Sorting Methods");
-    Options::AddOption("-byname", "sort by alignment name", m_settings->IsSortingByName, SortOpts);
-    
-    OptionGroup* MemOpts = Options::CreateOptionGroup("Memory Settings");
-    Options::AddValueOption("-n",   "count", "max number of alignments per tempfile", "", m_settings->HasMaxBufferCount,  m_settings->MaxBufferCount,  MemOpts, SORT_DEFAULT_MAX_BUFFER_COUNT);
-    Options::AddValueOption("-mem", "Mb",    "max memory to use",                     "", m_settings->HasMaxBufferMemory, m_settings->MaxBufferMemory, MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY);
-}
-
-SortTool::~SortTool(void) {
-    
-    delete m_settings;
-    m_settings = 0;
-    
-    delete m_impl;
-    m_impl = 0;
-}
-
-int SortTool::Help(void) {
-    Options::DisplayHelp();
-    return 0;
-}
-
-int SortTool::Run(int argc, char* argv[]) {
-  
-    // parse command line arguments
-    Options::Parse(argc, argv, 1);
-    
-    // run internal SortTool implementation, return success/fail
-    m_impl = new SortToolPrivate(m_settings);
-    
-    if ( m_impl->Run() ) return 0;
-    else return 1;
-}
+    { }
+};
 
 // ---------------------------------------------
 // SortToolPrivate implementation
 
+class SortTool::SortToolPrivate {
+      
+    // ctor & dtor
+    public:
+        SortToolPrivate(SortTool::SortSettings* settings);
+        ~SortToolPrivate(void) { }
+        
+    // 'public' interface
+    public:
+        bool Run(void);
+        
+    // internal methods
+    private:
+        bool CreateSortedTempFile(vector<BamAlignment>& buffer);
+        bool GenerateSortedRuns(void);
+        bool MergeSortedRuns(void);
+        bool WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename);
+        void SortBuffer(vector<BamAlignment>& buffer);
+        
+    // data members
+    private:
+        SortTool::SortSettings* m_settings;
+        string m_tempFilenameStub;
+        int m_numberOfRuns;
+        string m_headerText;
+        RefVector m_references;
+        vector<string> m_tempFilenames;
+};
+
 // constructor
 SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings) 
     : m_settings(settings)
@@ -183,52 +114,96 @@ SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings)
     // that way multiple sort runs don't trip on each other's temp files
     if ( m_settings) {
         size_t extensionFound = m_settings->InputBamFilename.find(".bam");
-        if (extensionFound != string::npos )
+        if ( extensionFound != string::npos )
             m_tempFilenameStub = m_settings->InputBamFilename.substr(0,extensionFound);
         m_tempFilenameStub.append(".sort.temp.");
     }
 }
 
-// destructor
-SortTool::SortToolPrivate::~SortToolPrivate(void) { }
-
 // generates mutiple sorted temp BAM files from single unsorted BAM file
 bool SortTool::SortToolPrivate::GenerateSortedRuns(void) {
     
     // open input BAM file
-    BamReader inputReader;
-    inputReader.Open(m_settings->InputBamFilename);
+    BamReader reader;
+    if ( !reader.Open(m_settings->InputBamFilename) ) {
+        cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename
+             << " for reading... Aborting." << endl;
+        return false;
+    }
     
     // get basic data that will be shared by all temp/output files 
-    m_headerText = inputReader.GetHeaderText();
-    m_references = inputReader.GetReferenceData();
+    SamHeader header = reader.GetHeader();
+    if ( !header.HasVersion() )
+        header.Version = Constants::SAM_CURRENT_VERSION;
+    header.SortOrder = ( m_settings->IsSortingByName
+                       ? Constants::SAM_HD_SORTORDER_QUERYNAME
+                       : Constants::SAM_HD_SORTORDER_COORDINATE );
+    m_headerText = header.ToString();
+    m_references = reader.GetReferenceData();
     
     // set up alignments buffer
-    vector<BamAlignment> buffer;
-    buffer.reserve(m_settings->MaxBufferCount);
-    
-    // while data available
     BamAlignment al;
-    while ( inputReader.GetNextAlignmentCore(al)) {
-        
-        // store alignments in buffer
-        buffer.push_back(al);
-        
-        // if buffer is full, handle contents (sort & write to temp file)
-        if ( buffer.size() == m_settings->MaxBufferCount )
-            HandleBufferContents(buffer);
+    vector<BamAlignment> buffer;
+    buffer.reserve( (size_t)(m_settings->MaxBufferCount*1.1) );
+    bool bufferFull = false;
+
+    // if sorting by name, we need to generate full char data
+    // so can't use GetNextAlignmentCore()
+    if ( m_settings->IsSortingByName ) {
+
+        // iterate through file
+        while ( reader.GetNextAlignment(al)) {
+
+            // check buffer's usage
+            bufferFull = ( buffer.size() >= m_settings->MaxBufferCount );
+
+            // store alignments until buffer is "full"
+            if ( !bufferFull )
+                buffer.push_back(al);
+
+            // if buffer is "full"
+            else {
+                // so create a sorted temp file with current buffer contents
+                // then push "al" into fresh buffer
+                CreateSortedTempFile(buffer);
+                buffer.push_back(al);
+            }
+        }
     }
-    
-    // handle any remaining buffer contents
-    if ( buffer.size() > 0 )
-        HandleBufferContents(buffer);
+
+    // sorting by position, can take advantage of GNACore() speedup
+    else {
+
+        // iterate through file
+        while ( reader.GetNextAlignmentCore(al) ) {
+
+            // check buffer's usage
+            bufferFull = ( buffer.size() >= m_settings->MaxBufferCount );
+
+            // store alignments until buffer is "full"
+            if ( !bufferFull )
+                buffer.push_back(al);
+
+            // if buffer is "full"
+            else {
+                // create a sorted temp file with current buffer contents
+                // then push "al" into fresh buffer
+                CreateSortedTempFile(buffer);
+                buffer.push_back(al);
+            }
+        }
+    }
+
+    // handle any leftover buffer contents
+    if ( !buffer.empty() )
+        CreateSortedTempFile(buffer);
     
     // close reader & return success
-    inputReader.Close();
+    reader.Close();
     return true;
 }
 
-bool SortTool::SortToolPrivate::HandleBufferContents(vector<BamAlignment>& buffer ) {
+bool SortTool::SortToolPrivate::CreateSortedTempFile(vector<BamAlignment>& buffer) {
  
     // do sorting
     SortBuffer(buffer);
@@ -246,6 +221,7 @@ bool SortTool::SortToolPrivate::HandleBufferContents(vector<BamAlignment>& buffe
     ++m_numberOfRuns;
     
     // return success/fail of writing to temp file
+    // TODO: a failure returned here is not actually caught and handled anywhere
     return success;
 }
 
@@ -255,19 +231,27 @@ bool SortTool::SortToolPrivate::MergeSortedRuns(void) {
     // open up multi reader for all of our temp files
     // this might get broken up if we do a multi-pass system later ??
     BamMultiReader multiReader;
-    multiReader.Open(m_tempFilenames, false, true);
-    
+    if ( !multiReader.Open(m_tempFilenames) ) {
+        cerr << "bamtools sort ERROR: could not open BamMultiReader for merging temp files... Aborting."
+             << endl;
+        return false;
+    }
+
     // open writer for our completely sorted output BAM file
     BamWriter mergedWriter;
-    mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references);
+    if ( !mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references) ) {
+        cerr << "bamtools sort ERROR: could not open " << m_settings->OutputBamFilename
+             << " for writing... Aborting." << endl;
+        multiReader.Close();
+        return false;
+    }
     
     // while data available in temp files
     BamAlignment al;
-    while ( multiReader.GetNextAlignmentCore(al) ) {
+    while ( multiReader.GetNextAlignmentCore(al) )
         mergedWriter.SaveAlignment(al);
-    }
   
-    // close readers
+    // close files
     multiReader.Close();
     mergedWriter.Close();
     
@@ -279,6 +263,7 @@ bool SortTool::SortToolPrivate::MergeSortedRuns(void) {
         remove(tempFilename.c_str());
     }
   
+    // return success
     return true;
 }
 
@@ -299,17 +284,21 @@ void SortTool::SortToolPrivate::SortBuffer(vector<BamAlignment>& buffer) {
     
     // sort buffer by desired method
     if ( m_settings->IsSortingByName )
-        sort ( buffer.begin(), buffer.end(), SortLessThanName() );
-    else 
-        sort ( buffer.begin(), buffer.end(), SortLessThanPosition() );
+        std::stable_sort( buffer.begin(), buffer.end(), Sort::ByName() );
+    else
+        std::stable_sort( buffer.begin(), buffer.end(), Sort::ByPosition() );
 }
     
-    
-bool SortTool::SortToolPrivate::WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename) {
-
+bool SortTool::SortToolPrivate::WriteTempFile(const vector<BamAlignment>& buffer,
+                                              const string& tempFilename)
+{
     // open temp file for writing
     BamWriter tempWriter;
-    tempWriter.Open(tempFilename, m_headerText, m_references);
+    if ( !tempWriter.Open(tempFilename, m_headerText, m_references) ) {
+        cerr << "bamtools sort ERROR: could not open " << tempFilename
+             << " for writing." << endl;
+        return false;
+    }
   
     // write data
     vector<BamAlignment>::const_iterator buffIter = buffer.begin();
@@ -322,4 +311,65 @@ bool SortTool::SortToolPrivate::WriteTempFile(const vector<BamAlignment>& buffer
     // close temp file & return success
     tempWriter.Close();
     return true;
-}
\ No newline at end of file
+}
+
+// ---------------------------------------------
+// SortTool implementation
+
+SortTool::SortTool(void)
+    : AbstractTool()
+    , m_settings(new SortSettings)
+    , m_impl(0)
+{
+    // set program details
+    Options::SetProgramInfo("bamtools sort", "sorts a BAM file", "[-in <filename>] [-out <filename>] [sortOptions]");
+
+    // set up options
+    OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
+    Options::AddValueOption("-in",  "BAM filename", "the input BAM file",  "",
+                            m_settings->HasInputBamFilename,  m_settings->InputBamFilename,
+                            IO_Opts, Options::StandardIn());
+    Options::AddValueOption("-out", "BAM filename", "the output BAM file", "",
+                            m_settings->HasOutputBamFilename, m_settings->OutputBamFilename,
+                            IO_Opts, Options::StandardOut());
+
+    OptionGroup* SortOpts = Options::CreateOptionGroup("Sorting Methods");
+    Options::AddOption("-byname", "sort by alignment name", m_settings->IsSortingByName, SortOpts);
+
+    OptionGroup* MemOpts = Options::CreateOptionGroup("Memory Settings");
+    Options::AddValueOption("-n",   "count", "max number of alignments per tempfile", "",
+                            m_settings->HasMaxBufferCount,  m_settings->MaxBufferCount,
+                            MemOpts, SORT_DEFAULT_MAX_BUFFER_COUNT);
+    Options::AddValueOption("-mem", "Mb", "max memory to use", "",
+                            m_settings->HasMaxBufferMemory, m_settings->MaxBufferMemory,
+                            MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY);
+}
+
+SortTool::~SortTool(void) {
+
+    delete m_settings;
+    m_settings = 0;
+
+    delete m_impl;
+    m_impl = 0;
+}
+
+int SortTool::Help(void) {
+    Options::DisplayHelp();
+    return 0;
+}
+
+int SortTool::Run(int argc, char* argv[]) {
+
+    // parse command line arguments
+    Options::Parse(argc, argv, 1);
+
+    // initialize SortTool with settings
+    m_impl = new SortToolPrivate(m_settings);
+
+    // run SortTool, return success/fail
+    if ( m_impl->Run() )
+        return 0;
+    else
+        return 1;
+}