1 // ***************************************************************************
2 // bamtools_sort.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 21 June 2010 (DB)
7 // ---------------------------------------------------------------------------
8 // Sorts an input BAM file (default by position) and stores in a new BAM file.
9 // ***************************************************************************
18 #include "bamtools_sort.h"
19 #include "bamtools_options.h"
20 #include "BamReader.h"
21 #include "BamMultiReader.h"
22 #include "BamWriter.h"
25 using namespace BamTools;
31 // ** These defaults should be tweaked & 'optimized' per testing ** //
32 // I say 'optimized' because each system will naturally perform
33 // differently. We will attempt to determine a sensible
34 // compromise that should perform well on average.
35 const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT = 10000; // max numberOfAlignments for buffer
36 const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024; // Mb
38 // -----------------------------------
39 // comparison objects (for sorting)
41 struct SortLessThanPosition {
42 bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) {
43 if ( lhs.RefID != rhs.RefID )
44 return lhs.RefID < rhs.RefID;
46 return lhs.Position < rhs.Position;
50 struct SortLessThanName {
51 bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) {
52 return lhs.Name < rhs.Name;
56 } // namespace BamTools
58 // ---------------------------------------------
59 // SortToolPrivate declaration
60 class SortTool::SortToolPrivate {
64 SortToolPrivate(SortTool::SortSettings* settings);
65 ~SortToolPrivate(void);
73 void ClearBuffer(vector<BamAlignment>& buffer);
74 bool GenerateSortedRuns(void);
75 bool HandleBufferContents(vector<BamAlignment>& buffer);
76 bool MergeSortedRuns(void);
77 bool WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename);
78 void SortBuffer(vector<BamAlignment>& buffer);
82 SortTool::SortSettings* m_settings;
83 string m_tempFilenameStub;
86 RefVector m_references;
87 vector<string> m_tempFilenames;
90 // ---------------------------------------------
91 // SortSettings implementation
93 struct SortTool::SortSettings {
96 bool HasInputBamFilename;
97 bool HasMaxBufferCount;
98 bool HasMaxBufferMemory;
99 bool HasOutputBamFilename;
100 bool IsSortingByName;
103 string InputBamFilename;
104 string OutputBamFilename;
107 unsigned int MaxBufferCount;
108 unsigned int MaxBufferMemory;
112 : HasInputBamFilename(false)
113 , HasMaxBufferCount(false)
114 , HasMaxBufferMemory(false)
115 , HasOutputBamFilename(false)
116 , IsSortingByName(false)
117 , InputBamFilename(Options::StandardIn())
118 , OutputBamFilename(Options::StandardOut())
119 , MaxBufferCount(SORT_DEFAULT_MAX_BUFFER_COUNT)
120 , MaxBufferMemory(SORT_DEFAULT_MAX_BUFFER_MEMORY)
124 // ---------------------------------------------
125 // SortTool implementation
127 SortTool::SortTool(void)
129 , m_settings(new SortSettings)
132 // set program details
133 Options::SetProgramInfo("bamtools sort", "sorts a BAM file", "[-in <filename>] [-out <filename>]");
136 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
137 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn());
138 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputBamFilename, IO_Opts, Options::StandardOut());
140 OptionGroup* SortOpts = Options::CreateOptionGroup("Sorting Methods");
141 Options::AddOption("-byname", "sort by alignment name", m_settings->IsSortingByName, SortOpts);
143 OptionGroup* MemOpts = Options::CreateOptionGroup("Memory Settings");
144 Options::AddValueOption("-n", "count", "max number of alignments per tempfile", "", m_settings->HasMaxBufferCount, m_settings->MaxBufferCount, MemOpts, SORT_DEFAULT_MAX_BUFFER_COUNT);
145 Options::AddValueOption("-mem", "Mb", "max memory to use", "", m_settings->HasMaxBufferMemory, m_settings->MaxBufferMemory, MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY);
148 SortTool::~SortTool(void) {
157 int SortTool::Help(void) {
158 Options::DisplayHelp();
162 int SortTool::Run(int argc, char* argv[]) {
164 // parse command line arguments
165 Options::Parse(argc, argv, 1);
167 // run internal SortTool implementation, return success/fail
168 m_impl = new SortToolPrivate(m_settings);
170 if ( m_impl->Run() ) return 0;
174 // ---------------------------------------------
175 // SortToolPrivate implementation
178 SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings)
179 : m_settings(settings)
182 // set filename stub depending on inputfile path
183 // that way multiple sort runs don't trip on each other's temp files
185 size_t extensionFound = m_settings->InputBamFilename.find(".bam");
186 if (extensionFound != string::npos )
187 m_tempFilenameStub = m_settings->InputBamFilename.substr(0,extensionFound);
188 m_tempFilenameStub.append(".sort.temp.");
193 SortTool::SortToolPrivate::~SortToolPrivate(void) { }
195 // generates mutiple sorted temp BAM files from single unsorted BAM file
196 bool SortTool::SortToolPrivate::GenerateSortedRuns(void) {
198 // open input BAM file
199 BamReader inputReader;
200 inputReader.Open(m_settings->InputBamFilename);
202 // get basic data that will be shared by all temp/output files
203 m_headerText = inputReader.GetHeaderText();
204 m_references = inputReader.GetReferenceData();
206 // set up alignments buffer
207 vector<BamAlignment> buffer;
208 buffer.reserve(m_settings->MaxBufferCount);
210 // while data available
212 while ( inputReader.GetNextAlignmentCore(al)) {
214 // store alignments in buffer
215 buffer.push_back(al);
217 // if buffer is full, handle contents (sort & write to temp file)
218 if ( buffer.size() == m_settings->MaxBufferCount )
219 HandleBufferContents(buffer);
222 // handle any remaining buffer contents
223 if ( buffer.size() > 0 )
224 HandleBufferContents(buffer);
226 // close reader & return success
231 bool SortTool::SortToolPrivate::HandleBufferContents(vector<BamAlignment>& buffer ) {
236 // write sorted contents to temp file, store success/fail
237 stringstream tempStr;
238 tempStr << m_tempFilenameStub << m_numberOfRuns;
239 bool success = WriteTempFile( buffer, tempStr.str() );
241 // save temp filename for merging later
242 m_tempFilenames.push_back(tempStr.str());
244 // clear buffer contents & update run counter
248 // return success/fail of writing to temp file
252 // merges sorted temp BAM files into single sorted output BAM file
253 bool SortTool::SortToolPrivate::MergeSortedRuns(void) {
255 // open up multi reader for all of our temp files
256 // this might get broken up if we do a multi-pass system later ??
257 BamMultiReader multiReader;
258 multiReader.Open(m_tempFilenames, false, true);
260 // open writer for our completely sorted output BAM file
261 BamWriter mergedWriter;
262 mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references);
264 // while data available in temp files
266 while ( multiReader.GetNextAlignmentCore(al) ) {
267 mergedWriter.SaveAlignment(al);
272 mergedWriter.Close();
274 // delete all temp files
275 vector<string>::const_iterator tempIter = m_tempFilenames.begin();
276 vector<string>::const_iterator tempEnd = m_tempFilenames.end();
277 for ( ; tempIter != tempEnd; ++tempIter ) {
278 const string& tempFilename = (*tempIter);
279 remove(tempFilename.c_str());
285 bool SortTool::SortToolPrivate::Run(void) {
287 // this does a single pass, chunking up the input file into smaller sorted temp files,
288 // then write out using BamMultiReader to handle merging
290 if ( GenerateSortedRuns() )
291 return MergeSortedRuns();
296 void SortTool::SortToolPrivate::SortBuffer(vector<BamAlignment>& buffer) {
298 // ** add further custom sort options later ?? **
300 // sort buffer by desired method
301 if ( m_settings->IsSortingByName )
302 sort ( buffer.begin(), buffer.end(), SortLessThanName() );
304 sort ( buffer.begin(), buffer.end(), SortLessThanPosition() );
308 bool SortTool::SortToolPrivate::WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename) {
310 // open temp file for writing
311 BamWriter tempWriter;
312 tempWriter.Open(tempFilename, m_headerText, m_references);
315 vector<BamAlignment>::const_iterator buffIter = buffer.begin();
316 vector<BamAlignment>::const_iterator buffEnd = buffer.end();
317 for ( ; buffIter != buffEnd; ++buffIter ) {
318 const BamAlignment& al = (*buffIter);
319 tempWriter.SaveAlignment(al);
322 // close temp file & return success