1 // ***************************************************************************
2 // bamtools_sort.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 27 March 2012 (DB)
6 // ---------------------------------------------------------------------------
7 // Sorts an input BAM file
8 // ***************************************************************************
10 #include "bamtools_sort.h"
12 #include <api/SamConstants.h>
13 #include <api/BamMultiReader.h>
14 #include <api/BamWriter.h>
15 #include <api/algorithms/Sort.h>
16 #include <utils/bamtools_options.h>
17 using namespace BamTools;
18 using namespace BamTools::Algorithms;
32 // ** These defaults should be tweaked & 'optimized' per testing ** //
34 // I say 'optimized' because each system will naturally perform
35 // differently. We will attempt to determine a sensible
36 // compromise that should perform well on average.
37 const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT = 500000; // max numberOfAlignments for buffer
38 const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024; // Mb
40 } // namespace BamTools
42 // ---------------------------------------------
43 // SortSettings implementation
45 struct SortTool::SortSettings {
48 bool HasInputBamFilename;
49 bool HasMaxBufferCount;
50 bool HasMaxBufferMemory;
51 bool HasOutputBamFilename;
55 string InputBamFilename;
56 string OutputBamFilename;
59 unsigned int MaxBufferCount;
60 unsigned int MaxBufferMemory;
64 : HasInputBamFilename(false)
65 , HasMaxBufferCount(false)
66 , HasMaxBufferMemory(false)
67 , HasOutputBamFilename(false)
68 , IsSortingByName(false)
69 , InputBamFilename(Options::StandardIn())
70 , OutputBamFilename(Options::StandardOut())
71 , MaxBufferCount(SORT_DEFAULT_MAX_BUFFER_COUNT)
72 , MaxBufferMemory(SORT_DEFAULT_MAX_BUFFER_MEMORY)
76 // ---------------------------------------------
77 // SortToolPrivate implementation
79 class SortTool::SortToolPrivate {
83 SortToolPrivate(SortTool::SortSettings* settings);
84 ~SortToolPrivate(void) { }
92 bool CreateSortedTempFile(vector<BamAlignment>& buffer);
93 bool GenerateSortedRuns(void);
94 bool MergeSortedRuns(void);
95 bool WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename);
96 void SortBuffer(vector<BamAlignment>& buffer);
100 SortTool::SortSettings* m_settings;
101 string m_tempFilenameStub;
104 RefVector m_references;
105 vector<string> m_tempFilenames;
109 SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings)
110 : m_settings(settings)
113 // set filename stub depending on inputfile path
114 // that way multiple sort runs don't trip on each other's temp files
116 size_t extensionFound = m_settings->InputBamFilename.find(".bam");
117 if ( extensionFound != string::npos )
118 m_tempFilenameStub = m_settings->InputBamFilename.substr(0,extensionFound);
119 m_tempFilenameStub.append(".sort.temp.");
123 // generates mutiple sorted temp BAM files from single unsorted BAM file
124 bool SortTool::SortToolPrivate::GenerateSortedRuns(void) {
126 // open input BAM file
128 if ( !reader.Open(m_settings->InputBamFilename) ) {
129 cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename
130 << " for reading... Aborting." << endl;
134 // get basic data that will be shared by all temp/output files
135 SamHeader header = reader.GetHeader();
136 if ( !header.HasVersion() )
137 header.Version = Constants::SAM_CURRENT_VERSION;
138 header.SortOrder = ( m_settings->IsSortingByName
139 ? Constants::SAM_HD_SORTORDER_QUERYNAME
140 : Constants::SAM_HD_SORTORDER_COORDINATE );
141 m_headerText = header.ToString();
142 m_references = reader.GetReferenceData();
144 // set up alignments buffer
146 vector<BamAlignment> buffer;
147 buffer.reserve( (size_t)(m_settings->MaxBufferCount*1.1) );
148 bool bufferFull = false;
150 // if sorting by name, we need to generate full char data
151 // so can't use GetNextAlignmentCore()
152 if ( m_settings->IsSortingByName ) {
154 // iterate through file
155 while ( reader.GetNextAlignment(al)) {
157 // check buffer's usage
158 bufferFull = ( buffer.size() >= m_settings->MaxBufferCount );
160 // store alignments until buffer is "full"
162 buffer.push_back(al);
164 // if buffer is "full"
166 // so create a sorted temp file with current buffer contents
167 // then push "al" into fresh buffer
168 CreateSortedTempFile(buffer);
169 buffer.push_back(al);
174 // sorting by position, can take advantage of GNACore() speedup
177 // iterate through file
178 while ( reader.GetNextAlignmentCore(al) ) {
180 // check buffer's usage
181 bufferFull = ( buffer.size() >= m_settings->MaxBufferCount );
183 // store alignments until buffer is "full"
185 buffer.push_back(al);
187 // if buffer is "full"
189 // create a sorted temp file with current buffer contents
190 // then push "al" into fresh buffer
191 CreateSortedTempFile(buffer);
192 buffer.push_back(al);
197 // handle any leftover buffer contents
198 if ( !buffer.empty() )
199 CreateSortedTempFile(buffer);
201 // close reader & return success
206 bool SortTool::SortToolPrivate::CreateSortedTempFile(vector<BamAlignment>& buffer) {
211 // write sorted contents to temp file, store success/fail
212 stringstream tempStr;
213 tempStr << m_tempFilenameStub << m_numberOfRuns;
214 bool success = WriteTempFile( buffer, tempStr.str() );
216 // save temp filename for merging later
217 m_tempFilenames.push_back(tempStr.str());
219 // clear buffer contents & update run counter
223 // return success/fail of writing to temp file
224 // TODO: a failure returned here is not actually caught and handled anywhere
228 // merges sorted temp BAM files into single sorted output BAM file
229 bool SortTool::SortToolPrivate::MergeSortedRuns(void) {
231 // open up multi reader for all of our temp files
232 // this might get broken up if we do a multi-pass system later ??
233 BamMultiReader multiReader;
234 if ( !multiReader.Open(m_tempFilenames) ) {
235 cerr << "bamtools sort ERROR: could not open BamMultiReader for merging temp files... Aborting."
240 // open writer for our completely sorted output BAM file
241 BamWriter mergedWriter;
242 if ( !mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references) ) {
243 cerr << "bamtools sort ERROR: could not open " << m_settings->OutputBamFilename
244 << " for writing... Aborting." << endl;
249 // while data available in temp files
251 while ( multiReader.GetNextAlignmentCore(al) )
252 mergedWriter.SaveAlignment(al);
256 mergedWriter.Close();
258 // delete all temp files
259 vector<string>::const_iterator tempIter = m_tempFilenames.begin();
260 vector<string>::const_iterator tempEnd = m_tempFilenames.end();
261 for ( ; tempIter != tempEnd; ++tempIter ) {
262 const string& tempFilename = (*tempIter);
263 remove(tempFilename.c_str());
270 bool SortTool::SortToolPrivate::Run(void) {
272 // this does a single pass, chunking up the input file into smaller sorted temp files,
273 // then write out using BamMultiReader to handle merging
275 if ( GenerateSortedRuns() )
276 return MergeSortedRuns();
281 void SortTool::SortToolPrivate::SortBuffer(vector<BamAlignment>& buffer) {
283 // ** add further custom sort options later ?? **
285 // sort buffer by desired method
286 if ( m_settings->IsSortingByName )
287 std::stable_sort( buffer.begin(), buffer.end(), Sort::ByName() );
289 std::stable_sort( buffer.begin(), buffer.end(), Sort::ByPosition() );
292 bool SortTool::SortToolPrivate::WriteTempFile(const vector<BamAlignment>& buffer,
293 const string& tempFilename)
295 // open temp file for writing
296 BamWriter tempWriter;
297 if ( !tempWriter.Open(tempFilename, m_headerText, m_references) ) {
298 cerr << "bamtools sort ERROR: could not open " << tempFilename
299 << " for writing." << endl;
304 vector<BamAlignment>::const_iterator buffIter = buffer.begin();
305 vector<BamAlignment>::const_iterator buffEnd = buffer.end();
306 for ( ; buffIter != buffEnd; ++buffIter ) {
307 const BamAlignment& al = (*buffIter);
308 tempWriter.SaveAlignment(al);
311 // close temp file & return success
316 // ---------------------------------------------
317 // SortTool implementation
319 SortTool::SortTool(void)
321 , m_settings(new SortSettings)
324 // set program details
325 Options::SetProgramInfo("bamtools sort", "sorts a BAM file", "[-in <filename>] [-out <filename>] [sortOptions]");
328 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
329 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "",
330 m_settings->HasInputBamFilename, m_settings->InputBamFilename,
331 IO_Opts, Options::StandardIn());
332 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "",
333 m_settings->HasOutputBamFilename, m_settings->OutputBamFilename,
334 IO_Opts, Options::StandardOut());
336 OptionGroup* SortOpts = Options::CreateOptionGroup("Sorting Methods");
337 Options::AddOption("-byname", "sort by alignment name", m_settings->IsSortingByName, SortOpts);
339 OptionGroup* MemOpts = Options::CreateOptionGroup("Memory Settings");
340 Options::AddValueOption("-n", "count", "max number of alignments per tempfile", "",
341 m_settings->HasMaxBufferCount, m_settings->MaxBufferCount,
342 MemOpts, SORT_DEFAULT_MAX_BUFFER_COUNT);
343 Options::AddValueOption("-mem", "Mb", "max memory to use", "",
344 m_settings->HasMaxBufferMemory, m_settings->MaxBufferMemory,
345 MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY);
348 SortTool::~SortTool(void) {
357 int SortTool::Help(void) {
358 Options::DisplayHelp();
362 int SortTool::Run(int argc, char* argv[]) {
364 // parse command line arguments
365 Options::Parse(argc, argv, 1);
367 // initialize SortTool with settings
368 m_impl = new SortToolPrivate(m_settings);
370 // run SortTool, return success/fail