1 // ***************************************************************************
2 // bamtools_sort.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 21 March 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Sorts an input BAM file (default by position) and stores in a new BAM file.
9 // ***************************************************************************
11 #include "bamtools_sort.h"
13 #include <api/SamConstants.h>
14 #include <api/BamMultiReader.h>
15 #include <api/BamWriter.h>
16 #include <utils/bamtools_options.h>
17 using namespace BamTools;
31 // ** These defaults should be tweaked & 'optimized' per testing ** //
33 // I say 'optimized' because each system will naturally perform
34 // differently. We will attempt to determine a sensible
35 // compromise that should perform well on average.
36 const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT = 500000; // max numberOfAlignments for buffer
37 const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024; // Mb
39 // -----------------------------------
40 // comparison objects (for sorting)
42 struct SortLessThanPosition {
43 bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) {
44 if ( lhs.RefID != rhs.RefID )
45 return lhs.RefID < rhs.RefID;
47 return lhs.Position < rhs.Position;
51 struct SortLessThanName {
52 bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) {
53 return lhs.Name < rhs.Name;
57 } // namespace BamTools
59 // ---------------------------------------------
60 // SortToolPrivate declaration
61 class SortTool::SortToolPrivate {
65 SortToolPrivate(SortTool::SortSettings* settings);
66 ~SortToolPrivate(void);
74 void ClearBuffer(vector<BamAlignment>& buffer);
75 bool GenerateSortedRuns(void);
76 bool HandleBufferContents(vector<BamAlignment>& buffer);
77 bool MergeSortedRuns(void);
78 bool WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename);
79 void SortBuffer(vector<BamAlignment>& buffer);
83 SortTool::SortSettings* m_settings;
84 string m_tempFilenameStub;
87 RefVector m_references;
88 vector<string> m_tempFilenames;
91 // ---------------------------------------------
92 // SortSettings implementation
94 struct SortTool::SortSettings {
97 bool HasInputBamFilename;
98 bool HasMaxBufferCount;
99 bool HasMaxBufferMemory;
100 bool HasOutputBamFilename;
101 bool IsSortingByName;
104 string InputBamFilename;
105 string OutputBamFilename;
108 unsigned int MaxBufferCount;
109 unsigned int MaxBufferMemory;
113 : HasInputBamFilename(false)
114 , HasMaxBufferCount(false)
115 , HasMaxBufferMemory(false)
116 , HasOutputBamFilename(false)
117 , IsSortingByName(false)
118 , InputBamFilename(Options::StandardIn())
119 , OutputBamFilename(Options::StandardOut())
120 , MaxBufferCount(SORT_DEFAULT_MAX_BUFFER_COUNT)
121 , MaxBufferMemory(SORT_DEFAULT_MAX_BUFFER_MEMORY)
125 // ---------------------------------------------
126 // SortTool implementation
128 SortTool::SortTool(void)
130 , m_settings(new SortSettings)
133 // set program details
134 Options::SetProgramInfo("bamtools sort", "sorts a BAM file", "[-in <filename>] [-out <filename>] [sortOptions]");
137 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
138 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn());
139 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputBamFilename, IO_Opts, Options::StandardOut());
141 OptionGroup* SortOpts = Options::CreateOptionGroup("Sorting Methods");
142 Options::AddOption("-byname", "sort by alignment name", m_settings->IsSortingByName, SortOpts);
144 OptionGroup* MemOpts = Options::CreateOptionGroup("Memory Settings");
145 Options::AddValueOption("-n", "count", "max number of alignments per tempfile", "", m_settings->HasMaxBufferCount, m_settings->MaxBufferCount, MemOpts, SORT_DEFAULT_MAX_BUFFER_COUNT);
146 Options::AddValueOption("-mem", "Mb", "max memory to use", "", m_settings->HasMaxBufferMemory, m_settings->MaxBufferMemory, MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY);
149 SortTool::~SortTool(void) {
158 int SortTool::Help(void) {
159 Options::DisplayHelp();
163 int SortTool::Run(int argc, char* argv[]) {
165 // parse command line arguments
166 Options::Parse(argc, argv, 1);
168 // run internal SortTool implementation, return success/fail
169 m_impl = new SortToolPrivate(m_settings);
171 if ( m_impl->Run() ) return 0;
175 // ---------------------------------------------
176 // SortToolPrivate implementation
179 SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings)
180 : m_settings(settings)
183 // set filename stub depending on inputfile path
184 // that way multiple sort runs don't trip on each other's temp files
186 size_t extensionFound = m_settings->InputBamFilename.find(".bam");
187 if (extensionFound != string::npos )
188 m_tempFilenameStub = m_settings->InputBamFilename.substr(0,extensionFound);
189 m_tempFilenameStub.append(".sort.temp.");
194 SortTool::SortToolPrivate::~SortToolPrivate(void) { }
196 // generates mutiple sorted temp BAM files from single unsorted BAM file
197 bool SortTool::SortToolPrivate::GenerateSortedRuns(void) {
199 // open input BAM file
200 BamReader inputReader;
201 if ( !inputReader.Open(m_settings->InputBamFilename) ) {
202 cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename
203 << " for reading... Aborting." << endl;
207 // get basic data that will be shared by all temp/output files
208 SamHeader header = inputReader.GetHeader();
209 header.SortOrder = ( m_settings->IsSortingByName
210 ? Constants::SAM_HD_SORTORDER_QUERYNAME
211 : Constants::SAM_HD_SORTORDER_COORDINATE );
212 m_headerText = header.ToString();
213 m_references = inputReader.GetReferenceData();
215 // set up alignments buffer
217 vector<BamAlignment> buffer;
218 buffer.reserve(m_settings->MaxBufferCount);
220 // if sorting by name, we need to generate full char data
221 // so can't use GetNextAlignmentCore()
222 if ( m_settings->IsSortingByName ) {
224 // iterate through file
225 while ( inputReader.GetNextAlignment(al)) {
227 // store alignments in buffer
228 buffer.push_back(al);
230 // if buffer is full, handle contents (sort & write to temp file)
231 if ( buffer.size() == m_settings->MaxBufferCount )
232 HandleBufferContents(buffer);
237 // sorting by position, can take advantage of GNACore() speedup
240 // iterate through file
241 while ( inputReader.GetNextAlignmentCore(al) ) {
243 // store alignments in buffer
244 buffer.push_back(al);
246 // if buffer is full, handle contents (sort & write to temp file)
247 if ( buffer.size() == m_settings->MaxBufferCount )
248 HandleBufferContents(buffer);
252 // handle any remaining buffer contents
253 if ( buffer.size() > 0 )
254 HandleBufferContents(buffer);
256 // close reader & return success
261 bool SortTool::SortToolPrivate::HandleBufferContents(vector<BamAlignment>& buffer ) {
266 // write sorted contents to temp file, store success/fail
267 stringstream tempStr;
268 tempStr << m_tempFilenameStub << m_numberOfRuns;
269 bool success = WriteTempFile( buffer, tempStr.str() );
271 // save temp filename for merging later
272 m_tempFilenames.push_back(tempStr.str());
274 // clear buffer contents & update run counter
278 // return success/fail of writing to temp file
279 // TODO: a failure returned here is not actually caught and handled anywhere
283 // merges sorted temp BAM files into single sorted output BAM file
284 bool SortTool::SortToolPrivate::MergeSortedRuns(void) {
286 // open up multi reader for all of our temp files
287 // this might get broken up if we do a multi-pass system later ??
288 BamMultiReader multiReader;
289 if ( !multiReader.Open(m_tempFilenames) ) {
290 cerr << "bamtools sort ERROR: could not open BamMultiReader for merging temp files... Aborting." << endl;
294 // set sort order for merge
295 if ( m_settings->IsSortingByName )
296 multiReader.SetSortOrder(BamMultiReader::SortedByReadName);
298 multiReader.SetSortOrder(BamMultiReader::SortedByPosition);
300 // open writer for our completely sorted output BAM file
301 BamWriter mergedWriter;
302 if ( !mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references) ) {
303 cerr << "bamtools sort ERROR: could not open " << m_settings->OutputBamFilename
304 << " for writing... Aborting." << endl;
309 // while data available in temp files
311 while ( multiReader.GetNextAlignmentCore(al) )
312 mergedWriter.SaveAlignment(al);
316 mergedWriter.Close();
318 // delete all temp files
319 vector<string>::const_iterator tempIter = m_tempFilenames.begin();
320 vector<string>::const_iterator tempEnd = m_tempFilenames.end();
321 for ( ; tempIter != tempEnd; ++tempIter ) {
322 const string& tempFilename = (*tempIter);
323 remove(tempFilename.c_str());
329 bool SortTool::SortToolPrivate::Run(void) {
331 // this does a single pass, chunking up the input file into smaller sorted temp files,
332 // then write out using BamMultiReader to handle merging
334 if ( GenerateSortedRuns() )
335 return MergeSortedRuns();
340 void SortTool::SortToolPrivate::SortBuffer(vector<BamAlignment>& buffer) {
342 // ** add further custom sort options later ?? **
344 // sort buffer by desired method
345 if ( m_settings->IsSortingByName )
346 sort ( buffer.begin(), buffer.end(), SortLessThanName() );
348 sort ( buffer.begin(), buffer.end(), SortLessThanPosition() );
352 bool SortTool::SortToolPrivate::WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename) {
354 // open temp file for writing
355 BamWriter tempWriter;
356 if ( !tempWriter.Open(tempFilename, m_headerText, m_references) ) {
357 cerr << "bamtools sort ERROR: could not open " << tempFilename
358 << " for writing." << endl;
363 vector<BamAlignment>::const_iterator buffIter = buffer.begin();
364 vector<BamAlignment>::const_iterator buffEnd = buffer.end();
365 for ( ; buffIter != buffEnd; ++buffIter ) {
366 const BamAlignment& al = (*buffIter);
367 tempWriter.SaveAlignment(al);
370 // close temp file & return success