1 // ***************************************************************************
2 // bamtools_sort.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 14 June 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Sorts an input BAM file
8 // ***************************************************************************
10 #include "bamtools_sort.h"
12 #include <api/SamConstants.h>
13 #include <api/BamMultiReader.h>
14 #include <api/BamWriter.h>
15 #include <utils/bamtools_options.h>
16 using namespace BamTools;
30 // ** These defaults should be tweaked & 'optimized' per testing ** //
32 // I say 'optimized' because each system will naturally perform
33 // differently. We will attempt to determine a sensible
34 // compromise that should perform well on average.
35 const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT = 500000; // max numberOfAlignments for buffer
36 const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024; // Mb
38 // -----------------------------------
39 // comparison objects (for sorting)
41 struct SortLessThanPosition {
42 bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) {
43 if ( lhs.RefID != rhs.RefID )
44 return lhs.RefID < rhs.RefID;
46 return lhs.Position < rhs.Position;
50 struct SortLessThanName {
51 bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) {
52 return lhs.Name < rhs.Name;
56 } // namespace BamTools
58 // ---------------------------------------------
59 // SortSettings implementation
61 struct SortTool::SortSettings {
64 bool HasInputBamFilename;
65 bool HasMaxBufferCount;
66 bool HasMaxBufferMemory;
67 bool HasOutputBamFilename;
71 string InputBamFilename;
72 string OutputBamFilename;
75 unsigned int MaxBufferCount;
76 unsigned int MaxBufferMemory;
80 : HasInputBamFilename(false)
81 , HasMaxBufferCount(false)
82 , HasMaxBufferMemory(false)
83 , HasOutputBamFilename(false)
84 , IsSortingByName(false)
85 , InputBamFilename(Options::StandardIn())
86 , OutputBamFilename(Options::StandardOut())
87 , MaxBufferCount(SORT_DEFAULT_MAX_BUFFER_COUNT)
88 , MaxBufferMemory(SORT_DEFAULT_MAX_BUFFER_MEMORY)
92 // ---------------------------------------------
93 // SortToolPrivate implementation
95 class SortTool::SortToolPrivate {
99 SortToolPrivate(SortTool::SortSettings* settings);
100 ~SortToolPrivate(void) { }
102 // 'public' interface
108 bool CreateSortedTempFile(vector<BamAlignment>& buffer);
109 bool GenerateSortedRuns(void);
110 bool MergeSortedRuns(void);
111 bool WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename);
112 void SortBuffer(vector<BamAlignment>& buffer);
116 SortTool::SortSettings* m_settings;
117 string m_tempFilenameStub;
120 RefVector m_references;
121 vector<string> m_tempFilenames;
125 SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings)
126 : m_settings(settings)
129 // set filename stub depending on inputfile path
130 // that way multiple sort runs don't trip on each other's temp files
132 size_t extensionFound = m_settings->InputBamFilename.find(".bam");
133 if (extensionFound != string::npos )
134 m_tempFilenameStub = m_settings->InputBamFilename.substr(0,extensionFound);
135 m_tempFilenameStub.append(".sort.temp.");
139 // generates mutiple sorted temp BAM files from single unsorted BAM file
140 bool SortTool::SortToolPrivate::GenerateSortedRuns(void) {
142 // open input BAM file
144 if ( !reader.Open(m_settings->InputBamFilename) ) {
145 cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename
146 << " for reading... Aborting." << endl;
150 // get basic data that will be shared by all temp/output files
151 SamHeader header = reader.GetHeader();
152 header.SortOrder = ( m_settings->IsSortingByName
153 ? Constants::SAM_HD_SORTORDER_QUERYNAME
154 : Constants::SAM_HD_SORTORDER_COORDINATE );
155 m_headerText = header.ToString();
156 m_references = reader.GetReferenceData();
158 // set up alignments buffer
160 vector<BamAlignment> buffer;
161 buffer.reserve( (size_t)(m_settings->MaxBufferCount*1.1) );
162 bool bufferFull = false;
164 // if sorting by name, we need to generate full char data
165 // so can't use GetNextAlignmentCore()
166 if ( m_settings->IsSortingByName ) {
168 // iterate through file
169 while ( reader.GetNextAlignment(al)) {
171 // check buffer's usage
172 bufferFull = ( buffer.size() >= m_settings->MaxBufferCount );
174 // store alignments until buffer is "full"
176 buffer.push_back(al);
178 // if buffer is "full"
181 // push any unmapped reads into buffer,
182 // don't want to split these into a separate temp file
183 if ( !al.IsMapped() )
184 buffer.push_back(al);
186 // "al" is mapped, so create a sorted temp file with current buffer contents
187 // then push "al" into fresh buffer
189 CreateSortedTempFile(buffer);
190 buffer.push_back(al);
196 // sorting by position, can take advantage of GNACore() speedup
199 // iterate through file
200 while ( reader.GetNextAlignmentCore(al) ) {
202 // check buffer's usage
203 bufferFull = ( buffer.size() >= m_settings->MaxBufferCount );
205 // store alignments until buffer is "full"
207 buffer.push_back(al);
209 // if buffer is "full"
212 // push any unmapped reads into buffer,
213 // don't want to split these into a separate temp file
214 if ( !al.IsMapped() )
215 buffer.push_back(al);
217 // "al" is mapped, so create a sorted temp file with current buffer contents
218 // then push "al" into fresh buffer
220 CreateSortedTempFile(buffer);
221 buffer.push_back(al);
227 // handle any leftover buffer contents
228 if ( !buffer.empty() )
229 CreateSortedTempFile(buffer);
231 // close reader & return success
236 bool SortTool::SortToolPrivate::CreateSortedTempFile(vector<BamAlignment>& buffer) {
241 // write sorted contents to temp file, store success/fail
242 stringstream tempStr;
243 tempStr << m_tempFilenameStub << m_numberOfRuns;
244 bool success = WriteTempFile( buffer, tempStr.str() );
246 // save temp filename for merging later
247 m_tempFilenames.push_back(tempStr.str());
249 // clear buffer contents & update run counter
253 // return success/fail of writing to temp file
254 // TODO: a failure returned here is not actually caught and handled anywhere
258 // merges sorted temp BAM files into single sorted output BAM file
259 bool SortTool::SortToolPrivate::MergeSortedRuns(void) {
261 // open up multi reader for all of our temp files
262 // this might get broken up if we do a multi-pass system later ??
263 BamMultiReader multiReader;
264 if ( !multiReader.Open(m_tempFilenames) ) {
265 cerr << "bamtools sort ERROR: could not open BamMultiReader for merging temp files... Aborting."
270 // set sort order for merge
271 if ( m_settings->IsSortingByName )
272 multiReader.SetSortOrder(BamMultiReader::SortedByReadName);
274 multiReader.SetSortOrder(BamMultiReader::SortedByPosition);
276 // open writer for our completely sorted output BAM file
277 BamWriter mergedWriter;
278 if ( !mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references) ) {
279 cerr << "bamtools sort ERROR: could not open " << m_settings->OutputBamFilename
280 << " for writing... Aborting." << endl;
285 // while data available in temp files
287 while ( multiReader.GetNextAlignmentCore(al) )
288 mergedWriter.SaveAlignment(al);
292 mergedWriter.Close();
294 // delete all temp files
295 vector<string>::const_iterator tempIter = m_tempFilenames.begin();
296 vector<string>::const_iterator tempEnd = m_tempFilenames.end();
297 for ( ; tempIter != tempEnd; ++tempIter ) {
298 const string& tempFilename = (*tempIter);
299 remove(tempFilename.c_str());
305 bool SortTool::SortToolPrivate::Run(void) {
307 // this does a single pass, chunking up the input file into smaller sorted temp files,
308 // then write out using BamMultiReader to handle merging
310 if ( GenerateSortedRuns() )
311 return MergeSortedRuns();
316 void SortTool::SortToolPrivate::SortBuffer(vector<BamAlignment>& buffer) {
318 // ** add further custom sort options later ?? **
320 // sort buffer by desired method
321 if ( m_settings->IsSortingByName )
322 std::stable_sort ( buffer.begin(), buffer.end(), SortLessThanName() );
324 std::stable_sort ( buffer.begin(), buffer.end(), SortLessThanPosition() );
327 bool SortTool::SortToolPrivate::WriteTempFile(const vector<BamAlignment>& buffer,
328 const string& tempFilename)
330 // open temp file for writing
331 BamWriter tempWriter;
332 if ( !tempWriter.Open(tempFilename, m_headerText, m_references) ) {
333 cerr << "bamtools sort ERROR: could not open " << tempFilename
334 << " for writing." << endl;
339 vector<BamAlignment>::const_iterator buffIter = buffer.begin();
340 vector<BamAlignment>::const_iterator buffEnd = buffer.end();
341 for ( ; buffIter != buffEnd; ++buffIter ) {
342 const BamAlignment& al = (*buffIter);
343 tempWriter.SaveAlignment(al);
346 // close temp file & return success
351 // ---------------------------------------------
352 // SortTool implementation
354 SortTool::SortTool(void)
356 , m_settings(new SortSettings)
359 // set program details
360 Options::SetProgramInfo("bamtools sort", "sorts a BAM file", "[-in <filename>] [-out <filename>] [sortOptions]");
363 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
364 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "",
365 m_settings->HasInputBamFilename, m_settings->InputBamFilename,
366 IO_Opts, Options::StandardIn());
367 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "",
368 m_settings->HasOutputBamFilename, m_settings->OutputBamFilename,
369 IO_Opts, Options::StandardOut());
371 OptionGroup* SortOpts = Options::CreateOptionGroup("Sorting Methods");
372 Options::AddOption("-byname", "sort by alignment name", m_settings->IsSortingByName, SortOpts);
374 OptionGroup* MemOpts = Options::CreateOptionGroup("Memory Settings");
375 Options::AddValueOption("-n", "count", "max number of alignments per tempfile", "",
376 m_settings->HasMaxBufferCount, m_settings->MaxBufferCount,
377 MemOpts, SORT_DEFAULT_MAX_BUFFER_COUNT);
378 Options::AddValueOption("-mem", "Mb", "max memory to use", "",
379 m_settings->HasMaxBufferMemory, m_settings->MaxBufferMemory,
380 MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY);
383 SortTool::~SortTool(void) {
392 int SortTool::Help(void) {
393 Options::DisplayHelp();
397 int SortTool::Run(int argc, char* argv[]) {
399 // parse command line arguments
400 Options::Parse(argc, argv, 1);
402 // initialize SortTool with settings
403 m_impl = new SortToolPrivate(m_settings);
405 // run SortTool, return success/fail