1 // ***************************************************************************
2 // bamtools_sort.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 7 April 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Sorts an input BAM file
9 // ***************************************************************************
11 #include "bamtools_sort.h"
13 #include <api/SamConstants.h>
14 #include <api/BamMultiReader.h>
15 #include <api/BamWriter.h>
16 #include <utils/bamtools_options.h>
17 using namespace BamTools;
31 // ** These defaults should be tweaked & 'optimized' per testing ** //
33 // I say 'optimized' because each system will naturally perform
34 // differently. We will attempt to determine a sensible
35 // compromise that should perform well on average.
36 const unsigned int SORT_DEFAULT_MAX_BUFFER_COUNT = 500000; // max numberOfAlignments for buffer
37 const unsigned int SORT_DEFAULT_MAX_BUFFER_MEMORY = 1024; // Mb
39 // -----------------------------------
40 // comparison objects (for sorting)
42 struct SortLessThanPosition {
43 bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) {
44 if ( lhs.RefID != rhs.RefID )
45 return lhs.RefID < rhs.RefID;
47 return lhs.Position < rhs.Position;
51 struct SortLessThanName {
52 bool operator() (const BamAlignment& lhs, const BamAlignment& rhs) {
53 return lhs.Name < rhs.Name;
57 } // namespace BamTools
59 // ---------------------------------------------
60 // SortSettings implementation
62 struct SortTool::SortSettings {
65 bool HasInputBamFilename;
66 bool HasMaxBufferCount;
67 bool HasMaxBufferMemory;
68 bool HasOutputBamFilename;
72 string InputBamFilename;
73 string OutputBamFilename;
76 unsigned int MaxBufferCount;
77 unsigned int MaxBufferMemory;
81 : HasInputBamFilename(false)
82 , HasMaxBufferCount(false)
83 , HasMaxBufferMemory(false)
84 , HasOutputBamFilename(false)
85 , IsSortingByName(false)
86 , InputBamFilename(Options::StandardIn())
87 , OutputBamFilename(Options::StandardOut())
88 , MaxBufferCount(SORT_DEFAULT_MAX_BUFFER_COUNT)
89 , MaxBufferMemory(SORT_DEFAULT_MAX_BUFFER_MEMORY)
93 // ---------------------------------------------
94 // SortToolPrivate implementation
96 class SortTool::SortToolPrivate {
100 SortToolPrivate(SortTool::SortSettings* settings);
101 ~SortToolPrivate(void) { }
103 // 'public' interface
109 void ClearBuffer(vector<BamAlignment>& buffer);
110 bool GenerateSortedRuns(void);
111 bool HandleBufferContents(vector<BamAlignment>& buffer);
112 bool MergeSortedRuns(void);
113 bool WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename);
114 void SortBuffer(vector<BamAlignment>& buffer);
118 SortTool::SortSettings* m_settings;
119 string m_tempFilenameStub;
122 RefVector m_references;
123 vector<string> m_tempFilenames;
128 SortTool::SortToolPrivate::SortToolPrivate(SortTool::SortSettings* settings)
129 : m_settings(settings)
132 // set filename stub depending on inputfile path
133 // that way multiple sort runs don't trip on each other's temp files
135 size_t extensionFound = m_settings->InputBamFilename.find(".bam");
136 if (extensionFound != string::npos )
137 m_tempFilenameStub = m_settings->InputBamFilename.substr(0,extensionFound);
138 m_tempFilenameStub.append(".sort.temp.");
142 // generates mutiple sorted temp BAM files from single unsorted BAM file
143 bool SortTool::SortToolPrivate::GenerateSortedRuns(void) {
145 // open input BAM file
146 BamReader inputReader;
147 if ( !inputReader.Open(m_settings->InputBamFilename) ) {
148 cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename
149 << " for reading... Aborting." << endl;
153 // get basic data that will be shared by all temp/output files
154 SamHeader header = inputReader.GetHeader();
155 header.SortOrder = ( m_settings->IsSortingByName
156 ? Constants::SAM_HD_SORTORDER_QUERYNAME
157 : Constants::SAM_HD_SORTORDER_COORDINATE );
158 m_headerText = header.ToString();
159 m_references = inputReader.GetReferenceData();
161 // set up alignments buffer
163 vector<BamAlignment> buffer;
164 buffer.reserve(m_settings->MaxBufferCount);
166 // if sorting by name, we need to generate full char data
167 // so can't use GetNextAlignmentCore()
168 if ( m_settings->IsSortingByName ) {
170 // iterate through file
171 while ( inputReader.GetNextAlignment(al)) {
173 // store alignments in buffer
174 buffer.push_back(al);
176 // if buffer is full, handle contents (sort & write to temp file)
177 if ( buffer.size() == m_settings->MaxBufferCount )
178 HandleBufferContents(buffer);
183 // sorting by position, can take advantage of GNACore() speedup
186 // iterate through file
187 while ( inputReader.GetNextAlignmentCore(al) ) {
189 // store alignments in buffer
190 buffer.push_back(al);
192 // if buffer is full, handle contents (sort & write to temp file)
193 if ( buffer.size() == m_settings->MaxBufferCount )
194 HandleBufferContents(buffer);
198 // handle any remaining buffer contents
199 if ( buffer.size() > 0 )
200 HandleBufferContents(buffer);
202 // close reader & return success
207 bool SortTool::SortToolPrivate::HandleBufferContents(vector<BamAlignment>& buffer ) {
212 // write sorted contents to temp file, store success/fail
213 stringstream tempStr;
214 tempStr << m_tempFilenameStub << m_numberOfRuns;
215 bool success = WriteTempFile( buffer, tempStr.str() );
217 // save temp filename for merging later
218 m_tempFilenames.push_back(tempStr.str());
220 // clear buffer contents & update run counter
224 // return success/fail of writing to temp file
225 // TODO: a failure returned here is not actually caught and handled anywhere
229 // merges sorted temp BAM files into single sorted output BAM file
230 bool SortTool::SortToolPrivate::MergeSortedRuns(void) {
232 // open up multi reader for all of our temp files
233 // this might get broken up if we do a multi-pass system later ??
234 BamMultiReader multiReader;
235 if ( !multiReader.Open(m_tempFilenames) ) {
236 cerr << "bamtools sort ERROR: could not open BamMultiReader for merging temp files... Aborting." << endl;
240 // set sort order for merge
241 if ( m_settings->IsSortingByName )
242 multiReader.SetSortOrder(BamMultiReader::SortedByReadName);
244 multiReader.SetSortOrder(BamMultiReader::SortedByPosition);
246 // open writer for our completely sorted output BAM file
247 BamWriter mergedWriter;
248 if ( !mergedWriter.Open(m_settings->OutputBamFilename, m_headerText, m_references) ) {
249 cerr << "bamtools sort ERROR: could not open " << m_settings->OutputBamFilename
250 << " for writing... Aborting." << endl;
255 // while data available in temp files
257 while ( multiReader.GetNextAlignmentCore(al) )
258 mergedWriter.SaveAlignment(al);
262 mergedWriter.Close();
264 // delete all temp files
265 vector<string>::const_iterator tempIter = m_tempFilenames.begin();
266 vector<string>::const_iterator tempEnd = m_tempFilenames.end();
267 for ( ; tempIter != tempEnd; ++tempIter ) {
268 const string& tempFilename = (*tempIter);
269 remove(tempFilename.c_str());
275 bool SortTool::SortToolPrivate::Run(void) {
277 // this does a single pass, chunking up the input file into smaller sorted temp files,
278 // then write out using BamMultiReader to handle merging
280 if ( GenerateSortedRuns() )
281 return MergeSortedRuns();
286 void SortTool::SortToolPrivate::SortBuffer(vector<BamAlignment>& buffer) {
288 // ** add further custom sort options later ?? **
290 // sort buffer by desired method
291 if ( m_settings->IsSortingByName )
292 sort ( buffer.begin(), buffer.end(), SortLessThanName() );
294 sort ( buffer.begin(), buffer.end(), SortLessThanPosition() );
298 bool SortTool::SortToolPrivate::WriteTempFile(const vector<BamAlignment>& buffer, const string& tempFilename) {
300 // open temp file for writing
301 BamWriter tempWriter;
302 if ( !tempWriter.Open(tempFilename, m_headerText, m_references) ) {
303 cerr << "bamtools sort ERROR: could not open " << tempFilename
304 << " for writing." << endl;
309 vector<BamAlignment>::const_iterator buffIter = buffer.begin();
310 vector<BamAlignment>::const_iterator buffEnd = buffer.end();
311 for ( ; buffIter != buffEnd; ++buffIter ) {
312 const BamAlignment& al = (*buffIter);
313 tempWriter.SaveAlignment(al);
316 // close temp file & return success
321 // ---------------------------------------------
322 // SortTool implementation
324 SortTool::SortTool(void)
326 , m_settings(new SortSettings)
329 // set program details
330 Options::SetProgramInfo("bamtools sort", "sorts a BAM file", "[-in <filename>] [-out <filename>] [sortOptions]");
333 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
334 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputBamFilename, m_settings->InputBamFilename, IO_Opts, Options::StandardIn());
335 Options::AddValueOption("-out", "BAM filename", "the output BAM file", "", m_settings->HasOutputBamFilename, m_settings->OutputBamFilename, IO_Opts, Options::StandardOut());
337 OptionGroup* SortOpts = Options::CreateOptionGroup("Sorting Methods");
338 Options::AddOption("-byname", "sort by alignment name", m_settings->IsSortingByName, SortOpts);
340 OptionGroup* MemOpts = Options::CreateOptionGroup("Memory Settings");
341 Options::AddValueOption("-n", "count", "max number of alignments per tempfile", "", m_settings->HasMaxBufferCount, m_settings->MaxBufferCount, MemOpts, SORT_DEFAULT_MAX_BUFFER_COUNT);
342 Options::AddValueOption("-mem", "Mb", "max memory to use", "", m_settings->HasMaxBufferMemory, m_settings->MaxBufferMemory, MemOpts, SORT_DEFAULT_MAX_BUFFER_MEMORY);
345 SortTool::~SortTool(void) {
354 int SortTool::Help(void) {
355 Options::DisplayHelp();
359 int SortTool::Run(int argc, char* argv[]) {
361 // parse command line arguments
362 Options::Parse(argc, argv, 1);
364 // initialize SortTool with settings
365 m_impl = new SortToolPrivate(m_settings);
367 // run SortTool, return success/fail