1 // ***************************************************************************
2 // bamtools_split.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 7 April 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Splits a BAM file on user-specified property, creating a new BAM output
8 // file for each value found
9 // ***************************************************************************
11 #include "bamtools_split.h"
13 #include <api/BamConstants.h>
14 #include <api/BamReader.h>
15 #include <api/BamWriter.h>
16 #include <utils/bamtools_options.h>
17 #include <utils/bamtools_variant.h>
18 using namespace BamTools;
31 static const string SPLIT_MAPPED_TOKEN = ".MAPPED";
32 static const string SPLIT_UNMAPPED_TOKEN = ".UNMAPPED";
33 static const string SPLIT_PAIRED_TOKEN = ".PAIRED_END";
34 static const string SPLIT_SINGLE_TOKEN = ".SINGLE_END";
35 static const string SPLIT_REFERENCE_TOKEN = ".REF_";
37 string GetTimestampString(void) {
39 // get human readable timestamp
42 stringstream timeStream("");
43 timeStream << ctime(¤tTime);
45 // convert whitespace to '_'
46 string timeString = timeStream.str();
47 size_t found = timeString.find(" ");
48 while (found != string::npos) {
49 timeString.replace(found, 1, "_");
50 found = timeString.find(" ", found+1);
55 // remove copy of filename without extension
56 // (so /path/to/file.txt becomes /path/to/file )
57 string RemoveFilenameExtension(const string& filename) {
58 size_t found = filename.rfind(".");
59 return filename.substr(0, found);
62 } // namespace BamTools
64 // ---------------------------------------------
65 // SplitSettings implementation
67 struct SplitTool::SplitSettings {
70 bool HasInputFilename;
71 bool HasCustomOutputStub;
72 bool IsSplittingMapped;
73 bool IsSplittingPaired;
74 bool IsSplittingReference;
78 string CustomOutputStub;
84 : HasInputFilename(false)
85 , HasCustomOutputStub(false)
86 , IsSplittingMapped(false)
87 , IsSplittingPaired(false)
88 , IsSplittingReference(false)
89 , IsSplittingTag(false)
90 , CustomOutputStub("")
91 , InputFilename(Options::StandardIn())
96 // ---------------------------------------------
97 // SplitToolPrivate declaration
99 class SplitTool::SplitToolPrivate {
103 SplitToolPrivate(SplitTool::SplitSettings* settings)
104 : m_settings(settings)
107 ~SplitToolPrivate(void) {
111 // 'public' interface
117 // close & delete BamWriters in map
119 void CloseWriters(map<T, BamWriter*>& writers);
120 // calculate output stub based on IO args given
121 void DetermineOutputFilenameStub(void);
122 // open our BamReader
123 bool OpenReader(void);
124 // split alignments in BAM file based on isMapped property
125 bool SplitMapped(void);
126 // split alignments in BAM file based on isPaired property
127 bool SplitPaired(void);
128 // split alignments in BAM file based on refID property
129 bool SplitReference(void);
130 // finds first alignment and calls corresponding SplitTagImpl<>
131 // depending on tag type
133 // templated split tag implementation
134 // handle the various types that are possible for tags
136 bool SplitTagImpl(BamAlignment& al);
140 SplitTool::SplitSettings* m_settings;
141 string m_outputFilenameStub;
144 RefVector m_references;
147 void SplitTool::SplitToolPrivate::DetermineOutputFilenameStub(void) {
149 // if user supplied output filename stub, use that
150 if ( m_settings->HasCustomOutputStub )
151 m_outputFilenameStub = m_settings->CustomOutputStub;
153 // else if user supplied input BAM filename, use that (minus ".bam" extension) as stub
154 else if ( m_settings->HasInputFilename )
155 m_outputFilenameStub = RemoveFilenameExtension(m_settings->InputFilename);
157 // otherwise, user did not specify -stub, and input is coming from STDIN
158 // generate stub from timestamp
159 else m_outputFilenameStub = GetTimestampString();
162 bool SplitTool::SplitToolPrivate::OpenReader(void) {
164 // attempt to open BAM file
165 if ( !m_reader.Open(m_settings->InputFilename) ) {
166 cerr << "bamtools split ERROR: could not open BAM file: " << m_settings->InputFilename << endl;
170 // save file 'metadata' & return success
171 m_header = m_reader.GetHeaderText();
172 m_references = m_reader.GetReferenceData();
176 bool SplitTool::SplitToolPrivate::Run(void) {
178 // determine output stub
179 DetermineOutputFilenameStub();
185 // determine split type from settings
186 if ( m_settings->IsSplittingMapped ) return SplitMapped();
187 if ( m_settings->IsSplittingPaired ) return SplitPaired();
188 if ( m_settings->IsSplittingReference ) return SplitReference();
189 if ( m_settings->IsSplittingTag ) return SplitTag();
191 // if we get here, no property was specified
192 cerr << "bamtools split ERROR: no property given to split on... " << endl
193 << "Please use -mapped, -paired, -reference, or -tag TAG to specifiy desired split behavior." << endl;
197 bool SplitTool::SplitToolPrivate::SplitMapped(void) {
199 // set up splitting data structure
200 map<bool, BamWriter*> outputFiles;
201 map<bool, BamWriter*>::iterator writerIter;
203 // iterate through alignments
206 bool isCurrentAlignmentMapped;
207 while ( m_reader.GetNextAlignment(al) ) {
209 // see if bool value exists
210 isCurrentAlignmentMapped = al.IsMapped();
211 writerIter = outputFiles.find(isCurrentAlignmentMapped);
213 // if no writer associated with this value
214 if ( writerIter == outputFiles.end() ) {
216 // open new BamWriter
217 const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentMapped
219 : SPLIT_UNMAPPED_TOKEN ) + ".bam";
220 writer = new BamWriter;
221 if ( !writer->Open(outputFilename, m_header, m_references) ) {
222 cerr << "bamtools split ERROR: could not open " << outputFilename
223 << " for writing." << endl;
228 outputFiles.insert( make_pair(isCurrentAlignmentMapped, writer) );
231 // else grab corresponding writer
232 else writer = (*writerIter).second;
234 // store alignment in proper BAM output file
236 writer->SaveAlignment(al);
239 // clean up BamWriters
240 CloseWriters(outputFiles);
246 bool SplitTool::SplitToolPrivate::SplitPaired(void) {
248 // set up splitting data structure
249 map<bool, BamWriter*> outputFiles;
250 map<bool, BamWriter*>::iterator writerIter;
252 // iterate through alignments
255 bool isCurrentAlignmentPaired;
256 while ( m_reader.GetNextAlignment(al) ) {
258 // see if bool value exists
259 isCurrentAlignmentPaired = al.IsPaired();
260 writerIter = outputFiles.find(isCurrentAlignmentPaired);
262 // if no writer associated with this value
263 if ( writerIter == outputFiles.end() ) {
265 // open new BamWriter
266 const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentPaired
268 : SPLIT_SINGLE_TOKEN ) + ".bam";
269 writer = new BamWriter;
270 if ( !writer->Open(outputFilename, m_header, m_references) ) {
271 cerr << "bamtool split ERROR: could not open " << outputFilename
272 << " for writing." << endl;
277 outputFiles.insert( make_pair(isCurrentAlignmentPaired, writer) );
280 // else grab corresponding writer
281 else writer = (*writerIter).second;
283 // store alignment in proper BAM output file
285 writer->SaveAlignment(al);
288 // clean up BamWriters
289 CloseWriters(outputFiles);
295 bool SplitTool::SplitToolPrivate::SplitReference(void) {
297 // set up splitting data structure
298 map<int32_t, BamWriter*> outputFiles;
299 map<int32_t, BamWriter*>::iterator writerIter;
301 // iterate through alignments
304 int32_t currentRefId;
305 while ( m_reader.GetNextAlignment(al) ) {
307 // see if bool value exists
308 currentRefId = al.RefID;
309 writerIter = outputFiles.find(currentRefId);
311 // if no writer associated with this value
312 if ( writerIter == outputFiles.end() ) {
314 // open new BamWriter
315 const string refName = m_references.at(currentRefId).RefName;
316 const string outputFilename = m_outputFilenameStub + SPLIT_REFERENCE_TOKEN + refName + ".bam";
317 writer = new BamWriter;
318 if ( !writer->Open(outputFilename, m_header, m_references) ) {
319 cerr << "bamtools split ERROR: could not open " << outputFilename
320 << " for writing." << endl;
325 outputFiles.insert( make_pair(currentRefId, writer) );
328 // else grab corresponding writer
329 else writer = (*writerIter).second;
331 // store alignment in proper BAM output file
333 writer->SaveAlignment(al);
336 // clean up BamWriters
337 CloseWriters(outputFiles);
343 // finds first alignment and calls corresponding SplitTagImpl<>() depending on tag type
344 bool SplitTool::SplitToolPrivate::SplitTag(void) {
346 // iterate through alignments, until we hit TAG
348 while ( m_reader.GetNextAlignment(al) ) {
350 // look for tag in this alignment and get tag type
352 if ( !al.GetTagType(m_settings->TagToSplit, tagType) )
355 // request split method based on tag type
356 // pass it the current alignment found
359 case (Constants::BAM_TAG_TYPE_INT8) :
360 case (Constants::BAM_TAG_TYPE_INT16) :
361 case (Constants::BAM_TAG_TYPE_INT32) :
362 return SplitTagImpl<int32_t>(al);
364 case (Constants::BAM_TAG_TYPE_UINT8) :
365 case (Constants::BAM_TAG_TYPE_UINT16) :
366 case (Constants::BAM_TAG_TYPE_UINT32) :
367 return SplitTagImpl<uint32_t>(al);
369 case (Constants::BAM_TAG_TYPE_FLOAT) :
370 return SplitTagImpl<float>(al);
372 case (Constants::BAM_TAG_TYPE_ASCII) :
373 case (Constants::BAM_TAG_TYPE_STRING) :
374 case (Constants::BAM_TAG_TYPE_HEX) :
375 return SplitTagImpl<string>(al);
378 fprintf(stderr, "bamtools split ERROR: unknown tag type encountered: [%c]\n", tagType);
383 // tag not found, but that's not an error - return success
387 // --------------------------------------------------------------------------------
388 // template method implementation
389 // *Technical Note* - use of template methods declared & defined in ".cpp" file
390 // goes against normal practices, but works here because these
391 // are purely internal (no one can call from outside this file)
393 // close BamWriters & delete pointers
395 void SplitTool::SplitToolPrivate::CloseWriters(map<T, BamWriter*>& writers) {
397 typedef map<T, BamWriter*> WriterMap;
398 typedef typename WriterMap::iterator WriterMapIterator;
400 // iterate over writers
401 WriterMapIterator writerIter = writers.begin();
402 WriterMapIterator writerEnd = writers.end();
403 for ( ; writerIter != writerEnd; ++writerIter ) {
404 BamWriter* writer = (*writerIter).second;
405 if ( writer == 0 ) continue;
415 // clear the container (destroying the items doesn't remove them)
419 // handle the various types that are possible for tags
421 bool SplitTool::SplitToolPrivate::SplitTagImpl(BamAlignment& al) {
423 typedef T TagValueType;
424 typedef map<TagValueType, BamWriter*> WriterMap;
425 typedef typename WriterMap::iterator WriterMapIterator;
427 // set up splitting data structure
428 WriterMap outputFiles;
429 WriterMapIterator writerIter;
432 const string tag = m_settings->TagToSplit;
434 stringstream outputFilenameStream("");
435 TagValueType currentValue;
437 // retrieve first alignment tag value
438 if ( al.GetTag(tag, currentValue) ) {
440 // open new BamWriter, save first alignment
441 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
442 writer = new BamWriter;
443 if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) {
444 cerr << "bamtools split ERROR: could not open " << outputFilenameStream.str()
445 << " for writing." << endl;
448 writer->SaveAlignment(al);
451 outputFiles.insert( make_pair(currentValue, writer) );
454 outputFilenameStream.str("");
457 // iterate through remaining alignments
458 while ( m_reader.GetNextAlignment(al) ) {
460 // skip if this alignment doesn't have TAG
461 if ( !al.GetTag(tag, currentValue) ) continue;
463 // look up tag value in map
464 writerIter = outputFiles.find(currentValue);
466 // if no writer associated with this value
467 if ( writerIter == outputFiles.end() ) {
469 // open new BamWriter
470 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
471 writer = new BamWriter;
472 if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) {
473 cerr << "bamtool split ERROR: could not open " << outputFilenameStream.str()
474 << " for writing." << endl;
479 outputFiles.insert( make_pair(currentValue, writer) );
482 outputFilenameStream.str("");
485 // else grab corresponding writer
486 else writer = (*writerIter).second;
488 // store alignment in proper BAM output file
490 writer->SaveAlignment(al);
493 // clean up BamWriters
494 CloseWriters(outputFiles);
500 // ---------------------------------------------
501 // SplitTool implementation
503 SplitTool::SplitTool(void)
505 , m_settings(new SplitSettings)
508 // set program details
509 Options::SetProgramInfo("bamtools split", "splits a BAM file on user-specified property, creating a new BAM output file for each value found", "[-in <filename>] [-stub <filename stub>] < -mapped | -paired | -reference | -tag <TAG> > ");
512 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
513 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn());
514 Options::AddValueOption("-stub", "filename stub", "prefix stub for output BAM files (default behavior is to use input filename, without .bam extension, as stub). If input is stdin and no stub provided, a timestamp is generated as the stub.", "", m_settings->HasCustomOutputStub, m_settings->CustomOutputStub, IO_Opts);
516 OptionGroup* SplitOpts = Options::CreateOptionGroup("Split Options");
517 Options::AddOption("-mapped", "split mapped/unmapped alignments", m_settings->IsSplittingMapped, SplitOpts);
518 Options::AddOption("-paired", "split single-end/paired-end alignments", m_settings->IsSplittingPaired, SplitOpts);
519 Options::AddOption("-reference", "split alignments by reference", m_settings->IsSplittingReference, SplitOpts);
520 Options::AddValueOption("-tag", "tag name", "splits alignments based on all values of TAG encountered (i.e. -tag RG creates a BAM file for each read group in original BAM file)", "",
521 m_settings->IsSplittingTag, m_settings->TagToSplit, SplitOpts);
524 SplitTool::~SplitTool(void) {
533 int SplitTool::Help(void) {
534 Options::DisplayHelp();
538 int SplitTool::Run(int argc, char* argv[]) {
540 // parse command line arguments
541 Options::Parse(argc, argv, 1);
543 // initialize SplitTool with settings
544 m_impl = new SplitToolPrivate(m_settings);
546 // run SplitTool, return success/fail