1 // ***************************************************************************
2 // bamtools_split.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 8 December 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Splits a BAM file on user-specified property, creating a new BAM output
8 // file for each value found
9 // ***************************************************************************
11 #include "bamtools_split.h"
13 #include <api/BamConstants.h>
14 #include <api/BamReader.h>
15 #include <api/BamWriter.h>
16 #include <utils/bamtools_options.h>
17 #include <utils/bamtools_variant.h>
18 using namespace BamTools;
31 static const string SPLIT_MAPPED_TOKEN = ".MAPPED";
32 static const string SPLIT_UNMAPPED_TOKEN = ".UNMAPPED";
33 static const string SPLIT_PAIRED_TOKEN = ".PAIRED_END";
34 static const string SPLIT_SINGLE_TOKEN = ".SINGLE_END";
35 static const string SPLIT_REFERENCE_TOKEN = ".REF_";
37 string GetTimestampString(void) {
39 // get human readable timestamp
42 stringstream timeStream("");
43 timeStream << ctime(¤tTime);
45 // convert whitespace to '_'
46 string timeString = timeStream.str();
47 size_t found = timeString.find(" ");
48 while (found != string::npos) {
49 timeString.replace(found, 1, "_");
50 found = timeString.find(" ", found+1);
55 // remove copy of filename without extension
56 // (so /path/to/file.txt becomes /path/to/file )
57 string RemoveFilenameExtension(const string& filename) {
58 size_t found = filename.rfind(".");
59 return filename.substr(0, found);
62 } // namespace BamTools
64 // ---------------------------------------------
65 // SplitSettings implementation
67 struct SplitTool::SplitSettings {
70 bool HasInputFilename;
71 bool HasCustomOutputStub;
72 bool HasCustomRefPrefix;
73 bool IsSplittingMapped;
74 bool IsSplittingPaired;
75 bool IsSplittingReference;
79 string CustomOutputStub;
80 string CustomRefPrefix;
86 : HasInputFilename(false)
87 , HasCustomOutputStub(false)
88 , HasCustomRefPrefix(false)
89 , IsSplittingMapped(false)
90 , IsSplittingPaired(false)
91 , IsSplittingReference(false)
92 , IsSplittingTag(false)
93 , CustomOutputStub("")
95 , InputFilename(Options::StandardIn())
100 // ---------------------------------------------
101 // SplitToolPrivate declaration
103 class SplitTool::SplitToolPrivate {
107 SplitToolPrivate(SplitTool::SplitSettings* settings)
108 : m_settings(settings)
111 ~SplitToolPrivate(void) {
115 // 'public' interface
121 // close & delete BamWriters in map
123 void CloseWriters(map<T, BamWriter*>& writers);
124 // calculate output stub based on IO args given
125 void DetermineOutputFilenameStub(void);
126 // open our BamReader
127 bool OpenReader(void);
128 // split alignments in BAM file based on isMapped property
129 bool SplitMapped(void);
130 // split alignments in BAM file based on isPaired property
131 bool SplitPaired(void);
132 // split alignments in BAM file based on refID property
133 bool SplitReference(void);
134 // finds first alignment and calls corresponding SplitTagImpl<>
135 // depending on tag type
137 // templated split tag implementation
138 // handle the various types that are possible for tags
140 bool SplitTagImpl(BamAlignment& al);
144 SplitTool::SplitSettings* m_settings;
145 string m_outputFilenameStub;
148 RefVector m_references;
151 void SplitTool::SplitToolPrivate::DetermineOutputFilenameStub(void) {
153 // if user supplied output filename stub, use that
154 if ( m_settings->HasCustomOutputStub )
155 m_outputFilenameStub = m_settings->CustomOutputStub;
157 // else if user supplied input BAM filename, use that (minus ".bam" extension) as stub
158 else if ( m_settings->HasInputFilename )
159 m_outputFilenameStub = RemoveFilenameExtension(m_settings->InputFilename);
161 // otherwise, user did not specify -stub, and input is coming from STDIN
162 // generate stub from timestamp
163 else m_outputFilenameStub = GetTimestampString();
166 bool SplitTool::SplitToolPrivate::OpenReader(void) {
168 // attempt to open BAM file
169 if ( !m_reader.Open(m_settings->InputFilename) ) {
170 cerr << "bamtools split ERROR: could not open BAM file: " << m_settings->InputFilename << endl;
174 // save file 'metadata' & return success
175 m_header = m_reader.GetHeaderText();
176 m_references = m_reader.GetReferenceData();
180 bool SplitTool::SplitToolPrivate::Run(void) {
182 // determine output stub
183 DetermineOutputFilenameStub();
189 // determine split type from settings
190 if ( m_settings->IsSplittingMapped ) return SplitMapped();
191 if ( m_settings->IsSplittingPaired ) return SplitPaired();
192 if ( m_settings->IsSplittingReference ) return SplitReference();
193 if ( m_settings->IsSplittingTag ) return SplitTag();
195 // if we get here, no property was specified
196 cerr << "bamtools split ERROR: no property given to split on... " << endl
197 << "Please use -mapped, -paired, -reference, or -tag TAG to specifiy desired split behavior." << endl;
201 bool SplitTool::SplitToolPrivate::SplitMapped(void) {
203 // set up splitting data structure
204 map<bool, BamWriter*> outputFiles;
205 map<bool, BamWriter*>::iterator writerIter;
207 // iterate through alignments
210 bool isCurrentAlignmentMapped;
211 while ( m_reader.GetNextAlignment(al) ) {
213 // see if bool value exists
214 isCurrentAlignmentMapped = al.IsMapped();
215 writerIter = outputFiles.find(isCurrentAlignmentMapped);
217 // if no writer associated with this value
218 if ( writerIter == outputFiles.end() ) {
220 // open new BamWriter
221 const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentMapped
223 : SPLIT_UNMAPPED_TOKEN ) + ".bam";
224 writer = new BamWriter;
225 if ( !writer->Open(outputFilename, m_header, m_references) ) {
226 cerr << "bamtools split ERROR: could not open " << outputFilename
227 << " for writing." << endl;
232 outputFiles.insert( make_pair(isCurrentAlignmentMapped, writer) );
235 // else grab corresponding writer
236 else writer = (*writerIter).second;
238 // store alignment in proper BAM output file
240 writer->SaveAlignment(al);
243 // clean up BamWriters
244 CloseWriters(outputFiles);
250 bool SplitTool::SplitToolPrivate::SplitPaired(void) {
252 // set up splitting data structure
253 map<bool, BamWriter*> outputFiles;
254 map<bool, BamWriter*>::iterator writerIter;
256 // iterate through alignments
259 bool isCurrentAlignmentPaired;
260 while ( m_reader.GetNextAlignment(al) ) {
262 // see if bool value exists
263 isCurrentAlignmentPaired = al.IsPaired();
264 writerIter = outputFiles.find(isCurrentAlignmentPaired);
266 // if no writer associated with this value
267 if ( writerIter == outputFiles.end() ) {
269 // open new BamWriter
270 const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentPaired
272 : SPLIT_SINGLE_TOKEN ) + ".bam";
273 writer = new BamWriter;
274 if ( !writer->Open(outputFilename, m_header, m_references) ) {
275 cerr << "bamtool split ERROR: could not open " << outputFilename
276 << " for writing." << endl;
281 outputFiles.insert( make_pair(isCurrentAlignmentPaired, writer) );
284 // else grab corresponding writer
285 else writer = (*writerIter).second;
287 // store alignment in proper BAM output file
289 writer->SaveAlignment(al);
292 // clean up BamWriters
293 CloseWriters(outputFiles);
299 bool SplitTool::SplitToolPrivate::SplitReference(void) {
301 // set up splitting data structure
302 map<int32_t, BamWriter*> outputFiles;
303 map<int32_t, BamWriter*>::iterator writerIter;
305 // determine reference prefix
306 string refPrefix = SPLIT_REFERENCE_TOKEN;
307 if ( m_settings->HasCustomRefPrefix )
308 refPrefix = m_settings->CustomRefPrefix;
310 // make sure prefix starts with '.'
311 const size_t dotFound = refPrefix.find('.');
313 refPrefix = string(".") + refPrefix;
315 // iterate through alignments
318 int32_t currentRefId;
319 while ( m_reader.GetNextAlignment(al) ) {
321 // see if bool value exists
322 currentRefId = al.RefID;
323 writerIter = outputFiles.find(currentRefId);
325 // if no writer associated with this value
326 if ( writerIter == outputFiles.end() ) {
328 // fetch reference name for ID
330 if ( currentRefId == -1 )
331 refName = "unmapped";
333 refName = m_references.at(currentRefId).RefName;
335 // construct new output filename
336 const string outputFilename = m_outputFilenameStub + refPrefix + refName + ".bam";
338 // open new BamWriter
339 writer = new BamWriter;
340 if ( !writer->Open(outputFilename, m_header, m_references) ) {
341 cerr << "bamtools split ERROR: could not open " << outputFilename
342 << " for writing." << endl;
347 outputFiles.insert( make_pair(currentRefId, writer) );
350 // else grab corresponding writer
351 else writer = (*writerIter).second;
353 // store alignment in proper BAM output file
355 writer->SaveAlignment(al);
358 // clean up BamWriters
359 CloseWriters(outputFiles);
365 // finds first alignment and calls corresponding SplitTagImpl<>() depending on tag type
366 bool SplitTool::SplitToolPrivate::SplitTag(void) {
368 // iterate through alignments, until we hit TAG
370 while ( m_reader.GetNextAlignment(al) ) {
372 // look for tag in this alignment and get tag type
374 if ( !al.GetTagType(m_settings->TagToSplit, tagType) )
377 // request split method based on tag type
378 // pass it the current alignment found
381 case (Constants::BAM_TAG_TYPE_INT8) :
382 case (Constants::BAM_TAG_TYPE_INT16) :
383 case (Constants::BAM_TAG_TYPE_INT32) :
384 return SplitTagImpl<int32_t>(al);
386 case (Constants::BAM_TAG_TYPE_UINT8) :
387 case (Constants::BAM_TAG_TYPE_UINT16) :
388 case (Constants::BAM_TAG_TYPE_UINT32) :
389 return SplitTagImpl<uint32_t>(al);
391 case (Constants::BAM_TAG_TYPE_FLOAT) :
392 return SplitTagImpl<float>(al);
394 case (Constants::BAM_TAG_TYPE_ASCII) :
395 case (Constants::BAM_TAG_TYPE_STRING) :
396 case (Constants::BAM_TAG_TYPE_HEX) :
397 return SplitTagImpl<string>(al);
399 case (Constants::BAM_TAG_TYPE_ARRAY) :
400 cerr << "bamtools split ERROR: array tag types are not supported" << endl;
404 cerr << "bamtools split ERROR: unknown tag type encountered: " << tagType << endl;
409 // tag not found, but that's not an error - return success
413 // --------------------------------------------------------------------------------
414 // template method implementation
415 // *Technical Note* - use of template methods declared & defined in ".cpp" file
416 // goes against normal practices, but works here because these
417 // are purely internal (no one can call from outside this file)
419 // close BamWriters & delete pointers
421 void SplitTool::SplitToolPrivate::CloseWriters(map<T, BamWriter*>& writers) {
423 typedef map<T, BamWriter*> WriterMap;
424 typedef typename WriterMap::iterator WriterMapIterator;
426 // iterate over writers
427 WriterMapIterator writerIter = writers.begin();
428 WriterMapIterator writerEnd = writers.end();
429 for ( ; writerIter != writerEnd; ++writerIter ) {
430 BamWriter* writer = (*writerIter).second;
431 if ( writer == 0 ) continue;
441 // clear the container (destroying the items doesn't remove them)
445 // handle the various types that are possible for tags
447 bool SplitTool::SplitToolPrivate::SplitTagImpl(BamAlignment& al) {
449 typedef T TagValueType;
450 typedef map<TagValueType, BamWriter*> WriterMap;
451 typedef typename WriterMap::iterator WriterMapIterator;
453 // set up splitting data structure
454 WriterMap outputFiles;
455 WriterMapIterator writerIter;
458 const string tag = m_settings->TagToSplit;
460 stringstream outputFilenameStream("");
461 TagValueType currentValue;
463 // retrieve first alignment tag value
464 if ( al.GetTag(tag, currentValue) ) {
466 // open new BamWriter, save first alignment
467 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
468 writer = new BamWriter;
469 if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) {
470 cerr << "bamtools split ERROR: could not open " << outputFilenameStream.str()
471 << " for writing." << endl;
474 writer->SaveAlignment(al);
477 outputFiles.insert( make_pair(currentValue, writer) );
480 outputFilenameStream.str("");
483 // iterate through remaining alignments
484 while ( m_reader.GetNextAlignment(al) ) {
486 // skip if this alignment doesn't have TAG
487 if ( !al.GetTag(tag, currentValue) ) continue;
489 // look up tag value in map
490 writerIter = outputFiles.find(currentValue);
492 // if no writer associated with this value
493 if ( writerIter == outputFiles.end() ) {
495 // open new BamWriter
496 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
497 writer = new BamWriter;
498 if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) {
499 cerr << "bamtool split ERROR: could not open " << outputFilenameStream.str()
500 << " for writing." << endl;
505 outputFiles.insert( make_pair(currentValue, writer) );
508 outputFilenameStream.str("");
511 // else grab corresponding writer
512 else writer = (*writerIter).second;
514 // store alignment in proper BAM output file
516 writer->SaveAlignment(al);
519 // clean up BamWriters
520 CloseWriters(outputFiles);
526 // ---------------------------------------------
527 // SplitTool implementation
529 SplitTool::SplitTool(void)
531 , m_settings(new SplitSettings)
534 // set program details
535 const string name = "bamtools split";
536 const string description = "splits a BAM file on user-specified property, creating a new BAM output file for each value found";
537 const string args = "[-in <filename>] [-stub <filename stub>] < -mapped | -paired | -reference [-refPrefix <prefix>] | -tag <TAG> > ";
538 Options::SetProgramInfo(name, description, args);
541 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
542 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn());
543 Options::AddValueOption("-refPrefix", "string", "custom prefix for splitting by references. Currently files end with REF_<refName>.bam. This option allows you to replace \"REF_\" with a prefix of your choosing.", "",
544 m_settings->HasCustomRefPrefix, m_settings->CustomRefPrefix, IO_Opts);
545 Options::AddValueOption("-stub", "filename stub", "prefix stub for output BAM files (default behavior is to use input filename, without .bam extension, as stub). If input is stdin and no stub provided, a timestamp is generated as the stub.", "",
546 m_settings->HasCustomOutputStub, m_settings->CustomOutputStub, IO_Opts);
548 OptionGroup* SplitOpts = Options::CreateOptionGroup("Split Options");
549 Options::AddOption("-mapped", "split mapped/unmapped alignments", m_settings->IsSplittingMapped, SplitOpts);
550 Options::AddOption("-paired", "split single-end/paired-end alignments", m_settings->IsSplittingPaired, SplitOpts);
551 Options::AddOption("-reference", "split alignments by reference", m_settings->IsSplittingReference, SplitOpts);
552 Options::AddValueOption("-tag", "tag name", "splits alignments based on all values of TAG encountered (i.e. -tag RG creates a BAM file for each read group in original BAM file)", "",
553 m_settings->IsSplittingTag, m_settings->TagToSplit, SplitOpts);
556 SplitTool::~SplitTool(void) {
565 int SplitTool::Help(void) {
566 Options::DisplayHelp();
570 int SplitTool::Run(int argc, char* argv[]) {
572 // parse command line arguments
573 Options::Parse(argc, argv, 1);
575 // initialize SplitTool with settings
576 m_impl = new SplitToolPrivate(m_settings);
578 // run SplitTool, return success/fail