1 // ***************************************************************************
2 // bamtools_split.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 24 July 2013 (DB)
6 // ---------------------------------------------------------------------------
7 // Splits a BAM file on user-specified property, creating a new BAM output
8 // file for each value found
9 // ***************************************************************************
11 #include "bamtools_split.h"
13 #include <api/BamConstants.h>
14 #include <api/BamReader.h>
15 #include <api/BamWriter.h>
16 #include <utils/bamtools_options.h>
17 #include <utils/bamtools_variant.h>
18 using namespace BamTools;
31 static const string SPLIT_MAPPED_TOKEN = ".MAPPED";
32 static const string SPLIT_UNMAPPED_TOKEN = ".UNMAPPED";
33 static const string SPLIT_PAIRED_TOKEN = ".PAIRED_END";
34 static const string SPLIT_SINGLE_TOKEN = ".SINGLE_END";
35 static const string SPLIT_REFERENCE_TOKEN = ".REF_";
36 static const string SPLIT_TAG_TOKEN = ".TAG_";
38 string GetTimestampString(void) {
40 // get human readable timestamp
43 stringstream timeStream("");
44 timeStream << ctime(¤tTime);
46 // convert whitespace to '_'
47 string timeString = timeStream.str();
48 size_t found = timeString.find(" ");
49 while (found != string::npos) {
50 timeString.replace(found, 1, "_");
51 found = timeString.find(" ", found+1);
56 // remove copy of filename without extension
57 // (so /path/to/file.txt becomes /path/to/file )
58 string RemoveFilenameExtension(const string& filename) {
59 size_t found = filename.rfind(".");
60 return filename.substr(0, found);
63 } // namespace BamTools
65 // ---------------------------------------------
66 // SplitSettings implementation
68 struct SplitTool::SplitSettings {
71 bool HasInputFilename;
72 bool HasCustomOutputStub;
73 bool HasCustomRefPrefix;
74 bool HasCustomTagPrefix;
75 bool IsSplittingMapped;
76 bool IsSplittingPaired;
77 bool IsSplittingReference;
81 string CustomOutputStub;
82 string CustomRefPrefix;
83 string CustomTagPrefix;
89 : HasInputFilename(false)
90 , HasCustomOutputStub(false)
91 , HasCustomRefPrefix(false)
92 , HasCustomTagPrefix(false)
93 , IsSplittingMapped(false)
94 , IsSplittingPaired(false)
95 , IsSplittingReference(false)
96 , IsSplittingTag(false)
97 , CustomOutputStub("")
100 , InputFilename(Options::StandardIn())
105 // ---------------------------------------------
106 // SplitToolPrivate declaration
108 class SplitTool::SplitToolPrivate {
112 SplitToolPrivate(SplitTool::SplitSettings* settings)
113 : m_settings(settings)
116 ~SplitToolPrivate(void) {
120 // 'public' interface
126 // close & delete BamWriters in map
128 void CloseWriters(map<T, BamWriter*>& writers);
129 // calculate output stub based on IO args given
130 void DetermineOutputFilenameStub(void);
131 // open our BamReader
132 bool OpenReader(void);
133 // split alignments in BAM file based on isMapped property
134 bool SplitMapped(void);
135 // split alignments in BAM file based on isPaired property
136 bool SplitPaired(void);
137 // split alignments in BAM file based on refID property
138 bool SplitReference(void);
139 // finds first alignment and calls corresponding SplitTagImpl<>
140 // depending on tag type
142 // templated split tag implementation
143 // handle the various types that are possible for tags
145 bool SplitTagImpl(BamAlignment& al);
149 SplitTool::SplitSettings* m_settings;
150 string m_outputFilenameStub;
153 RefVector m_references;
156 void SplitTool::SplitToolPrivate::DetermineOutputFilenameStub(void) {
158 // if user supplied output filename stub, use that
159 if ( m_settings->HasCustomOutputStub )
160 m_outputFilenameStub = m_settings->CustomOutputStub;
162 // else if user supplied input BAM filename, use that (minus ".bam" extension) as stub
163 else if ( m_settings->HasInputFilename )
164 m_outputFilenameStub = RemoveFilenameExtension(m_settings->InputFilename);
166 // otherwise, user did not specify -stub, and input is coming from STDIN
167 // generate stub from timestamp
168 else m_outputFilenameStub = GetTimestampString();
171 bool SplitTool::SplitToolPrivate::OpenReader(void) {
173 // attempt to open BAM file
174 if ( !m_reader.Open(m_settings->InputFilename) ) {
175 cerr << "bamtools split ERROR: could not open BAM file: " << m_settings->InputFilename << endl;
179 // save file 'metadata' & return success
180 m_header = m_reader.GetHeaderText();
181 m_references = m_reader.GetReferenceData();
185 bool SplitTool::SplitToolPrivate::Run(void) {
187 // determine output stub
188 DetermineOutputFilenameStub();
194 // determine split type from settings
195 if ( m_settings->IsSplittingMapped ) return SplitMapped();
196 if ( m_settings->IsSplittingPaired ) return SplitPaired();
197 if ( m_settings->IsSplittingReference ) return SplitReference();
198 if ( m_settings->IsSplittingTag ) return SplitTag();
200 // if we get here, no property was specified
201 cerr << "bamtools split ERROR: no property given to split on... " << endl
202 << "Please use -mapped, -paired, -reference, or -tag TAG to specifiy desired split behavior." << endl;
206 bool SplitTool::SplitToolPrivate::SplitMapped(void) {
208 // set up splitting data structure
209 map<bool, BamWriter*> outputFiles;
210 map<bool, BamWriter*>::iterator writerIter;
212 // iterate through alignments
215 bool isCurrentAlignmentMapped;
216 while ( m_reader.GetNextAlignment(al) ) {
218 // see if bool value exists
219 isCurrentAlignmentMapped = al.IsMapped();
220 writerIter = outputFiles.find(isCurrentAlignmentMapped);
222 // if no writer associated with this value
223 if ( writerIter == outputFiles.end() ) {
225 // open new BamWriter
226 const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentMapped
228 : SPLIT_UNMAPPED_TOKEN ) + ".bam";
229 writer = new BamWriter;
230 if ( !writer->Open(outputFilename, m_header, m_references) ) {
231 cerr << "bamtools split ERROR: could not open " << outputFilename
232 << " for writing." << endl;
237 outputFiles.insert( make_pair(isCurrentAlignmentMapped, writer) );
240 // else grab corresponding writer
241 else writer = (*writerIter).second;
243 // store alignment in proper BAM output file
245 writer->SaveAlignment(al);
248 // clean up BamWriters
249 CloseWriters(outputFiles);
255 bool SplitTool::SplitToolPrivate::SplitPaired(void) {
257 // set up splitting data structure
258 map<bool, BamWriter*> outputFiles;
259 map<bool, BamWriter*>::iterator writerIter;
261 // iterate through alignments
264 bool isCurrentAlignmentPaired;
265 while ( m_reader.GetNextAlignment(al) ) {
267 // see if bool value exists
268 isCurrentAlignmentPaired = al.IsPaired();
269 writerIter = outputFiles.find(isCurrentAlignmentPaired);
271 // if no writer associated with this value
272 if ( writerIter == outputFiles.end() ) {
274 // open new BamWriter
275 const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentPaired
277 : SPLIT_SINGLE_TOKEN ) + ".bam";
278 writer = new BamWriter;
279 if ( !writer->Open(outputFilename, m_header, m_references) ) {
280 cerr << "bamtool split ERROR: could not open " << outputFilename
281 << " for writing." << endl;
286 outputFiles.insert( make_pair(isCurrentAlignmentPaired, writer) );
289 // else grab corresponding writer
290 else writer = (*writerIter).second;
292 // store alignment in proper BAM output file
294 writer->SaveAlignment(al);
297 // clean up BamWriters
298 CloseWriters(outputFiles);
304 bool SplitTool::SplitToolPrivate::SplitReference(void) {
306 // set up splitting data structure
307 map<int32_t, BamWriter*> outputFiles;
308 map<int32_t, BamWriter*>::iterator writerIter;
310 // determine reference prefix
311 string refPrefix = SPLIT_REFERENCE_TOKEN;
312 if ( m_settings->HasCustomRefPrefix )
313 refPrefix = m_settings->CustomRefPrefix;
315 // make sure prefix starts with '.'
316 const size_t dotFound = refPrefix.find('.');
318 refPrefix = string(".") + refPrefix;
320 // iterate through alignments
323 int32_t currentRefId;
324 while ( m_reader.GetNextAlignment(al) ) {
326 // see if bool value exists
327 currentRefId = al.RefID;
328 writerIter = outputFiles.find(currentRefId);
330 // if no writer associated with this value
331 if ( writerIter == outputFiles.end() ) {
333 // fetch reference name for ID
335 if ( currentRefId == -1 )
336 refName = "unmapped";
338 refName = m_references.at(currentRefId).RefName;
340 // construct new output filename
341 const string outputFilename = m_outputFilenameStub + refPrefix + refName + ".bam";
343 // open new BamWriter
344 writer = new BamWriter;
345 if ( !writer->Open(outputFilename, m_header, m_references) ) {
346 cerr << "bamtools split ERROR: could not open " << outputFilename
347 << " for writing." << endl;
352 outputFiles.insert( make_pair(currentRefId, writer) );
355 // else grab corresponding writer
356 else writer = (*writerIter).second;
358 // store alignment in proper BAM output file
360 writer->SaveAlignment(al);
363 // clean up BamWriters
364 CloseWriters(outputFiles);
370 // finds first alignment and calls corresponding SplitTagImpl<>() depending on tag type
371 bool SplitTool::SplitToolPrivate::SplitTag(void) {
373 // iterate through alignments, until we hit TAG
375 while ( m_reader.GetNextAlignment(al) ) {
377 // look for tag in this alignment and get tag type
379 if ( !al.GetTagType(m_settings->TagToSplit, tagType) )
382 // request split method based on tag type
383 // pass it the current alignment found
386 case (Constants::BAM_TAG_TYPE_INT8) :
387 case (Constants::BAM_TAG_TYPE_INT16) :
388 case (Constants::BAM_TAG_TYPE_INT32) :
389 return SplitTagImpl<int32_t>(al);
391 case (Constants::BAM_TAG_TYPE_UINT8) :
392 case (Constants::BAM_TAG_TYPE_UINT16) :
393 case (Constants::BAM_TAG_TYPE_UINT32) :
394 return SplitTagImpl<uint32_t>(al);
396 case (Constants::BAM_TAG_TYPE_FLOAT) :
397 return SplitTagImpl<float>(al);
399 case (Constants::BAM_TAG_TYPE_ASCII) :
400 case (Constants::BAM_TAG_TYPE_STRING) :
401 case (Constants::BAM_TAG_TYPE_HEX) :
402 return SplitTagImpl<string>(al);
404 case (Constants::BAM_TAG_TYPE_ARRAY) :
405 cerr << "bamtools split ERROR: array tag types are not supported" << endl;
409 cerr << "bamtools split ERROR: unknown tag type encountered: " << tagType << endl;
414 // tag not found, but that's not an error - return success
418 // --------------------------------------------------------------------------------
419 // template method implementation
420 // *Technical Note* - use of template methods declared & defined in ".cpp" file
421 // goes against normal practices, but works here because these
422 // are purely internal (no one can call from outside this file)
424 // close BamWriters & delete pointers
426 void SplitTool::SplitToolPrivate::CloseWriters(map<T, BamWriter*>& writers) {
428 typedef map<T, BamWriter*> WriterMap;
429 typedef typename WriterMap::iterator WriterMapIterator;
431 // iterate over writers
432 WriterMapIterator writerIter = writers.begin();
433 WriterMapIterator writerEnd = writers.end();
434 for ( ; writerIter != writerEnd; ++writerIter ) {
435 BamWriter* writer = (*writerIter).second;
436 if ( writer == 0 ) continue;
446 // clear the container (destroying the items doesn't remove them)
450 // handle the various types that are possible for tags
452 bool SplitTool::SplitToolPrivate::SplitTagImpl(BamAlignment& al) {
454 typedef T TagValueType;
455 typedef map<TagValueType, BamWriter*> WriterMap;
456 typedef typename WriterMap::iterator WriterMapIterator;
458 // set up splitting data structure
459 WriterMap outputFiles;
460 WriterMapIterator writerIter;
462 // determine tag prefix
463 string tagPrefix = SPLIT_TAG_TOKEN;
464 if ( m_settings->HasCustomTagPrefix )
465 tagPrefix = m_settings->CustomTagPrefix;
467 // make sure prefix starts with '.'
468 const size_t dotFound = tagPrefix.find('.');
470 tagPrefix = string(".") + tagPrefix;
473 const string tag = m_settings->TagToSplit;
475 stringstream outputFilenameStream("");
476 TagValueType currentValue;
478 // retrieve first alignment tag value
479 if ( al.GetTag(tag, currentValue) ) {
481 // open new BamWriter, save first alignment
482 outputFilenameStream << m_outputFilenameStub << tagPrefix << tag << "_" << currentValue << ".bam";
483 writer = new BamWriter;
484 if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) {
485 cerr << "bamtools split ERROR: could not open " << outputFilenameStream.str()
486 << " for writing." << endl;
489 writer->SaveAlignment(al);
492 outputFiles.insert( make_pair(currentValue, writer) );
495 outputFilenameStream.str("");
498 // iterate through remaining alignments
499 while ( m_reader.GetNextAlignment(al) ) {
501 // skip if this alignment doesn't have TAG
502 if ( !al.GetTag(tag, currentValue) ) continue;
504 // look up tag value in map
505 writerIter = outputFiles.find(currentValue);
507 // if no writer associated with this value
508 if ( writerIter == outputFiles.end() ) {
510 // open new BamWriter
511 outputFilenameStream << m_outputFilenameStub << tagPrefix << tag << "_" << currentValue << ".bam";
512 writer = new BamWriter;
513 if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) {
514 cerr << "bamtool split ERROR: could not open " << outputFilenameStream.str()
515 << " for writing." << endl;
520 outputFiles.insert( make_pair(currentValue, writer) );
523 outputFilenameStream.str("");
526 // else grab corresponding writer
527 else writer = (*writerIter).second;
529 // store alignment in proper BAM output file
531 writer->SaveAlignment(al);
534 // clean up BamWriters
535 CloseWriters(outputFiles);
541 // ---------------------------------------------
542 // SplitTool implementation
544 SplitTool::SplitTool(void)
546 , m_settings(new SplitSettings)
549 // set program details
550 const string name = "bamtools split";
551 const string description = "splits a BAM file on user-specified property, creating a new BAM output file for each value found";
552 const string args = "[-in <filename>] [-stub <filename stub>] < -mapped | -paired | -reference [-refPrefix <prefix>] | -tag <TAG> > ";
553 Options::SetProgramInfo(name, description, args);
556 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
557 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn());
558 Options::AddValueOption("-refPrefix", "string", "custom prefix for splitting by references. Currently files end with REF_<refName>.bam. This option allows you to replace \"REF_\" with a prefix of your choosing.", "",
559 m_settings->HasCustomRefPrefix, m_settings->CustomRefPrefix, IO_Opts);
560 Options::AddValueOption("-tagPrefix", "string", "custom prefix for splitting by tags. Current files end with TAG_<tagname>_<tagvalue>.bam. This option allows you to replace \"TAG_\" with a prefix of your choosing.", "",
561 m_settings->HasCustomTagPrefix, m_settings->CustomTagPrefix, IO_Opts);
562 Options::AddValueOption("-stub", "filename stub", "prefix stub for output BAM files (default behavior is to use input filename, without .bam extension, as stub). If input is stdin and no stub provided, a timestamp is generated as the stub.", "",
563 m_settings->HasCustomOutputStub, m_settings->CustomOutputStub, IO_Opts);
565 OptionGroup* SplitOpts = Options::CreateOptionGroup("Split Options");
566 Options::AddOption("-mapped", "split mapped/unmapped alignments", m_settings->IsSplittingMapped, SplitOpts);
567 Options::AddOption("-paired", "split single-end/paired-end alignments", m_settings->IsSplittingPaired, SplitOpts);
568 Options::AddOption("-reference", "split alignments by reference", m_settings->IsSplittingReference, SplitOpts);
569 Options::AddValueOption("-tag", "tag name", "splits alignments based on all values of TAG encountered (i.e. -tag RG creates a BAM file for each read group in original BAM file)", "",
570 m_settings->IsSplittingTag, m_settings->TagToSplit, SplitOpts);
573 SplitTool::~SplitTool(void) {
582 int SplitTool::Help(void) {
583 Options::DisplayHelp();
587 int SplitTool::Run(int argc, char* argv[]) {
589 // parse command line arguments
590 Options::Parse(argc, argv, 1);
592 // initialize SplitTool with settings
593 m_impl = new SplitToolPrivate(m_settings);
595 // run SplitTool, return success/fail