1 // ***************************************************************************
2 // bamtools_split.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 7 April 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Splits a BAM file on user-specified property, creating a new BAM output
9 // file for each value found
10 // ***************************************************************************
12 #include "bamtools_split.h"
14 #include <api/BamConstants.h>
15 #include <api/BamReader.h>
16 #include <api/BamWriter.h>
17 #include <utils/bamtools_options.h>
18 #include <utils/bamtools_variant.h>
19 using namespace BamTools;
32 static const string SPLIT_MAPPED_TOKEN = ".MAPPED";
33 static const string SPLIT_UNMAPPED_TOKEN = ".UNMAPPED";
34 static const string SPLIT_PAIRED_TOKEN = ".PAIRED_END";
35 static const string SPLIT_SINGLE_TOKEN = ".SINGLE_END";
36 static const string SPLIT_REFERENCE_TOKEN = ".REF_";
38 string GetTimestampString(void) {
40 // get human readable timestamp
43 stringstream timeStream("");
44 timeStream << ctime(¤tTime);
46 // convert whitespace to '_'
47 string timeString = timeStream.str();
48 size_t found = timeString.find(" ");
49 while (found != string::npos) {
50 timeString.replace(found, 1, "_");
51 found = timeString.find(" ", found+1);
56 // remove copy of filename without extension
57 // (so /path/to/file.txt becomes /path/to/file )
58 string RemoveFilenameExtension(const string& filename) {
59 size_t found = filename.rfind(".");
60 return filename.substr(0, found);
63 } // namespace BamTools
65 // ---------------------------------------------
66 // SplitSettings implementation
68 struct SplitTool::SplitSettings {
71 bool HasInputFilename;
72 bool HasCustomOutputStub;
73 bool IsSplittingMapped;
74 bool IsSplittingPaired;
75 bool IsSplittingReference;
79 string CustomOutputStub;
85 : HasInputFilename(false)
86 , HasCustomOutputStub(false)
87 , IsSplittingMapped(false)
88 , IsSplittingPaired(false)
89 , IsSplittingReference(false)
90 , IsSplittingTag(false)
91 , CustomOutputStub("")
92 , InputFilename(Options::StandardIn())
97 // ---------------------------------------------
98 // SplitToolPrivate declaration
100 class SplitTool::SplitToolPrivate {
104 SplitToolPrivate(SplitTool::SplitSettings* settings)
105 : m_settings(settings)
108 ~SplitToolPrivate(void) {
112 // 'public' interface
118 // close & delete BamWriters in map
120 void CloseWriters(map<T, BamWriter*>& writers);
121 // calculate output stub based on IO args given
122 void DetermineOutputFilenameStub(void);
123 // open our BamReader
124 bool OpenReader(void);
125 // split alignments in BAM file based on isMapped property
126 bool SplitMapped(void);
127 // split alignments in BAM file based on isPaired property
128 bool SplitPaired(void);
129 // split alignments in BAM file based on refID property
130 bool SplitReference(void);
131 // finds first alignment and calls corresponding SplitTagImpl<>
132 // depending on tag type
134 // templated split tag implementation
135 // handle the various types that are possible for tags
137 bool SplitTagImpl(BamAlignment& al);
141 SplitTool::SplitSettings* m_settings;
142 string m_outputFilenameStub;
145 RefVector m_references;
148 void SplitTool::SplitToolPrivate::DetermineOutputFilenameStub(void) {
150 // if user supplied output filename stub, use that
151 if ( m_settings->HasCustomOutputStub )
152 m_outputFilenameStub = m_settings->CustomOutputStub;
154 // else if user supplied input BAM filename, use that (minus ".bam" extension) as stub
155 else if ( m_settings->HasInputFilename )
156 m_outputFilenameStub = RemoveFilenameExtension(m_settings->InputFilename);
158 // otherwise, user did not specify -stub, and input is coming from STDIN
159 // generate stub from timestamp
160 else m_outputFilenameStub = GetTimestampString();
163 bool SplitTool::SplitToolPrivate::OpenReader(void) {
165 // attempt to open BAM file
166 if ( !m_reader.Open(m_settings->InputFilename) ) {
167 cerr << "bamtools split ERROR: could not open BAM file: " << m_settings->InputFilename << endl;
171 // save file 'metadata' & return success
172 m_header = m_reader.GetHeaderText();
173 m_references = m_reader.GetReferenceData();
177 bool SplitTool::SplitToolPrivate::Run(void) {
179 // determine output stub
180 DetermineOutputFilenameStub();
186 // determine split type from settings
187 if ( m_settings->IsSplittingMapped ) return SplitMapped();
188 if ( m_settings->IsSplittingPaired ) return SplitPaired();
189 if ( m_settings->IsSplittingReference ) return SplitReference();
190 if ( m_settings->IsSplittingTag ) return SplitTag();
192 // if we get here, no property was specified
193 cerr << "bamtools split ERROR: no property given to split on... " << endl
194 << "Please use -mapped, -paired, -reference, or -tag TAG to specifiy desired split behavior." << endl;
198 bool SplitTool::SplitToolPrivate::SplitMapped(void) {
200 // set up splitting data structure
201 map<bool, BamWriter*> outputFiles;
202 map<bool, BamWriter*>::iterator writerIter;
204 // iterate through alignments
207 bool isCurrentAlignmentMapped;
208 while ( m_reader.GetNextAlignment(al) ) {
210 // see if bool value exists
211 isCurrentAlignmentMapped = al.IsMapped();
212 writerIter = outputFiles.find(isCurrentAlignmentMapped);
214 // if no writer associated with this value
215 if ( writerIter == outputFiles.end() ) {
217 // open new BamWriter
218 const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentMapped
220 : SPLIT_UNMAPPED_TOKEN ) + ".bam";
221 writer = new BamWriter;
222 if ( !writer->Open(outputFilename, m_header, m_references) ) {
223 cerr << "bamtools split ERROR: could not open " << outputFilename
224 << " for writing." << endl;
229 outputFiles.insert( make_pair(isCurrentAlignmentMapped, writer) );
232 // else grab corresponding writer
233 else writer = (*writerIter).second;
235 // store alignment in proper BAM output file
237 writer->SaveAlignment(al);
240 // clean up BamWriters
241 CloseWriters(outputFiles);
247 bool SplitTool::SplitToolPrivate::SplitPaired(void) {
249 // set up splitting data structure
250 map<bool, BamWriter*> outputFiles;
251 map<bool, BamWriter*>::iterator writerIter;
253 // iterate through alignments
256 bool isCurrentAlignmentPaired;
257 while ( m_reader.GetNextAlignment(al) ) {
259 // see if bool value exists
260 isCurrentAlignmentPaired = al.IsPaired();
261 writerIter = outputFiles.find(isCurrentAlignmentPaired);
263 // if no writer associated with this value
264 if ( writerIter == outputFiles.end() ) {
266 // open new BamWriter
267 const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentPaired
269 : SPLIT_SINGLE_TOKEN ) + ".bam";
270 writer = new BamWriter;
271 if ( !writer->Open(outputFilename, m_header, m_references) ) {
272 cerr << "bamtool split ERROR: could not open " << outputFilename
273 << " for writing." << endl;
278 outputFiles.insert( make_pair(isCurrentAlignmentPaired, writer) );
281 // else grab corresponding writer
282 else writer = (*writerIter).second;
284 // store alignment in proper BAM output file
286 writer->SaveAlignment(al);
289 // clean up BamWriters
290 CloseWriters(outputFiles);
296 bool SplitTool::SplitToolPrivate::SplitReference(void) {
298 // set up splitting data structure
299 map<int32_t, BamWriter*> outputFiles;
300 map<int32_t, BamWriter*>::iterator writerIter;
302 // iterate through alignments
305 int32_t currentRefId;
306 while ( m_reader.GetNextAlignment(al) ) {
308 // see if bool value exists
309 currentRefId = al.RefID;
310 writerIter = outputFiles.find(currentRefId);
312 // if no writer associated with this value
313 if ( writerIter == outputFiles.end() ) {
315 // open new BamWriter
316 const string refName = m_references.at(currentRefId).RefName;
317 const string outputFilename = m_outputFilenameStub + SPLIT_REFERENCE_TOKEN + refName + ".bam";
318 writer = new BamWriter;
319 if ( !writer->Open(outputFilename, m_header, m_references) ) {
320 cerr << "bamtools split ERROR: could not open " << outputFilename
321 << " for writing." << endl;
326 outputFiles.insert( make_pair(currentRefId, writer) );
329 // else grab corresponding writer
330 else writer = (*writerIter).second;
332 // store alignment in proper BAM output file
334 writer->SaveAlignment(al);
337 // clean up BamWriters
338 CloseWriters(outputFiles);
344 // finds first alignment and calls corresponding SplitTagImpl<>() depending on tag type
345 bool SplitTool::SplitToolPrivate::SplitTag(void) {
347 // iterate through alignments, until we hit TAG
349 while ( m_reader.GetNextAlignment(al) ) {
351 // look for tag in this alignment and get tag type
353 if ( !al.GetTagType(m_settings->TagToSplit, tagType) )
356 // request split method based on tag type
357 // pass it the current alignment found
360 case (Constants::BAM_TAG_TYPE_INT8) :
361 case (Constants::BAM_TAG_TYPE_INT16) :
362 case (Constants::BAM_TAG_TYPE_INT32) :
363 return SplitTagImpl<int32_t>(al);
365 case (Constants::BAM_TAG_TYPE_UINT8) :
366 case (Constants::BAM_TAG_TYPE_UINT16) :
367 case (Constants::BAM_TAG_TYPE_UINT32) :
368 return SplitTagImpl<uint32_t>(al);
370 case (Constants::BAM_TAG_TYPE_FLOAT) :
371 return SplitTagImpl<float>(al);
373 case (Constants::BAM_TAG_TYPE_ASCII) :
374 case (Constants::BAM_TAG_TYPE_STRING) :
375 case (Constants::BAM_TAG_TYPE_HEX) :
376 return SplitTagImpl<string>(al);
379 fprintf(stderr, "bamtools split ERROR: unknown tag type encountered: [%c]\n", tagType);
384 // tag not found, but that's not an error - return success
388 // --------------------------------------------------------------------------------
389 // template method implementation
390 // *Technical Note* - use of template methods declared & defined in ".cpp" file
391 // goes against normal practices, but works here because these
392 // are purely internal (no one can call from outside this file)
394 // close BamWriters & delete pointers
396 void SplitTool::SplitToolPrivate::CloseWriters(map<T, BamWriter*>& writers) {
398 typedef map<T, BamWriter*> WriterMap;
399 typedef typename WriterMap::iterator WriterMapIterator;
401 // iterate over writers
402 WriterMapIterator writerIter = writers.begin();
403 WriterMapIterator writerEnd = writers.end();
404 for ( ; writerIter != writerEnd; ++writerIter ) {
405 BamWriter* writer = (*writerIter).second;
406 if ( writer == 0 ) continue;
416 // clear the container (destroying the items doesn't remove them)
420 // handle the various types that are possible for tags
422 bool SplitTool::SplitToolPrivate::SplitTagImpl(BamAlignment& al) {
424 typedef T TagValueType;
425 typedef map<TagValueType, BamWriter*> WriterMap;
426 typedef typename WriterMap::iterator WriterMapIterator;
428 // set up splitting data structure
429 WriterMap outputFiles;
430 WriterMapIterator writerIter;
433 const string tag = m_settings->TagToSplit;
435 stringstream outputFilenameStream("");
436 TagValueType currentValue;
438 // retrieve first alignment tag value
439 if ( al.GetTag(tag, currentValue) ) {
441 // open new BamWriter, save first alignment
442 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
443 writer = new BamWriter;
444 if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) {
445 cerr << "bamtools split ERROR: could not open " << outputFilenameStream.str()
446 << " for writing." << endl;
449 writer->SaveAlignment(al);
452 outputFiles.insert( make_pair(currentValue, writer) );
455 outputFilenameStream.str("");
458 // iterate through remaining alignments
459 while ( m_reader.GetNextAlignment(al) ) {
461 // skip if this alignment doesn't have TAG
462 if ( !al.GetTag(tag, currentValue) ) continue;
464 // look up tag value in map
465 writerIter = outputFiles.find(currentValue);
467 // if no writer associated with this value
468 if ( writerIter == outputFiles.end() ) {
470 // open new BamWriter
471 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
472 writer = new BamWriter;
473 if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) {
474 cerr << "bamtool split ERROR: could not open " << outputFilenameStream.str()
475 << " for writing." << endl;
480 outputFiles.insert( make_pair(currentValue, writer) );
483 outputFilenameStream.str("");
486 // else grab corresponding writer
487 else writer = (*writerIter).second;
489 // store alignment in proper BAM output file
491 writer->SaveAlignment(al);
494 // clean up BamWriters
495 CloseWriters(outputFiles);
501 // ---------------------------------------------
502 // SplitTool implementation
504 SplitTool::SplitTool(void)
506 , m_settings(new SplitSettings)
509 // set program details
510 Options::SetProgramInfo("bamtools split", "splits a BAM file on user-specified property, creating a new BAM output file for each value found", "[-in <filename>] [-stub <filename stub>] < -mapped | -paired | -reference | -tag <TAG> > ");
513 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
514 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn());
515 Options::AddValueOption("-stub", "filename stub", "prefix stub for output BAM files (default behavior is to use input filename, without .bam extension, as stub). If input is stdin and no stub provided, a timestamp is generated as the stub.", "", m_settings->HasCustomOutputStub, m_settings->CustomOutputStub, IO_Opts);
517 OptionGroup* SplitOpts = Options::CreateOptionGroup("Split Options");
518 Options::AddOption("-mapped", "split mapped/unmapped alignments", m_settings->IsSplittingMapped, SplitOpts);
519 Options::AddOption("-paired", "split single-end/paired-end alignments", m_settings->IsSplittingPaired, SplitOpts);
520 Options::AddOption("-reference", "split alignments by reference", m_settings->IsSplittingReference, SplitOpts);
521 Options::AddValueOption("-tag", "tag name", "splits alignments based on all values of TAG encountered (i.e. -tag RG creates a BAM file for each read group in original BAM file)", "",
522 m_settings->IsSplittingTag, m_settings->TagToSplit, SplitOpts);
525 SplitTool::~SplitTool(void) {
534 int SplitTool::Help(void) {
535 Options::DisplayHelp();
539 int SplitTool::Run(int argc, char* argv[]) {
541 // parse command line arguments
542 Options::Parse(argc, argv, 1);
544 // initialize SplitTool with settings
545 m_impl = new SplitToolPrivate(m_settings);
547 // run SplitTool, return success/fail