1 // ***************************************************************************
2 // bamtools_split.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // All rights reserved.
5 // ---------------------------------------------------------------------------
6 // Last modified: 21 March 2011 (DB)
7 // ---------------------------------------------------------------------------
8 // Splits a BAM file on user-specified property, creating a new BAM output
9 // file for each value found.
10 // ***************************************************************************
12 #include "bamtools_split.h"
14 #include <api/BamConstants.h>
15 #include <api/BamReader.h>
16 #include <api/BamWriter.h>
17 #include <utils/bamtools_options.h>
18 #include <utils/bamtools_variant.h>
19 using namespace BamTools;
32 static const string SPLIT_MAPPED_TOKEN = ".MAPPED";
33 static const string SPLIT_UNMAPPED_TOKEN = ".UNMAPPED";
34 static const string SPLIT_PAIRED_TOKEN = ".PAIRED_END";
35 static const string SPLIT_SINGLE_TOKEN = ".SINGLE_END";
36 static const string SPLIT_REFERENCE_TOKEN = ".REF_";
38 string GetTimestampString(void) {
40 // get human readable timestamp
43 stringstream timeStream("");
44 timeStream << ctime(¤tTime);
46 // convert whitespace to '_'
47 string timeString = timeStream.str();
48 size_t found = timeString.find(" ");
49 while (found != string::npos) {
50 timeString.replace(found, 1, "_");
51 found = timeString.find(" ", found+1);
56 // remove copy of filename without extension
57 // (so /path/to/file.txt becomes /path/to/file )
58 string RemoveFilenameExtension(const string& filename) {
59 size_t found = filename.rfind(".");
60 return filename.substr(0, found);
63 } // namespace BamTools
65 // ---------------------------------------------
66 // SplitSettings implementation
68 struct SplitTool::SplitSettings {
71 bool HasInputFilename;
72 bool HasCustomOutputStub;
73 bool IsSplittingMapped;
74 bool IsSplittingPaired;
75 bool IsSplittingReference;
79 string CustomOutputStub;
85 : HasInputFilename(false)
86 , HasCustomOutputStub(false)
87 , IsSplittingMapped(false)
88 , IsSplittingPaired(false)
89 , IsSplittingReference(false)
90 , IsSplittingTag(false)
91 , CustomOutputStub("")
92 , InputFilename(Options::StandardIn())
97 // ---------------------------------------------
98 // SplitToolPrivate declaration
100 class SplitTool::SplitToolPrivate {
104 SplitToolPrivate(SplitTool::SplitSettings* settings);
105 ~SplitToolPrivate(void);
107 // 'public' interface
113 // close & delete BamWriters in map
115 void CloseWriters(map<T, BamWriter*>& writers);
116 // calculate output stub based on IO args given
117 void DetermineOutputFilenameStub(void);
118 // open our BamReader
119 bool OpenReader(void);
120 // split alignments in BAM file based on isMapped property
121 bool SplitMapped(void);
122 // split alignments in BAM file based on isPaired property
123 bool SplitPaired(void);
124 // split alignments in BAM file based on refID property
125 bool SplitReference(void);
126 // finds first alignment and calls corresponding SplitTagImpl<>
127 // depending on tag type
129 // templated split tag implementation
130 // handle the various types that are possible for tags
132 bool SplitTagImpl(BamAlignment& al);
136 SplitTool::SplitSettings* m_settings;
137 string m_outputFilenameStub;
140 RefVector m_references;
144 SplitTool::SplitToolPrivate::SplitToolPrivate(SplitTool::SplitSettings* settings)
145 : m_settings(settings)
149 SplitTool::SplitToolPrivate::~SplitToolPrivate(void) {
153 void SplitTool::SplitToolPrivate::DetermineOutputFilenameStub(void) {
155 // if user supplied output filename stub, use that
156 if ( m_settings->HasCustomOutputStub )
157 m_outputFilenameStub = m_settings->CustomOutputStub;
159 // else if user supplied input BAM filename, use that (minus ".bam" extension) as stub
160 else if ( m_settings->HasInputFilename )
161 m_outputFilenameStub = RemoveFilenameExtension(m_settings->InputFilename);
163 // otherwise, user did not specify -stub, and input is coming from STDIN
164 // generate stub from timestamp
165 else m_outputFilenameStub = GetTimestampString();
168 bool SplitTool::SplitToolPrivate::OpenReader(void) {
170 // attempt to open BAM file
171 if ( !m_reader.Open(m_settings->InputFilename) ) {
172 cerr << "bamtools split ERROR: could not open BAM file: " << m_settings->InputFilename << endl;
176 // save file 'metadata' & return success
177 m_header = m_reader.GetHeaderText();
178 m_references = m_reader.GetReferenceData();
182 bool SplitTool::SplitToolPrivate::Run(void) {
184 // determine output stub
185 DetermineOutputFilenameStub();
191 // determine split type from settings
192 if ( m_settings->IsSplittingMapped ) return SplitMapped();
193 if ( m_settings->IsSplittingPaired ) return SplitPaired();
194 if ( m_settings->IsSplittingReference ) return SplitReference();
195 if ( m_settings->IsSplittingTag ) return SplitTag();
197 // if we get here, no property was specified
198 cerr << "bamtools split ERROR: no property given to split on... " << endl
199 << "Please use -mapped, -paired, -reference, or -tag TAG to specifiy desired split behavior." << endl;
203 bool SplitTool::SplitToolPrivate::SplitMapped(void) {
205 // set up splitting data structure
206 map<bool, BamWriter*> outputFiles;
207 map<bool, BamWriter*>::iterator writerIter;
209 // iterate through alignments
212 bool isCurrentAlignmentMapped;
213 while ( m_reader.GetNextAlignment(al) ) {
215 // see if bool value exists
216 isCurrentAlignmentMapped = al.IsMapped();
217 writerIter = outputFiles.find(isCurrentAlignmentMapped);
219 // if no writer associated with this value
220 if ( writerIter == outputFiles.end() ) {
222 // open new BamWriter
223 const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentMapped
225 : SPLIT_UNMAPPED_TOKEN ) + ".bam";
226 writer = new BamWriter;
227 if ( !writer->Open(outputFilename, m_header, m_references) ) {
228 cerr << "bamtools split ERROR: could not open " << outputFilename
229 << " for writing." << endl;
234 outputFiles.insert( make_pair(isCurrentAlignmentMapped, writer) );
237 // else grab corresponding writer
238 else writer = (*writerIter).second;
240 // store alignment in proper BAM output file
242 writer->SaveAlignment(al);
245 // clean up BamWriters
246 CloseWriters(outputFiles);
252 bool SplitTool::SplitToolPrivate::SplitPaired(void) {
254 // set up splitting data structure
255 map<bool, BamWriter*> outputFiles;
256 map<bool, BamWriter*>::iterator writerIter;
258 // iterate through alignments
261 bool isCurrentAlignmentPaired;
262 while ( m_reader.GetNextAlignment(al) ) {
264 // see if bool value exists
265 isCurrentAlignmentPaired = al.IsPaired();
266 writerIter = outputFiles.find(isCurrentAlignmentPaired);
268 // if no writer associated with this value
269 if ( writerIter == outputFiles.end() ) {
271 // open new BamWriter
272 const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentPaired
274 : SPLIT_SINGLE_TOKEN ) + ".bam";
275 writer = new BamWriter;
276 if ( !writer->Open(outputFilename, m_header, m_references) ) {
277 cerr << "bamtool split ERROR: could not open " << outputFilename
278 << " for writing." << endl;
283 outputFiles.insert( make_pair(isCurrentAlignmentPaired, writer) );
286 // else grab corresponding writer
287 else writer = (*writerIter).second;
289 // store alignment in proper BAM output file
291 writer->SaveAlignment(al);
294 // clean up BamWriters
295 CloseWriters(outputFiles);
301 bool SplitTool::SplitToolPrivate::SplitReference(void) {
303 // set up splitting data structure
304 map<int32_t, BamWriter*> outputFiles;
305 map<int32_t, BamWriter*>::iterator writerIter;
307 // iterate through alignments
310 int32_t currentRefId;
311 while ( m_reader.GetNextAlignment(al) ) {
313 // see if bool value exists
314 currentRefId = al.RefID;
315 writerIter = outputFiles.find(currentRefId);
317 // if no writer associated with this value
318 if ( writerIter == outputFiles.end() ) {
320 // open new BamWriter
321 const string refName = m_references.at(currentRefId).RefName;
322 const string outputFilename = m_outputFilenameStub + SPLIT_REFERENCE_TOKEN + refName + ".bam";
323 writer = new BamWriter;
324 if ( !writer->Open(outputFilename, m_header, m_references) ) {
325 cerr << "bamtools split ERROR: could not open " << outputFilename
326 << " for writing." << endl;
331 outputFiles.insert( make_pair(currentRefId, writer) );
334 // else grab corresponding writer
335 else writer = (*writerIter).second;
337 // store alignment in proper BAM output file
339 writer->SaveAlignment(al);
342 // clean up BamWriters
343 CloseWriters(outputFiles);
349 // finds first alignment and calls corresponding SplitTagImpl<>() depending on tag type
350 bool SplitTool::SplitToolPrivate::SplitTag(void) {
352 // iterate through alignments, until we hit TAG
354 while ( m_reader.GetNextAlignment(al) ) {
356 // look for tag in this alignment and get tag type
358 if ( !al.GetTagType(m_settings->TagToSplit, tagType) )
361 // request split method based on tag type
362 // pass it the current alignment found
365 case (Constants::BAM_TAG_TYPE_INT8) :
366 case (Constants::BAM_TAG_TYPE_INT16) :
367 case (Constants::BAM_TAG_TYPE_INT32) :
368 return SplitTagImpl<int32_t>(al);
370 case (Constants::BAM_TAG_TYPE_UINT8) :
371 case (Constants::BAM_TAG_TYPE_UINT16) :
372 case (Constants::BAM_TAG_TYPE_UINT32) :
373 return SplitTagImpl<uint32_t>(al);
375 case (Constants::BAM_TAG_TYPE_FLOAT) :
376 return SplitTagImpl<float>(al);
378 case (Constants::BAM_TAG_TYPE_ASCII) :
379 case (Constants::BAM_TAG_TYPE_STRING) :
380 case (Constants::BAM_TAG_TYPE_HEX) :
381 return SplitTagImpl<string>(al);
384 fprintf(stderr, "bamtools split ERROR: unknown tag type encountered: [%c]\n", tagType);
389 // tag not found, but that's not an error - return success
393 // --------------------------------------------------------------------------------
394 // template method implementation
395 // N.B. - *technical note* - use of template methods defined in ".cpp" goes against normal practices
396 // but works here because these are purely internal (no one can call from outside this file)
398 // close BamWriters & delete pointers
400 void SplitTool::SplitToolPrivate::CloseWriters(map<T, BamWriter*>& writers) {
402 typedef map<T, BamWriter*> WriterMap;
403 typedef typename WriterMap::iterator WriterMapIterator;
405 // iterate over writers
406 WriterMapIterator writerIter = writers.begin();
407 WriterMapIterator writerEnd = writers.end();
408 for ( ; writerIter != writerEnd; ++writerIter ) {
409 BamWriter* writer = (*writerIter).second;
410 if (writer == 0 ) continue;
412 // close & delete writer
420 // handle the various types that are possible for tags
422 bool SplitTool::SplitToolPrivate::SplitTagImpl(BamAlignment& al) {
424 typedef T TagValueType;
425 typedef map<TagValueType, BamWriter*> WriterMap;
426 typedef typename WriterMap::iterator WriterMapIterator;
428 // set up splitting data structure
429 WriterMap outputFiles;
430 WriterMapIterator writerIter;
433 const string tag = m_settings->TagToSplit;
435 stringstream outputFilenameStream("");
436 TagValueType currentValue;
438 // retrieve first alignment tag value
439 if ( al.GetTag(tag, currentValue) ) {
441 // open new BamWriter, save first alignment
442 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
443 writer = new BamWriter;
444 if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) {
445 cerr << "bamtools split ERROR: could not open " << outputFilenameStream.str()
446 << " for writing." << endl;
449 writer->SaveAlignment(al);
452 outputFiles.insert( make_pair(currentValue, writer) );
455 outputFilenameStream.str("");
458 // iterate through remaining alignments
459 while ( m_reader.GetNextAlignment(al) ) {
461 // skip if this alignment doesn't have TAG
462 if ( !al.GetTag(tag, currentValue) ) continue;
464 // look up tag value in map
465 writerIter = outputFiles.find(currentValue);
467 // if no writer associated with this value
468 if ( writerIter == outputFiles.end() ) {
470 // open new BamWriter
471 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
472 writer = new BamWriter;
473 if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) {
474 cerr << "bamtool split ERROR: could not open " << outputFilenameStream.str()
475 << " for writing." << endl;
480 outputFiles.insert( make_pair(currentValue, writer) );
483 outputFilenameStream.str("");
486 // else grab corresponding writer
487 else writer = (*writerIter).second;
489 // store alignment in proper BAM output file
491 writer->SaveAlignment(al);
494 // clean up BamWriters
495 CloseWriters(outputFiles);
501 // ---------------------------------------------
502 // SplitTool implementation
504 SplitTool::SplitTool(void)
506 , m_settings(new SplitSettings)
509 // set program details
510 Options::SetProgramInfo("bamtools split", "splits a BAM file on user-specified property, creating a new BAM output file for each value found", "[-in <filename>] [-stub <filename stub>] < -mapped | -paired | -reference | -tag <TAG> > ");
513 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
514 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn());
515 Options::AddValueOption("-stub", "filename stub", "prefix stub for output BAM files (default behavior is to use input filename, without .bam extension, as stub). If input is stdin and no stub provided, a timestamp is generated as the stub.", "", m_settings->HasCustomOutputStub, m_settings->CustomOutputStub, IO_Opts);
517 OptionGroup* SplitOpts = Options::CreateOptionGroup("Split Options");
518 Options::AddOption("-mapped", "split mapped/unmapped alignments", m_settings->IsSplittingMapped, SplitOpts);
519 Options::AddOption("-paired", "split single-end/paired-end alignments", m_settings->IsSplittingPaired, SplitOpts);
520 Options::AddOption("-reference", "split alignments by reference", m_settings->IsSplittingReference, SplitOpts);
521 Options::AddValueOption("-tag", "tag name", "splits alignments based on all values of TAG encountered (i.e. -tag RG creates a BAM file for each read group in original BAM file)", "",
522 m_settings->IsSplittingTag, m_settings->TagToSplit, SplitOpts);
525 SplitTool::~SplitTool(void) {
534 int SplitTool::Help(void) {
535 Options::DisplayHelp();
539 int SplitTool::Run(int argc, char* argv[]) {
541 // parse command line arguments
542 Options::Parse(argc, argv, 1);
544 // initialize internal implementation
545 m_impl = new SplitToolPrivate(m_settings);
547 // run tool, return success/fail