1 // ***************************************************************************
2 // bamtools_split.cpp (c) 2010 Derek Barnett, Erik Garrison
3 // Marth Lab, Department of Biology, Boston College
4 // ---------------------------------------------------------------------------
5 // Last modified: 7 April 2011 (DB)
6 // ---------------------------------------------------------------------------
7 // Splits a BAM file on user-specified property, creating a new BAM output
8 // file for each value found
9 // ***************************************************************************
11 #include "bamtools_split.h"
13 #include <api/BamConstants.h>
14 #include <api/BamReader.h>
15 #include <api/BamWriter.h>
16 #include <utils/bamtools_options.h>
17 #include <utils/bamtools_variant.h>
18 using namespace BamTools;
31 static const string SPLIT_MAPPED_TOKEN = ".MAPPED";
32 static const string SPLIT_UNMAPPED_TOKEN = ".UNMAPPED";
33 static const string SPLIT_PAIRED_TOKEN = ".PAIRED_END";
34 static const string SPLIT_SINGLE_TOKEN = ".SINGLE_END";
35 static const string SPLIT_REFERENCE_TOKEN = ".REF_";
37 string GetTimestampString(void) {
39 // get human readable timestamp
42 stringstream timeStream("");
43 timeStream << ctime(¤tTime);
45 // convert whitespace to '_'
46 string timeString = timeStream.str();
47 size_t found = timeString.find(" ");
48 while (found != string::npos) {
49 timeString.replace(found, 1, "_");
50 found = timeString.find(" ", found+1);
55 // remove copy of filename without extension
56 // (so /path/to/file.txt becomes /path/to/file )
57 string RemoveFilenameExtension(const string& filename) {
58 size_t found = filename.rfind(".");
59 return filename.substr(0, found);
62 } // namespace BamTools
64 // ---------------------------------------------
65 // SplitSettings implementation
67 struct SplitTool::SplitSettings {
70 bool HasInputFilename;
71 bool HasCustomOutputStub;
72 bool IsSplittingMapped;
73 bool IsSplittingPaired;
74 bool IsSplittingReference;
78 string CustomOutputStub;
84 : HasInputFilename(false)
85 , HasCustomOutputStub(false)
86 , IsSplittingMapped(false)
87 , IsSplittingPaired(false)
88 , IsSplittingReference(false)
89 , IsSplittingTag(false)
90 , CustomOutputStub("")
91 , InputFilename(Options::StandardIn())
96 // ---------------------------------------------
97 // SplitToolPrivate declaration
99 class SplitTool::SplitToolPrivate {
103 SplitToolPrivate(SplitTool::SplitSettings* settings)
104 : m_settings(settings)
107 ~SplitToolPrivate(void) {
111 // 'public' interface
117 // close & delete BamWriters in map
119 void CloseWriters(map<T, BamWriter*>& writers);
120 // calculate output stub based on IO args given
121 void DetermineOutputFilenameStub(void);
122 // open our BamReader
123 bool OpenReader(void);
124 // split alignments in BAM file based on isMapped property
125 bool SplitMapped(void);
126 // split alignments in BAM file based on isPaired property
127 bool SplitPaired(void);
128 // split alignments in BAM file based on refID property
129 bool SplitReference(void);
130 // finds first alignment and calls corresponding SplitTagImpl<>
131 // depending on tag type
133 // templated split tag implementation
134 // handle the various types that are possible for tags
136 bool SplitTagImpl(BamAlignment& al);
140 SplitTool::SplitSettings* m_settings;
141 string m_outputFilenameStub;
144 RefVector m_references;
147 void SplitTool::SplitToolPrivate::DetermineOutputFilenameStub(void) {
149 // if user supplied output filename stub, use that
150 if ( m_settings->HasCustomOutputStub )
151 m_outputFilenameStub = m_settings->CustomOutputStub;
153 // else if user supplied input BAM filename, use that (minus ".bam" extension) as stub
154 else if ( m_settings->HasInputFilename )
155 m_outputFilenameStub = RemoveFilenameExtension(m_settings->InputFilename);
157 // otherwise, user did not specify -stub, and input is coming from STDIN
158 // generate stub from timestamp
159 else m_outputFilenameStub = GetTimestampString();
162 bool SplitTool::SplitToolPrivate::OpenReader(void) {
164 // attempt to open BAM file
165 if ( !m_reader.Open(m_settings->InputFilename) ) {
166 cerr << "bamtools split ERROR: could not open BAM file: " << m_settings->InputFilename << endl;
170 // save file 'metadata' & return success
171 m_header = m_reader.GetHeaderText();
172 m_references = m_reader.GetReferenceData();
176 bool SplitTool::SplitToolPrivate::Run(void) {
178 // determine output stub
179 DetermineOutputFilenameStub();
185 // determine split type from settings
186 if ( m_settings->IsSplittingMapped ) return SplitMapped();
187 if ( m_settings->IsSplittingPaired ) return SplitPaired();
188 if ( m_settings->IsSplittingReference ) return SplitReference();
189 if ( m_settings->IsSplittingTag ) return SplitTag();
191 // if we get here, no property was specified
192 cerr << "bamtools split ERROR: no property given to split on... " << endl
193 << "Please use -mapped, -paired, -reference, or -tag TAG to specifiy desired split behavior." << endl;
197 bool SplitTool::SplitToolPrivate::SplitMapped(void) {
199 // set up splitting data structure
200 map<bool, BamWriter*> outputFiles;
201 map<bool, BamWriter*>::iterator writerIter;
203 // iterate through alignments
206 bool isCurrentAlignmentMapped;
207 while ( m_reader.GetNextAlignment(al) ) {
209 // see if bool value exists
210 isCurrentAlignmentMapped = al.IsMapped();
211 writerIter = outputFiles.find(isCurrentAlignmentMapped);
213 // if no writer associated with this value
214 if ( writerIter == outputFiles.end() ) {
216 // open new BamWriter
217 const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentMapped
219 : SPLIT_UNMAPPED_TOKEN ) + ".bam";
220 writer = new BamWriter;
221 if ( !writer->Open(outputFilename, m_header, m_references) ) {
222 cerr << "bamtools split ERROR: could not open " << outputFilename
223 << " for writing." << endl;
228 outputFiles.insert( make_pair(isCurrentAlignmentMapped, writer) );
231 // else grab corresponding writer
232 else writer = (*writerIter).second;
234 // store alignment in proper BAM output file
236 writer->SaveAlignment(al);
239 // clean up BamWriters
240 CloseWriters(outputFiles);
246 bool SplitTool::SplitToolPrivate::SplitPaired(void) {
248 // set up splitting data structure
249 map<bool, BamWriter*> outputFiles;
250 map<bool, BamWriter*>::iterator writerIter;
252 // iterate through alignments
255 bool isCurrentAlignmentPaired;
256 while ( m_reader.GetNextAlignment(al) ) {
258 // see if bool value exists
259 isCurrentAlignmentPaired = al.IsPaired();
260 writerIter = outputFiles.find(isCurrentAlignmentPaired);
262 // if no writer associated with this value
263 if ( writerIter == outputFiles.end() ) {
265 // open new BamWriter
266 const string outputFilename = m_outputFilenameStub + ( isCurrentAlignmentPaired
268 : SPLIT_SINGLE_TOKEN ) + ".bam";
269 writer = new BamWriter;
270 if ( !writer->Open(outputFilename, m_header, m_references) ) {
271 cerr << "bamtool split ERROR: could not open " << outputFilename
272 << " for writing." << endl;
277 outputFiles.insert( make_pair(isCurrentAlignmentPaired, writer) );
280 // else grab corresponding writer
281 else writer = (*writerIter).second;
283 // store alignment in proper BAM output file
285 writer->SaveAlignment(al);
288 // clean up BamWriters
289 CloseWriters(outputFiles);
295 bool SplitTool::SplitToolPrivate::SplitReference(void) {
297 // set up splitting data structure
298 map<int32_t, BamWriter*> outputFiles;
299 map<int32_t, BamWriter*>::iterator writerIter;
301 // iterate through alignments
304 int32_t currentRefId;
305 while ( m_reader.GetNextAlignment(al) ) {
307 // see if bool value exists
308 currentRefId = al.RefID;
309 writerIter = outputFiles.find(currentRefId);
311 // if no writer associated with this value
312 if ( writerIter == outputFiles.end() ) {
314 // fetch reference name for ID
316 if ( currentRefId == -1 )
317 refName = "unmapped";
319 refName = m_references.at(currentRefId).RefName;
321 // construct new output filename
322 const string outputFilename = m_outputFilenameStub + SPLIT_REFERENCE_TOKEN + refName + ".bam";
324 // open new BamWriter
325 writer = new BamWriter;
326 if ( !writer->Open(outputFilename, m_header, m_references) ) {
327 cerr << "bamtools split ERROR: could not open " << outputFilename
328 << " for writing." << endl;
333 outputFiles.insert( make_pair(currentRefId, writer) );
336 // else grab corresponding writer
337 else writer = (*writerIter).second;
339 // store alignment in proper BAM output file
341 writer->SaveAlignment(al);
344 // clean up BamWriters
345 CloseWriters(outputFiles);
351 // finds first alignment and calls corresponding SplitTagImpl<>() depending on tag type
352 bool SplitTool::SplitToolPrivate::SplitTag(void) {
354 // iterate through alignments, until we hit TAG
356 while ( m_reader.GetNextAlignment(al) ) {
358 // look for tag in this alignment and get tag type
360 if ( !al.GetTagType(m_settings->TagToSplit, tagType) )
363 // request split method based on tag type
364 // pass it the current alignment found
367 case (Constants::BAM_TAG_TYPE_INT8) :
368 case (Constants::BAM_TAG_TYPE_INT16) :
369 case (Constants::BAM_TAG_TYPE_INT32) :
370 return SplitTagImpl<int32_t>(al);
372 case (Constants::BAM_TAG_TYPE_UINT8) :
373 case (Constants::BAM_TAG_TYPE_UINT16) :
374 case (Constants::BAM_TAG_TYPE_UINT32) :
375 return SplitTagImpl<uint32_t>(al);
377 case (Constants::BAM_TAG_TYPE_FLOAT) :
378 return SplitTagImpl<float>(al);
380 case (Constants::BAM_TAG_TYPE_ASCII) :
381 case (Constants::BAM_TAG_TYPE_STRING) :
382 case (Constants::BAM_TAG_TYPE_HEX) :
383 return SplitTagImpl<string>(al);
385 case (Constants::BAM_TAG_TYPE_ARRAY) :
386 cerr << "bamtools split ERROR: array tag types are not supported" << endl;
390 cerr << "bamtools split ERROR: unknown tag type encountered: " << tagType << endl;
395 // tag not found, but that's not an error - return success
399 // --------------------------------------------------------------------------------
400 // template method implementation
401 // *Technical Note* - use of template methods declared & defined in ".cpp" file
402 // goes against normal practices, but works here because these
403 // are purely internal (no one can call from outside this file)
405 // close BamWriters & delete pointers
407 void SplitTool::SplitToolPrivate::CloseWriters(map<T, BamWriter*>& writers) {
409 typedef map<T, BamWriter*> WriterMap;
410 typedef typename WriterMap::iterator WriterMapIterator;
412 // iterate over writers
413 WriterMapIterator writerIter = writers.begin();
414 WriterMapIterator writerEnd = writers.end();
415 for ( ; writerIter != writerEnd; ++writerIter ) {
416 BamWriter* writer = (*writerIter).second;
417 if ( writer == 0 ) continue;
427 // clear the container (destroying the items doesn't remove them)
431 // handle the various types that are possible for tags
433 bool SplitTool::SplitToolPrivate::SplitTagImpl(BamAlignment& al) {
435 typedef T TagValueType;
436 typedef map<TagValueType, BamWriter*> WriterMap;
437 typedef typename WriterMap::iterator WriterMapIterator;
439 // set up splitting data structure
440 WriterMap outputFiles;
441 WriterMapIterator writerIter;
444 const string tag = m_settings->TagToSplit;
446 stringstream outputFilenameStream("");
447 TagValueType currentValue;
449 // retrieve first alignment tag value
450 if ( al.GetTag(tag, currentValue) ) {
452 // open new BamWriter, save first alignment
453 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
454 writer = new BamWriter;
455 if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) {
456 cerr << "bamtools split ERROR: could not open " << outputFilenameStream.str()
457 << " for writing." << endl;
460 writer->SaveAlignment(al);
463 outputFiles.insert( make_pair(currentValue, writer) );
466 outputFilenameStream.str("");
469 // iterate through remaining alignments
470 while ( m_reader.GetNextAlignment(al) ) {
472 // skip if this alignment doesn't have TAG
473 if ( !al.GetTag(tag, currentValue) ) continue;
475 // look up tag value in map
476 writerIter = outputFiles.find(currentValue);
478 // if no writer associated with this value
479 if ( writerIter == outputFiles.end() ) {
481 // open new BamWriter
482 outputFilenameStream << m_outputFilenameStub << ".TAG_" << tag << "_" << currentValue << ".bam";
483 writer = new BamWriter;
484 if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) {
485 cerr << "bamtool split ERROR: could not open " << outputFilenameStream.str()
486 << " for writing." << endl;
491 outputFiles.insert( make_pair(currentValue, writer) );
494 outputFilenameStream.str("");
497 // else grab corresponding writer
498 else writer = (*writerIter).second;
500 // store alignment in proper BAM output file
502 writer->SaveAlignment(al);
505 // clean up BamWriters
506 CloseWriters(outputFiles);
512 // ---------------------------------------------
513 // SplitTool implementation
515 SplitTool::SplitTool(void)
517 , m_settings(new SplitSettings)
520 // set program details
521 Options::SetProgramInfo("bamtools split", "splits a BAM file on user-specified property, creating a new BAM output file for each value found", "[-in <filename>] [-stub <filename stub>] < -mapped | -paired | -reference | -tag <TAG> > ");
524 OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output");
525 Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn());
526 Options::AddValueOption("-stub", "filename stub", "prefix stub for output BAM files (default behavior is to use input filename, without .bam extension, as stub). If input is stdin and no stub provided, a timestamp is generated as the stub.", "", m_settings->HasCustomOutputStub, m_settings->CustomOutputStub, IO_Opts);
528 OptionGroup* SplitOpts = Options::CreateOptionGroup("Split Options");
529 Options::AddOption("-mapped", "split mapped/unmapped alignments", m_settings->IsSplittingMapped, SplitOpts);
530 Options::AddOption("-paired", "split single-end/paired-end alignments", m_settings->IsSplittingPaired, SplitOpts);
531 Options::AddOption("-reference", "split alignments by reference", m_settings->IsSplittingReference, SplitOpts);
532 Options::AddValueOption("-tag", "tag name", "splits alignments based on all values of TAG encountered (i.e. -tag RG creates a BAM file for each read group in original BAM file)", "",
533 m_settings->IsSplittingTag, m_settings->TagToSplit, SplitOpts);
536 SplitTool::~SplitTool(void) {
545 int SplitTool::Help(void) {
546 Options::DisplayHelp();
550 int SplitTool::Run(int argc, char* argv[]) {
552 // parse command line arguments
553 Options::Parse(argc, argv, 1);
555 // initialize SplitTool with settings
556 m_impl = new SplitToolPrivate(m_settings);
558 // run SplitTool, return success/fail