--- /dev/null
+#include "BamMultiReader.h"\r
+#include "BamWriter.h"\r
+#include <boost/algorithm/string.hpp>\r
+#include <iostream>\r
+\r
+using namespace BamTools;\r
+using namespace std;\r
+\r
+int main(int argc, char** argv) {\r
+\r
+ if (argc == 1) {\r
+ cerr << "USAGE: ./BamMultiMerge <output file> [input files]" << endl;\r
+ exit(0);\r
+ }\r
+\r
+ string outputFilename = argv[1];\r
+\r
+ BamMultiReader reader;\r
+ vector<string> filenames;\r
+ for (int i = 2; i<argc; ++i) {\r
+ filenames.push_back(argv[i]);\r
+ }\r
+\r
+ reader.Open(filenames);\r
+\r
+ //cerr << "merging to " << outputFilename << endl;\r
+ string mergedHeader = reader.GetUnifiedHeaderText();\r
+ //cerr << "mergedHeader = " << endl << mergedHeader << endl;\r
+\r
+ // check that we are merging files which have the same sets of references\r
+ RefVector references;\r
+ int referencesSize = 0; bool first = true;\r
+ for (int i = 2; i<argc; ++i) {\r
+ BamReader areader;\r
+ areader.Open( argv[i] );\r
+ if (first) {\r
+ references = areader.GetReferenceData();\r
+ referencesSize = references.size();\r
+ first = false;\r
+ } else {\r
+ RefVector newreferences = areader.GetReferenceData();\r
+ int i = 0;\r
+ for (RefVector::const_iterator it = references.begin(); it != references.end(); it++) {\r
+ if (newreferences.at(i++).RefName != it->RefName) {\r
+ cerr << "BAM FILES ALIGNED AGAINST DIFFERING SETS OF REFERENCES, NOT MERGING" << endl;\r
+ exit(1);\r
+ }\r
+ }\r
+ }\r
+ }\r
+\r
+ // open BamWriter\r
+ BamWriter writer;\r
+ writer.Open( outputFilename.c_str(), mergedHeader, references);\r
+\r
+ BamAlignment bAlignment;\r
+ while (reader.GetNextAlignment(bAlignment)) {\r
+ // write to output file\r
+ writer.SaveAlignment(bAlignment);\r
+ }\r
+\r
+ // close output file\r
+ writer.Close();\r
+ // close input files\r
+ reader.Close();\r
+\r
+ //cerr << "done" << endl;\r
+\r
+ return 0;\r
+}\r
--- /dev/null
+// ***************************************************************************
+// BamMultiReader.cpp (c) 2010 Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 Februrary 2010 (EG)
+// ---------------------------------------------------------------------------
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
+// Functionality for simultaneously reading multiple BAM files.
+//
+// This functionality allows applications to work on very large sets of files
+// without requiring intermediate merge, sort, and index steps for each file
+// subset. It also improves the performance of our merge system as it
+// precludes the need to sort merged files.
+// ***************************************************************************
+
+// C++ includes
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <boost/algorithm/string.hpp>
+
+// BamTools includes
+#include "BGZF.h"
+#include "BamMultiReader.h"
+using namespace BamTools;
+using namespace std;
+
+// -----------------------------------------------------
+// BamMultiReader implementation
+// -----------------------------------------------------
+
+// constructor
+BamMultiReader::BamMultiReader(void)
+ : CurrentRefID(0)
+ , CurrentLeft(0)
+{ }
+
+// destructor
+BamMultiReader::~BamMultiReader(void) {
+ Close(); // close the bam files
+ // clean up reader objects
+ for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
+ delete *br;
+ }
+}
+
+// close the BAM files
+void BamMultiReader::Close(void) {
+ int index = 0;
+ for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
+ BamReader* reader = *br;
+ reader->Close(); // close the reader
+ readerStates.at(index++) = CLOSED; // track its state
+ }
+}
+
+// updates the reference id stored in the BamMultiReader
+// to reflect the current state of the readers
+void BamMultiReader::UpdateReferenceID(void) {
+ bool updateRefID = true;
+ int i = 0;
+ for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
+ BamAlignment* alignment = *it;
+ if (readerStates.at(i++) != END && alignment->RefID == CurrentRefID) {
+ updateRefID = false;
+ break;
+ }
+ }
+ if (updateRefID) {
+ // get the next reference id
+ // while there aren't any readers at the next ref id
+ // increment the ref id
+ int nextRefID = CurrentRefID;
+ bool ok = false;
+ while (!ok) {
+ ++nextRefID;
+ int i = 0;
+ for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
+ BamAlignment* alignment = *it;
+ if (readerStates.at(i++) != END && alignment->RefID == nextRefID) {
+ ok = true;
+ break;
+ }
+ }
+ }
+ cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl;
+ CurrentRefID = nextRefID;
+ }
+}
+
+// checks if any readers still have alignments
+bool BamMultiReader::HasOpenReaders() {
+ for (vector<BamReaderState>::iterator it = readerStates.begin(); it != readerStates.end(); ++it) {
+ BamReaderState readerState = *it;
+ if (readerState != END)
+ return true;
+ }
+ return false;
+}
+
+// get next alignment among all files (from specified region, if given)
+bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
+
+ // bail out if we are at EOF in all files, means no more alignments to process
+ if (!HasOpenReaders())
+ return false;
+
+ // when all alignments have stepped into a new target sequence, update our
+ // current reference sequence id
+ UpdateReferenceID();
+
+ // find the alignment with the lowest position that is also in the current
+ // reference sequence, getnextalignment on its bam reader to step it
+ // forward, and return it
+ int i = 0, lowestPosition = -1, lowestAlignmentIndex = -1;
+ BamAlignment* lowestAlignment = NULL;
+ for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
+ BamAlignment* ba = *it;
+ if (readerStates.at(i) != END &&
+ ba->RefID == CurrentRefID &&
+ (lowestAlignment == NULL || ba->Position < lowestPosition)) {
+ lowestPosition = ba->Position;
+ lowestAlignmentIndex = i;
+ lowestAlignment = ba;
+ }
+ ++i;
+ }
+
+ // now that we have the lowest alignment in the set, save it by copy to our argument
+ nextAlignment = BamAlignment(*lowestAlignment);
+
+ // else continue and load the next alignment
+ bool r = (readers.at(lowestAlignmentIndex))->GetNextAlignment(*alignments.at(lowestAlignmentIndex));
+ if (!r) {
+ cerr << "reached end of file " << readers.at(lowestAlignmentIndex)->GetFilename() << endl;
+ readerStates.at(lowestAlignmentIndex) = END; // set flag for end of file
+ }
+
+ return true;
+}
+
+// jumps to specified region(refID, leftBound) in BAM files, returns success/fail
+bool BamMultiReader::Jump(int refID, int position) {
+
+ //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {
+ CurrentRefID = refID;
+ CurrentLeft = position;
+
+ bool result = true;
+ for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
+ BamReader* reader = *br;
+ result &= reader->Jump(refID, position);
+ }
+ if (result)
+ UpdateAlignments();
+ return result;
+}
+
+// opens BAM files
+void BamMultiReader::Open(const vector<string> filenames) {
+ // for filename in filenames
+ fileNames = filenames; // save filenames in our multireader
+ for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) {
+ string filename = *it;
+ BamReader* reader = new BamReader;
+ reader->Open(filename, filename + ".bai");
+ BamAlignment* alignment = new BamAlignment;
+ reader->GetNextAlignment(*alignment);
+ readers.push_back(reader); // tracks readers
+ alignments.push_back(alignment); // tracks current alignments of the readers
+ BamReaderState readerState = START;
+ readerStates.push_back(readerState);
+ }
+}
+
+// Runs GetNextAlignment for all BAM readers; used during jumps
+void BamMultiReader::UpdateAlignments(void) {
+ int i = 0;
+ for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
+ BamAlignment* ba = *it;
+ readers.at(i++)->GetNextAlignment(*ba);
+ }
+}
+
+void BamMultiReader::PrintFilenames(void) {
+ for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
+ BamReader* reader = *br;
+ cout << reader->GetFilename() << endl;
+ }
+}
+
+// returns BAM file pointers to beginning of alignment data
+bool BamMultiReader::Rewind(void) {
+ bool result = true;
+ int index = 0;
+ for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
+ BamReader* reader = *br;
+ result &= reader->Rewind();
+ readerStates.at(index++) = START;
+ }
+ return result;
+}
+
+// saves index data to BAM index files (".bai") where necessary, returns success/fail
+bool BamMultiReader::CreateIndexes(void) {
+ bool result = true;
+ for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
+ BamReader* reader = *br;
+ result &= reader->CreateIndex();
+ }
+ return result;
+}
+
+const string BamMultiReader::GetUnifiedHeaderText(void) const {
+
+ string mergedHeader = "";
+
+ // foreach extraction entry (each BAM file)
+ bool isFirstTime = true;
+ for (vector<BamReader*>::const_iterator it = readers.begin(); it != readers.end(); ++it) {
+
+ BamReader* reader = *it;
+
+ string header = reader->GetHeaderText();
+
+ vector<string> lines;
+ boost::split(lines, header, boost::is_any_of("\n"));
+
+ for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) {
+
+ // get next line from header, skip if empty
+ string headerLine = *it;
+ if ( headerLine.empty() ) { continue; }
+
+ // if first file, save HD & SQ entries
+ if ( isFirstTime ) {
+ if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
+ mergedHeader.append(headerLine.c_str());
+ mergedHeader.append(1, '\n');
+ }
+ }
+
+ // (for all files) append RG entries
+ if ( headerLine.find("@RG") == 0 ) {
+ mergedHeader.append(headerLine.c_str() );
+ mergedHeader.append(1, '\n');
+ }
+
+ }
+
+ // set iteration flag
+ isFirstTime = false;
+ }
+
+ // return merged header text
+ return mergedHeader;
+}
+
--- /dev/null
+// ***************************************************************************\r
+// BamMultiReader.h (c) 2010 Erik Garrison\r
+// Marth Lab, Department of Biology, Boston College\r
+// All rights reserved.\r
+// ---------------------------------------------------------------------------\r
+// Last modified: 22 February 2010 (EG)\r
+// ---------------------------------------------------------------------------\r
+// Functionality for simultaneously reading multiple BAM files\r
+// ***************************************************************************\r
+\r
+#ifndef BAMMULTIREADER_H\r
+#define BAMMULTIREADER_H\r
+\r
+// C++ includes\r
+#include <string>\r
+\r
+using namespace std;\r
+\r
+// BamTools includes\r
+#include "BamAux.h"\r
+#include "BamReader.h"\r
+\r
+namespace BamTools {\r
+\r
+enum BamReaderState { START, END, CLOSED };\r
+\r
+class BamMultiReader {\r
+\r
+ // constructor / destructor\r
+ public:\r
+ BamMultiReader(void);\r
+ ~BamMultiReader(void);\r
+\r
+ // public interface\r
+ public:\r
+\r
+ // positioning\r
+ int CurrentRefID;\r
+ int CurrentLeft;\r
+\r
+ // ----------------------\r
+ // BAM file operations\r
+ // ----------------------\r
+\r
+ // close BAM files\r
+ void Close(void);\r
+ // performs random-access jump to reference, position\r
+ bool Jump(int refID, int position = 0);\r
+ // opens BAM files (and optional BAM index files, if provided)\r
+ //void Open(const vector<std::string&> filenames, const vector<std::string&> indexFilenames);\r
+ void Open(const vector<string> filenames);\r
+ // returns file pointers to beginning of alignments\r
+ bool Rewind(void);\r
+\r
+ // ----------------------\r
+ // access alignment data\r
+ // ----------------------\r
+ // updates the reference id marker to match the lower limit of our readers\r
+ void UpdateReferenceID(void);\r
+\r
+ // retrieves next available alignment (returns success/fail) from all files\r
+ bool GetNextAlignment(BamAlignment&);\r
+ // ... should this be private?\r
+ bool HasOpenReaders(void);\r
+\r
+ // ----------------------\r
+ // access auxiliary data\r
+ // ----------------------\r
+\r
+ // returns unified SAM header text for all files\r
+ const string GetUnifiedHeaderText(void) const;\r
+ // returns number of reference sequences\r
+ const int GetReferenceCount(void) const;\r
+ // returns vector of reference objects\r
+ const BamTools::RefVector GetReferenceData(void) const;\r
+ // returns reference id (used for BamMultiReader::Jump()) for the given reference name\r
+ //const int GetReferenceID(const std::string& refName) const;\r
+\r
+ // ----------------------\r
+ // BAM index operations\r
+ // ----------------------\r
+\r
+ // creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai")\r
+ bool CreateIndexes(void);\r
+\r
+ //const int GetReferenceID(const string& refName) const;\r
+\r
+ // utility\r
+ void PrintFilenames(void);\r
+ void UpdateAlignments(void);\r
+\r
+\r
+ // private implementation\r
+ private:\r
+ // TODO perhaps, for legibility, I should use a struct to wrap them all up\r
+ // But this may actually make things more confusing, as I'm only\r
+ // operating on them all simultaneously during GetNextAlignment\r
+ // calls.\r
+ // all these vectors are ordered the same\r
+ // readers.at(N) refers to the same reader as alignments.at(N) and readerStates.at(N)\r
+ vector<BamReader*> readers; // the set of readers which we operate on\r
+ vector<BamAlignment*> alignments; // the equivalent set of alignments we use to step through the files\r
+ vector<BamReaderState> readerStates; // states of the various readers\r
+ // alignment position?\r
+ vector<string> fileNames;\r
+};\r
+\r
+} // namespace BamTools\r
+\r
+#endif // BAMMULTIREADER_H\r