+++ /dev/null
-#include "BamMultiReader.h"\r
-#include "BamWriter.h"\r
-#include <boost/algorithm/string.hpp>\r
-#include <iostream>\r
-\r
-using namespace BamTools;\r
-using namespace std;\r
-\r
-int main(int argc, char** argv) {\r
-\r
- if (argc == 1) {\r
- cerr << "USAGE: ./BamMultiMerge <output file> [input files]" << endl;\r
- exit(0);\r
- }\r
-\r
- string outputFilename = argv[1];\r
-\r
- BamMultiReader reader;\r
- vector<string> filenames;\r
- for (int i = 2; i<argc; ++i) {\r
- filenames.push_back(argv[i]);\r
- }\r
-\r
- reader.Open(filenames);\r
-\r
- //cerr << "merging to " << outputFilename << endl;\r
- string mergedHeader = reader.GetUnifiedHeaderText();\r
- //cerr << "mergedHeader = " << endl << mergedHeader << endl;\r
-\r
- // check that we are merging files which have the same sets of references\r
- RefVector references;\r
- int referencesSize = 0; bool first = true;\r
- for (int i = 2; i<argc; ++i) {\r
- BamReader areader;\r
- areader.Open( argv[i] );\r
- if (first) {\r
- references = areader.GetReferenceData();\r
- referencesSize = references.size();\r
- first = false;\r
- } else {\r
- RefVector newreferences = areader.GetReferenceData();\r
- int i = 0;\r
- for (RefVector::const_iterator it = references.begin(); it != references.end(); it++) {\r
- if (newreferences.at(i++).RefName != it->RefName) {\r
- cerr << "BAM FILES ALIGNED AGAINST DIFFERING SETS OF REFERENCES, NOT MERGING" << endl;\r
- exit(1);\r
- }\r
- }\r
- }\r
- }\r
-\r
- // open BamWriter\r
- BamWriter writer;\r
- writer.Open( outputFilename.c_str(), mergedHeader, references);\r
-\r
- BamAlignment bAlignment;\r
- while (reader.GetNextAlignment(bAlignment)) {\r
- // write to output file\r
- writer.SaveAlignment(bAlignment);\r
- }\r
-\r
- // close output file\r
- writer.Close();\r
- // close input files\r
- reader.Close();\r
-\r
- //cerr << "done" << endl;\r
-\r
- return 0;\r
-}\r
}
}
}
- cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl;
+ //cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl;
CurrentRefID = nextRefID;
}
}
// forward, and return it
int i = 0, lowestPosition = -1, lowestAlignmentIndex = -1;
BamAlignment* lowestAlignment = NULL;
- for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
+ for (vector<BamAlignment*>::const_iterator it = alignments.begin(); it != alignments.end(); ++it) {
BamAlignment* ba = *it;
if (readerStates.at(i) != END &&
ba->RefID == CurrentRefID &&
// else continue and load the next alignment
bool r = (readers.at(lowestAlignmentIndex))->GetNextAlignment(*alignments.at(lowestAlignmentIndex));
if (!r) {
- cerr << "reached end of file " << readers.at(lowestAlignmentIndex)->GetFilename() << endl;
+ //cerr << "reached end of file " << readers.at(lowestAlignmentIndex)->GetFilename() << endl;
readerStates.at(lowestAlignmentIndex) = END; // set flag for end of file
}
}
// opens BAM files
-void BamMultiReader::Open(const vector<string> filenames) {
+void BamMultiReader::Open(const vector<string> filenames, bool openIndexes) {
// for filename in filenames
fileNames = filenames; // save filenames in our multireader
for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) {
string filename = *it;
BamReader* reader = new BamReader;
- reader->Open(filename, filename + ".bai");
+ if (openIndexes) {
+ reader->Open(filename, filename + ".bai");
+ } else {
+ reader->Open(filename); // for merging, jumping is disallowed
+ }
BamAlignment* alignment = new BamAlignment;
reader->GetNextAlignment(*alignment);
readers.push_back(reader); // tracks readers
BamReaderState readerState = START;
readerStates.push_back(readerState);
}
+ ValidateReaders();
}
// Runs GetNextAlignment for all BAM readers; used during jumps
}
// makes a virtual, unified header for all the bam files in the multireader
-const string BamMultiReader::GetUnifiedHeaderText(void) const {
+const string BamMultiReader::GetHeaderText(void) const {
string mergedHeader = "";
return mergedHeader;
}
+// ValidateReaders checks that all the readers point to BAM files representing
+// alignments against the same set of reference sequences, and that the
+// sequences are identically ordered. If these checks fail the operation of
+// the multireader is undefined, so we force program exit.
+void BamMultiReader::ValidateReaders(void) const {
+ int firstRefCount = readers.front()->GetReferenceCount();
+ BamTools::RefVector firstRefData = readers.front()->GetReferenceData();
+ for (vector<BamReader*>::const_iterator it = readers.begin(); it != readers.end(); ++it) {
+ BamTools::RefVector currentRefData = (*it)->GetReferenceData();
+ BamTools::RefVector::const_iterator f = firstRefData.begin();
+ BamTools::RefVector::const_iterator c = currentRefData.begin();
+ if ((*it)->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) {
+ cerr << "ERROR: mismatched number of references in " << (*it)->GetFilename()
+ << " expected " << firstRefCount
+ << " reference sequences but only found " << (*it)->GetReferenceCount() << endl;
+ exit(1);
+ }
+ // this will be ok; we just checked above that we have identically-sized sets of references
+ // here we simply check if they are all, in fact, equal in content
+ while (f != firstRefData.end()) {
+ if (f->RefName != c->RefName || f->RefLength != c->RefLength) {
+ cerr << "ERROR: mismatched references found in " << (*it)->GetFilename()
+ << " expected: " << endl;
+ for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a)
+ cerr << a->RefName << " " << a->RefLength << endl;
+ cerr << "but found: " << endl;
+ for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a)
+ cerr << a->RefName << " " << a->RefLength << endl;
+ exit(1);
+ }
+ ++f; ++c;
+ }
+ }
+}
+
+// NB: The following functions assume that we have identical references for all
+// BAM files. We enforce this by invoking the above validation function
+// (ValidateReaders) to verify that our reference data is the same across all
+// files on Open, so we will not encounter a situation in which there is a
+// mismatch and we are still live.
+
+// returns the number of reference sequences
+const int BamMultiReader::GetReferenceCount(void) const {
+ return readers.front()->GetReferenceCount();
+}
+
+// returns vector of reference objects
+const BamTools::RefVector BamMultiReader::GetReferenceData(void) const {
+ return readers.front()->GetReferenceData();
+}
+
+const int BamMultiReader::GetReferenceID(const string& refName) const {
+ return readers.front()->GetReferenceID(refName);
+}
bool Jump(int refID, int position = 0);\r
// opens BAM files (and optional BAM index files, if provided)\r
//void Open(const vector<std::string&> filenames, const vector<std::string&> indexFilenames);\r
- void Open(const vector<string> filenames);\r
+ void Open(const vector<string> filenames, bool openIndexes = true);\r
// returns file pointers to beginning of alignments\r
bool Rewind(void);\r
\r
// ----------------------\r
\r
// returns unified SAM header text for all files\r
- const string GetUnifiedHeaderText(void) const;\r
+ const string GetHeaderText(void) const;\r
// returns number of reference sequences\r
const int GetReferenceCount(void) const;\r
// returns vector of reference objects\r
const BamTools::RefVector GetReferenceData(void) const;\r
// returns reference id (used for BamMultiReader::Jump()) for the given reference name\r
- //const int GetReferenceID(const std::string& refName) const;\r
+ const int GetReferenceID(const std::string& refName) const;\r
+ // validates that we have a congruent set of BAM files that are aligned against the same reference sequences\r
+ void ValidateReaders() const;\r
\r
// ----------------------\r
// BAM index operations\r
CXX= g++\r
CXXFLAGS= -Wall -O3\r
-PROG= BamConversion BamDump BamTrim BamMultiMerge\r
+PROG= BamConversion BamDump BamTrim bamtools\r
LIBS= -lz\r
\r
all: $(PROG)\r
\r
+bamtools: BGZF.o BamReader.o BamWriter.o BamMultiReader.o bamtools.o\r
+ $(CXX) $(CXXFLAGS) -o $@ BGZF.o BamReader.o BamWriter.o BamMultiReader.o bamtools.o $(LIBS)\r
+\r
BamConversion: BGZF.o BamReader.o BamWriter.o BamConversionMain.o\r
$(CXX) $(CXXFLAGS) -o $@ BGZF.o BamReader.o BamWriter.o BamConversionMain.o $(LIBS)\r
\r
BamTrim: BGZF.o BamReader.o BamWriter.o BamTrimMain.o\r
$(CXX) $(CXXFLAGS) -o $@ BGZF.o BamReader.o BamWriter.o BamTrimMain.o $(LIBS)\r
\r
-BamMultiMerge: BGZF.o BamMultiReader.o BamReader.o BamWriter.o BamMultiMergeMain.o\r
- $(CXX) $(CXXFLAGS) -o $@ BGZF.o BamMultiReader.o BamReader.o BamWriter.o BamMultiMergeMain.o $(LIBS)\r
-\r
clean:\r
- rm -fr gmon.out *.o *.a a.out *~\r
+ rm -fr gmon.out *.o *.a a.out *~ $(PROG)\r
// Std C/C++ includes
#include <cstdlib>
#include <iostream>
+#include <fstream>
#include <string>
-#include <boost/algorithm/string.hpp>
using namespace std;
// BamTools includes
#include "BamMultiReader.h"
using namespace BamTools;
-void usageSummary() {
- cerr << "usage: bamtools <command> [options]" << endl
+void usageSummary(string cmdname) {
+ cerr << "usage: " << cmdname << " <command> [options]" << endl
<< "actions:" << endl
- << " index <bam file>" << endl
- << " merge <merged BAM file> [<BAM file> <BAM file> ...]" << endl
- << " dump [<BAM file> <BAM file> ...]" << endl;
+ << " index <BAM file> # generates BAM index <BAM file>.bai" << endl
+ << " merge <merged BAM file> [<BAM file> <BAM file> ...] # merges BAM files into a single file" << endl
+ << " dump [<BAM file> <BAM file> ...] # dumps alignment summaries to stdout" << endl
+ << " header [<BAM file> <BAM file> ...] # prints header, or unified header for BAM file or files" << endl;
//<< " trim <input BAM file> <input BAM index file> <output BAM file> <reference name> <leftBound> <rightBound>" << endl;
}
BamMultiReader reader;
- reader.Open(filenames);
+ reader.Open(filenames, false); // opens the files without checking for indexes
string mergedHeader = reader.GetHeaderText();
}
+void BamDumpHeader(vector<string> files) {
+
+ BamMultiReader reader;
+ reader.Open(files, false);
+
+ cout << reader.GetHeaderText() << endl;
+
+ reader.Close();
+
+}
+
int main(int argc, char* argv[]) {
// validate argument count
- if( argc < 2 ) {
- usageSummary();
+ if( argc < 3 ) {
+ usageSummary(argv[0]);
exit(1);
}
string command = argv[1];
if (command == "index") {
+
BamCreateIndex(argv[2]);
+
} else if (command == "merge") {
+
vector<string> files;
string outputFile = argv[2];
+
+ // check if our output exists, and exit if so
+ ifstream output(outputFile.c_str());
+ if (output.good()) {
+ cerr << "ERROR: output file " << outputFile << " exists, exiting." << endl;
+ exit(1);
+ } else {
+ output.close();
+ }
+
for (int i = 3; i<argc; ++i) {
files.push_back(argv[i]);
}
BamMerge(outputFile, files);
+
} else if (command == "dump") {
+
vector<string> files;
for (int i = 2; i<argc; ++i) {
files.push_back(argv[i]);
}
BamDump(files);
+
+ } else if (command == "header") {
+
+ vector<string> files;
+ for (int i = 2; i<argc; ++i) {
+ files.push_back(argv[i]);
+ }
+ BamDumpHeader(files);
+
}