From 320649eff43c8092ce9ef548179905dccfeb1c42 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Fri, 21 May 2010 17:07:31 -0400 Subject: [PATCH] Complete prior commit In this commit, addition of verification that reference sequences are identical among readers opened by the BamMultiReader. Without this check the behavior of the MultiReader is undefined. --- BamMultiMergeMain.cpp | 70 ------------------------------------------ BamMultiReader.cpp | 71 +++++++++++++++++++++++++++++++++++++++---- BamMultiReader.h | 8 +++-- Makefile | 10 +++--- bamtools.cpp | 54 ++++++++++++++++++++++++++------ 5 files changed, 120 insertions(+), 93 deletions(-) delete mode 100644 BamMultiMergeMain.cpp diff --git a/BamMultiMergeMain.cpp b/BamMultiMergeMain.cpp deleted file mode 100644 index a4bed55..0000000 --- a/BamMultiMergeMain.cpp +++ /dev/null @@ -1,70 +0,0 @@ -#include "BamMultiReader.h" -#include "BamWriter.h" -#include -#include - -using namespace BamTools; -using namespace std; - -int main(int argc, char** argv) { - - if (argc == 1) { - cerr << "USAGE: ./BamMultiMerge [input files]" << endl; - exit(0); - } - - string outputFilename = argv[1]; - - BamMultiReader reader; - vector filenames; - for (int i = 2; i::iterator it = alignments.begin(); it != alignments.end(); ++it) { + for (vector::const_iterator it = alignments.begin(); it != alignments.end(); ++it) { BamAlignment* ba = *it; if (readerStates.at(i) != END && ba->RefID == CurrentRefID && @@ -137,7 +137,7 @@ bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { // else continue and load the next alignment bool r = (readers.at(lowestAlignmentIndex))->GetNextAlignment(*alignments.at(lowestAlignmentIndex)); if (!r) { - cerr << "reached end of file " << readers.at(lowestAlignmentIndex)->GetFilename() << endl; + //cerr << "reached end of file " << readers.at(lowestAlignmentIndex)->GetFilename() << endl; readerStates.at(lowestAlignmentIndex) = END; // set flag for end of file } @@ -162,13 +162,17 @@ bool BamMultiReader::Jump(int refID, int position) { } // opens BAM files -void BamMultiReader::Open(const vector filenames) { +void BamMultiReader::Open(const vector filenames, bool openIndexes) { // for filename in filenames fileNames = filenames; // save filenames in our multireader for (vector::const_iterator it = filenames.begin(); it != filenames.end(); ++it) { string filename = *it; BamReader* reader = new BamReader; - reader->Open(filename, filename + ".bai"); + if (openIndexes) { + reader->Open(filename, filename + ".bai"); + } else { + reader->Open(filename); // for merging, jumping is disallowed + } BamAlignment* alignment = new BamAlignment; reader->GetNextAlignment(*alignment); readers.push_back(reader); // tracks readers @@ -176,6 +180,7 @@ void BamMultiReader::Open(const vector filenames) { BamReaderState readerState = START; readerStates.push_back(readerState); } + ValidateReaders(); } // Runs GetNextAlignment for all BAM readers; used during jumps @@ -217,7 +222,7 @@ bool BamMultiReader::CreateIndexes(void) { } // makes a virtual, unified header for all the bam files in the multireader -const string BamMultiReader::GetUnifiedHeaderText(void) const { +const string BamMultiReader::GetHeaderText(void) const { string mergedHeader = ""; @@ -263,3 +268,57 @@ const string BamMultiReader::GetUnifiedHeaderText(void) const { return mergedHeader; } +// ValidateReaders checks that all the readers point to BAM files representing +// alignments against the same set of reference sequences, and that the +// sequences are identically ordered. If these checks fail the operation of +// the multireader is undefined, so we force program exit. +void BamMultiReader::ValidateReaders(void) const { + int firstRefCount = readers.front()->GetReferenceCount(); + BamTools::RefVector firstRefData = readers.front()->GetReferenceData(); + for (vector::const_iterator it = readers.begin(); it != readers.end(); ++it) { + BamTools::RefVector currentRefData = (*it)->GetReferenceData(); + BamTools::RefVector::const_iterator f = firstRefData.begin(); + BamTools::RefVector::const_iterator c = currentRefData.begin(); + if ((*it)->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) { + cerr << "ERROR: mismatched number of references in " << (*it)->GetFilename() + << " expected " << firstRefCount + << " reference sequences but only found " << (*it)->GetReferenceCount() << endl; + exit(1); + } + // this will be ok; we just checked above that we have identically-sized sets of references + // here we simply check if they are all, in fact, equal in content + while (f != firstRefData.end()) { + if (f->RefName != c->RefName || f->RefLength != c->RefLength) { + cerr << "ERROR: mismatched references found in " << (*it)->GetFilename() + << " expected: " << endl; + for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a) + cerr << a->RefName << " " << a->RefLength << endl; + cerr << "but found: " << endl; + for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a) + cerr << a->RefName << " " << a->RefLength << endl; + exit(1); + } + ++f; ++c; + } + } +} + +// NB: The following functions assume that we have identical references for all +// BAM files. We enforce this by invoking the above validation function +// (ValidateReaders) to verify that our reference data is the same across all +// files on Open, so we will not encounter a situation in which there is a +// mismatch and we are still live. + +// returns the number of reference sequences +const int BamMultiReader::GetReferenceCount(void) const { + return readers.front()->GetReferenceCount(); +} + +// returns vector of reference objects +const BamTools::RefVector BamMultiReader::GetReferenceData(void) const { + return readers.front()->GetReferenceData(); +} + +const int BamMultiReader::GetReferenceID(const string& refName) const { + return readers.front()->GetReferenceID(refName); +} diff --git a/BamMultiReader.h b/BamMultiReader.h index 319d327..0464256 100644 --- a/BamMultiReader.h +++ b/BamMultiReader.h @@ -48,7 +48,7 @@ class BamMultiReader { bool Jump(int refID, int position = 0); // opens BAM files (and optional BAM index files, if provided) //void Open(const vector filenames, const vector indexFilenames); - void Open(const vector filenames); + void Open(const vector filenames, bool openIndexes = true); // returns file pointers to beginning of alignments bool Rewind(void); @@ -68,13 +68,15 @@ class BamMultiReader { // ---------------------- // returns unified SAM header text for all files - const string GetUnifiedHeaderText(void) const; + const string GetHeaderText(void) const; // returns number of reference sequences const int GetReferenceCount(void) const; // returns vector of reference objects const BamTools::RefVector GetReferenceData(void) const; // returns reference id (used for BamMultiReader::Jump()) for the given reference name - //const int GetReferenceID(const std::string& refName) const; + const int GetReferenceID(const std::string& refName) const; + // validates that we have a congruent set of BAM files that are aligned against the same reference sequences + void ValidateReaders() const; // ---------------------- // BAM index operations diff --git a/Makefile b/Makefile index 7abd53b..fd08519 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,13 @@ CXX= g++ CXXFLAGS= -Wall -O3 -PROG= BamConversion BamDump BamTrim BamMultiMerge +PROG= BamConversion BamDump BamTrim bamtools LIBS= -lz all: $(PROG) +bamtools: BGZF.o BamReader.o BamWriter.o BamMultiReader.o bamtools.o + $(CXX) $(CXXFLAGS) -o $@ BGZF.o BamReader.o BamWriter.o BamMultiReader.o bamtools.o $(LIBS) + BamConversion: BGZF.o BamReader.o BamWriter.o BamConversionMain.o $(CXX) $(CXXFLAGS) -o $@ BGZF.o BamReader.o BamWriter.o BamConversionMain.o $(LIBS) @@ -14,8 +17,5 @@ BamDump: BGZF.o BamReader.o BamDumpMain.o BamTrim: BGZF.o BamReader.o BamWriter.o BamTrimMain.o $(CXX) $(CXXFLAGS) -o $@ BGZF.o BamReader.o BamWriter.o BamTrimMain.o $(LIBS) -BamMultiMerge: BGZF.o BamMultiReader.o BamReader.o BamWriter.o BamMultiMergeMain.o - $(CXX) $(CXXFLAGS) -o $@ BGZF.o BamMultiReader.o BamReader.o BamWriter.o BamMultiMergeMain.o $(LIBS) - clean: - rm -fr gmon.out *.o *.a a.out *~ + rm -fr gmon.out *.o *.a a.out *~ $(PROG) diff --git a/bamtools.cpp b/bamtools.cpp index e4d7769..ae3deed 100644 --- a/bamtools.cpp +++ b/bamtools.cpp @@ -9,8 +9,8 @@ // Std C/C++ includes #include #include +#include #include -#include using namespace std; // BamTools includes @@ -19,12 +19,13 @@ using namespace std; #include "BamMultiReader.h" using namespace BamTools; -void usageSummary() { - cerr << "usage: bamtools [options]" << endl +void usageSummary(string cmdname) { + cerr << "usage: " << cmdname << " [options]" << endl << "actions:" << endl - << " index " << endl - << " merge [ ...]" << endl - << " dump [ ...]" << endl; + << " index # generates BAM index .bai" << endl + << " merge [ ...] # merges BAM files into a single file" << endl + << " dump [ ...] # dumps alignment summaries to stdout" << endl + << " header [ ...] # prints header, or unified header for BAM file or files" << endl; //<< " trim " << endl; } @@ -33,7 +34,7 @@ void BamMerge(string outputFilename, vector filenames) { BamMultiReader reader; - reader.Open(filenames); + reader.Open(filenames, false); // opens the files without checking for indexes string mergedHeader = reader.GetHeaderText(); @@ -93,32 +94,67 @@ void BamDump(vector files) { } +void BamDumpHeader(vector files) { + + BamMultiReader reader; + reader.Open(files, false); + + cout << reader.GetHeaderText() << endl; + + reader.Close(); + +} + int main(int argc, char* argv[]) { // validate argument count - if( argc < 2 ) { - usageSummary(); + if( argc < 3 ) { + usageSummary(argv[0]); exit(1); } string command = argv[1]; if (command == "index") { + BamCreateIndex(argv[2]); + } else if (command == "merge") { + vector files; string outputFile = argv[2]; + + // check if our output exists, and exit if so + ifstream output(outputFile.c_str()); + if (output.good()) { + cerr << "ERROR: output file " << outputFile << " exists, exiting." << endl; + exit(1); + } else { + output.close(); + } + for (int i = 3; i files; for (int i = 2; i files; + for (int i = 2; i