From: Erik Garrison Date: Fri, 21 May 2010 21:07:31 +0000 (-0400) Subject: Complete prior commit X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;ds=sidebyside;h=320649eff43c8092ce9ef548179905dccfeb1c42;p=bamtools.git Complete prior commit In this commit, addition of verification that reference sequences are identical among readers opened by the BamMultiReader. Without this check the behavior of the MultiReader is undefined. --- diff --git a/BamMultiMergeMain.cpp b/BamMultiMergeMain.cpp deleted file mode 100644 index a4bed55..0000000 --- a/BamMultiMergeMain.cpp +++ /dev/null @@ -1,70 +0,0 @@ -#include "BamMultiReader.h" -#include "BamWriter.h" -#include -#include - -using namespace BamTools; -using namespace std; - -int main(int argc, char** argv) { - - if (argc == 1) { - cerr << "USAGE: ./BamMultiMerge [input files]" << endl; - exit(0); - } - - string outputFilename = argv[1]; - - BamMultiReader reader; - vector filenames; - for (int i = 2; i::iterator it = alignments.begin(); it != alignments.end(); ++it) { + for (vector::const_iterator it = alignments.begin(); it != alignments.end(); ++it) { BamAlignment* ba = *it; if (readerStates.at(i) != END && ba->RefID == CurrentRefID && @@ -137,7 +137,7 @@ bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { // else continue and load the next alignment bool r = (readers.at(lowestAlignmentIndex))->GetNextAlignment(*alignments.at(lowestAlignmentIndex)); if (!r) { - cerr << "reached end of file " << readers.at(lowestAlignmentIndex)->GetFilename() << endl; + //cerr << "reached end of file " << readers.at(lowestAlignmentIndex)->GetFilename() << endl; readerStates.at(lowestAlignmentIndex) = END; // set flag for end of file } @@ -162,13 +162,17 @@ bool BamMultiReader::Jump(int refID, int position) { } // opens BAM files -void BamMultiReader::Open(const vector filenames) { +void BamMultiReader::Open(const vector filenames, bool openIndexes) { // for filename in filenames fileNames = filenames; // save filenames in our multireader for (vector::const_iterator it = filenames.begin(); it != filenames.end(); ++it) { string filename = *it; BamReader* reader = new BamReader; - reader->Open(filename, filename + ".bai"); + if (openIndexes) { + reader->Open(filename, filename + ".bai"); + } else { + reader->Open(filename); // for merging, jumping is disallowed + } BamAlignment* alignment = new BamAlignment; reader->GetNextAlignment(*alignment); readers.push_back(reader); // tracks readers @@ -176,6 +180,7 @@ void BamMultiReader::Open(const vector filenames) { BamReaderState readerState = START; readerStates.push_back(readerState); } + ValidateReaders(); } // Runs GetNextAlignment for all BAM readers; used during jumps @@ -217,7 +222,7 @@ bool BamMultiReader::CreateIndexes(void) { } // makes a virtual, unified header for all the bam files in the multireader -const string BamMultiReader::GetUnifiedHeaderText(void) const { +const string BamMultiReader::GetHeaderText(void) const { string mergedHeader = ""; @@ -263,3 +268,57 @@ const string BamMultiReader::GetUnifiedHeaderText(void) const { return mergedHeader; } +// ValidateReaders checks that all the readers point to BAM files representing +// alignments against the same set of reference sequences, and that the +// sequences are identically ordered. If these checks fail the operation of +// the multireader is undefined, so we force program exit. +void BamMultiReader::ValidateReaders(void) const { + int firstRefCount = readers.front()->GetReferenceCount(); + BamTools::RefVector firstRefData = readers.front()->GetReferenceData(); + for (vector::const_iterator it = readers.begin(); it != readers.end(); ++it) { + BamTools::RefVector currentRefData = (*it)->GetReferenceData(); + BamTools::RefVector::const_iterator f = firstRefData.begin(); + BamTools::RefVector::const_iterator c = currentRefData.begin(); + if ((*it)->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) { + cerr << "ERROR: mismatched number of references in " << (*it)->GetFilename() + << " expected " << firstRefCount + << " reference sequences but only found " << (*it)->GetReferenceCount() << endl; + exit(1); + } + // this will be ok; we just checked above that we have identically-sized sets of references + // here we simply check if they are all, in fact, equal in content + while (f != firstRefData.end()) { + if (f->RefName != c->RefName || f->RefLength != c->RefLength) { + cerr << "ERROR: mismatched references found in " << (*it)->GetFilename() + << " expected: " << endl; + for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a) + cerr << a->RefName << " " << a->RefLength << endl; + cerr << "but found: " << endl; + for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a) + cerr << a->RefName << " " << a->RefLength << endl; + exit(1); + } + ++f; ++c; + } + } +} + +// NB: The following functions assume that we have identical references for all +// BAM files. We enforce this by invoking the above validation function +// (ValidateReaders) to verify that our reference data is the same across all +// files on Open, so we will not encounter a situation in which there is a +// mismatch and we are still live. + +// returns the number of reference sequences +const int BamMultiReader::GetReferenceCount(void) const { + return readers.front()->GetReferenceCount(); +} + +// returns vector of reference objects +const BamTools::RefVector BamMultiReader::GetReferenceData(void) const { + return readers.front()->GetReferenceData(); +} + +const int BamMultiReader::GetReferenceID(const string& refName) const { + return readers.front()->GetReferenceID(refName); +} diff --git a/BamMultiReader.h b/BamMultiReader.h index 319d327..0464256 100644 --- a/BamMultiReader.h +++ b/BamMultiReader.h @@ -48,7 +48,7 @@ class BamMultiReader { bool Jump(int refID, int position = 0); // opens BAM files (and optional BAM index files, if provided) //void Open(const vector filenames, const vector indexFilenames); - void Open(const vector filenames); + void Open(const vector filenames, bool openIndexes = true); // returns file pointers to beginning of alignments bool Rewind(void); @@ -68,13 +68,15 @@ class BamMultiReader { // ---------------------- // returns unified SAM header text for all files - const string GetUnifiedHeaderText(void) const; + const string GetHeaderText(void) const; // returns number of reference sequences const int GetReferenceCount(void) const; // returns vector of reference objects const BamTools::RefVector GetReferenceData(void) const; // returns reference id (used for BamMultiReader::Jump()) for the given reference name - //const int GetReferenceID(const std::string& refName) const; + const int GetReferenceID(const std::string& refName) const; + // validates that we have a congruent set of BAM files that are aligned against the same reference sequences + void ValidateReaders() const; // ---------------------- // BAM index operations diff --git a/Makefile b/Makefile index 7abd53b..fd08519 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,13 @@ CXX= g++ CXXFLAGS= -Wall -O3 -PROG= BamConversion BamDump BamTrim BamMultiMerge +PROG= BamConversion BamDump BamTrim bamtools LIBS= -lz all: $(PROG) +bamtools: BGZF.o BamReader.o BamWriter.o BamMultiReader.o bamtools.o + $(CXX) $(CXXFLAGS) -o $@ BGZF.o BamReader.o BamWriter.o BamMultiReader.o bamtools.o $(LIBS) + BamConversion: BGZF.o BamReader.o BamWriter.o BamConversionMain.o $(CXX) $(CXXFLAGS) -o $@ BGZF.o BamReader.o BamWriter.o BamConversionMain.o $(LIBS) @@ -14,8 +17,5 @@ BamDump: BGZF.o BamReader.o BamDumpMain.o BamTrim: BGZF.o BamReader.o BamWriter.o BamTrimMain.o $(CXX) $(CXXFLAGS) -o $@ BGZF.o BamReader.o BamWriter.o BamTrimMain.o $(LIBS) -BamMultiMerge: BGZF.o BamMultiReader.o BamReader.o BamWriter.o BamMultiMergeMain.o - $(CXX) $(CXXFLAGS) -o $@ BGZF.o BamMultiReader.o BamReader.o BamWriter.o BamMultiMergeMain.o $(LIBS) - clean: - rm -fr gmon.out *.o *.a a.out *~ + rm -fr gmon.out *.o *.a a.out *~ $(PROG) diff --git a/bamtools.cpp b/bamtools.cpp index e4d7769..ae3deed 100644 --- a/bamtools.cpp +++ b/bamtools.cpp @@ -9,8 +9,8 @@ // Std C/C++ includes #include #include +#include #include -#include using namespace std; // BamTools includes @@ -19,12 +19,13 @@ using namespace std; #include "BamMultiReader.h" using namespace BamTools; -void usageSummary() { - cerr << "usage: bamtools [options]" << endl +void usageSummary(string cmdname) { + cerr << "usage: " << cmdname << " [options]" << endl << "actions:" << endl - << " index " << endl - << " merge [ ...]" << endl - << " dump [ ...]" << endl; + << " index # generates BAM index .bai" << endl + << " merge [ ...] # merges BAM files into a single file" << endl + << " dump [ ...] # dumps alignment summaries to stdout" << endl + << " header [ ...] # prints header, or unified header for BAM file or files" << endl; //<< " trim " << endl; } @@ -33,7 +34,7 @@ void BamMerge(string outputFilename, vector filenames) { BamMultiReader reader; - reader.Open(filenames); + reader.Open(filenames, false); // opens the files without checking for indexes string mergedHeader = reader.GetHeaderText(); @@ -93,32 +94,67 @@ void BamDump(vector files) { } +void BamDumpHeader(vector files) { + + BamMultiReader reader; + reader.Open(files, false); + + cout << reader.GetHeaderText() << endl; + + reader.Close(); + +} + int main(int argc, char* argv[]) { // validate argument count - if( argc < 2 ) { - usageSummary(); + if( argc < 3 ) { + usageSummary(argv[0]); exit(1); } string command = argv[1]; if (command == "index") { + BamCreateIndex(argv[2]); + } else if (command == "merge") { + vector files; string outputFile = argv[2]; + + // check if our output exists, and exit if so + ifstream output(outputFile.c_str()); + if (output.good()) { + cerr << "ERROR: output file " << outputFile << " exists, exiting." << endl; + exit(1); + } else { + output.close(); + } + for (int i = 3; i files; for (int i = 2; i files; + for (int i = 2; i