From: barnett Date: Fri, 7 May 2010 03:10:38 +0000 (+0000) Subject: Didnt actually add BamMultiMerge and BamMultiReader with last commit. Adding to repos... X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=da8310a92e91d3682857b8f9affbe19621ea7bae;p=bamtools.git Didnt actually add BamMultiMerge and BamMultiReader with last commit. Adding to repository this time, seriously. git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@51 9efb377e-2e27-44b9-b91a-ec4abb80ed8b --- diff --git a/BamMultiMergeMain.cpp b/BamMultiMergeMain.cpp new file mode 100644 index 0000000..a4bed55 --- /dev/null +++ b/BamMultiMergeMain.cpp @@ -0,0 +1,70 @@ +#include "BamMultiReader.h" +#include "BamWriter.h" +#include +#include + +using namespace BamTools; +using namespace std; + +int main(int argc, char** argv) { + + if (argc == 1) { + cerr << "USAGE: ./BamMultiMerge [input files]" << endl; + exit(0); + } + + string outputFilename = argv[1]; + + BamMultiReader reader; + vector filenames; + for (int i = 2; i +#include +#include +#include +#include +#include + +// BamTools includes +#include "BGZF.h" +#include "BamMultiReader.h" +using namespace BamTools; +using namespace std; + +// ----------------------------------------------------- +// BamMultiReader implementation +// ----------------------------------------------------- + +// constructor +BamMultiReader::BamMultiReader(void) + : CurrentRefID(0) + , CurrentLeft(0) +{ } + +// destructor +BamMultiReader::~BamMultiReader(void) { + Close(); // close the bam files + // clean up reader objects + for (vector::iterator br = readers.begin(); br != readers.end(); ++br) { + delete *br; + } +} + +// close the BAM files +void BamMultiReader::Close(void) { + int index = 0; + for (vector::iterator br = readers.begin(); br != readers.end(); ++br) { + BamReader* reader = *br; + reader->Close(); // close the reader + readerStates.at(index++) = CLOSED; // track its state + } +} + +// updates the reference id stored in the BamMultiReader +// to reflect the current state of the readers +void BamMultiReader::UpdateReferenceID(void) { + bool updateRefID = true; + int i = 0; + for (vector::iterator it = alignments.begin(); it != alignments.end(); ++it) { + BamAlignment* alignment = *it; + if (readerStates.at(i++) != END && alignment->RefID == CurrentRefID) { + updateRefID = false; + break; + } + } + if (updateRefID) { + // get the next reference id + // while there aren't any readers at the next ref id + // increment the ref id + int nextRefID = CurrentRefID; + bool ok = false; + while (!ok) { + ++nextRefID; + int i = 0; + for (vector::iterator it = alignments.begin(); it != alignments.end(); ++it) { + BamAlignment* alignment = *it; + if (readerStates.at(i++) != END && alignment->RefID == nextRefID) { + ok = true; + break; + } + } + } + cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl; + CurrentRefID = nextRefID; + } +} + +// checks if any readers still have alignments +bool BamMultiReader::HasOpenReaders() { + for (vector::iterator it = readerStates.begin(); it != readerStates.end(); ++it) { + BamReaderState readerState = *it; + if (readerState != END) + return true; + } + return false; +} + +// get next alignment among all files (from specified region, if given) +bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { + + // bail out if we are at EOF in all files, means no more alignments to process + if (!HasOpenReaders()) + return false; + + // when all alignments have stepped into a new target sequence, update our + // current reference sequence id + UpdateReferenceID(); + + // find the alignment with the lowest position that is also in the current + // reference sequence, getnextalignment on its bam reader to step it + // forward, and return it + int i = 0, lowestPosition = -1, lowestAlignmentIndex = -1; + BamAlignment* lowestAlignment = NULL; + for (vector::iterator it = alignments.begin(); it != alignments.end(); ++it) { + BamAlignment* ba = *it; + if (readerStates.at(i) != END && + ba->RefID == CurrentRefID && + (lowestAlignment == NULL || ba->Position < lowestPosition)) { + lowestPosition = ba->Position; + lowestAlignmentIndex = i; + lowestAlignment = ba; + } + ++i; + } + + // now that we have the lowest alignment in the set, save it by copy to our argument + nextAlignment = BamAlignment(*lowestAlignment); + + // else continue and load the next alignment + bool r = (readers.at(lowestAlignmentIndex))->GetNextAlignment(*alignments.at(lowestAlignmentIndex)); + if (!r) { + cerr << "reached end of file " << readers.at(lowestAlignmentIndex)->GetFilename() << endl; + readerStates.at(lowestAlignmentIndex) = END; // set flag for end of file + } + + return true; +} + +// jumps to specified region(refID, leftBound) in BAM files, returns success/fail +bool BamMultiReader::Jump(int refID, int position) { + + //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) { + CurrentRefID = refID; + CurrentLeft = position; + + bool result = true; + for (vector::iterator br = readers.begin(); br != readers.end(); ++br) { + BamReader* reader = *br; + result &= reader->Jump(refID, position); + } + if (result) + UpdateAlignments(); + return result; +} + +// opens BAM files +void BamMultiReader::Open(const vector filenames) { + // for filename in filenames + fileNames = filenames; // save filenames in our multireader + for (vector::const_iterator it = filenames.begin(); it != filenames.end(); ++it) { + string filename = *it; + BamReader* reader = new BamReader; + reader->Open(filename, filename + ".bai"); + BamAlignment* alignment = new BamAlignment; + reader->GetNextAlignment(*alignment); + readers.push_back(reader); // tracks readers + alignments.push_back(alignment); // tracks current alignments of the readers + BamReaderState readerState = START; + readerStates.push_back(readerState); + } +} + +// Runs GetNextAlignment for all BAM readers; used during jumps +void BamMultiReader::UpdateAlignments(void) { + int i = 0; + for (vector::iterator it = alignments.begin(); it != alignments.end(); ++it) { + BamAlignment* ba = *it; + readers.at(i++)->GetNextAlignment(*ba); + } +} + +void BamMultiReader::PrintFilenames(void) { + for (vector::iterator br = readers.begin(); br != readers.end(); ++br) { + BamReader* reader = *br; + cout << reader->GetFilename() << endl; + } +} + +// returns BAM file pointers to beginning of alignment data +bool BamMultiReader::Rewind(void) { + bool result = true; + int index = 0; + for (vector::iterator br = readers.begin(); br != readers.end(); ++br) { + BamReader* reader = *br; + result &= reader->Rewind(); + readerStates.at(index++) = START; + } + return result; +} + +// saves index data to BAM index files (".bai") where necessary, returns success/fail +bool BamMultiReader::CreateIndexes(void) { + bool result = true; + for (vector::iterator br = readers.begin(); br != readers.end(); ++br) { + BamReader* reader = *br; + result &= reader->CreateIndex(); + } + return result; +} + +const string BamMultiReader::GetUnifiedHeaderText(void) const { + + string mergedHeader = ""; + + // foreach extraction entry (each BAM file) + bool isFirstTime = true; + for (vector::const_iterator it = readers.begin(); it != readers.end(); ++it) { + + BamReader* reader = *it; + + string header = reader->GetHeaderText(); + + vector lines; + boost::split(lines, header, boost::is_any_of("\n")); + + for (vector::const_iterator it = lines.begin(); it != lines.end(); ++it) { + + // get next line from header, skip if empty + string headerLine = *it; + if ( headerLine.empty() ) { continue; } + + // if first file, save HD & SQ entries + if ( isFirstTime ) { + if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) { + mergedHeader.append(headerLine.c_str()); + mergedHeader.append(1, '\n'); + } + } + + // (for all files) append RG entries + if ( headerLine.find("@RG") == 0 ) { + mergedHeader.append(headerLine.c_str() ); + mergedHeader.append(1, '\n'); + } + + } + + // set iteration flag + isFirstTime = false; + } + + // return merged header text + return mergedHeader; +} + diff --git a/BamMultiReader.h b/BamMultiReader.h new file mode 100644 index 0000000..319d327 --- /dev/null +++ b/BamMultiReader.h @@ -0,0 +1,110 @@ +// *************************************************************************** +// BamMultiReader.h (c) 2010 Erik Garrison +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 22 February 2010 (EG) +// --------------------------------------------------------------------------- +// Functionality for simultaneously reading multiple BAM files +// *************************************************************************** + +#ifndef BAMMULTIREADER_H +#define BAMMULTIREADER_H + +// C++ includes +#include + +using namespace std; + +// BamTools includes +#include "BamAux.h" +#include "BamReader.h" + +namespace BamTools { + +enum BamReaderState { START, END, CLOSED }; + +class BamMultiReader { + + // constructor / destructor + public: + BamMultiReader(void); + ~BamMultiReader(void); + + // public interface + public: + + // positioning + int CurrentRefID; + int CurrentLeft; + + // ---------------------- + // BAM file operations + // ---------------------- + + // close BAM files + void Close(void); + // performs random-access jump to reference, position + bool Jump(int refID, int position = 0); + // opens BAM files (and optional BAM index files, if provided) + //void Open(const vector filenames, const vector indexFilenames); + void Open(const vector filenames); + // returns file pointers to beginning of alignments + bool Rewind(void); + + // ---------------------- + // access alignment data + // ---------------------- + // updates the reference id marker to match the lower limit of our readers + void UpdateReferenceID(void); + + // retrieves next available alignment (returns success/fail) from all files + bool GetNextAlignment(BamAlignment&); + // ... should this be private? + bool HasOpenReaders(void); + + // ---------------------- + // access auxiliary data + // ---------------------- + + // returns unified SAM header text for all files + const string GetUnifiedHeaderText(void) const; + // returns number of reference sequences + const int GetReferenceCount(void) const; + // returns vector of reference objects + const BamTools::RefVector GetReferenceData(void) const; + // returns reference id (used for BamMultiReader::Jump()) for the given reference name + //const int GetReferenceID(const std::string& refName) const; + + // ---------------------- + // BAM index operations + // ---------------------- + + // creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai") + bool CreateIndexes(void); + + //const int GetReferenceID(const string& refName) const; + + // utility + void PrintFilenames(void); + void UpdateAlignments(void); + + + // private implementation + private: + // TODO perhaps, for legibility, I should use a struct to wrap them all up + // But this may actually make things more confusing, as I'm only + // operating on them all simultaneously during GetNextAlignment + // calls. + // all these vectors are ordered the same + // readers.at(N) refers to the same reader as alignments.at(N) and readerStates.at(N) + vector readers; // the set of readers which we operate on + vector alignments; // the equivalent set of alignments we use to step through the files + vector readerStates; // states of the various readers + // alignment position? + vector fileNames; +}; + +} // namespace BamTools + +#endif // BAMMULTIREADER_H