From da8310a92e91d3682857b8f9affbe19621ea7bae Mon Sep 17 00:00:00 2001
From: barnett <barnett@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Date: Fri, 7 May 2010 03:10:38 +0000
Subject: [PATCH] Didnt actually add BamMultiMerge and BamMultiReader with last
 commit. Adding to repository this time, seriously.

git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@51 9efb377e-2e27-44b9-b91a-ec4abb80ed8b
---
 BamMultiMergeMain.cpp |  70 +++++++++++
 BamMultiReader.cpp    | 263 ++++++++++++++++++++++++++++++++++++++++++
 BamMultiReader.h      | 110 ++++++++++++++++++
 3 files changed, 443 insertions(+)
 create mode 100644 BamMultiMergeMain.cpp
 create mode 100644 BamMultiReader.cpp
 create mode 100644 BamMultiReader.h

diff --git a/BamMultiMergeMain.cpp b/BamMultiMergeMain.cpp
new file mode 100644
index 0000000..a4bed55
--- /dev/null
+++ b/BamMultiMergeMain.cpp
@@ -0,0 +1,70 @@
+#include "BamMultiReader.h"
+#include "BamWriter.h"
+#include <boost/algorithm/string.hpp>
+#include <iostream>
+
+using namespace BamTools;
+using namespace std;
+
+int main(int argc, char** argv) {
+
+    if (argc == 1) {
+        cerr << "USAGE: ./BamMultiMerge <output file> [input files]" << endl;
+        exit(0);
+    }
+
+    string outputFilename = argv[1];
+
+    BamMultiReader reader;
+    vector<string> filenames;
+    for (int i = 2; i<argc; ++i) {
+        filenames.push_back(argv[i]);
+    }
+
+    reader.Open(filenames);
+
+    //cerr << "merging to " << outputFilename << endl;
+    string mergedHeader = reader.GetUnifiedHeaderText();
+    //cerr << "mergedHeader = " << endl << mergedHeader << endl;
+
+    // check that we are merging files which have the same sets of references
+    RefVector references;
+    int referencesSize = 0; bool first = true;
+    for (int i = 2; i<argc; ++i) {
+        BamReader areader;
+        areader.Open( argv[i] );
+        if (first) {
+            references = areader.GetReferenceData();
+            referencesSize = references.size();
+            first = false;
+        } else {
+            RefVector newreferences = areader.GetReferenceData();
+            int i = 0;
+            for (RefVector::const_iterator it = references.begin(); it != references.end(); it++) {
+                if (newreferences.at(i++).RefName != it->RefName) {
+                    cerr << "BAM FILES ALIGNED AGAINST DIFFERING SETS OF REFERENCES, NOT MERGING" << endl;
+                    exit(1);
+                }
+            }
+        }
+    }
+
+    // open BamWriter
+    BamWriter writer;
+    writer.Open( outputFilename.c_str(), mergedHeader, references);
+
+    BamAlignment bAlignment;
+    while (reader.GetNextAlignment(bAlignment)) {
+        // write to output file
+        writer.SaveAlignment(bAlignment);
+    }
+
+    // close output file
+    writer.Close();
+    // close input files
+    reader.Close();
+
+    //cerr << "done" << endl;
+
+    return 0;
+}
diff --git a/BamMultiReader.cpp b/BamMultiReader.cpp
new file mode 100644
index 0000000..77f4b81
--- /dev/null
+++ b/BamMultiReader.cpp
@@ -0,0 +1,263 @@
+// ***************************************************************************
+// BamMultiReader.cpp (c) 2010 Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 23 Februrary 2010 (EG)
+// ---------------------------------------------------------------------------
+// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
+// Institute.
+// ---------------------------------------------------------------------------
+// Functionality for simultaneously reading multiple BAM files.
+//
+// This functionality allows applications to work on very large sets of files
+// without requiring intermediate merge, sort, and index steps for each file
+// subset.  It also improves the performance of our merge system as it
+// precludes the need to sort merged files.
+// ***************************************************************************
+
+// C++ includes
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <boost/algorithm/string.hpp>
+
+// BamTools includes
+#include "BGZF.h"
+#include "BamMultiReader.h"
+using namespace BamTools;
+using namespace std;
+
+// -----------------------------------------------------
+// BamMultiReader implementation
+// -----------------------------------------------------
+
+// constructor
+BamMultiReader::BamMultiReader(void)
+    : CurrentRefID(0)
+    , CurrentLeft(0)
+{ }
+
+// destructor
+BamMultiReader::~BamMultiReader(void) {
+    Close(); // close the bam files
+    // clean up reader objects
+    for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
+        delete *br;
+    }
+}
+
+// close the BAM files
+void BamMultiReader::Close(void) {
+    int index = 0;
+    for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
+        BamReader* reader = *br;
+        reader->Close();  // close the reader
+        readerStates.at(index++) = CLOSED;  // track its state
+    }
+}
+
+// updates the reference id stored in the BamMultiReader
+// to reflect the current state of the readers
+void BamMultiReader::UpdateReferenceID(void) {
+    bool updateRefID = true;
+    int i = 0;
+    for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
+        BamAlignment* alignment = *it;
+        if (readerStates.at(i++) != END && alignment->RefID == CurrentRefID) {
+            updateRefID = false;
+            break;
+        }
+    }
+    if (updateRefID) {
+        // get the next reference id
+        // while there aren't any readers at the next ref id
+        // increment the ref id
+        int nextRefID = CurrentRefID;
+        bool ok = false;
+        while (!ok) {
+            ++nextRefID;
+            int i = 0;
+            for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
+                BamAlignment* alignment = *it;
+                if (readerStates.at(i++) != END && alignment->RefID == nextRefID) {
+                    ok = true;
+                    break;
+                }
+            }
+        }
+        cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl;
+        CurrentRefID = nextRefID;
+    }
+}
+
+// checks if any readers still have alignments
+bool BamMultiReader::HasOpenReaders() {
+    for (vector<BamReaderState>::iterator it = readerStates.begin(); it != readerStates.end(); ++it) {
+        BamReaderState readerState = *it;
+        if (readerState != END)
+            return true;
+    }
+    return false;
+}
+
+// get next alignment among all files (from specified region, if given)
+bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
+
+    // bail out if we are at EOF in all files, means no more alignments to process
+    if (!HasOpenReaders())
+        return false;
+
+    // when all alignments have stepped into a new target sequence, update our
+    // current reference sequence id
+    UpdateReferenceID();
+
+    // find the alignment with the lowest position that is also in the current
+    // reference sequence, getnextalignment on its bam reader to step it
+    // forward, and return it
+    int i = 0, lowestPosition = -1, lowestAlignmentIndex = -1;
+    BamAlignment* lowestAlignment = NULL;
+    for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
+        BamAlignment* ba = *it;
+        if (readerStates.at(i) != END && 
+                ba->RefID == CurrentRefID && 
+                (lowestAlignment == NULL || ba->Position < lowestPosition)) {
+            lowestPosition = ba->Position;
+            lowestAlignmentIndex = i;
+            lowestAlignment = ba;
+        }
+        ++i;
+    }
+
+    // now that we have the lowest alignment in the set, save it by copy to our argument
+    nextAlignment = BamAlignment(*lowestAlignment);
+
+    // else continue and load the next alignment
+    bool r = (readers.at(lowestAlignmentIndex))->GetNextAlignment(*alignments.at(lowestAlignmentIndex));
+    if (!r) {
+        cerr << "reached end of file " << readers.at(lowestAlignmentIndex)->GetFilename() << endl;
+        readerStates.at(lowestAlignmentIndex) = END;  // set flag for end of file
+    }
+
+    return true;
+}
+
+// jumps to specified region(refID, leftBound) in BAM files, returns success/fail
+bool BamMultiReader::Jump(int refID, int position) {
+
+    //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {
+    CurrentRefID = refID;
+    CurrentLeft  = position;
+
+    bool result = true;
+    for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
+        BamReader* reader = *br;
+        result &= reader->Jump(refID, position);
+    }
+    if (result)
+        UpdateAlignments();
+    return result;
+}
+
+// opens BAM files
+void BamMultiReader::Open(const vector<string> filenames) {
+    // for filename in filenames
+    fileNames = filenames; // save filenames in our multireader
+    for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) {
+        string filename = *it;
+        BamReader* reader = new BamReader;
+        reader->Open(filename, filename + ".bai");
+        BamAlignment* alignment = new BamAlignment;
+        reader->GetNextAlignment(*alignment);
+        readers.push_back(reader); // tracks readers
+        alignments.push_back(alignment); // tracks current alignments of the readers
+        BamReaderState readerState = START;
+        readerStates.push_back(readerState);
+    }
+}
+
+// Runs GetNextAlignment for all BAM readers; used during jumps
+void BamMultiReader::UpdateAlignments(void) {
+    int i = 0;
+    for (vector<BamAlignment*>::iterator it = alignments.begin(); it != alignments.end(); ++it) {
+        BamAlignment* ba = *it;
+        readers.at(i++)->GetNextAlignment(*ba);
+    }
+}
+
+void BamMultiReader::PrintFilenames(void) {
+    for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
+        BamReader* reader = *br;
+        cout << reader->GetFilename() << endl;
+    }
+}
+
+// returns BAM file pointers to beginning of alignment data
+bool BamMultiReader::Rewind(void) { 
+    bool result = true;
+    int index = 0;
+    for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
+        BamReader* reader = *br;
+        result &= reader->Rewind();
+        readerStates.at(index++) = START;
+    }
+    return result;
+}
+
+// saves index data to BAM index files (".bai") where necessary, returns success/fail
+bool BamMultiReader::CreateIndexes(void) {
+    bool result = true;
+    for (vector<BamReader*>::iterator br = readers.begin(); br != readers.end(); ++br) {
+        BamReader* reader = *br;
+        result &= reader->CreateIndex();
+    }
+    return result;
+}
+
+const string BamMultiReader::GetUnifiedHeaderText(void) const {
+
+    string mergedHeader = "";
+
+    // foreach extraction entry (each BAM file)
+    bool isFirstTime = true;
+    for (vector<BamReader*>::const_iterator it = readers.begin(); it != readers.end(); ++it) {
+
+        BamReader* reader = *it;
+
+        string header = reader->GetHeaderText();
+
+        vector<string> lines;
+        boost::split(lines, header, boost::is_any_of("\n"));
+
+        for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) {
+
+            // get next line from header, skip if empty
+            string headerLine = *it;
+            if ( headerLine.empty() ) { continue; }
+
+            // if first file, save HD & SQ entries
+            if ( isFirstTime ) {
+                if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
+                    mergedHeader.append(headerLine.c_str());
+                    mergedHeader.append(1, '\n');
+                }
+            }
+
+            // (for all files) append RG entries
+            if ( headerLine.find("@RG") == 0 ) {
+                mergedHeader.append(headerLine.c_str() );
+                mergedHeader.append(1, '\n');
+            }
+
+        }
+
+        // set iteration flag
+        isFirstTime = false;
+    }
+
+    // return merged header text
+    return mergedHeader;
+}
+
diff --git a/BamMultiReader.h b/BamMultiReader.h
new file mode 100644
index 0000000..319d327
--- /dev/null
+++ b/BamMultiReader.h
@@ -0,0 +1,110 @@
+// ***************************************************************************
+// BamMultiReader.h (c) 2010 Erik Garrison
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 22 February 2010 (EG)
+// ---------------------------------------------------------------------------
+// Functionality for simultaneously reading multiple BAM files
+// ***************************************************************************
+
+#ifndef BAMMULTIREADER_H
+#define BAMMULTIREADER_H
+
+// C++ includes
+#include <string>
+
+using namespace std;
+
+// BamTools includes
+#include "BamAux.h"
+#include "BamReader.h"
+
+namespace BamTools {
+
+enum BamReaderState { START, END, CLOSED };
+
+class BamMultiReader {
+
+    // constructor / destructor
+    public:
+        BamMultiReader(void);
+        ~BamMultiReader(void);
+
+    // public interface
+    public:
+
+        // positioning
+        int CurrentRefID;
+        int CurrentLeft;
+
+        // ----------------------
+        // BAM file operations
+        // ----------------------
+
+        // close BAM files
+        void Close(void);
+        // performs random-access jump to reference, position
+        bool Jump(int refID, int position = 0);
+        // opens BAM files (and optional BAM index files, if provided)
+        //void Open(const vector<std::string&> filenames, const vector<std::string&> indexFilenames);
+        void Open(const vector<string> filenames);
+        // returns file pointers to beginning of alignments
+        bool Rewind(void);
+
+        // ----------------------
+        // access alignment data
+        // ----------------------
+        // updates the reference id marker to match the lower limit of our readers
+        void UpdateReferenceID(void);
+
+        // retrieves next available alignment (returns success/fail) from all files
+        bool GetNextAlignment(BamAlignment&);
+        // ... should this be private?
+        bool HasOpenReaders(void);
+
+        // ----------------------
+        // access auxiliary data
+        // ----------------------
+
+        // returns unified SAM header text for all files
+        const string GetUnifiedHeaderText(void) const;
+        // returns number of reference sequences
+        const int GetReferenceCount(void) const;
+        // returns vector of reference objects
+        const BamTools::RefVector GetReferenceData(void) const;
+        // returns reference id (used for BamMultiReader::Jump()) for the given reference name
+        //const int GetReferenceID(const std::string& refName) const;
+
+        // ----------------------
+        // BAM index operations
+        // ----------------------
+
+        // creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai")
+        bool CreateIndexes(void);
+
+        //const int GetReferenceID(const string& refName) const;
+
+        // utility
+        void PrintFilenames(void);
+        void UpdateAlignments(void);
+
+
+    // private implementation
+    private:
+        // TODO perhaps, for legibility, I should use a struct to wrap them all up
+        //      But this may actually make things more confusing, as I'm only
+        //      operating on them all simultaneously during GetNextAlignment
+        //      calls.
+        // all these vectors are ordered the same
+        // readers.at(N) refers to the same reader as alignments.at(N) and readerStates.at(N)
+        vector<BamReader*> readers; // the set of readers which we operate on
+        vector<BamAlignment*> alignments; // the equivalent set of alignments we use to step through the files
+        vector<BamReaderState> readerStates; // states of the various readers
+        // alignment position?
+        vector<string> fileNames;
+};
+
+} // namespace BamTools
+
+#endif // BAMMULTIREADER_H
-- 
2.39.5