]> git.donarmstrong.com Git - bamtools.git/commitdiff
Added Chip's BamMerge to the repository
authorbarnett <barnett@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Mon, 13 Apr 2009 13:45:48 +0000 (13:45 +0000)
committerbarnett <barnett@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Mon, 13 Apr 2009 13:45:48 +0000 (13:45 +0000)
git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@14 9efb377e-2e27-44b9-b91a-ec4abb80ed8b

BamMerge.cpp [new file with mode: 0644]
Makefile

diff --git a/BamMerge.cpp b/BamMerge.cpp
new file mode 100644 (file)
index 0000000..43dc821
--- /dev/null
@@ -0,0 +1,200 @@
+#include <iostream>\r
+#include <map>\r
+#include "BamReader.h"\r
+#include "BamWriter.h"\r
+\r
+using namespace std;\r
+\r
+int main(int argc, char* argv[]) {\r
+\r
+       for (int a=0; a<argc; a++) {\r
+               cout <<  argv[a] << " ";\r
+  }\r
+  cout << endl;\r
+  \r
+  if(argc != 4) {\r
+               cout << "USAGE: " << argv[0] << " <input BAM file 1> <input BAM file 1> <output BAM file>" << endl;\r
+               exit(1);\r
+       }\r
+\r
+       // localize our arguments\r
+       const char* inputFilename1  = argv[1];\r
+       const char* inputFilename2  = argv[2];\r
+       const char* outputFilename = argv[3];\r
+\r
+       // open our BAM reader1 for input file 1\r
+       BamReader reader1;\r
+       reader1.SetFilename(inputFilename1);\r
+\r
+       // open our BAM reader2 for input file 2\r
+       BamReader reader2;\r
+       reader2.SetFilename(inputFilename2);\r
+\r
+       // check reader1\r
+       if(!reader1.Open()) {\r
+               cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename1 << ")." << endl;\r
+               exit(1);\r
+       }\r
+\r
+       // check reader2\r
+       if(!reader2.Open()) {\r
+               cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename2 << ")." << endl;\r
+               exit(1);\r
+       }\r
+       \r
+       // retrieve the header text from both files\r
+       string samHeader1 = reader1.GetHeaderText();\r
+       string samHeader2 = reader2.GetHeaderText();\r
+\r
+       // retrieve the reference sequence vectors\r
+       RefVector referenceSequences1 = reader1.GetReferenceData();\r
+       RefVector referenceSequences2 = reader2.GetReferenceData();\r
+\r
+       // process reference sequences from file 1\r
+       map<string, unsigned int, less<string> > refLengthMap;\r
+       map<string, unsigned int, less<string> > refIdMap;\r
+       map<string, RefData, less<string> > refDataMap;\r
\r
+       cerr << "Reference list from file 1" << endl;\r
+       unsigned int refCounter = 0;\r
+       for (RefVector::const_iterator refIter = referenceSequences1.begin();\r
+            refIter != referenceSequences1.end(); refIter++) {\r
+\r
+         // retrieve\r
+         RefData rd = *refIter;\r
+\r
+         // get member data\r
+         string refName = rd.RefName;\r
+         unsigned int refLength = rd.RefLength;\r
+         int refId = reader1.GetRefID(refName);\r
+\r
+         // report\r
+         cerr << "  refName=" << refName << " refId=" << refId << endl;\r
+\r
+         // store in maps\r
+         refLengthMap[refName] = refLength;\r
+         refIdMap[refName] = refCounter;\r
+         refDataMap[refName] = rd;\r
+       \r
+         // increment ref count\r
+         refCounter++;\r
+       }\r
+\r
+       // process reference sequences from file 1\r
+       map<unsigned int, unsigned int, less<unsigned int> > recodedRefId;\r
+\r
+       cerr << "Reference list from file 2" << endl;\r
+       for (RefVector::const_iterator refIter = referenceSequences2.begin();\r
+            refIter != referenceSequences2.end(); refIter++) {\r
+\r
+         // retrieve\r
+         RefData rd = *refIter;\r
+\r
+         // get member data\r
+         string refName = rd.RefName;\r
+         unsigned int refLength = rd.RefLength;\r
+         int refId = reader2.GetRefID(refName);\r
+\r
+         // report\r
+         cerr << "  refName=" << refName << " refId=" << refId << endl;\r
+\r
+         // if refName already in map, check ref length\r
+         if (refLengthMap.count(refName) > 0) {\r
+           \r
+           // check if length is the same\r
+           if (refLengthMap[refName] != refLength) {\r
+             cerr << "Refernce name in two files with inconsistent lengths: " << refName << " ... exiting." << endl;\r
+             exit(1);\r
+           }\r
+\r
+           // make ref id recoding entry\r
+           unsigned int refIdNew = refIdMap[refName];\r
+           recodedRefId[refId] = refIdNew;\r
+         }\r
+         // otherwise make new refId and RefData\r
+         else {\r
+           // store in maps\r
+           refLengthMap[refName] = refLength;\r
+           refIdMap[refName] = refCounter;\r
+           refDataMap[refName] = rd;\r
+       \r
+           // make ref id recoding entry\r
+           recodedRefId[refId] = refCounter;\r
+\r
+           // increment ref count\r
+           refCounter++;\r
+         }\r
+       }\r
+\r
+       // make new referenceSequences vector\r
+       RefVector referenceSequences3;\r
+       unsigned rc = 0;\r
+       for (map<string, RefData, less<string> >::const_iterator rdIter = refDataMap.begin();\r
+            rdIter != refDataMap.end(); rdIter++) {\r
+         RefData rd = rdIter->second;\r
+         referenceSequences3.push_back(rd);\r
+         cerr << "Adding RefData for refName=" << rd.RefName << " refId=" << rc << endl;\r
+         rc++;\r
+       }\r
+       cerr << "Length of referenceSequences vector=" << referenceSequences3.size() << endl;\r
+       \r
+\r
+       // open output bam file and write:\r
+       //   empty header\r
+       //   new referenceSequences3 vector\r
+       string samHeader3 = "";\r
+\r
+       BamWriter writer;\r
+       writer.Open(outputFilename, samHeader3, referenceSequences3);\r
+\r
+       // iterate through alignments from file 1 and write them unchanged\r
+       BamAlignment ba;\r
+       \r
+       unsigned int ac1 = 0;\r
+       while(reader1.GetNextAlignment(ba) && ac1 < 400) {\r
+         writer.SaveAlignment(ba);\r
+         ac1++;\r
+       }\r
+       \r
+       cerr << "Write " << ac1 << " " << inputFilename1 << " records to " << outputFilename << endl;\r
+\r
+       // close input file 1\r
+       reader1.Close();\r
+\r
+       // iterate through alignments from file 2 and:\r
+       //   assign re-coded refId\r
+       //   write alignment\r
+       int ac2 = 0;\r
+       while(reader2.GetNextAlignment(ba) && ac2 < 400) {\r
+         ac2++;\r
+\r
+         // retrieve original refId\r
+         unsigned int refId = ba.RefID;\r
+         \r
+         // check if original refId is in recoding table... bomb if not\r
+         if (recodedRefId.count(refId) <= 0) {\r
+           cerr << "Alignment in file 2 has inconsistent reference sequence ID... exiting." << endl;\r
+           exit(1);\r
+         }\r
+\r
+         // assign new refId\r
+         ba.RefID = recodedRefId[refId];\r
+\r
+         //      cerr << "  recoding refId=" << refId << " to refId=" << recodedRefId[refId] << endl;\r
+\r
+         writer.SaveAlignment(ba);\r
+       }\r
+\r
+       cerr << "Write " << ac2 << " " << inputFilename2 << " records to " << outputFilename << endl;\r
+       \r
+       // close input file 2\r
+       reader2.Close();\r
+\r
+       // close output file\r
+       writer.Close();\r
+\r
+       cerr << "Total " << ac1+ac2 <<  " records to " << outputFilename << endl;\r
+\r
+       // return\r
+       return 0;\r
+}\r
index a61b64bcabd286df25f604e61c6b98189ab4cec8..8862226d50cf01812083549ca1d71f41dc29cde0 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,35 +1,39 @@
-CC=            gcc
-CXX=           g++
-CFLAGS=                -Wall -O3
-CXXFLAGS=      $(CFLAGS)
-DFLAGS=                -D_IOLIB=2 #-D_NDEBUG
-OBJS=          BamReader.o bgzf.o
-PROG=          BamReaderTest
-INCLUDES=       
-ARFLAGS=       -crs
-LIBS=          -lz
-SUBDIRS=       .
-
-.SUFFIXES:.c .cpp .o
-
-.c.o:
-               $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
-
-.cpp.o:
-               $(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
-
-all: $(PROG) BamConversion
-
-lib:libbambc.a
-
-libbambc.a:$(OBJS)
-               $(AR) $(ARFLAGS) $@ $(OBJS)
-
-BamReaderTest:lib BamReaderMain.o
-               $(CXX) $(CXXFLAGS) -o $@ BamReaderMain.o $(LIBS) -L. -lbambc
-
-BamConversion: lib BamWriter.o BamConversionMain.o
-       $(CXX) $(CXXFLAGS) -o $@ BamWriter.o BamConversionMain.o $(LIBS) -L. -lbambc
-
-clean:
-               rm -fr gmon.out *.o *.a a.out $(PROG) BamConversion *~
+CC=            gcc\r
+CXX=           g++\r
+CFLAGS=                -Wall -O3\r
+CXXFLAGS=      $(CFLAGS)\r
+DFLAGS=                -D_IOLIB=2 #-D_NDEBUG\r
+OBJS=          BamReader.o bgzf.o\r
+PROG=          BamReaderTest\r
+INCLUDES=       \r
+ARFLAGS=       -crs\r
+LIBS=          -lz\r
+SUBDIRS=       .\r
+\r
+.SUFFIXES:.c .cpp .o\r
+\r
+.c.o:\r
+               $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@\r
+\r
+.cpp.o:\r
+               $(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@\r
+\r
+all: $(PROG) BamConversion\r
+\r
+lib:libbambc.a\r
+\r
+libbambc.a:$(OBJS)\r
+               $(AR) $(ARFLAGS) $@ $(OBJS)\r
+\r
+BamReaderTest:lib BamReaderMain.o\r
+               $(CXX) $(CXXFLAGS) -o $@ BamReaderMain.o $(LIBS) -L. -lbambc\r
+\r
+BamConversion: lib BamWriter.o BamConversionMain.o\r
+       $(CXX) $(CXXFLAGS) -o $@ BamWriter.o BamConversionMain.o $(LIBS) -L. -lbambc\r
+\r
+BamMerge: lib BamMerge.o\r
+               $(CXX) $(CXXFLAGS) -o $@ BamWriter.o BamMerge.o $(LIBS) -L. -lbambc\r
+\r
+\r
+clean:\r
+               rm -fr gmon.out *.o *.a a.out $(PROG) BamConversion *~\r