]> git.donarmstrong.com Git - bamtools.git/commitdiff
Bug fix in BamMerge.cpp by Chip
authorbarnett <barnett@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Sun, 19 Apr 2009 20:39:29 +0000 (20:39 +0000)
committerbarnett <barnett@9efb377e-2e27-44b9-b91a-ec4abb80ed8b>
Sun, 19 Apr 2009 20:39:29 +0000 (20:39 +0000)
git-svn-id: svn+ssh://gene.bc.edu/home/subversion/Derek/BamTools/trunk@16 9efb377e-2e27-44b9-b91a-ec4abb80ed8b

BamMerge.cpp

index 7f4383348bdc87b6e814ae9285053d396eb2f3ac..bfa7908f47ee618e87bbaf4106a9a99f7a966927 100644 (file)
-#include <iostream>\r
-#include <map>\r
-#include "BamReader.h"\r
-#include "BamWriter.h"\r
-\r
-using namespace std;\r
-\r
-int main(int argc, char* argv[]) {\r
-\r
-       for (int a=0; a<argc; a++) {\r
-               cout <<  argv[a] << " ";\r
-  }\r
-  cout << endl;\r
-  \r
-  if(argc != 4) {\r
-               cout << "USAGE: " << argv[0] << " <input BAM file 1> <input BAM file 1> <output BAM file>" << endl;\r
-               exit(1);\r
-       }\r
-\r
-       // localize our arguments\r
-       const char* inputFilename1  = argv[1];\r
-       const char* inputFilename2  = argv[2];\r
-       const char* outputFilename = argv[3];\r
-\r
-       // open our BAM reader1 for input file 1\r
-       BamReader reader1;\r
-       reader1.SetFilename(inputFilename1);\r
-\r
-       // open our BAM reader2 for input file 2\r
-       BamReader reader2;\r
-       reader2.SetFilename(inputFilename2);\r
-\r
-       // check reader1\r
-       if(!reader1.Open()) {\r
-               cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename1 << ")." << endl;\r
-               exit(1);\r
-       }\r
-\r
-       // check reader2\r
-       if(!reader2.Open()) {\r
-               cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename2 << ")." << endl;\r
-               exit(1);\r
-       }\r
-       \r
-       // retrieve the header text from both files\r
-       string samHeader1 = reader1.GetHeaderText();\r
-       string samHeader2 = reader2.GetHeaderText();\r
-\r
-       // retrieve the reference sequence vectors\r
-       RefVector referenceSequences1 = reader1.GetReferenceData();\r
-       RefVector referenceSequences2 = reader2.GetReferenceData();\r
-\r
-       // process reference sequences from file 1\r
-       map<string, unsigned int, less<string> > refLengthMap;\r
-       map<string, unsigned int, less<string> > refIdMap;\r
-       map<string, RefData, less<string> > refDataMap;\r
\r
-       cerr << "Reference list from file 1" << endl;\r
-       unsigned int refCounter = 0;\r
-       for (RefVector::const_iterator refIter = referenceSequences1.begin();\r
-            refIter != referenceSequences1.end(); refIter++) {\r
-\r
-         // retrieve\r
-         RefData rd = *refIter;\r
-\r
-         // get member data\r
-         string refName = rd.RefName;\r
-         unsigned int refLength = rd.RefLength;\r
-         int refId = reader1.GetRefID(refName);\r
-\r
-         // report\r
-         cerr << "  refName=" << refName << " refId=" << refId << endl;\r
-\r
-         // store in maps\r
-         refLengthMap[refName] = refLength;\r
-         refIdMap[refName] = refCounter;\r
-         refDataMap[refName] = rd;\r
-       \r
-         // increment ref count\r
-         refCounter++;\r
-       }\r
-\r
-       // process reference sequences from file 1\r
-       map<unsigned int, unsigned int, less<unsigned int> > recodedRefId;\r
-\r
-       cerr << "Reference list from file 2" << endl;\r
-       for (RefVector::const_iterator refIter = referenceSequences2.begin();\r
-            refIter != referenceSequences2.end(); refIter++) {\r
-\r
-         // retrieve\r
-         RefData rd = *refIter;\r
-\r
-         // get member data\r
-         string refName = rd.RefName;\r
-         unsigned int refLength = rd.RefLength;\r
-         int refId = reader2.GetRefID(refName);\r
-\r
-         // report\r
-         cerr << "  refName=" << refName << " refId=" << refId << endl;\r
-\r
-         // if refName already in map, check ref length\r
-         if (refLengthMap.count(refName) > 0) {\r
-           \r
-           // check if length is the same\r
-           if (refLengthMap[refName] != refLength) {\r
-             cerr << "Refernce name in two files with inconsistent lengths: " << refName << " ... exiting." << endl;\r
-             exit(1);\r
-           }\r
-\r
-           // make ref id recoding entry\r
-           unsigned int refIdNew = refIdMap[refName];\r
-           recodedRefId[refId] = refIdNew;\r
-         }\r
-         // otherwise make new refId and RefData\r
-         else {\r
-           // store in maps\r
-           refLengthMap[refName] = refLength;\r
-           refIdMap[refName] = refCounter;\r
-           refDataMap[refName] = rd;\r
-       \r
-           // make ref id recoding entry\r
-           recodedRefId[refId] = refCounter;\r
-\r
-           // increment ref count\r
-           refCounter++;\r
-         }\r
-       }\r
-\r
-       // make new referenceSequences vector\r
-       RefVector referenceSequences3;\r
-       unsigned rc = 0;\r
-       for (map<string, RefData, less<string> >::const_iterator rdIter = refDataMap.begin();\r
-            rdIter != refDataMap.end(); rdIter++) {\r
-         RefData rd = rdIter->second;\r
-         referenceSequences3.push_back(rd);\r
-         cerr << "Adding RefData for refName=" << rd.RefName << " refId=" << rc << endl;\r
-         rc++;\r
-       }\r
-       cerr << "Length of referenceSequences vector=" << referenceSequences3.size() << endl;\r
-       \r
-\r
-       // open output bam file and write:\r
-       //   empty header\r
-       //   new referenceSequences3 vector\r
-       string samHeader3 = "";\r
-\r
-       BamWriter writer;\r
-       writer.Open(outputFilename, samHeader3, referenceSequences3);\r
-\r
-       // iterate through alignments from file 1 and write them unchanged\r
-       BamAlignment ba;\r
-       \r
-       unsigned int ac1 = 0;\r
-       while(reader1.GetNextAlignment(ba)) {\r
-         writer.SaveAlignment(ba);\r
-         ac1++;\r
-       }\r
-       \r
-       cerr << "Write " << ac1 << " " << inputFilename1 << " records to " << outputFilename << endl;\r
-\r
-       // close input file 1\r
-       reader1.Close();\r
-\r
-       // iterate through alignments from file 2 and:\r
-       //   assign re-coded refId\r
-       //   write alignment\r
-       int ac2 = 0;\r
-       while(reader2.GetNextAlignment(ba) ) {\r
-         ac2++;\r
-\r
-         // retrieve original refId\r
-         unsigned int refId = ba.RefID;\r
-         \r
-         // check if original refId is in recoding table... bomb if not\r
-         if (recodedRefId.count(refId) <= 0) {\r
-           cerr << "Alignment in file 2 has inconsistent reference sequence ID... exiting." << endl;\r
-           exit(1);\r
-         }\r
-\r
-         // assign new refId\r
-         ba.RefID = recodedRefId[refId];\r
-\r
-         //      cerr << "  recoding refId=" << refId << " to refId=" << recodedRefId[refId] << endl;\r
-\r
-         writer.SaveAlignment(ba);\r
-       }\r
-\r
-       cerr << "Write " << ac2 << " " << inputFilename2 << " records to " << outputFilename << endl;\r
-       \r
-       // close input file 2\r
-       reader2.Close();\r
-\r
-       // close output file\r
-       writer.Close();\r
-\r
-       cerr << "Total " << ac1+ac2 <<  " records to " << outputFilename << endl;\r
-\r
-       // return\r
-       return 0;\r
-}\r
+
+#include <iostream>
+#include <map>
+#include "BamReader.h"
+#include "BamWriter.h"
+
+using namespace std;
+
+int main(int argc, char* argv[]) {
+
+       for (int a=0; a<argc; a++) {
+               cout << argv[a] << " ";
+       }
+       cout << endl;
+  
+       if(argc < 4) {
+               cout << "USAGE: " << argv[0] << " <input BAM file 1> <input BAM file 1> <output BAM file> [<maxReads>]" << endl;
+               exit(1);
+       }
+
+       // localize our arguments
+       const char* inputFilename1 = argv[1];
+       const char* inputFilename2 = argv[2];
+       const char* outputFilename = argv[3];
+       unsigned int Nmax=0;
+       if (argc>4) {
+               const char* cmaxRead = argv[4];
+               Nmax = atoi(cmaxRead);
+       }
+      
+
+       // open our BAM reader1 for input file 1
+       BamReader reader1;
+       reader1.SetFilename(inputFilename1);
+
+       // open our BAM reader2 for input file 2
+       BamReader reader2;
+       reader2.SetFilename(inputFilename2);
+
+       // check reader1
+       if(!reader1.Open()) {
+               cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename1 << ")." << endl;
+               exit(1);
+       }
+
+       // check reader2
+       if(!reader2.Open()) {
+               cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename2 << ")." << endl;
+               exit(1);
+       }
+
+       // retrieve the header text from both files
+       string samHeader1 = reader1.GetHeaderText();
+       string samHeader2 = reader2.GetHeaderText();
+
+       // retrieve the reference sequence vectors
+       RefVector referenceSequences1 = reader1.GetReferenceData();
+       RefVector referenceSequences2 = reader2.GetReferenceData();
+
+  // merged referenceSequences
+  RefVector referenceSequences12;
+  
+       // process reference sequences from file 1
+       map<string, unsigned int, less<string> > refLengthMap;
+       map<string, unsigned int, less<string> > refIdMap;
+       map<string, RefData, less<string> > refDataMap;
+       
+       cerr << "Reference list from file 1" << endl;
+       unsigned int refCounter = 0;
+       for (RefVector::const_iterator refIter = referenceSequences1.begin(); efIter != referenceSequences1.end(); refIter++) {
+
+               // retrieve
+               RefData rd = *refIter;
+
+               // get member data
+               string refName = rd.RefName;
+               unsigned int refLength = rd.RefLength;
+               int refId = reader1.GetRefID(refName);
+
+               // report
+               cerr << " refName=" << refName << " refId=" << refId << endl;
+
+               // store in maps
+               refLengthMap[refName] = refLength;
+               refIdMap[refName] = refCounter;
+               refDataMap[refName] = rd;
+
+               // add this to merged refVector
+               referenceSequences12.push_back(rd);
+
+               // increment ref count
+               refCounter++;
+       }
+
+       // map file 2 refId to merged refID
+       map<unsigned int, unsigned int, less<unsigned int> > mapRefId12;
+
+       cerr << "Reference list from file 2" << endl;
+       for (RefVector::const_iterator refIter = referenceSequences2.begin(); refIter != referenceSequences2.end(); refIter++) {
+
+               // retrieve
+               RefData rd = *refIter;
+
+               // get member data
+               string refName = rd.RefName;
+               unsigned int refLength = rd.RefLength;
+               int refId = reader2.GetRefID(refName);
+
+               // report
+               cerr << " refName=" << refName << " refId=" << refId << endl;
+
+               // if refName already in map, check ref length
+               if (refLengthMap.count(refName) > 0) {
+
+                       // check if length is the same
+                       if (refLengthMap[refName] != refLength) {
+                               cerr << "Refernce name in two files with inconsistent lengths: " << refName << " ... exiting." << endl;
+                               exit(1);
+                       }
+
+                       // make ref id recoding entry
+                       unsigned int refIdNew = refIdMap[refName];
+                       mapRefId12[refId] = refIdNew;
+
+               } else {
+                       // otherwise make new refId and RefData
+                       // store in maps
+                       refLengthMap[refName] = refLength;
+                       refIdMap[refName] = refCounter;
+                       refDataMap[refName] = rd;
+
+                       // make ref id recoding entry
+                       mapRefId12[refId] = refCounter;
+
+                       // add this to merged refVector
+                       referenceSequences12.push_back(rd);
+
+                       // increment ref count
+                       refCounter++;
+               }
+       }
+
+       cerr << "Length of referenceSequences vector=" << referenceSequences12.size() << endl;
+
+       // open output bam file and write:
+       // empty header
+       string samHeader3 = "";
+
+       BamWriter writer;
+       writer.Open(outputFilename, samHeader3, referenceSequences12);
+
+       // iterate through alignments from file 1 and write them unchanged
+       BamAlignment ba;
+
+       bool keepgoing=true;
+       unsigned int ac1 = 0;
+       while(reader1.GetNextAlignment(ba) && keepgoing ) {
+               writer.SaveAlignment(ba);
+               ac1++;
+               keepgoing=(Nmax<1)||(ac1<Nmax);
+       }
+
+       cerr << "Write " << ac1 << " " << inputFilename1 << " records to " << outputFilename << endl;
+
+       // close input file 1
+       reader1.Close();
+
+       // iterate through alignments from file 2 and:
+       // assign re-coded refId
+       // write alignment
+       keepgoing=true;
+       unsigned int ac2 = 0;
+       while(reader2.GetNextAlignment(ba)&&keepgoing ) {
+               ac2++;
+               keepgoing=(Nmax<1)||(ac2<Nmax);
+
+               // retrieve original refId
+               unsigned int refId = ba.RefID;
+
+               // check if original refId is in recoding table... bomb if not
+               if (mapRefId12.count(refId) <= 0) {
+                       cerr << "Alignment in file 2 has inconsistent reference sequence ID... exiting." << endl;
+                       exit(1);
+               }
+
+               // assign new refId
+               ba.RefID = mapRefId12[refId];
+
+               // cerr << " recoding refId=" << refId << " to refId=" << recodedRefId[refId] << endl;
+
+               writer.SaveAlignment(ba);
+       }
+
+       cerr << "Write " << ac2 << " " << inputFilename2 << " records to " << outputFilename << endl;
+
+       // close input file 2
+       reader2.Close();
+
+       // close output file
+       writer.Close();
+
+       cerr << "Total " << ac1+ac2 << " records to " << outputFilename << endl;
+
+       // return
+       return 0;
+}