-#include <iostream>\r
-#include <map>\r
-#include "BamReader.h"\r
-#include "BamWriter.h"\r
-\r
-using namespace std;\r
-\r
-int main(int argc, char* argv[]) {\r
-\r
- for (int a=0; a<argc; a++) {\r
- cout << argv[a] << " ";\r
- }\r
- cout << endl;\r
- \r
- if(argc != 4) {\r
- cout << "USAGE: " << argv[0] << " <input BAM file 1> <input BAM file 1> <output BAM file>" << endl;\r
- exit(1);\r
- }\r
-\r
- // localize our arguments\r
- const char* inputFilename1 = argv[1];\r
- const char* inputFilename2 = argv[2];\r
- const char* outputFilename = argv[3];\r
-\r
- // open our BAM reader1 for input file 1\r
- BamReader reader1;\r
- reader1.SetFilename(inputFilename1);\r
-\r
- // open our BAM reader2 for input file 2\r
- BamReader reader2;\r
- reader2.SetFilename(inputFilename2);\r
-\r
- // check reader1\r
- if(!reader1.Open()) {\r
- cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename1 << ")." << endl;\r
- exit(1);\r
- }\r
-\r
- // check reader2\r
- if(!reader2.Open()) {\r
- cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename2 << ")." << endl;\r
- exit(1);\r
- }\r
- \r
- // retrieve the header text from both files\r
- string samHeader1 = reader1.GetHeaderText();\r
- string samHeader2 = reader2.GetHeaderText();\r
-\r
- // retrieve the reference sequence vectors\r
- RefVector referenceSequences1 = reader1.GetReferenceData();\r
- RefVector referenceSequences2 = reader2.GetReferenceData();\r
-\r
- // process reference sequences from file 1\r
- map<string, unsigned int, less<string> > refLengthMap;\r
- map<string, unsigned int, less<string> > refIdMap;\r
- map<string, RefData, less<string> > refDataMap;\r
- \r
- cerr << "Reference list from file 1" << endl;\r
- unsigned int refCounter = 0;\r
- for (RefVector::const_iterator refIter = referenceSequences1.begin();\r
- refIter != referenceSequences1.end(); refIter++) {\r
-\r
- // retrieve\r
- RefData rd = *refIter;\r
-\r
- // get member data\r
- string refName = rd.RefName;\r
- unsigned int refLength = rd.RefLength;\r
- int refId = reader1.GetRefID(refName);\r
-\r
- // report\r
- cerr << " refName=" << refName << " refId=" << refId << endl;\r
-\r
- // store in maps\r
- refLengthMap[refName] = refLength;\r
- refIdMap[refName] = refCounter;\r
- refDataMap[refName] = rd;\r
- \r
- // increment ref count\r
- refCounter++;\r
- }\r
-\r
- // process reference sequences from file 1\r
- map<unsigned int, unsigned int, less<unsigned int> > recodedRefId;\r
-\r
- cerr << "Reference list from file 2" << endl;\r
- for (RefVector::const_iterator refIter = referenceSequences2.begin();\r
- refIter != referenceSequences2.end(); refIter++) {\r
-\r
- // retrieve\r
- RefData rd = *refIter;\r
-\r
- // get member data\r
- string refName = rd.RefName;\r
- unsigned int refLength = rd.RefLength;\r
- int refId = reader2.GetRefID(refName);\r
-\r
- // report\r
- cerr << " refName=" << refName << " refId=" << refId << endl;\r
-\r
- // if refName already in map, check ref length\r
- if (refLengthMap.count(refName) > 0) {\r
- \r
- // check if length is the same\r
- if (refLengthMap[refName] != refLength) {\r
- cerr << "Refernce name in two files with inconsistent lengths: " << refName << " ... exiting." << endl;\r
- exit(1);\r
- }\r
-\r
- // make ref id recoding entry\r
- unsigned int refIdNew = refIdMap[refName];\r
- recodedRefId[refId] = refIdNew;\r
- }\r
- // otherwise make new refId and RefData\r
- else {\r
- // store in maps\r
- refLengthMap[refName] = refLength;\r
- refIdMap[refName] = refCounter;\r
- refDataMap[refName] = rd;\r
- \r
- // make ref id recoding entry\r
- recodedRefId[refId] = refCounter;\r
-\r
- // increment ref count\r
- refCounter++;\r
- }\r
- }\r
-\r
- // make new referenceSequences vector\r
- RefVector referenceSequences3;\r
- unsigned rc = 0;\r
- for (map<string, RefData, less<string> >::const_iterator rdIter = refDataMap.begin();\r
- rdIter != refDataMap.end(); rdIter++) {\r
- RefData rd = rdIter->second;\r
- referenceSequences3.push_back(rd);\r
- cerr << "Adding RefData for refName=" << rd.RefName << " refId=" << rc << endl;\r
- rc++;\r
- }\r
- cerr << "Length of referenceSequences vector=" << referenceSequences3.size() << endl;\r
- \r
-\r
- // open output bam file and write:\r
- // empty header\r
- // new referenceSequences3 vector\r
- string samHeader3 = "";\r
-\r
- BamWriter writer;\r
- writer.Open(outputFilename, samHeader3, referenceSequences3);\r
-\r
- // iterate through alignments from file 1 and write them unchanged\r
- BamAlignment ba;\r
- \r
- unsigned int ac1 = 0;\r
- while(reader1.GetNextAlignment(ba)) {\r
- writer.SaveAlignment(ba);\r
- ac1++;\r
- }\r
- \r
- cerr << "Write " << ac1 << " " << inputFilename1 << " records to " << outputFilename << endl;\r
-\r
- // close input file 1\r
- reader1.Close();\r
-\r
- // iterate through alignments from file 2 and:\r
- // assign re-coded refId\r
- // write alignment\r
- int ac2 = 0;\r
- while(reader2.GetNextAlignment(ba) ) {\r
- ac2++;\r
-\r
- // retrieve original refId\r
- unsigned int refId = ba.RefID;\r
- \r
- // check if original refId is in recoding table... bomb if not\r
- if (recodedRefId.count(refId) <= 0) {\r
- cerr << "Alignment in file 2 has inconsistent reference sequence ID... exiting." << endl;\r
- exit(1);\r
- }\r
-\r
- // assign new refId\r
- ba.RefID = recodedRefId[refId];\r
-\r
- // cerr << " recoding refId=" << refId << " to refId=" << recodedRefId[refId] << endl;\r
-\r
- writer.SaveAlignment(ba);\r
- }\r
-\r
- cerr << "Write " << ac2 << " " << inputFilename2 << " records to " << outputFilename << endl;\r
- \r
- // close input file 2\r
- reader2.Close();\r
-\r
- // close output file\r
- writer.Close();\r
-\r
- cerr << "Total " << ac1+ac2 << " records to " << outputFilename << endl;\r
-\r
- // return\r
- return 0;\r
-}\r
+
+#include <iostream>
+#include <map>
+#include "BamReader.h"
+#include "BamWriter.h"
+
+using namespace std;
+
+int main(int argc, char* argv[]) {
+
+ for (int a=0; a<argc; a++) {
+ cout << argv[a] << " ";
+ }
+ cout << endl;
+
+ if(argc < 4) {
+ cout << "USAGE: " << argv[0] << " <input BAM file 1> <input BAM file 1> <output BAM file> [<maxReads>]" << endl;
+ exit(1);
+ }
+
+ // localize our arguments
+ const char* inputFilename1 = argv[1];
+ const char* inputFilename2 = argv[2];
+ const char* outputFilename = argv[3];
+ unsigned int Nmax=0;
+ if (argc>4) {
+ const char* cmaxRead = argv[4];
+ Nmax = atoi(cmaxRead);
+ }
+
+
+ // open our BAM reader1 for input file 1
+ BamReader reader1;
+ reader1.SetFilename(inputFilename1);
+
+ // open our BAM reader2 for input file 2
+ BamReader reader2;
+ reader2.SetFilename(inputFilename2);
+
+ // check reader1
+ if(!reader1.Open()) {
+ cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename1 << ")." << endl;
+ exit(1);
+ }
+
+ // check reader2
+ if(!reader2.Open()) {
+ cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename2 << ")." << endl;
+ exit(1);
+ }
+
+ // retrieve the header text from both files
+ string samHeader1 = reader1.GetHeaderText();
+ string samHeader2 = reader2.GetHeaderText();
+
+ // retrieve the reference sequence vectors
+ RefVector referenceSequences1 = reader1.GetReferenceData();
+ RefVector referenceSequences2 = reader2.GetReferenceData();
+
+ // merged referenceSequences
+ RefVector referenceSequences12;
+
+ // process reference sequences from file 1
+ map<string, unsigned int, less<string> > refLengthMap;
+ map<string, unsigned int, less<string> > refIdMap;
+ map<string, RefData, less<string> > refDataMap;
+
+ cerr << "Reference list from file 1" << endl;
+ unsigned int refCounter = 0;
+ for (RefVector::const_iterator refIter = referenceSequences1.begin(); efIter != referenceSequences1.end(); refIter++) {
+
+ // retrieve
+ RefData rd = *refIter;
+
+ // get member data
+ string refName = rd.RefName;
+ unsigned int refLength = rd.RefLength;
+ int refId = reader1.GetRefID(refName);
+
+ // report
+ cerr << " refName=" << refName << " refId=" << refId << endl;
+
+ // store in maps
+ refLengthMap[refName] = refLength;
+ refIdMap[refName] = refCounter;
+ refDataMap[refName] = rd;
+
+ // add this to merged refVector
+ referenceSequences12.push_back(rd);
+
+ // increment ref count
+ refCounter++;
+ }
+
+ // map file 2 refId to merged refID
+ map<unsigned int, unsigned int, less<unsigned int> > mapRefId12;
+
+ cerr << "Reference list from file 2" << endl;
+ for (RefVector::const_iterator refIter = referenceSequences2.begin(); refIter != referenceSequences2.end(); refIter++) {
+
+ // retrieve
+ RefData rd = *refIter;
+
+ // get member data
+ string refName = rd.RefName;
+ unsigned int refLength = rd.RefLength;
+ int refId = reader2.GetRefID(refName);
+
+ // report
+ cerr << " refName=" << refName << " refId=" << refId << endl;
+
+ // if refName already in map, check ref length
+ if (refLengthMap.count(refName) > 0) {
+
+ // check if length is the same
+ if (refLengthMap[refName] != refLength) {
+ cerr << "Refernce name in two files with inconsistent lengths: " << refName << " ... exiting." << endl;
+ exit(1);
+ }
+
+ // make ref id recoding entry
+ unsigned int refIdNew = refIdMap[refName];
+ mapRefId12[refId] = refIdNew;
+
+ } else {
+ // otherwise make new refId and RefData
+ // store in maps
+ refLengthMap[refName] = refLength;
+ refIdMap[refName] = refCounter;
+ refDataMap[refName] = rd;
+
+ // make ref id recoding entry
+ mapRefId12[refId] = refCounter;
+
+ // add this to merged refVector
+ referenceSequences12.push_back(rd);
+
+ // increment ref count
+ refCounter++;
+ }
+ }
+
+ cerr << "Length of referenceSequences vector=" << referenceSequences12.size() << endl;
+
+ // open output bam file and write:
+ // empty header
+ string samHeader3 = "";
+
+ BamWriter writer;
+ writer.Open(outputFilename, samHeader3, referenceSequences12);
+
+ // iterate through alignments from file 1 and write them unchanged
+ BamAlignment ba;
+
+ bool keepgoing=true;
+ unsigned int ac1 = 0;
+ while(reader1.GetNextAlignment(ba) && keepgoing ) {
+ writer.SaveAlignment(ba);
+ ac1++;
+ keepgoing=(Nmax<1)||(ac1<Nmax);
+ }
+
+ cerr << "Write " << ac1 << " " << inputFilename1 << " records to " << outputFilename << endl;
+
+ // close input file 1
+ reader1.Close();
+
+ // iterate through alignments from file 2 and:
+ // assign re-coded refId
+ // write alignment
+ keepgoing=true;
+ unsigned int ac2 = 0;
+ while(reader2.GetNextAlignment(ba)&&keepgoing ) {
+ ac2++;
+ keepgoing=(Nmax<1)||(ac2<Nmax);
+
+ // retrieve original refId
+ unsigned int refId = ba.RefID;
+
+ // check if original refId is in recoding table... bomb if not
+ if (mapRefId12.count(refId) <= 0) {
+ cerr << "Alignment in file 2 has inconsistent reference sequence ID... exiting." << endl;
+ exit(1);
+ }
+
+ // assign new refId
+ ba.RefID = mapRefId12[refId];
+
+ // cerr << " recoding refId=" << refId << " to refId=" << recodedRefId[refId] << endl;
+
+ writer.SaveAlignment(ba);
+ }
+
+ cerr << "Write " << ac2 << " " << inputFilename2 << " records to " << outputFilename << endl;
+
+ // close input file 2
+ reader2.Close();
+
+ // close output file
+ writer.Close();
+
+ cerr << "Total " << ac1+ac2 << " records to " << outputFilename << endl;
+
+ // return
+ return 0;
+}