]> git.donarmstrong.com Git - bamtools.git/blob - BamMerge.cpp
bfa7908f47ee618e87bbaf4106a9a99f7a966927
[bamtools.git] / BamMerge.cpp
1
2 #include <iostream>
3 #include <map>
4 #include "BamReader.h"
5 #include "BamWriter.h"
6
7 using namespace std;
8
9 int main(int argc, char* argv[]) {
10
11         for (int a=0; a<argc; a++) {
12                 cout << argv[a] << " ";
13         }
14         cout << endl;
15   
16         if(argc < 4) {
17                 cout << "USAGE: " << argv[0] << " <input BAM file 1> <input BAM file 1> <output BAM file> [<maxReads>]" << endl;
18                 exit(1);
19         }
20
21         // localize our arguments
22         const char* inputFilename1 = argv[1];
23         const char* inputFilename2 = argv[2];
24         const char* outputFilename = argv[3];
25         unsigned int Nmax=0;
26         if (argc>4) {
27                 const char* cmaxRead = argv[4];
28                 Nmax = atoi(cmaxRead);
29         }
30       
31
32         // open our BAM reader1 for input file 1
33         BamReader reader1;
34         reader1.SetFilename(inputFilename1);
35
36         // open our BAM reader2 for input file 2
37         BamReader reader2;
38         reader2.SetFilename(inputFilename2);
39
40         // check reader1
41         if(!reader1.Open()) {
42                 cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename1 << ")." << endl;
43                 exit(1);
44         }
45
46         // check reader2
47         if(!reader2.Open()) {
48                 cout << "ERROR: Unable to open the BAM file 1 (" << inputFilename2 << ")." << endl;
49                 exit(1);
50         }
51
52         // retrieve the header text from both files
53         string samHeader1 = reader1.GetHeaderText();
54         string samHeader2 = reader2.GetHeaderText();
55
56         // retrieve the reference sequence vectors
57         RefVector referenceSequences1 = reader1.GetReferenceData();
58         RefVector referenceSequences2 = reader2.GetReferenceData();
59
60   // merged referenceSequences
61   RefVector referenceSequences12;
62   
63         // process reference sequences from file 1
64         map<string, unsigned int, less<string> > refLengthMap;
65         map<string, unsigned int, less<string> > refIdMap;
66         map<string, RefData, less<string> > refDataMap;
67         
68         cerr << "Reference list from file 1" << endl;
69         unsigned int refCounter = 0;
70         for (RefVector::const_iterator refIter = referenceSequences1.begin(); efIter != referenceSequences1.end(); refIter++) {
71
72                 // retrieve
73                 RefData rd = *refIter;
74
75                 // get member data
76                 string refName = rd.RefName;
77                 unsigned int refLength = rd.RefLength;
78                 int refId = reader1.GetRefID(refName);
79
80                 // report
81                 cerr << " refName=" << refName << " refId=" << refId << endl;
82
83                 // store in maps
84                 refLengthMap[refName] = refLength;
85                 refIdMap[refName] = refCounter;
86                 refDataMap[refName] = rd;
87
88                 // add this to merged refVector
89                 referenceSequences12.push_back(rd);
90
91                 // increment ref count
92                 refCounter++;
93         }
94
95         // map file 2 refId to merged refID
96         map<unsigned int, unsigned int, less<unsigned int> > mapRefId12;
97
98         cerr << "Reference list from file 2" << endl;
99         for (RefVector::const_iterator refIter = referenceSequences2.begin(); refIter != referenceSequences2.end(); refIter++) {
100
101                 // retrieve
102                 RefData rd = *refIter;
103
104                 // get member data
105                 string refName = rd.RefName;
106                 unsigned int refLength = rd.RefLength;
107                 int refId = reader2.GetRefID(refName);
108
109                 // report
110                 cerr << " refName=" << refName << " refId=" << refId << endl;
111
112                 // if refName already in map, check ref length
113                 if (refLengthMap.count(refName) > 0) {
114
115                         // check if length is the same
116                         if (refLengthMap[refName] != refLength) {
117                                 cerr << "Refernce name in two files with inconsistent lengths: " << refName << " ... exiting." << endl;
118                                 exit(1);
119                         }
120
121                         // make ref id recoding entry
122                         unsigned int refIdNew = refIdMap[refName];
123                         mapRefId12[refId] = refIdNew;
124
125                 } else {
126                         // otherwise make new refId and RefData
127                         // store in maps
128                         refLengthMap[refName] = refLength;
129                         refIdMap[refName] = refCounter;
130                         refDataMap[refName] = rd;
131
132                         // make ref id recoding entry
133                         mapRefId12[refId] = refCounter;
134
135                         // add this to merged refVector
136                         referenceSequences12.push_back(rd);
137
138                         // increment ref count
139                         refCounter++;
140                 }
141         }
142
143         cerr << "Length of referenceSequences vector=" << referenceSequences12.size() << endl;
144
145         // open output bam file and write:
146         // empty header
147         string samHeader3 = "";
148
149         BamWriter writer;
150         writer.Open(outputFilename, samHeader3, referenceSequences12);
151
152         // iterate through alignments from file 1 and write them unchanged
153         BamAlignment ba;
154
155         bool keepgoing=true;
156         unsigned int ac1 = 0;
157         while(reader1.GetNextAlignment(ba) && keepgoing ) {
158                 writer.SaveAlignment(ba);
159                 ac1++;
160                 keepgoing=(Nmax<1)||(ac1<Nmax);
161         }
162
163         cerr << "Write " << ac1 << " " << inputFilename1 << " records to " << outputFilename << endl;
164
165         // close input file 1
166         reader1.Close();
167
168         // iterate through alignments from file 2 and:
169         // assign re-coded refId
170         // write alignment
171         keepgoing=true;
172         unsigned int ac2 = 0;
173         while(reader2.GetNextAlignment(ba)&&keepgoing ) {
174                 ac2++;
175                 keepgoing=(Nmax<1)||(ac2<Nmax);
176
177                 // retrieve original refId
178                 unsigned int refId = ba.RefID;
179
180                 // check if original refId is in recoding table... bomb if not
181                 if (mapRefId12.count(refId) <= 0) {
182                         cerr << "Alignment in file 2 has inconsistent reference sequence ID... exiting." << endl;
183                         exit(1);
184                 }
185
186                 // assign new refId
187                 ba.RefID = mapRefId12[refId];
188
189                 // cerr << " recoding refId=" << refId << " to refId=" << recodedRefId[refId] << endl;
190
191                 writer.SaveAlignment(ba);
192         }
193
194         cerr << "Write " << ac2 << " " << inputFilename2 << " records to " << outputFilename << endl;
195
196         // close input file 2
197         reader2.Close();
198
199         // close output file
200         writer.Close();
201
202         cerr << "Total " << ac1+ac2 << " records to " << outputFilename << endl;
203
204         // return
205         return 0;
206 }