]> git.donarmstrong.com Git - mothur.git/blob - bellerophon.cpp
working on chimeras
[mothur.git] / bellerophon.cpp
1 /*
2  *  bellerophon.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/9/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "bellerophon.h"
11 #include "eachgapdist.h"
12 #include "ignoregaps.h"
13 #include "onegapdist.h"
14
15
16 //***************************************************************************************************************
17
18 Bellerophon::Bellerophon(string name, string o)  {
19         try {
20                 fastafile = name;
21                 outputDir = o;
22         }
23         catch(exception& e) {
24                 errorOut(e, "Bellerophon", "Bellerophon");
25                 exit(1);
26         }
27 }
28
29 //***************************************************************************************************************
30 void Bellerophon::print(ostream& out) {
31         try {
32                 int above1 = 0;
33                 out << "Name\tScore\tLeft\tRight\t" << endl;
34                 //output prefenence structure to .chimeras file
35                 for (int i = 0; i < pref.size(); i++) {
36                         out << pref[i].name << '\t' << setprecision(3) << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
37                         
38                         //calc # of seqs with preference above 1.0
39                         if (pref[i].score[0] > 1.0) { 
40                                 above1++; 
41                                 mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine();
42                                 mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine();
43                         }
44                 }
45                 
46                 //output results to screen
47                 mothurOutEndLine();
48                 mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine();
49                 int spot;
50                 spot = pref.size()-1;
51                 mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
52                 spot = pref.size() * 0.975;
53                 mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
54                 spot = pref.size() * 0.75;
55                 mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
56                 spot = pref.size() * 0.50;
57                 mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine();
58                 spot = pref.size() * 0.25;
59                 mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
60                 spot = pref.size() * 0.025;
61                 mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
62                 spot = 0;
63                 mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
64
65         }
66         catch(exception& e) {
67                 errorOut(e, "Bellerophon", "print");
68                 exit(1);
69         }
70 }
71
72 //********************************************************************************************************************
73 //sorts highest score to lowest
74 inline bool comparePref(Preference left, Preference right){
75         return (left.score[0] > right.score[0]);        
76 }
77
78 //***************************************************************************************************************
79 int Bellerophon::getChimeras() {
80         try {
81                 
82                 //do soft filter
83                 if (filter)  {
84                         string optionString = "fasta=" + fastafile + ", soft=50";
85                         if (outputDir != "") { optionString += ", outputdir=" + outputDir; }
86                         
87                         filterSeqs = new FilterSeqsCommand(optionString);
88                         filterSeqs->execute();
89                         delete filterSeqs;
90                         
91                         //reset fastafile to filtered file
92                         if (outputDir == "") { fastafile = getRootName(fastafile) + "filter.fasta"; }
93                         else                             { fastafile = outputDir + getRootName(getSimpleName(fastafile)) + "filter.fasta"; }
94                         
95                 }
96                 
97                 distCalculator = new eachGapDist();
98                 
99                 //read in sequences
100                 seqs = readSeqs(fastafile);
101                 
102                 if (unaligned) { mothurOut("Your sequences need to be aligned when you use the bellerophon method."); mothurOutEndLine(); return 1;  }
103                 
104                 int numSeqs = seqs.size();
105                 
106                 if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); exit(1); }
107                 
108                 //set default window to 25% of sequence length
109                 string seq0 = seqs[0]->getAligned();
110                 if (window == 0) { window = seq0.length() / 4;  }
111                 else if (window > (seq0.length() / 2)) {  
112                         mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine();
113                         window = (seq0.length() / 2);
114                 }
115                 
116                 if (increment > (seqs[0]->getAlignLength() - (2*window))) { 
117                         if (increment != 10) {
118                         
119                                 mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine();
120                                 increment = 10;
121                                 if (increment > (seqs[0]->getAlignLength() - (2*window))) {  increment = 0;  }
122                                 
123                         }else{ increment = 0; }
124                 }
125                 
126                 if (increment == 0) { iters = 1; }
127                 else { iters = ((seqs[0]->getAlignLength() - (2*window)) / increment); }
128                 
129                 //initialize pref
130                 pref.resize(numSeqs);  
131                 
132                 for (int i = 0; i < numSeqs; i++ ) { 
133                         pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2);   pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3);
134                         pref[i].name = seqs[i]->getName();
135                         pref[i].score[0] = 0.0;  pref[i].score[1] = 0.0; 
136                         pref[i].closestLeft[0] = 100000.0;  pref[i].closestLeft[1] = 100000.0;  
137                         pref[i].closestRight[0] = 100000.0;  pref[i].closestRight[1] = 100000.0;  
138                 }
139
140                 int midpoint = window;
141                 int count = 0;
142                 while (count < iters) {
143                                 
144                                 //create 2 vectors of sequences, 1 for left side and one for right side
145                                 vector<Sequence> left;  vector<Sequence> right;
146                                 
147                                 for (int i = 0; i < seqs.size(); i++) {
148 //cout << "midpoint = " << midpoint << "\twindow = " << window << endl;
149 //cout << "whole = " << seqs[i]->getAligned().length() << endl;
150                                         //save left side
151                                         string seqLeft = seqs[i]->getAligned().substr(midpoint-window, window);
152                                         Sequence tempLeft;
153                                         tempLeft.setName(seqs[i]->getName());
154                                         tempLeft.setAligned(seqLeft);
155                                         left.push_back(tempLeft);
156 //cout << "left = " << tempLeft.getAligned().length() << endl;                  
157                                         //save right side
158                                         string seqRight = seqs[i]->getAligned().substr(midpoint, window);
159                                         Sequence tempRight;
160                                         tempRight.setName(seqs[i]->getName());
161                                         tempRight.setAligned(seqRight);
162                                         right.push_back(tempRight);
163 //cout << "right = " << seqRight.length() << endl;      
164                                 }
165                                 
166                                 //adjust midpoint by increment
167                                 midpoint += increment;
168                                 
169                                 
170                                 //this should be parallelized
171                                 //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
172                                 //create a matrix containing the distance from left to left and right to right
173                                 //calculate distances
174                                 SparseMatrix* SparseLeft = new SparseMatrix();
175                                 SparseMatrix* SparseRight = new SparseMatrix();
176                                 
177                                 createSparseMatrix(0, left.size(), SparseLeft, left);
178                                 createSparseMatrix(0, right.size(), SparseRight, right);
179                                 
180                                 vector<SeqMap> distMapRight;
181                                 vector<SeqMap> distMapLeft;
182                                 
183                                 // Create a data structure to quickly access the distance information.
184                                 //this is from thallingers reimplementation on get.oturep
185                                 // It consists of a vector of distance maps, where each map contains
186                                 // all distances of a certain sequence. Vector and maps are accessed
187                                 // via the index of a sequence in the distance matrix
188                                 distMapRight = vector<SeqMap>(numSeqs); 
189                                 distMapLeft = vector<SeqMap>(numSeqs); 
190                                 //cout << "left" << endl << endl;
191                                 for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
192                                         distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
193                                         //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
194                                 }
195                                 //cout << "right" << endl << endl;
196                                 for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
197                                         distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
198                                         //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
199                                 }
200                                 
201                                 delete SparseLeft;
202                                 delete SparseRight;
203                                 
204                                 //fill preference structure
205                                 generatePreferences(distMapLeft, distMapRight, midpoint);
206                                 
207                                 count++;
208                                 
209                 }
210                 
211                 delete distCalculator;
212                 
213                 //rank preference score to eachother
214                 float dme = 0.0;
215                 float expectedPercent = 1 / (float) (pref.size());
216                 
217                 for (int i = 0; i < pref.size(); i++) {  dme += pref[i].score[0];  }
218         
219                 for (int i = 0; i < pref.size(); i++) {
220
221                         //gives the actual percentage of the dme this seq adds
222                         pref[i].score[0] = pref[i].score[0] / dme;
223                         
224                         //how much higher or lower is this than expected
225                         pref[i].score[0] = pref[i].score[0] / expectedPercent;
226                 
227                 }
228                 
229                 //sort Preferences highest to lowest
230                 sort(pref.begin(), pref.end(), comparePref);
231                 
232                 return 0;
233                 
234         }
235         catch(exception& e) {
236                 errorOut(e, "Bellerophon", "getChimeras");
237                 exit(1);
238         }
239 }
240
241 /***************************************************************************************************************/
242 int Bellerophon::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
243         try {
244
245                 for(int i=startSeq; i<endSeq; i++){
246                         
247                         for(int j=0;j<i;j++){
248                         
249                                 distCalculator->calcDist(s[i], s[j]);
250                                 float dist = distCalculator->getDist();
251                         
252                                 PCell temp(i, j, dist);
253                                 sparse->addCell(temp);
254                                 
255                         }
256                 }
257                 
258                 return 1;
259         }
260         catch(exception& e) {
261                 errorOut(e, "Bellerophon", "createSparseMatrix");
262                 exit(1);
263         }
264 }
265 /***************************************************************************************************************/
266 void Bellerophon::generatePreferences(vector<SeqMap> left, vector<SeqMap> right, int mid){
267         try {
268                 
269                 float dme = 0.0;
270                 SeqMap::iterator itR;
271                 SeqMap::iterator itL;
272                 
273                 //initialize pref[i]
274                 for (int i = 0; i < pref.size(); i++) {
275                         pref[i].score[1] = 0.0;
276                         pref[i].closestLeft[1] = 100000.0; 
277                         pref[i].closestRight[1] = 100000.0; 
278                         pref[i].leftParent[1] = "";
279                         pref[i].rightParent[1] = "";
280                 }
281         
282                 for (int i = 0; i < left.size(); i++) {
283                         
284                         SeqMap currentLeft = left[i];    //example i = 3;   currentLeft is a map of 0 to the distance of sequence 3 to sequence 0,
285                                                                                                 //                                                                              1 to the distance of sequence 3 to sequence 1,
286                                                                                                 //                                                                              2 to the distance of sequence 3 to sequence 2.
287                         SeqMap currentRight = right[i];         // same as left but with distances on the right side.
288                         
289                         for (int j = 0; j < i; j++) {
290                                 
291                                 itL = currentLeft.find(j);
292                                 itR = currentRight.find(j);
293 //cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl;
294 //cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl;
295                                 
296                                 //if you can find this entry update the preferences
297                                 if ((itL != currentLeft.end()) && (itR != currentRight.end())) {
298                                 
299                                         if (!correction) {
300                                                 pref[i].score[1] += abs((itL->second - itR->second));
301                                                 pref[j].score[1] += abs((itL->second - itR->second));
302 //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
303 //cout << "abs = " << abs((itL->second - itR->second)) << endl;
304 //cout << i << " score = " << pref[i].score[1] << endl;
305 //cout << j << " score = " << pref[j].score[1] << endl;
306                                         }else {
307                                                 pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
308                                                 pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
309 //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
310 //cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl;
311 //cout << i << " score = " << pref[i].score[1] << endl;
312 //cout << j << " score = " << pref[j].score[1] << endl;
313                                         }
314 //cout << "pref[" << i << "].closestLeft[1] = " <<      pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl;                        
315                                         //are you the closest left sequence
316                                         if (itL->second < pref[i].closestLeft[1]) {  
317
318                                                 pref[i].closestLeft[1] = itL->second;
319                                                 pref[i].leftParent[1] = seqs[j]->getName();
320 //cout << "updating closest left to " << pref[i].leftParent[1] << endl;
321                                         }
322 //cout << "pref[" << j << "].closestLeft[1] = " <<      pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl;        
323                                         if (itL->second < pref[j].closestLeft[1]) { 
324                                                 pref[j].closestLeft[1] = itL->second;
325                                                 pref[j].leftParent[1] = seqs[i]->getName();
326 //cout << "updating closest left to " << pref[j].leftParent[1] << endl;
327                                         }
328                                         
329                                         //are you the closest right sequence
330                                         if (itR->second < pref[i].closestRight[1]) {   
331                                                 pref[i].closestRight[1] = itR->second;
332                                                 pref[i].rightParent[1] = seqs[j]->getName();
333                                         }
334                                         if (itR->second < pref[j].closestRight[1]) {   
335                                                 pref[j].closestRight[1] = itR->second;
336                                                 pref[j].rightParent[1] = seqs[i]->getName();
337                                         }
338                                         
339                                 }
340                         }
341                 
342                 }
343                 
344                 
345                   
346                 //calculate the dme
347                 int count0 = 0;
348                 for (int i = 0; i < pref.size(); i++) {  dme += pref[i].score[1];  if (pref[i].score[1] == 0.0) { count0++; }  }
349                 
350                 float expectedPercent = 1 / (float) (pref.size() - count0);
351 //cout << endl << "dme = " << dme << endl;
352                 //recalculate prefernences based on dme
353                 for (int i = 0; i < pref.size(); i++) {
354 //cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl; 
355                         // gives the actual percentage of the dme this seq adds
356                         pref[i].score[1] = pref[i].score[1] / dme;
357                         
358                         //how much higher or lower is this than expected
359                         pref[i].score[1] = pref[i].score[1] / expectedPercent;
360                         
361                         //pref[i].score[1] = dme / (dme - 2 * pref[i].score[1]);
362                         
363                         //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
364 //cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;                                   
365                 }
366                 
367                 //is this score bigger then the last score
368                 for (int i = 0; i < pref.size(); i++) {  
369                         
370                         //update biggest score
371                         if (pref[i].score[1] > pref[i].score[0]) {
372                                 pref[i].score[0] = pref[i].score[1];
373                                 pref[i].leftParent[0] = pref[i].leftParent[1];
374                                 pref[i].rightParent[0] = pref[i].rightParent[1];
375                                 pref[i].closestLeft[0] = pref[i].closestLeft[1];
376                                 pref[i].closestRight[0] = pref[i].closestRight[1];
377                                 pref[i].midpoint = mid;
378                         }
379                         
380                 }
381
382         }
383         catch(exception& e) {
384                 errorOut(e, "Bellerophon", "generatePreferences");
385                 exit(1);
386         }
387 }
388 /**************************************************************************************************/
389