]> git.donarmstrong.com Git - mothur.git/blob - bellerophon.cpp
54dfb9b67ecf5e764e3c159dffcc47d6fd7004dc
[mothur.git] / bellerophon.cpp
1 /*
2  *  bellerophon.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/9/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "bellerophon.h"
11 #include "eachgapdist.h"
12 #include "ignoregaps.h"
13 #include "onegapdist.h"
14
15
16 //***************************************************************************************************************
17
18 Bellerophon::Bellerophon(string name, string o)  {
19         try {
20                 fastafile = name;
21                 outputDir = o;
22         }
23         catch(exception& e) {
24                 m->errorOut(e, "Bellerophon", "Bellerophon");
25                 exit(1);
26         }
27 }
28
29 //***************************************************************************************************************
30 int Bellerophon::print(ostream& out, ostream& outAcc) {
31         try {
32                 int above1 = 0;
33                 out << "Name\tScore\tLeft\tRight\t" << endl;
34                 //output prefenence structure to .chimeras file
35                 for (int i = 0; i < pref.size(); i++) {
36                         
37                         if (m->control_pressed) {  return 0; }
38                         
39                         out << pref[i].name << '\t' << setprecision(3) << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
40                         
41                         //calc # of seqs with preference above 1.0
42                         if (pref[i].score[0] > 1.0) { 
43                                 above1++; 
44                                 outAcc << pref[i].name << endl;
45                                 m->mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); m->mothurOutEndLine();
46                                 m->mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); m->mothurOutEndLine();
47                         }
48                 }
49                 
50                 //output results to screen
51                 m->mothurOutEndLine();
52                 m->mothurOut("Sequence with preference score above 1.0: " + toString(above1)); m->mothurOutEndLine();
53                 int spot;
54                 spot = pref.size()-1;
55                 m->mothurOut("Minimum:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine();
56                 spot = pref.size() * 0.975;
57                 m->mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine();
58                 spot = pref.size() * 0.75;
59                 m->mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine();
60                 spot = pref.size() * 0.50;
61                 m->mothurOut("Median: \t" + toString(pref[spot].score[0])); m->mothurOutEndLine();
62                 spot = pref.size() * 0.25;
63                 m->mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine();
64                 spot = pref.size() * 0.025;
65                 m->mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine();
66                 spot = 0;
67                 m->mothurOut("Maximum:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine();
68                 
69                 return 1;
70
71         }
72         catch(exception& e) {
73                 m->errorOut(e, "Bellerophon", "print");
74                 exit(1);
75         }
76 }
77
78 //********************************************************************************************************************
79 //sorts highest score to lowest
80 inline bool comparePref(Preference left, Preference right){
81         return (left.score[0] > right.score[0]);        
82 }
83
84 //***************************************************************************************************************
85 int Bellerophon::getChimeras() {
86         try {
87                 
88                 //do soft filter
89                 if (filter)  {
90                         string optionString = "fasta=" + fastafile + ", soft=50";
91                         if (outputDir != "") { optionString += ", outputdir=" + outputDir; }
92                         
93                         filterSeqs = new FilterSeqsCommand(optionString);
94                         filterSeqs->execute();
95                         delete filterSeqs;
96                         
97                         if (m->control_pressed) { return 0; }
98                         
99                         //reset fastafile to filtered file
100                         if (outputDir == "") { fastafile = getRootName(fastafile) + "filter.fasta"; }
101                         else                             { fastafile = outputDir + getRootName(getSimpleName(fastafile)) + "filter.fasta"; }
102                         
103                 }
104                 
105                 distCalculator = new eachGapDist();
106                 
107                 //read in sequences
108                 seqs = readSeqs(fastafile);
109                 
110                 if (m->control_pressed) { return 0; }
111         
112                 if (unaligned) { m->mothurOut("Your sequences need to be aligned when you use the bellerophon method."); m->mothurOutEndLine(); return 1;  }
113                 
114                 int numSeqs = seqs.size();
115                 
116                 if (numSeqs == 0) { m->mothurOut("Error in reading you sequences."); m->mothurOutEndLine(); exit(1); }
117                 
118                 //set default window to 25% of sequence length
119                 string seq0 = seqs[0]->getAligned();
120                 if (window == 0) { window = seq0.length() / 4;  }
121                 else if (window > (seq0.length() / 2)) {  
122                         m->mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); m->mothurOutEndLine();
123                         window = (seq0.length() / 2);
124                 }
125                 
126                 if (increment > (seqs[0]->getAlignLength() - (2*window))) { 
127                         if (increment != 10) {
128                         
129                                 m->mothurOut("You have selected a increment that is too large. I will use the default."); m->mothurOutEndLine();
130                                 increment = 10;
131                                 if (increment > (seqs[0]->getAlignLength() - (2*window))) {  increment = 0;  }
132                                 
133                         }else{ increment = 0; }
134                 }
135                 
136                 if (increment == 0) { iters = 1; }
137                 else { iters = ((seqs[0]->getAlignLength() - (2*window)) / increment); }
138                 
139                 //initialize pref
140                 pref.resize(numSeqs);  
141                 
142                 for (int i = 0; i < numSeqs; i++ ) { 
143                         pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2);   pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3);
144                         pref[i].name = seqs[i]->getName();
145                         pref[i].score[0] = 0.0;  pref[i].score[1] = 0.0; 
146                         pref[i].closestLeft[0] = 100000.0;  pref[i].closestLeft[1] = 100000.0;  
147                         pref[i].closestRight[0] = 100000.0;  pref[i].closestRight[1] = 100000.0;  
148                 }
149
150                 int midpoint = window;
151                 int count = 0;
152                 while (count < iters) {
153                                 
154                                 if (m->control_pressed) { return 0; }
155                                 
156                                 //create 2 vectors of sequences, 1 for left side and one for right side
157                                 vector<Sequence> left;  vector<Sequence> right;
158                                 
159                                 for (int i = 0; i < seqs.size(); i++) {
160                                 
161                                         if (m->control_pressed) { return 0; }
162                                         
163 //cout << "midpoint = " << midpoint << "\twindow = " << window << endl;
164 //cout << "whole = " << seqs[i]->getAligned().length() << endl;
165                                         //save left side
166                                         string seqLeft = seqs[i]->getAligned().substr(midpoint-window, window);
167                                         Sequence tempLeft;
168                                         tempLeft.setName(seqs[i]->getName());
169                                         tempLeft.setAligned(seqLeft);
170                                         left.push_back(tempLeft);
171 //cout << "left = " << tempLeft.getAligned().length() << endl;                  
172                                         //save right side
173                                         string seqRight = seqs[i]->getAligned().substr(midpoint, window);
174                                         Sequence tempRight;
175                                         tempRight.setName(seqs[i]->getName());
176                                         tempRight.setAligned(seqRight);
177                                         right.push_back(tempRight);
178 //cout << "right = " << seqRight.length() << endl;      
179                                 }
180                                 
181                                 //adjust midpoint by increment
182                                 midpoint += increment;
183                                 
184                                 
185                                 //this should be parallelized
186                                 //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
187                                 //create a matrix containing the distance from left to left and right to right
188                                 //calculate distances
189                                 SparseMatrix* SparseLeft = new SparseMatrix();
190                                 SparseMatrix* SparseRight = new SparseMatrix();
191                                 
192                                 createSparseMatrix(0, left.size(), SparseLeft, left);
193                                 
194                                 if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; }
195                                 
196                                 createSparseMatrix(0, right.size(), SparseRight, right);
197                                 
198                                 if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; }
199                                 
200                                 left.clear(); right.clear();
201                                 vector<SeqMap> distMapRight;
202                                 vector<SeqMap> distMapLeft;
203                                 
204                                 // Create a data structure to quickly access the distance information.
205                                 //this is from thallingers reimplementation on get.oturep
206                                 // It consists of a vector of distance maps, where each map contains
207                                 // all distances of a certain sequence. Vector and maps are accessed
208                                 // via the index of a sequence in the distance matrix
209                                 distMapRight = vector<SeqMap>(numSeqs); 
210                                 distMapLeft = vector<SeqMap>(numSeqs); 
211                                 //cout << "left" << endl << endl;
212                                 for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
213                                         distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
214                                         if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; }
215                                         //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
216                                 }
217                                 //cout << "right" << endl << endl;
218                                 for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
219                                         distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
220                                         if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; }
221                                         //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
222                                 }
223                                 
224                                 delete SparseLeft;
225                                 delete SparseRight;
226                                 
227                                 //fill preference structure
228                                 generatePreferences(distMapLeft, distMapRight, midpoint);
229                                 
230                                 count++;
231                                 
232                 }
233                 
234                 delete distCalculator;
235                 
236                 //rank preference score to eachother
237                 float dme = 0.0;
238                 float expectedPercent = 1 / (float) (pref.size());
239                 
240                 for (int i = 0; i < pref.size(); i++) {  dme += pref[i].score[0];  }
241         
242                 for (int i = 0; i < pref.size(); i++) {
243
244                         //gives the actual percentage of the dme this seq adds
245                         pref[i].score[0] = pref[i].score[0] / dme;
246                         
247                         //how much higher or lower is this than expected
248                         pref[i].score[0] = pref[i].score[0] / expectedPercent;
249                 
250                 }
251                 
252                 //sort Preferences highest to lowest
253                 sort(pref.begin(), pref.end(), comparePref);
254                 
255                 for (int i = 0; i < seqs.size(); i++) { delete seqs[i];  }  seqs.clear();
256                 
257                 return 0;
258                 
259         }
260         catch(exception& e) {
261                 m->errorOut(e, "Bellerophon", "getChimeras");
262                 exit(1);
263         }
264 }
265
266 /***************************************************************************************************************/
267 int Bellerophon::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
268         try {
269
270                 for(int i=startSeq; i<endSeq; i++){
271                         
272                         for(int j=0;j<i;j++){
273                                 
274                                 if (m->control_pressed) { return 0; }
275                         
276                                 distCalculator->calcDist(s[i], s[j]);
277                                 float dist = distCalculator->getDist();
278                         
279                                 PCell temp(i, j, dist);
280                                 sparse->addCell(temp);
281                                 
282                         }
283                 }
284                 
285                 return 1;
286         }
287         catch(exception& e) {
288                 m->errorOut(e, "Bellerophon", "createSparseMatrix");
289                 exit(1);
290         }
291 }
292 /***************************************************************************************************************/
293 int Bellerophon::generatePreferences(vector<SeqMap> left, vector<SeqMap> right, int mid){
294         try {
295                 
296                 float dme = 0.0;
297                 SeqMap::iterator itR;
298                 SeqMap::iterator itL;
299                 
300                 //initialize pref[i]
301                 for (int i = 0; i < pref.size(); i++) {
302                         pref[i].score[1] = 0.0;
303                         pref[i].closestLeft[1] = 100000.0; 
304                         pref[i].closestRight[1] = 100000.0; 
305                         pref[i].leftParent[1] = "";
306                         pref[i].rightParent[1] = "";
307                 }
308         
309                 for (int i = 0; i < left.size(); i++) {
310                         
311                         SeqMap currentLeft = left[i];    //example i = 3;   currentLeft is a map of 0 to the distance of sequence 3 to sequence 0,
312                                                                                                 //                                                                              1 to the distance of sequence 3 to sequence 1,
313                                                                                                 //                                                                              2 to the distance of sequence 3 to sequence 2.
314                         SeqMap currentRight = right[i];         // same as left but with distances on the right side.
315                         
316                         for (int j = 0; j < i; j++) {
317                         
318                                 if (m->control_pressed) {  return 0; }
319                                 
320                                 itL = currentLeft.find(j);
321                                 itR = currentRight.find(j);
322 //cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl;
323 //cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl;
324                                 
325                                 //if you can find this entry update the preferences
326                                 if ((itL != currentLeft.end()) && (itR != currentRight.end())) {
327                                 
328                                         if (!correction) {
329                                                 pref[i].score[1] += abs((itL->second - itR->second));
330                                                 pref[j].score[1] += abs((itL->second - itR->second));
331 //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
332 //cout << "abs = " << abs((itL->second - itR->second)) << endl;
333 //cout << i << " score = " << pref[i].score[1] << endl;
334 //cout << j << " score = " << pref[j].score[1] << endl;
335                                         }else {
336                                                 pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
337                                                 pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
338 //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
339 //cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl;
340 //cout << i << " score = " << pref[i].score[1] << endl;
341 //cout << j << " score = " << pref[j].score[1] << endl;
342                                         }
343 //cout << "pref[" << i << "].closestLeft[1] = " <<      pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl;                        
344                                         //are you the closest left sequence
345                                         if (itL->second < pref[i].closestLeft[1]) {  
346
347                                                 pref[i].closestLeft[1] = itL->second;
348                                                 pref[i].leftParent[1] = seqs[j]->getName();
349 //cout << "updating closest left to " << pref[i].leftParent[1] << endl;
350                                         }
351 //cout << "pref[" << j << "].closestLeft[1] = " <<      pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl;        
352                                         if (itL->second < pref[j].closestLeft[1]) { 
353                                                 pref[j].closestLeft[1] = itL->second;
354                                                 pref[j].leftParent[1] = seqs[i]->getName();
355 //cout << "updating closest left to " << pref[j].leftParent[1] << endl;
356                                         }
357                                         
358                                         //are you the closest right sequence
359                                         if (itR->second < pref[i].closestRight[1]) {   
360                                                 pref[i].closestRight[1] = itR->second;
361                                                 pref[i].rightParent[1] = seqs[j]->getName();
362                                         }
363                                         if (itR->second < pref[j].closestRight[1]) {   
364                                                 pref[j].closestRight[1] = itR->second;
365                                                 pref[j].rightParent[1] = seqs[i]->getName();
366                                         }
367                                         
368                                 }
369                         }
370                 
371                 }
372                 
373                 
374                   
375                 //calculate the dme
376                 int count0 = 0;
377                 for (int i = 0; i < pref.size(); i++) {  dme += pref[i].score[1];  if (pref[i].score[1] == 0.0) { count0++; }  }
378                 
379                 float expectedPercent = 1 / (float) (pref.size() - count0);
380 //cout << endl << "dme = " << dme << endl;
381                 //recalculate prefernences based on dme
382                 for (int i = 0; i < pref.size(); i++) {
383                 
384                         if (m->control_pressed) {  return 0; }
385 //cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl; 
386                         // gives the actual percentage of the dme this seq adds
387                         pref[i].score[1] = pref[i].score[1] / dme;
388                         
389                         //how much higher or lower is this than expected
390                         pref[i].score[1] = pref[i].score[1] / expectedPercent;
391                         
392                         //pref[i].score[1] = dme / (dme - 2 * pref[i].score[1]);
393                         
394                         //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
395 //cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;                                   
396                 }
397                 
398                 //is this score bigger then the last score
399                 for (int i = 0; i < pref.size(); i++) {  
400                         
401                         if (m->control_pressed) {  return 0; }
402                         
403                         //update biggest score
404                         if (pref[i].score[1] > pref[i].score[0]) {
405                                 pref[i].score[0] = pref[i].score[1];
406                                 pref[i].leftParent[0] = pref[i].leftParent[1];
407                                 pref[i].rightParent[0] = pref[i].rightParent[1];
408                                 pref[i].closestLeft[0] = pref[i].closestLeft[1];
409                                 pref[i].closestRight[0] = pref[i].closestRight[1];
410                                 pref[i].midpoint = mid;
411                         }
412                         
413                 }
414                 
415                 return 1;
416
417         }
418         catch(exception& e) {
419                 m->errorOut(e, "Bellerophon", "generatePreferences");
420                 exit(1);
421         }
422 }
423 /**************************************************************************************************/
424