]> git.donarmstrong.com Git - mothur.git/blob - bellerophon.cpp
checking in chimera files in progress after move to michigan
[mothur.git] / bellerophon.cpp
1 /*
2  *  bellerophon.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/9/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "bellerophon.h"
11 #include "eachgapdist.h"
12 #include "ignoregaps.h"
13 #include "onegapdist.h"
14
15
16 //***************************************************************************************************************
17
18 Bellerophon::Bellerophon(string name) {
19         try {
20                 fastafile = name;
21         }
22         catch(exception& e) {
23                 errorOut(e, "Bellerophon", "Bellerophon");
24                 exit(1);
25         }
26 }
27
28 //***************************************************************************************************************
29 void Bellerophon::print(ostream& out) {
30         try {
31                 int above1 = 0;
32                 out << "Name\tScore\tLeft\tRight\t" << endl;
33                 //output prefenence structure to .chimeras file
34                 for (int i = 0; i < pref.size(); i++) {
35                         out << pref[i].name << '\t' << setprecision(3) << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
36                         
37                         //calc # of seqs with preference above 1.0
38                         if (pref[i].score[0] > 1.1) { 
39                                 above1++; 
40                                 mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine();
41                                 mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine();
42                         }
43                 }
44                 
45                 //output results to screen
46                 mothurOutEndLine();
47                 mothurOut("Sequence with preference score above 1.1: " + toString(above1)); mothurOutEndLine();
48                 int spot;
49                 spot = pref.size()-1;
50                 mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
51                 spot = pref.size() * 0.975;
52                 mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
53                 spot = pref.size() * 0.75;
54                 mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
55                 spot = pref.size() * 0.50;
56                 mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine();
57                 spot = pref.size() * 0.25;
58                 mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
59                 spot = pref.size() * 0.025;
60                 mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
61                 spot = 0;
62                 mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
63
64         }
65         catch(exception& e) {
66                 errorOut(e, "Bellerophon", "print");
67                 exit(1);
68         }
69 }
70
71 //********************************************************************************************************************
72 //sorts highest score to lowest
73 inline bool comparePref(Preference left, Preference right){
74         return (left.score[0] > right.score[0]);        
75 }
76
77 //***************************************************************************************************************
78 void Bellerophon::getChimeras() {
79         try {
80                 
81                 //do soft filter
82                 if (filter)  {
83                         string optionString = "fasta=" + fastafile + ", soft=50";
84                         filterSeqs = new FilterSeqsCommand(optionString);
85                         filterSeqs->execute();
86                         delete filterSeqs;
87                         
88                         //reset fastafile to filtered file
89                         fastafile = getRootName(fastafile) + "filter.fasta";
90                 }
91                 
92                 distCalculator = new eachGapDist();
93                 
94                 //read in sequences
95                 seqs = readSeqs(fastafile);
96                 
97                 int numSeqs = seqs.size();
98                 
99                 if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); exit(1); }
100                 
101                 //set default window to 25% of sequence length
102                 string seq0 = seqs[0]->getAligned();
103                 if (window == 0) { window = seq0.length() / 4;  }
104                 else if (window > (seq0.length() / 2)) {  
105                         mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine();
106                         window = (seq0.length() / 2);
107                 }
108                 
109                 if (increment > (seqs[0]->getAlignLength() - (2*window))) { 
110                         if (increment != 10) {
111                         
112                                 mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine();
113                                 increment = 10;
114                                 if (increment > (seqs[0]->getAlignLength() - (2*window))) {  increment = 0;  }
115                                 
116                         }else{ increment = 0; }
117                 }
118                 
119                 if (increment == 0) { iters = 1; }
120                 else { iters = ((seqs[0]->getAlignLength() - (2*window)) / increment); }
121                 
122                 //initialize pref
123                 pref.resize(numSeqs);  
124                 
125                 for (int i = 0; i < numSeqs; i++ ) { 
126                         pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2);   pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3);
127                         pref[i].name = seqs[i]->getName();
128                         pref[i].score[0] = 0.0;  pref[i].score[1] = 0.0; 
129                         pref[i].closestLeft[0] = 100000.0;  pref[i].closestLeft[1] = 100000.0;  
130                         pref[i].closestRight[0] = 100000.0;  pref[i].closestRight[1] = 100000.0;  
131                 }
132
133                 int midpoint = window;
134                 int count = 0;
135                 while (count < iters) {
136                                 
137                                 //create 2 vectors of sequences, 1 for left side and one for right side
138                                 vector<Sequence> left;  vector<Sequence> right;
139                                 
140                                 for (int i = 0; i < seqs.size(); i++) {
141 //cout << "whole = " << seqs[i].getAligned() << endl;
142                                         //save left side
143                                         string seqLeft = seqs[i]->getAligned().substr(midpoint-window, window);
144                                         Sequence tempLeft;
145                                         tempLeft.setName(seqs[i]->getName());
146                                         tempLeft.setAligned(seqLeft);
147                                         left.push_back(tempLeft);
148 //cout << "left = " << tempLeft.getAligned() << endl;                   
149                                         //save right side
150                                         string seqRight = seqs[i]->getAligned().substr(midpoint, window);
151                                         Sequence tempRight;
152                                         tempRight.setName(seqs[i]->getName());
153                                         tempRight.setAligned(seqRight);
154                                         right.push_back(tempRight);
155 //cout << "right = " << seqRight << endl;       
156                                 }
157                                 
158                                 //adjust midpoint by increment
159                                 midpoint += increment;
160                                 
161                                 
162                                 //this should be parallelized
163                                 //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
164                                 //create a matrix containing the distance from left to left and right to right
165                                 //calculate distances
166                                 SparseMatrix* SparseLeft = new SparseMatrix();
167                                 SparseMatrix* SparseRight = new SparseMatrix();
168                                 
169                                 createSparseMatrix(0, left.size(), SparseLeft, left);
170                                 createSparseMatrix(0, right.size(), SparseRight, right);
171                                 
172                                 vector<SeqMap> distMapRight;
173                                 vector<SeqMap> distMapLeft;
174                                 
175                                 // Create a data structure to quickly access the distance information.
176                                 //this is from thallingers reimplementation on get.oturep
177                                 // It consists of a vector of distance maps, where each map contains
178                                 // all distances of a certain sequence. Vector and maps are accessed
179                                 // via the index of a sequence in the distance matrix
180                                 distMapRight = vector<SeqMap>(numSeqs); 
181                                 distMapLeft = vector<SeqMap>(numSeqs); 
182                                 //cout << "left" << endl << endl;
183                                 for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
184                                         distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
185                                         //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
186                                 }
187                                 //cout << "right" << endl << endl;
188                                 for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
189                                         distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
190                                         //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
191                                 }
192                                 
193                                 delete SparseLeft;
194                                 delete SparseRight;
195                                 
196                                 //fill preference structure
197                                 generatePreferences(distMapLeft, distMapRight, midpoint);
198                                 
199                                 count++;
200                                 
201                 }
202                 
203                 delete distCalculator;
204                 
205                 //rank preference score to eachother
206                 /*float dme = 0.0;
207                 float expectedPercent = 1 / (float) (pref.size());
208                 
209                 for (int i = 0; i < pref.size(); i++) {  dme += pref[i].score[0];  }
210         
211                 for (int i = 0; i < pref.size(); i++) {
212
213                         //gives the actual percentage of the dme this seq adds
214                         pref[i].score[0] = pref[i].score[0] / dme;
215                         
216                         //how much higher or lower is this than expected
217                         pref[i].score[0] = pref[i].score[0] / expectedPercent;
218                 
219                 }*/
220                 
221                 //sort Preferences highest to lowest
222                 sort(pref.begin(), pref.end(), comparePref);
223
224         }
225         catch(exception& e) {
226                 errorOut(e, "Bellerophon", "getChimeras");
227                 exit(1);
228         }
229 }
230
231 /***************************************************************************************************************/
232 int Bellerophon::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
233         try {
234
235                 for(int i=startSeq; i<endSeq; i++){
236                         
237                         for(int j=0;j<i;j++){
238                         
239                                 distCalculator->calcDist(s[i], s[j]);
240                                 float dist = distCalculator->getDist();
241                         
242                                 PCell temp(i, j, dist);
243                                 sparse->addCell(temp);
244                                 
245                         }
246                 }
247                 
248                 return 1;
249         }
250         catch(exception& e) {
251                 errorOut(e, "Bellerophon", "createSparseMatrix");
252                 exit(1);
253         }
254 }
255 /***************************************************************************************************************/
256 void Bellerophon::generatePreferences(vector<SeqMap> left, vector<SeqMap> right, int mid){
257         try {
258                 
259                 float dme = 0.0;
260                 SeqMap::iterator itR;
261                 SeqMap::iterator itL;
262                 
263                 //initialize pref[i]
264                 for (int i = 0; i < pref.size(); i++) {
265                         pref[i].score[1] = 0.0;
266                         pref[i].closestLeft[1] = 100000.0; 
267                         pref[i].closestRight[1] = 100000.0; 
268                         pref[i].leftParent[1] = "";
269                         pref[i].rightParent[1] = "";
270                 }
271         
272                 for (int i = 0; i < left.size(); i++) {
273                         
274                         SeqMap currentLeft = left[i];    //example i = 3;   currentLeft is a map of 0 to the distance of sequence 3 to sequence 0,
275                                                                                                 //                                                                              1 to the distance of sequence 3 to sequence 1,
276                                                                                                 //                                                                              2 to the distance of sequence 3 to sequence 2.
277                         SeqMap currentRight = right[i];         // same as left but with distances on the right side.
278                         
279                         for (int j = 0; j < i; j++) {
280                                 
281                                 itL = currentLeft.find(j);
282                                 itR = currentRight.find(j);
283 //cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl;
284 //cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl;
285                                 
286                                 //if you can find this entry update the preferences
287                                 if ((itL != currentLeft.end()) && (itR != currentRight.end())) {
288                                 
289                                         if (!correction) {
290                                                 pref[i].score[1] += abs((itL->second - itR->second));
291                                                 pref[j].score[1] += abs((itL->second - itR->second));
292 //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
293 //cout << "abs = " << abs((itL->second - itR->second)) << endl;
294 //cout << i << " score = " << pref[i].score[1] << endl;
295 //cout << j << " score = " << pref[j].score[1] << endl;
296                                         }else {
297                                                 pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
298                                                 pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
299 //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
300 //cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl;
301 //cout << i << " score = " << pref[i].score[1] << endl;
302 //cout << j << " score = " << pref[j].score[1] << endl;
303                                         }
304 //cout << "pref[" << i << "].closestLeft[1] = " <<      pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl;                        
305                                         //are you the closest left sequence
306                                         if (itL->second < pref[i].closestLeft[1]) {  
307
308                                                 pref[i].closestLeft[1] = itL->second;
309                                                 pref[i].leftParent[1] = seqs[j]->getName();
310 //cout << "updating closest left to " << pref[i].leftParent[1] << endl;
311                                         }
312 //cout << "pref[" << j << "].closestLeft[1] = " <<      pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl;        
313                                         if (itL->second < pref[j].closestLeft[1]) { 
314                                                 pref[j].closestLeft[1] = itL->second;
315                                                 pref[j].leftParent[1] = seqs[i]->getName();
316 //cout << "updating closest left to " << pref[j].leftParent[1] << endl;
317                                         }
318                                         
319                                         //are you the closest right sequence
320                                         if (itR->second < pref[i].closestRight[1]) {   
321                                                 pref[i].closestRight[1] = itR->second;
322                                                 pref[i].rightParent[1] = seqs[j]->getName();
323                                         }
324                                         if (itR->second < pref[j].closestRight[1]) {   
325                                                 pref[j].closestRight[1] = itR->second;
326                                                 pref[j].rightParent[1] = seqs[i]->getName();
327                                         }
328                                         
329                                 }
330                         }
331                 
332                 }
333                 
334                 
335                   
336                 //calculate the dme
337                 int count0 = 0;
338                 for (int i = 0; i < pref.size(); i++) {  dme += pref[i].score[1];  if (pref[i].score[1] == 0.0) {  count0++; }  }
339                 
340                 //float expectedPercent = 1 / (float) (pref.size() - count0);
341 //cout << endl << "dme = " << dme << endl;
342                 //recalculate prefernences based on dme
343                 for (int i = 0; i < pref.size(); i++) {
344 //cout << "unadjusted col[i] " << i << " = " << pref[i].score[1] << endl;       
345 //cout << i << '\t' <<  (dme / (dme - 2 * pref[i].score[1])) << endl;
346                         
347                         // gives the actual percentage of the dme this seq adds
348                         //pref[i].score[1] = pref[i].score[1] / dme;
349                         
350                         //how much higher or lower is this than expected
351                         //pref[i].score[1] = pref[i].score[1] / expectedPercent;
352                         
353                         //not 2 * pref[i].score[1] because i only calulate the pref scores once.
354                         pref[i].score[1] = dme / (dme - pref[i].score[1]);
355 //cout << i << '\t' << pref[i].score[1] << endl;
356                         //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
357 //cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;                                   
358                 }
359                 
360                 //is this score bigger then the last score
361                 for (int i = 0; i < pref.size(); i++) {  
362                         
363                         //update biggest score
364                         if (pref[i].score[1] > pref[i].score[0]) {
365                                 pref[i].score[0] = pref[i].score[1];
366                                 pref[i].leftParent[0] = pref[i].leftParent[1];
367                                 pref[i].rightParent[0] = pref[i].rightParent[1];
368                                 pref[i].closestLeft[0] = pref[i].closestLeft[1];
369                                 pref[i].closestRight[0] = pref[i].closestRight[1];
370                                 pref[i].midpoint = mid;
371                         }
372                         
373                 }
374
375         }
376         catch(exception& e) {
377                 errorOut(e, "Bellerophon", "generatePreferences");
378                 exit(1);
379         }
380 }
381 /**************************************************************************************************/
382