5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "bellerophon.h"
11 #include "eachgapdist.h"
12 #include "ignoregaps.h"
13 #include "onegapdist.h"
16 //***************************************************************************************************************
18 Bellerophon::Bellerophon(string name, string o) {
24 m->errorOut(e, "Bellerophon", "Bellerophon");
29 //***************************************************************************************************************
30 int Bellerophon::print(ostream& out, ostream& outAcc) {
33 out << "Name\tScore\tLeft\tRight\t" << endl;
34 //output prefenence structure to .chimeras file
35 for (int i = 0; i < pref.size(); i++) {
37 if (m->control_pressed) { return 0; }
39 out << pref[i].name << '\t' << setprecision(3) << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
41 //calc # of seqs with preference above 1.0
42 if (pref[i].score[0] > 1.0) {
44 outAcc << pref[i].name << endl;
45 m->mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); m->mothurOutEndLine();
46 m->mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); m->mothurOutEndLine();
50 //output results to screen
51 m->mothurOutEndLine();
52 m->mothurOut("Sequence with preference score above 1.0: " + toString(above1)); m->mothurOutEndLine();
55 m->mothurOut("Minimum:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine();
56 spot = pref.size() * 0.975;
57 m->mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine();
58 spot = pref.size() * 0.75;
59 m->mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine();
60 spot = pref.size() * 0.50;
61 m->mothurOut("Median: \t" + toString(pref[spot].score[0])); m->mothurOutEndLine();
62 spot = pref.size() * 0.25;
63 m->mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine();
64 spot = pref.size() * 0.025;
65 m->mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine();
67 m->mothurOut("Maximum:\t" + toString(pref[spot].score[0])); m->mothurOutEndLine();
73 m->errorOut(e, "Bellerophon", "print");
78 //********************************************************************************************************************
79 //sorts highest score to lowest
80 inline bool comparePref(Preference left, Preference right){
81 return (left.score[0] > right.score[0]);
84 //***************************************************************************************************************
85 int Bellerophon::getChimeras() {
90 string optionString = "fasta=" + fastafile + ", soft=50";
91 if (outputDir != "") { optionString += ", outputdir=" + outputDir; }
93 filterSeqs = new FilterSeqsCommand(optionString);
94 filterSeqs->execute();
97 if (m->control_pressed) { return 0; }
99 //reset fastafile to filtered file
100 if (outputDir == "") { fastafile = getRootName(fastafile) + "filter.fasta"; }
101 else { fastafile = outputDir + getRootName(getSimpleName(fastafile)) + "filter.fasta"; }
105 distCalculator = new eachGapDist();
108 seqs = readSeqs(fastafile);
110 if (m->control_pressed) { return 0; }
112 if (unaligned) { m->mothurOut("Your sequences need to be aligned when you use the bellerophon method."); m->mothurOutEndLine(); return 1; }
114 int numSeqs = seqs.size();
116 if (numSeqs == 0) { m->mothurOut("Error in reading you sequences."); m->mothurOutEndLine(); exit(1); }
118 //set default window to 25% of sequence length
119 string seq0 = seqs[0]->getAligned();
120 if (window == 0) { window = seq0.length() / 4; }
121 else if (window > (seq0.length() / 2)) {
122 m->mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); m->mothurOutEndLine();
123 window = (seq0.length() / 2);
126 if (increment > (seqs[0]->getAlignLength() - (2*window))) {
127 if (increment != 10) {
129 m->mothurOut("You have selected a increment that is too large. I will use the default."); m->mothurOutEndLine();
131 if (increment > (seqs[0]->getAlignLength() - (2*window))) { increment = 0; }
133 }else{ increment = 0; }
136 if (increment == 0) { iters = 1; }
137 else { iters = ((seqs[0]->getAlignLength() - (2*window)) / increment); }
140 pref.resize(numSeqs);
142 for (int i = 0; i < numSeqs; i++ ) {
143 pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2); pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3);
144 pref[i].name = seqs[i]->getName();
145 pref[i].score[0] = 0.0; pref[i].score[1] = 0.0;
146 pref[i].closestLeft[0] = 100000.0; pref[i].closestLeft[1] = 100000.0;
147 pref[i].closestRight[0] = 100000.0; pref[i].closestRight[1] = 100000.0;
150 int midpoint = window;
152 while (count < iters) {
154 if (m->control_pressed) { return 0; }
156 //create 2 vectors of sequences, 1 for left side and one for right side
157 vector<Sequence> left; vector<Sequence> right;
159 for (int i = 0; i < seqs.size(); i++) {
161 if (m->control_pressed) { return 0; }
163 //cout << "midpoint = " << midpoint << "\twindow = " << window << endl;
164 //cout << "whole = " << seqs[i]->getAligned().length() << endl;
166 string seqLeft = seqs[i]->getAligned().substr(midpoint-window, window);
168 tempLeft.setName(seqs[i]->getName());
169 tempLeft.setAligned(seqLeft);
170 left.push_back(tempLeft);
171 //cout << "left = " << tempLeft.getAligned().length() << endl;
173 string seqRight = seqs[i]->getAligned().substr(midpoint, window);
175 tempRight.setName(seqs[i]->getName());
176 tempRight.setAligned(seqRight);
177 right.push_back(tempRight);
178 //cout << "right = " << seqRight.length() << endl;
181 //adjust midpoint by increment
182 midpoint += increment;
185 //this should be parallelized
186 //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
187 //create a matrix containing the distance from left to left and right to right
188 //calculate distances
189 SparseMatrix* SparseLeft = new SparseMatrix();
190 SparseMatrix* SparseRight = new SparseMatrix();
192 createSparseMatrix(0, left.size(), SparseLeft, left);
194 if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; }
196 createSparseMatrix(0, right.size(), SparseRight, right);
198 if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; }
200 left.clear(); right.clear();
201 vector<SeqMap> distMapRight;
202 vector<SeqMap> distMapLeft;
204 // Create a data structure to quickly access the distance information.
205 //this is from thallingers reimplementation on get.oturep
206 // It consists of a vector of distance maps, where each map contains
207 // all distances of a certain sequence. Vector and maps are accessed
208 // via the index of a sequence in the distance matrix
209 distMapRight = vector<SeqMap>(numSeqs);
210 distMapLeft = vector<SeqMap>(numSeqs);
211 //cout << "left" << endl << endl;
212 for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
213 distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
214 if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; }
215 //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
217 //cout << "right" << endl << endl;
218 for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
219 distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
220 if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; }
221 //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
227 //fill preference structure
228 generatePreferences(distMapLeft, distMapRight, midpoint);
234 delete distCalculator;
236 //rank preference score to eachother
238 float expectedPercent = 1 / (float) (pref.size());
240 for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[0]; }
242 for (int i = 0; i < pref.size(); i++) {
244 //gives the actual percentage of the dme this seq adds
245 pref[i].score[0] = pref[i].score[0] / dme;
247 //how much higher or lower is this than expected
248 pref[i].score[0] = pref[i].score[0] / expectedPercent;
252 //sort Preferences highest to lowest
253 sort(pref.begin(), pref.end(), comparePref);
255 for (int i = 0; i < seqs.size(); i++) { delete seqs[i]; } seqs.clear();
260 catch(exception& e) {
261 m->errorOut(e, "Bellerophon", "getChimeras");
266 /***************************************************************************************************************/
267 int Bellerophon::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
270 for(int i=startSeq; i<endSeq; i++){
272 for(int j=0;j<i;j++){
274 if (m->control_pressed) { return 0; }
276 distCalculator->calcDist(s[i], s[j]);
277 float dist = distCalculator->getDist();
279 PCell temp(i, j, dist);
280 sparse->addCell(temp);
287 catch(exception& e) {
288 m->errorOut(e, "Bellerophon", "createSparseMatrix");
292 /***************************************************************************************************************/
293 int Bellerophon::generatePreferences(vector<SeqMap> left, vector<SeqMap> right, int mid){
297 SeqMap::iterator itR;
298 SeqMap::iterator itL;
301 for (int i = 0; i < pref.size(); i++) {
302 pref[i].score[1] = 0.0;
303 pref[i].closestLeft[1] = 100000.0;
304 pref[i].closestRight[1] = 100000.0;
305 pref[i].leftParent[1] = "";
306 pref[i].rightParent[1] = "";
309 for (int i = 0; i < left.size(); i++) {
311 SeqMap currentLeft = left[i]; //example i = 3; currentLeft is a map of 0 to the distance of sequence 3 to sequence 0,
312 // 1 to the distance of sequence 3 to sequence 1,
313 // 2 to the distance of sequence 3 to sequence 2.
314 SeqMap currentRight = right[i]; // same as left but with distances on the right side.
316 for (int j = 0; j < i; j++) {
318 if (m->control_pressed) { return 0; }
320 itL = currentLeft.find(j);
321 itR = currentRight.find(j);
322 //cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl;
323 //cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl;
325 //if you can find this entry update the preferences
326 if ((itL != currentLeft.end()) && (itR != currentRight.end())) {
329 pref[i].score[1] += abs((itL->second - itR->second));
330 pref[j].score[1] += abs((itL->second - itR->second));
331 //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
332 //cout << "abs = " << abs((itL->second - itR->second)) << endl;
333 //cout << i << " score = " << pref[i].score[1] << endl;
334 //cout << j << " score = " << pref[j].score[1] << endl;
336 pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
337 pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
338 //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
339 //cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl;
340 //cout << i << " score = " << pref[i].score[1] << endl;
341 //cout << j << " score = " << pref[j].score[1] << endl;
343 //cout << "pref[" << i << "].closestLeft[1] = " << pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl;
344 //are you the closest left sequence
345 if (itL->second < pref[i].closestLeft[1]) {
347 pref[i].closestLeft[1] = itL->second;
348 pref[i].leftParent[1] = seqs[j]->getName();
349 //cout << "updating closest left to " << pref[i].leftParent[1] << endl;
351 //cout << "pref[" << j << "].closestLeft[1] = " << pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl;
352 if (itL->second < pref[j].closestLeft[1]) {
353 pref[j].closestLeft[1] = itL->second;
354 pref[j].leftParent[1] = seqs[i]->getName();
355 //cout << "updating closest left to " << pref[j].leftParent[1] << endl;
358 //are you the closest right sequence
359 if (itR->second < pref[i].closestRight[1]) {
360 pref[i].closestRight[1] = itR->second;
361 pref[i].rightParent[1] = seqs[j]->getName();
363 if (itR->second < pref[j].closestRight[1]) {
364 pref[j].closestRight[1] = itR->second;
365 pref[j].rightParent[1] = seqs[i]->getName();
377 for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } }
379 float expectedPercent = 1 / (float) (pref.size() - count0);
380 //cout << endl << "dme = " << dme << endl;
381 //recalculate prefernences based on dme
382 for (int i = 0; i < pref.size(); i++) {
384 if (m->control_pressed) { return 0; }
385 //cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl;
386 // gives the actual percentage of the dme this seq adds
387 pref[i].score[1] = pref[i].score[1] / dme;
389 //how much higher or lower is this than expected
390 pref[i].score[1] = pref[i].score[1] / expectedPercent;
392 //pref[i].score[1] = dme / (dme - 2 * pref[i].score[1]);
394 //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
395 //cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;
398 //is this score bigger then the last score
399 for (int i = 0; i < pref.size(); i++) {
401 if (m->control_pressed) { return 0; }
403 //update biggest score
404 if (pref[i].score[1] > pref[i].score[0]) {
405 pref[i].score[0] = pref[i].score[1];
406 pref[i].leftParent[0] = pref[i].leftParent[1];
407 pref[i].rightParent[0] = pref[i].rightParent[1];
408 pref[i].closestLeft[0] = pref[i].closestLeft[1];
409 pref[i].closestRight[0] = pref[i].closestRight[1];
410 pref[i].midpoint = mid;
418 catch(exception& e) {
419 m->errorOut(e, "Bellerophon", "generatePreferences");
423 /**************************************************************************************************/