5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "bellerophon.h"
11 #include "eachgapdist.h"
12 #include "ignoregaps.h"
13 #include "onegapdist.h"
16 //***************************************************************************************************************
18 Bellerophon::Bellerophon(string name) {
23 errorOut(e, "Bellerophon", "Bellerophon");
28 //***************************************************************************************************************
29 void Bellerophon::print(ostream& out) {
32 out << "Name\tScore\tLeft\tRight\t" << endl;
33 //output prefenence structure to .chimeras file
34 for (int i = 0; i < pref.size(); i++) {
35 out << pref[i].name << '\t' << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
37 //calc # of seqs with preference above 1.0
38 if (pref[i].score[0] > 1.0) {
40 mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine();
41 mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine();
45 //output results to screen
47 mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine();
50 mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
51 spot = pref.size() * 0.975;
52 mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
53 spot = pref.size() * 0.75;
54 mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
55 spot = pref.size() * 0.50;
56 mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine();
57 spot = pref.size() * 0.25;
58 mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
59 spot = pref.size() * 0.025;
60 mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
62 mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
66 errorOut(e, "Bellerophon", "print");
71 //********************************************************************************************************************
72 //sorts highest score to lowest
73 inline bool comparePref(Preference left, Preference right){
74 return (left.score[0] > right.score[0]);
77 //***************************************************************************************************************
78 void Bellerophon::getChimeras() {
83 string optionString = "fasta=" + fastafile + ", soft=50, vertical=F";
84 filterSeqs = new FilterSeqsCommand(optionString);
85 filterSeqs->execute();
88 //reset fastafile to filtered file
89 fastafile = getRootName(fastafile) + "filter.fasta";
92 distCalculator = new eachGapDist();
97 int numSeqs = seqs.size();
99 if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); exit(1); }
101 //set default window to 25% of sequence length
102 string seq0 = seqs[0].getAligned();
103 if (window == 0) { window = seq0.length() / 4; }
104 else if (window > (seq0.length() / 2)) {
105 mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine();
106 window = (seq0.length() / 2);
109 if (increment > (seqs[0].getAlignLength() - (2*window))) {
110 if (increment != 10) {
112 mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine();
114 if (increment > (seqs[0].getAlignLength() - (2*window))) { increment = 0; }
116 }else{ increment = 0; }
118 cout << "increment = " << increment << endl;
119 if (increment == 0) { iters = 1; }
120 else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); }
123 pref.resize(numSeqs);
125 for (int i = 0; i < numSeqs; i++ ) {
126 pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2); pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3);
127 pref[i].name = seqs[i].getName();
128 pref[i].score[0] = 0.0; pref[i].score[1] = 0.0;
129 pref[i].closestLeft[0] = 100000.0; pref[i].closestLeft[1] = 100000.0;
130 pref[i].closestRight[0] = 100000.0; pref[i].closestRight[1] = 100000.0;
133 int midpoint = window;
135 while (count < iters) {
137 //create 2 vectors of sequences, 1 for left side and one for right side
138 vector<Sequence> left; vector<Sequence> right;
140 for (int i = 0; i < seqs.size(); i++) {
141 //cout << "whole = " << seqs[i].getAligned() << endl;
143 string seqLeft = seqs[i].getAligned().substr(midpoint-window, window);
145 tempLeft.setName(seqs[i].getName());
146 tempLeft.setAligned(seqLeft);
147 left.push_back(tempLeft);
148 //cout << "left = " << tempLeft.getAligned() << endl;
150 string seqRight = seqs[i].getAligned().substr(midpoint, window);
152 tempRight.setName(seqs[i].getName());
153 tempRight.setAligned(seqRight);
154 right.push_back(tempRight);
155 //cout << "right = " << seqRight << endl;
158 //adjust midpoint by increment
159 midpoint += increment;
162 //this should be parallelized
163 //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
164 //create a matrix containing the distance from left to left and right to right
165 //calculate distances
166 SparseMatrix* SparseLeft = new SparseMatrix();
167 SparseMatrix* SparseRight = new SparseMatrix();
169 createSparseMatrix(0, left.size(), SparseLeft, left);
170 createSparseMatrix(0, right.size(), SparseRight, right);
172 vector<SeqMap> distMapRight;
173 vector<SeqMap> distMapLeft;
175 // Create a data structure to quickly access the distance information.
176 // It consists of a vector of distance maps, where each map contains
177 // all distances of a certain sequence. Vector and maps are accessed
178 // via the index of a sequence in the distance matrix
179 distMapRight = vector<SeqMap>(numSeqs);
180 distMapLeft = vector<SeqMap>(numSeqs);
181 //cout << "left" << endl << endl;
182 for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
183 distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
184 //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
186 //cout << "right" << endl << endl;
187 for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
188 distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
189 //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
196 //fill preference structure
197 generatePreferences(distMapLeft, distMapRight, midpoint);
203 delete distCalculator;
205 //rank preference score to eachother
207 float expectedPercent = 1 / (float) (pref.size());
209 for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[0]; }
211 for (int i = 0; i < pref.size(); i++) {
213 //gives the actual percentage of the dme this seq adds
214 pref[i].score[0] = pref[i].score[0] / dme;
216 //how much higher or lower is this than expected
217 pref[i].score[0] = pref[i].score[0] / expectedPercent;
222 //sort Preferences highest to lowest
223 sort(pref.begin(), pref.end(), comparePref);
228 catch(exception& e) {
229 errorOut(e, "Bellerophon", "getChimeras");
234 //***************************************************************************************************************
235 void Bellerophon::readSeqs(){
238 openInputFile(fastafile, inFASTA);
240 //read in seqs and store in vector
241 while(!inFASTA.eof()){
242 Sequence current(inFASTA);
244 if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); }
246 seqs.push_back(current);
253 catch(exception& e) {
254 errorOut(e, "Bellerophon", "readSeqs");
259 /***************************************************************************************************************/
260 int Bellerophon::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
263 for(int i=startSeq; i<endSeq; i++){
265 for(int j=0;j<i;j++){
267 distCalculator->calcDist(s[i], s[j]);
268 float dist = distCalculator->getDist();
270 PCell temp(i, j, dist);
271 sparse->addCell(temp);
279 catch(exception& e) {
280 errorOut(e, "Bellerophon", "createSparseMatrix");
284 /***************************************************************************************************************/
285 void Bellerophon::generatePreferences(vector<SeqMap> left, vector<SeqMap> right, int mid){
289 SeqMap::iterator itR;
290 SeqMap::iterator itL;
293 for (int i = 0; i < pref.size(); i++) {
294 pref[i].score[1] = 0.0;
295 pref[i].closestLeft[1] = 100000.0;
296 pref[i].closestRight[1] = 100000.0;
297 pref[i].leftParent[1] = "";
298 pref[i].rightParent[1] = "";
301 for (int i = 0; i < left.size(); i++) {
303 SeqMap currentLeft = left[i]; //example i = 3; currentLeft is a map of 0 to the distance of sequence 3 to sequence 0,
304 // 1 to the distance of sequence 3 to sequence 1,
305 // 2 to the distance of sequence 3 to sequence 2.
306 SeqMap currentRight = right[i]; // same as left but with distances on the right side.
308 for (int j = 0; j < i; j++) {
310 itL = currentLeft.find(j);
311 itR = currentRight.find(j);
312 //cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl;
313 //cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl;
315 //if you can find this entry update the preferences
316 if ((itL != currentLeft.end()) && (itR != currentRight.end())) {
319 pref[i].score[1] += abs((itL->second - itR->second));
320 pref[j].score[1] += abs((itL->second - itR->second));
321 //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
322 //cout << "abs = " << abs((itL->second - itR->second)) << endl;
323 //cout << i << " score = " << pref[i].score[1] << endl;
324 //cout << j << " score = " << pref[j].score[1] << endl;
326 pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
327 pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
328 //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
329 //cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl;
330 //cout << i << " score = " << pref[i].score[1] << endl;
331 //cout << j << " score = " << pref[j].score[1] << endl;
333 //cout << "pref[" << i << "].closestLeft[1] = " << pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl;
334 //are you the closest left sequence
335 if (itL->second < pref[i].closestLeft[1]) {
337 pref[i].closestLeft[1] = itL->second;
338 pref[i].leftParent[1] = seqs[j].getName();
339 //cout << "updating closest left to " << pref[i].leftParent[1] << endl;
341 //cout << "pref[" << j << "].closestLeft[1] = " << pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl;
342 if (itL->second < pref[j].closestLeft[1]) {
343 pref[j].closestLeft[1] = itL->second;
344 pref[j].leftParent[1] = seqs[i].getName();
345 //cout << "updating closest left to " << pref[j].leftParent[1] << endl;
348 //are you the closest right sequence
349 if (itR->second < pref[i].closestRight[1]) {
350 pref[i].closestRight[1] = itR->second;
351 pref[i].rightParent[1] = seqs[j].getName();
353 if (itR->second < pref[j].closestRight[1]) {
354 pref[j].closestRight[1] = itR->second;
355 pref[j].rightParent[1] = seqs[i].getName();
368 for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } }
370 float expectedPercent = 1 / (float) (pref.size() - count0);
371 //cout << endl << "dme = " << dme << endl;
372 //recalculate prefernences based on dme
373 for (int i = 0; i < pref.size(); i++) {
374 //cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl;
375 // gives the actual percentage of the dme this seq adds
376 pref[i].score[1] = pref[i].score[1] / dme;
378 //how much higher or lower is this than expected
379 pref[i].score[1] = pref[i].score[1] / expectedPercent;
381 //pref[i].score[1] = dme / (dme - 2 * pref[i].score[1]);
383 //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
384 //cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;
387 //is this score bigger then the last score
388 for (int i = 0; i < pref.size(); i++) {
390 //update biggest score
391 if (pref[i].score[1] > pref[i].score[0]) {
392 pref[i].score[0] = pref[i].score[1];
393 pref[i].leftParent[0] = pref[i].leftParent[1];
394 pref[i].rightParent[0] = pref[i].rightParent[1];
395 pref[i].closestLeft[0] = pref[i].closestLeft[1];
396 pref[i].closestRight[0] = pref[i].closestRight[1];
397 pref[i].midpoint = mid;
403 catch(exception& e) {
404 errorOut(e, "Bellerophon", "generatePreferences");
408 /**************************************************************************************************/