5 * Created by Sarah Westcott on 7/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "bellerophon.h"
11 #include "eachgapdist.h"
12 #include "ignoregaps.h"
13 #include "onegapdist.h"
16 /***************************************************************************************************************/
18 Bellerophon::Bellerophon(string name, bool filterSeqs, bool c, int win, int inc, int p, string o) : Chimera() {
28 seqs = readSeqs(fastafile);
29 numSeqs = seqs.size();
30 if (numSeqs == 0) { m->mothurOut("Error in reading you sequences."); m->mothurOutEndLine(); exit(1); }
34 createFilter(seqs, 0.5);
35 for (int i = 0; i < seqs.size(); i++) { runFilter(seqs[i]); }
38 distCalculator = new eachGapDist();
40 //set default window to 25% of sequence length
41 string seq0 = seqs[0]->getAligned();
42 if (window == 0) { window = seq0.length() / 4; }
43 else if (window > (seq0.length() / 2)) {
44 m->mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); m->mothurOutEndLine();
45 window = (seq0.length() / 2);
48 if (increment > (seqs[0]->getAlignLength() - (2*window))) {
49 if (increment != 10) {
51 m->mothurOut("You have selected a increment that is too large. I will use the default."); m->mothurOutEndLine();
53 if (increment > (seqs[0]->getAlignLength() - (2*window))) { increment = 0; }
55 }else{ increment = 0; }
58 if (increment == 0) { iters = 1; }
59 else { iters = ((seqs[0]->getAlignLength() - (2*window)) / increment); }
63 for (int i = 0; i < iters; i++) {
65 for (int j = 0; j < numSeqs; j++) {
66 pref[i].push_back(temp);
72 m->errorOut(e, "Bellerophon", "Bellerophon");
77 //***************************************************************************************************************
78 int Bellerophon::print(ostream& out, ostream& outAcc, string s) {
82 //sorted "best" preference scores for all seqs
83 vector<Preference> best = getBestPref();
85 if (m->control_pressed) { return numSeqs; }
87 out << "Name\tScore\tLeft\tRight\t" << endl;
88 //output prefenence structure to .chimeras file
89 for (int i = 0; i < best.size(); i++) {
91 if (m->control_pressed) { return numSeqs; }
93 out << best[i].name << '\t' << setprecision(3) << best[i].score << '\t' << best[i].leftParent << '\t' << best[i].rightParent << endl;
95 //calc # of seqs with preference above 1.0
96 if (best[i].score > 1.0) {
98 outAcc << best[i].name << endl;
99 m->mothurOut(best[i].name + " is a suspected chimera at breakpoint " + toString(best[i].midpoint)); m->mothurOutEndLine();
100 m->mothurOut("It's score is " + toString(best[i].score) + " with suspected left parent " + best[i].leftParent + " and right parent " + best[i].rightParent); m->mothurOutEndLine();
104 //output results to screen
105 m->mothurOutEndLine();
106 m->mothurOut("Sequence with preference score above 1.0: " + toString(above1)); m->mothurOutEndLine();
108 spot = best.size()-1;
109 m->mothurOut("Minimum:\t" + toString(best[spot].score)); m->mothurOutEndLine();
110 spot = best.size() * 0.975;
111 m->mothurOut("2.5%-tile:\t" + toString(best[spot].score)); m->mothurOutEndLine();
112 spot = best.size() * 0.75;
113 m->mothurOut("25%-tile:\t" + toString(best[spot].score)); m->mothurOutEndLine();
114 spot = best.size() * 0.50;
115 m->mothurOut("Median: \t" + toString(best[spot].score)); m->mothurOutEndLine();
116 spot = best.size() * 0.25;
117 m->mothurOut("75%-tile:\t" + toString(best[spot].score)); m->mothurOutEndLine();
118 spot = best.size() * 0.025;
119 m->mothurOut("97.5%-tile:\t" + toString(best[spot].score)); m->mothurOutEndLine();
121 m->mothurOut("Maximum:\t" + toString(best[spot].score)); m->mothurOutEndLine();
126 catch(exception& e) {
127 m->errorOut(e, "Bellerophon", "print");
132 //***************************************************************************************************************
133 int Bellerophon::print(MPI_File& out, MPI_File& outAcc, string s) {
137 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
140 string outString = "";
142 //sorted "best" preference scores for all seqs
143 vector<Preference> best = getBestPref();
146 int ninetyfive = best.size() * 0.05;
147 float cutoffScore = best[ninetyfive].score;
149 if (m->control_pressed) { return numSeqs; }
151 outString = "Name\tScore\tLeft\tRight\n";
153 int olength = outString.length();
154 char* buf5 = new char[olength];
155 memcpy(buf5, outString.c_str(), olength);
157 MPI_File_write_shared(out, buf5, olength, MPI_CHAR, &status);
161 //output prefenence structure to .chimeras file
162 for (int i = 0; i < best.size(); i++) {
164 if (m->control_pressed) { return numSeqs; }
166 outString = best[i].name + "\t" + toString(best[i].score) + "\t" + best[i].leftParent + "\t" + best[i].rightParent + "\n";
169 int length = outString.length();
170 char* buf2 = new char[length];
171 memcpy(buf2, outString.c_str(), length);
173 MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status);
177 //calc # of seqs with preference above 95%tile
178 if (best[i].score >= cutoffScore) {
180 string outAccString = "";
181 outAccString += best[i].name + "\n";
183 MPI_Status statusAcc;
184 length = outAccString.length();
185 char* buf = new char[length];
186 memcpy(buf, outAccString.c_str(), length);
188 MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc);
192 cout << best[i].name << " is a suspected chimera at breakpoint " << toString(best[i].midpoint) << endl;
193 cout << "It's score is " << toString(best[i].score) << " with suspected left parent " << best[i].leftParent << " and right parent " << best[i].rightParent << endl;
197 //output results to screen
198 m->mothurOutEndLine();
199 m->mothurOut("Sequence with preference score above " + toString(cutoffScore) + ": " + toString(above1)); m->mothurOutEndLine();
201 spot = best.size()-1;
202 m->mothurOut("Minimum:\t" + toString(best[spot].score)); m->mothurOutEndLine();
203 spot = best.size() * 0.975;
204 m->mothurOut("2.5%-tile:\t" + toString(best[spot].score)); m->mothurOutEndLine();
205 spot = best.size() * 0.75;
206 m->mothurOut("25%-tile:\t" + toString(best[spot].score)); m->mothurOutEndLine();
207 spot = best.size() * 0.50;
208 m->mothurOut("Median: \t" + toString(best[spot].score)); m->mothurOutEndLine();
209 spot = best.size() * 0.25;
210 m->mothurOut("75%-tile:\t" + toString(best[spot].score)); m->mothurOutEndLine();
211 spot = best.size() * 0.025;
212 m->mothurOut("97.5%-tile:\t" + toString(best[spot].score)); m->mothurOutEndLine();
214 m->mothurOut("Maximum:\t" + toString(best[spot].score)); m->mothurOutEndLine();
221 catch(exception& e) {
222 m->errorOut(e, "Bellerophon", "print");
227 //********************************************************************************************************************
228 //sorts highest score to lowest
229 inline bool comparePref(Preference left, Preference right){
230 return (left.score > right.score);
232 //***************************************************************************************************************
233 int Bellerophon::getChimeras() {
236 //create breaking points
237 vector<int> midpoints; midpoints.resize(iters, window);
238 for (int i = 1; i < iters; i++) { midpoints[i] = midpoints[i-1] + increment; }
241 int pid, numSeqsPerProcessor;
243 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
244 MPI_Comm_size(MPI_COMM_WORLD, &processors);
246 numSeqsPerProcessor = iters / processors;
248 //each process hits this only once
249 unsigned long int startPos = pid * numSeqsPerProcessor;
250 if(pid == processors - 1){
251 numSeqsPerProcessor = iters - pid * numSeqsPerProcessor;
253 lines.push_back(linePair(startPos, numSeqsPerProcessor));
255 //fill pref with scores
256 driverChimeras(midpoints, lines[0]);
258 if (m->control_pressed) { return 0; }
260 //each process must send its parts back to pid 0
264 for (int j = 1; j < processors; j++) {
266 vector<string> MPIBestSend;
267 for (int i = 0; i < numSeqs; i++) {
269 if (m->control_pressed) { return 0; }
274 MPI_Recv(&length, 1, MPI_INT, j, 2001, MPI_COMM_WORLD, &status);
276 char* buf = new char[length];
277 MPI_Recv(&buf, length, MPI_CHAR, j, 2001, MPI_COMM_WORLD, &status);
280 if (temp.length() > length) { temp = temp.substr(0, length); }
283 MPIBestSend.push_back(temp);
286 fillPref(j, MPIBestSend);
288 if (m->control_pressed) { return 0; }
292 //takes best window for each sequence and turns Preference to string that can be parsed by pid 0.
293 //played with this a bit, but it may be better to try user-defined datatypes with set string lengths??
294 vector<string> MPIBestSend = getBestWindow(lines[0]);
297 //send your result to parent
298 for (int i = 0; i < numSeqs; i++) {
300 if (m->control_pressed) { return 0; }
302 int bestLength = MPIBestSend[i].length();
303 char* buf = new char[bestLength];
304 memcpy(buf, MPIBestSend[i].c_str(), bestLength);
306 MPI_Send(&bestLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD);
307 MPI_Send(buf, bestLength, MPI_CHAR, 0, 2001, MPI_COMM_WORLD);
313 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
316 //divide breakpoints between processors
317 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
319 lines.push_back(linePair(0, iters));
321 //fill pref with scores
322 driverChimeras(midpoints, lines[0]);
326 int numSeqsPerProcessor = iters / processors;
328 for (int i = 0; i < processors; i++) {
329 unsigned long int startPos = i * numSeqsPerProcessor;
330 if(i == processors - 1){
331 numSeqsPerProcessor = iters - i * numSeqsPerProcessor;
333 lines.push_back(linePair(startPos, numSeqsPerProcessor));
336 createProcesses(midpoints);
339 lines.push_back(linePair(0, iters));
341 ///fill pref with scores
342 driverChimeras(midpoints, lines[0]);
350 catch(exception& e) {
351 m->errorOut(e, "Bellerophon", "getChimeras");
355 /**************************************************************************************************/
357 int Bellerophon::createProcesses(vector<int> mid) {
359 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
362 vector<int> processIDS;
364 //loop through and create all the processes you want
365 while (process != processors) {
369 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
372 exitCommand = driverChimeras(mid, lines[process]);
373 string tempOut = outputDir + toString(getpid()) + ".temp";
374 writePrefs(tempOut, lines[process]);
377 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
378 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
383 //force parent to wait until all the processes are done
384 for (int i=0;i<processors;i++) {
385 int temp = processIDS[i];
389 //get info that other processes created
390 for (int i = 0; i < processIDS.size(); i++) {
391 string tempIn = outputDir + toString(processIDS[i]) + ".temp";
398 catch(exception& e) {
399 m->errorOut(e, "AlignCommand", "createProcesses");
403 //***************************************************************************************************************
404 int Bellerophon::driverChimeras(vector<int> midpoints, linePair line) {
407 for (int h = line.start; h < (line.start + line.num); h++) {
409 int midpoint = midpoints[h];
411 //initialize pref[count]
412 for (int i = 0; i < numSeqs; i++ ) {
413 pref[count][i].name = seqs[i]->getName();
414 pref[count][i].midpoint = midpoint;
417 if (m->control_pressed) { return 0; }
419 //create 2 vectors of sequences, 1 for left side and one for right side
420 vector<Sequence> left; vector<Sequence> right;
422 for (int i = 0; i < seqs.size(); i++) {
424 if (m->control_pressed) { return 0; }
426 //cout << "midpoint = " << midpoint << "\twindow = " << window << endl;
427 //cout << "whole = " << seqs[i]->getAligned().length() << endl;
429 string seqLeft = seqs[i]->getAligned().substr(midpoint-window, window);
431 tempLeft.setName(seqs[i]->getName());
432 tempLeft.setAligned(seqLeft);
433 left.push_back(tempLeft);
434 //cout << "left = " << tempLeft.getAligned().length() << endl;
436 string seqRight = seqs[i]->getAligned().substr(midpoint, window);
438 tempRight.setName(seqs[i]->getName());
439 tempRight.setAligned(seqRight);
440 right.push_back(tempRight);
441 //cout << "right = " << seqRight.length() << endl;
444 //this should be parallelized
445 //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
446 //create a matrix containing the distance from left to left and right to right
447 //calculate distances
448 SparseMatrix* SparseLeft = new SparseMatrix();
449 SparseMatrix* SparseRight = new SparseMatrix();
451 createSparseMatrix(0, left.size(), SparseLeft, left);
453 if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; }
455 createSparseMatrix(0, right.size(), SparseRight, right);
457 if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; }
459 left.clear(); right.clear();
460 vector<SeqMap> distMapRight;
461 vector<SeqMap> distMapLeft;
463 // Create a data structure to quickly access the distance information.
464 //this is from thallingers reimplementation on get.oturep
465 // It consists of a vector of distance maps, where each map contains
466 // all distances of a certain sequence. Vector and maps are accessed
467 // via the index of a sequence in the distance matrix
468 distMapRight = vector<SeqMap>(numSeqs);
469 distMapLeft = vector<SeqMap>(numSeqs);
470 //cout << "left" << endl << endl;
471 for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
472 distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
473 if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; }
474 //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
476 //cout << "right" << endl << endl;
477 for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
478 distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
479 if (m->control_pressed) { delete SparseLeft; delete SparseRight; return 0; }
480 //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
486 //fill preference structure
487 generatePreferences(distMapLeft, distMapRight, midpoint);
489 if (m->control_pressed) { return 0; }
492 if((h+1) % 10 == 0){ cout << "Processing sliding window: " << toString(h+1) << "\n"; m->mothurOutJustToLog("Processing sliding window: " + toString(h+1) + "\n") ; }
497 if((line.start + line.num) % 10 != 0){ cout << "Processing sliding window: " << toString(line.start + line.num) << "\n"; m->mothurOutJustToLog("Processing sliding window: " + toString(line.start + line.num) + "\n") ; }
502 catch(exception& e) {
503 m->errorOut(e, "Bellerophon", "driverChimeras");
508 /***************************************************************************************************************/
509 int Bellerophon::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
512 for(int i=startSeq; i<endSeq; i++){
514 for(int j=0;j<i;j++){
516 if (m->control_pressed) { return 0; }
518 distCalculator->calcDist(s[i], s[j]);
519 float dist = distCalculator->getDist();
521 PCell temp(i, j, dist);
522 sparse->addCell(temp);
529 catch(exception& e) {
530 m->errorOut(e, "Bellerophon", "createSparseMatrix");
534 /***************************************************************************************************************/
535 int Bellerophon::generatePreferences(vector<SeqMap> left, vector<SeqMap> right, int mid){
538 SeqMap::iterator itR;
539 SeqMap::iterator itL;
541 for (int i = 0; i < left.size(); i++) {
543 SeqMap currentLeft = left[i]; //example i = 3; currentLeft is a map of 0 to the distance of sequence 3 to sequence 0,
544 // 1 to the distance of sequence 3 to sequence 1,
545 // 2 to the distance of sequence 3 to sequence 2.
546 SeqMap currentRight = right[i]; // same as left but with distances on the right side.
548 for (int j = 0; j < i; j++) {
550 if (m->control_pressed) { return 0; }
552 itL = currentLeft.find(j);
553 itR = currentRight.find(j);
554 //cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl;
555 //cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl;
557 //if you can find this entry update the preferences
558 if ((itL != currentLeft.end()) && (itR != currentRight.end())) {
561 pref[count][i].score += abs((itL->second - itR->second));
562 pref[count][j].score += abs((itL->second - itR->second));
563 //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
564 //cout << "abs = " << abs((itL->second - itR->second)) << endl;
565 //cout << i << " score = " << pref[i].score[1] << endl;
566 //cout << j << " score = " << pref[j].score[1] << endl;
568 pref[count][i].score += abs((sqrt(itL->second) - sqrt(itR->second)));
569 pref[count][j].score += abs((sqrt(itL->second) - sqrt(itR->second)));
570 //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
571 //cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl;
572 //cout << i << " score = " << pref[i].score[1] << endl;
573 //cout << j << " score = " << pref[j].score[1] << endl;
575 //cout << "pref[" << i << "].closestLeft[1] = " << pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl;
576 //are you the closest left sequence
577 if (itL->second < pref[count][i].closestLeft) {
579 pref[count][i].closestLeft = itL->second;
580 pref[count][i].leftParent = seqs[j]->getName();
581 //cout << "updating closest left to " << pref[i].leftParent[1] << endl;
583 //cout << "pref[" << j << "].closestLeft[1] = " << pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl;
584 if (itL->second < pref[count][j].closestLeft) {
585 pref[count][j].closestLeft = itL->second;
586 pref[count][j].leftParent = seqs[i]->getName();
587 //cout << "updating closest left to " << pref[j].leftParent[1] << endl;
590 //are you the closest right sequence
591 if (itR->second < pref[count][i].closestRight) {
592 pref[count][i].closestRight = itR->second;
593 pref[count][i].rightParent = seqs[j]->getName();
595 if (itR->second < pref[count][j].closestRight) {
596 pref[count][j].closestRight = itR->second;
597 pref[count][j].rightParent = seqs[i]->getName();
609 catch(exception& e) {
610 m->errorOut(e, "Bellerophon", "generatePreferences");
614 /**************************************************************************************************/
615 vector<Preference> Bellerophon::getBestPref() {
618 vector<Preference> best;
621 for (int i = 0; i < numSeqs; i++) {
623 //set best pref score to first one
624 Preference temp = pref[0][i];
626 if (m->control_pressed) { return best; }
629 for (int j = 1; j < pref.size(); j++) {
631 //is this a better score
632 if (pref[j][i].score > temp.score) { temp = pref[j][i]; }
635 best.push_back(temp);
638 //rank preference score to eachother
640 float expectedPercent = 1 / (float) (best.size());
642 for (int i = 0; i < best.size(); i++) { dme += best[i].score; }
644 for (int i = 0; i < best.size(); i++) {
646 if (m->control_pressed) { return best; }
648 //gives the actual percentage of the dme this seq adds
649 best[i].score = best[i].score / dme;
651 //how much higher or lower is this than expected
652 best[i].score = best[i].score / expectedPercent;
656 //sort Preferences highest to lowest
657 sort(best.begin(), best.end(), comparePref);
661 catch(exception& e) {
662 m->errorOut(e, "Bellerophon", "getBestPref");
666 /**************************************************************************************************/
667 int Bellerophon::writePrefs(string file, linePair tempLine) {
671 m->openOutputFile(file, outTemp);
673 //lets you know what part of the pref matrix you are writing
674 outTemp << tempLine.start << '\t' << tempLine.num << endl;
676 for (int i = tempLine.start; i < (tempLine.start + tempLine.num); i++) {
678 for (int j = 0; j < numSeqs; j++) {
680 if (m->control_pressed) { outTemp.close(); remove(file.c_str()); return 0; }
682 outTemp << pref[i][j].name << '\t' << pref[i][j].leftParent << '\t' << pref[i][j].rightParent << '\t';
683 outTemp << pref[i][j].score << '\t' << pref[i][j].closestLeft << '\t' << pref[i][j].closestRight << '\t' << pref[i][j].midpoint << endl;
691 catch(exception& e) {
692 m->errorOut(e, "Bellerophon", "writePrefs");
696 /**************************************************************************************************/
697 int Bellerophon::readPrefs(string file) {
701 m->openInputFile(file, inTemp);
705 //lets you know what part of the pref matrix you are writing
706 inTemp >> start >> num; m->gobble(inTemp);
708 for (int i = start; i < num; i++) {
710 for (int j = 0; j < numSeqs; j++) {
712 if (m->control_pressed) { inTemp.close(); remove(file.c_str()); return 0; }
714 inTemp >> pref[i][j].name >> pref[i][j].leftParent >> pref[i][j].rightParent;
715 inTemp >> pref[i][j].score >> pref[i][j].closestLeft >> pref[i][j].closestRight >> pref[i][j].midpoint;
722 remove(file.c_str());
726 catch(exception& e) {
727 m->errorOut(e, "Bellerophon", "writePrefs");
731 /**************************************************************************************************/
732 vector<string> Bellerophon::getBestWindow(linePair line) {
738 for (int i = 0; i < numSeqs; i++) {
740 //set best pref score to first one
741 Preference temp = pref[line.start][i];
743 if (m->control_pressed) { return best; }
746 for (int j = (line.start+1); j < (line.start+line.num); j++) {
748 //is this a better score
749 if (pref[j][i].score > temp.score) { temp = pref[j][i]; }
752 string tempString = temp.name + '\t' + temp.leftParent + '\t' + temp.rightParent + '\t' + toString(temp.score);
753 best.push_back(tempString);
759 catch(exception& e) {
760 m->errorOut(e, "Bellerophon", "getBestWindow");
764 /**************************************************************************************************/
765 int Bellerophon::fillPref(int process, vector<string>& best) {
767 //figure out where you start so you can put the best scores there
768 int numSeqsPerProcessor = iters / processors;
769 int start = process * numSeqsPerProcessor;
771 for (int i = 0; i < best.size(); i++) {
773 if (m->control_pressed) { return 0; }
775 istringstream iss (best[i],istringstream::in);
778 iss >> pref[start][i].name >> pref[start][i].leftParent >> pref[start][i].rightParent >> tempScore;
779 convert(tempScore, pref[start][i].score);
784 catch(exception& e) {
785 m->errorOut(e, "Bellerophon", "fillPref");
790 /**************************************************************************************************/