+/*
+ * alignedsimilarity.cpp
+ * Mothur
+ *
+ * Created by westcott on 8/18/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "alignedsimilarity.h"
+#include "ignoregaps.h"
+
+
+//***************************************************************************************************************
+AlignSim::AlignSim(string filename, string temp) { fastafile = filename; templateFile = temp; }
+//***************************************************************************************************************
+
+AlignSim::~AlignSim() {
+ try {
+ for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
+ for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
+ delete distCalc;
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignSim", "~AlignSim");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+void AlignSim::print(ostream& out) {
+ try {
+
+ mothurOutEndLine();
+
+ for (int i = 0; i < querySeqs.size(); i++) {
+
+ int j = 0; float largest = -10;
+ //find largest sim value
+ for (int k = 0; k < IS[i].size(); k++) {
+ //is this score larger
+ if (IS[i][k].score > largest) {
+ j = k;
+ largest = IS[i][k].score;
+ }
+ }
+
+ //find parental similarity
+ distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent));
+ float dist = distCalc->getDist();
+
+ //convert to similarity
+ dist = (1 - dist) * 100;
+
+ //warn about parental similarity - if its above 82% may not detect a chimera
+ if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly."); mothurOutEndLine(); }
+
+ int index = ceil(dist);
+
+ if (index == 0) { index=1; }
+
+ //is your DE value higher than the 95%
+ string chimera;
+ if (IS[i][j].score > quantile[index-1][4]) { chimera = "Yes"; }
+ else { chimera = "No"; }
+
+ out << querySeqs[i]->getName() << "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl;
+
+ if (chimera == "Yes") {
+ mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine();
+ }
+ out << "Improvement Score\t";
+
+ for (int r = 0; r < IS[i].size(); r++) { out << IS[i][r].score << '\t'; }
+ out << endl;
+ }
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignSim", "print");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+void AlignSim::getChimeras() {
+ try {
+
+ //read in query sequences and subject sequences
+ mothurOut("Reading sequences and template file... "); cout.flush();
+ querySeqs = readSeqs(fastafile);
+ templateSeqs = readSeqs(templateFile);
+ mothurOut("Done."); mothurOutEndLine();
+
+ int numSeqs = querySeqs.size();
+
+ IS.resize(numSeqs);
+
+ //break up file if needed
+ int linesPerProcess = numSeqs / processors ;
+
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ //find breakup of sequences for all times we will Parallelize
+ if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
+ else {
+ //fill line pairs
+ for (int i = 0; i < (processors-1); i++) {
+ lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
+ }
+ //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
+ int i = processors - 1;
+ lines.push_back(new linePair((i*linesPerProcess), numSeqs));
+ }
+
+ //find breakup of templatefile for quantiles
+ if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
+ else {
+ for (int i = 0; i < processors; i++) {
+ templateLines.push_back(new linePair());
+ templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
+ templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
+ }
+ }
+ #else
+ lines.push_back(new linePair(0, numSeqs));
+ templateLines.push_back(new linePair(0, templateSeqs.size()));
+ #endif
+
+
+ distCalc = new ignoreGaps();
+
+ //find window breaks
+ windowBreak = findWindows();
+//for (int i = 0; i < windowBreak.size(); i++) { cout << windowBreak[i] << '\t'; }
+//cout << endl;
+
+ mothurOut("Finding the IS values for your sequences..."); cout.flush();
+ if (processors == 1) {
+ IS = findIS(lines[0]->start, lines[0]->end, querySeqs);
+ }else { IS = createProcessesIS(querySeqs, lines); }
+ mothurOut("Done."); mothurOutEndLine();
+
+ //quantiles are used to determine whether the de values found indicate a chimera
+ if (quanfile != "") { quantile = readQuantiles(); }
+ else {
+
+ mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .alignedsimilarity.quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
+ //get IS quantiles as a reference to these IS scores
+ //you are assuming the template is free of chimeras and therefore will give you a baseline as to the scores you would like to see
+ if (processors == 1) {
+ templateIS = findIS(templateLines[0]->start, templateLines[0]->end, templateSeqs);
+ }else { templateIS = createProcessesIS(templateSeqs, templateLines); }
+//cout << "here" << endl;
+ ofstream out4;
+ string o;
+
+ o = getRootName(templateFile) + "alignedsimilarity.quan";
+
+ openOutputFile(o, out4);
+
+ //adjust quantiles
+ //construct table
+ quantile.resize(100);
+
+ for (int i = 0; i < templateIS.size(); i++) {
+
+ if (templateIS[i].size() != 0) {
+ int j = 0; float score = -1000;
+ //find highest IS score
+ //cout << templateIS[i].size() << endl;
+ for (int k = 0; k < templateIS[i].size(); k++) {
+ if (templateIS[i][k].score > score) {
+ score = templateIS[i][k].score;
+ j = k;
+ }
+ }
+ //cout << j << endl;
+ //find similarity of parents
+ distCalc->calcDist(*(templateIS[i][j].leftParent), *(templateIS[i][j].rightParent));
+ float dist = distCalc->getDist();
+
+ //convert to similarity
+ dist = (1 - dist) * 100;
+ // cout << dist << endl;
+ int index = ceil(dist);
+ // cout << "index = " << index << endl;
+ if (index == 0) { index = 1; }
+ quantile[index-1].push_back(templateIS[i][j].score);
+ }
+ }
+
+
+ for (int i = 0; i < quantile.size(); i++) {
+ vector<float> temp;
+
+ if (quantile[i].size() == 0) {
+ //in case this is not a distance found in your template files
+ for (int g = 0; g < 6; g++) {
+ temp.push_back(0.0);
+ }
+ }else{
+
+ sort(quantile[i].begin(), quantile[i].end());
+
+ //save 10%
+ temp.push_back(quantile[i][int(quantile[i].size() * 0.10)]);
+ //save 25%
+ temp.push_back(quantile[i][int(quantile[i].size() * 0.25)]);
+ //save 50%
+ temp.push_back(quantile[i][int(quantile[i].size() * 0.5)]);
+ //save 75%
+ temp.push_back(quantile[i][int(quantile[i].size() * 0.75)]);
+ //save 95%
+ temp.push_back(quantile[i][int(quantile[i].size() * 0.95)]);
+ //save 99%
+ temp.push_back(quantile[i][int(quantile[i].size() * 0.99)]);
+
+ }
+
+ //output quan value
+ out4 << i+1 << '\t';
+ for (int u = 0; u < temp.size(); u++) { out4 << temp[u] << '\t'; }
+ out4 << endl;
+
+ quantile[i] = temp;
+
+ }
+
+ out4.close();
+
+ mothurOut("Done."); mothurOutEndLine();
+
+ }
+
+ //free memory
+ for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
+ for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignSim", "getChimeras");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+vector<int> AlignSim::findWindows() {
+ try {
+
+ vector<int> win;
+
+ if (increment > querySeqs[0]->getAligned().length()) { mothurOut("You have selected an increment larger than the length of your sequences. I will use the default of 25."); increment = 25; }
+
+ for (int m = increment; m < (querySeqs[0]->getAligned().length() - increment); m+=increment) { win.push_back(m); }
+
+ return win;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignSim", "findWindows");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+vector< vector<sim> > AlignSim::findIS(int start, int end, vector<Sequence*> seqs) {
+ try {
+
+ vector< vector<sim> > isValues;
+ isValues.resize(seqs.size());
+
+ //for each sequence
+ for(int i = start; i < end; i++){
+
+ vector<sim> temp; temp.resize(windowBreak.size());
+
+ mothurOut("Finding IS value for sequence " + toString(i)); mothurOutEndLine();
+
+ //for each window
+ for (int m = 0; m < windowBreak.size(); m++) {
+
+ vector<Sequence*> closest; //left, right, overall
+ vector<int> similarity; //left+right and overall
+
+ //find closest left, closest right and closest overall
+ closest = findClosestSides(seqs[i], windowBreak[m], similarity, i);
+
+ int totalNumBases = seqs[i]->getUnaligned().length();
+
+ //IS = left+right-overall
+ float is = ((similarity[0]+similarity[1]) / (float) totalNumBases) - (similarity[2] / (float) totalNumBases);
+
+ ///save IS, leftparent, rightparent, breakpoint
+ temp[m].leftParent = closest[0];
+ temp[m].rightParent = closest[1];
+ temp[m].score = is;
+ temp[m].midpoint = windowBreak[m];
+//cout << is << '\t';
+ }
+//cout << endl;
+ isValues[i] = temp;
+
+ }
+
+ return isValues;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignSim", "findIS");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+vector<Sequence*> AlignSim::findClosestSides(Sequence* seq, int breakpoint, vector<int>& sim, int i) {
+ try{
+
+ vector<Sequence*> closest;
+
+ Sequence query, queryLeft, queryRight;
+ string frag = seq->getAligned();
+ string fragLeft = frag.substr(0, breakpoint);
+ string fragRight = frag.substr(breakpoint, frag.length());
+
+ //get pieces
+ query = *(seq);
+ queryLeft = *(seq);
+ queryRight = *(seq);
+ queryLeft.setAligned(fragLeft);
+ queryRight.setAligned(fragRight);
+
+ //initialize
+ Sequence* overall = templateSeqs[0];
+ Sequence* left = templateSeqs[0];
+ Sequence* right = templateSeqs[0];
+
+ float smallestOverall, smallestLeft, smallestRight;
+ smallestOverall = 1000; smallestLeft = 1000; smallestRight = 1000;
+
+ //go through the templateSeqs and search for the closest
+ for(int j = 0; j < templateSeqs.size(); j++){
+
+ //so you don't pick yourself
+ if (j != i) {
+ Sequence temp, tempLeft, tempRight;
+ string fragTemp = templateSeqs[j]->getAligned();
+ string fragTempLeft = fragTemp.substr(0, breakpoint);
+ string fragTempRight = fragTemp.substr(breakpoint, fragTemp.length());
+
+ //get pieces
+ temp = *(templateSeqs[j]);
+ tempLeft = *(templateSeqs[j]);
+ tempRight = *(templateSeqs[j]);
+ tempLeft.setAligned(fragTempLeft);
+ tempRight.setAligned(fragTempRight);
+
+ //find overall dist
+ distCalc->calcDist(query, temp);
+ float dist = distCalc->getDist();
+
+ if (dist < smallestOverall) {
+ overall = templateSeqs[j];
+ smallestOverall = dist;
+ }
+
+ //find left dist
+ distCalc->calcDist(queryLeft, tempLeft);
+ dist = distCalc->getDist();
+
+ if (dist < smallestLeft) {
+ left = templateSeqs[j];
+ smallestLeft = dist;
+ }
+
+ //find left dist
+ distCalc->calcDist(queryRight, tempRight);
+ dist = distCalc->getDist();
+
+ if (dist < smallestRight) {
+ right = templateSeqs[j];
+ smallestRight = dist;
+ }
+ }
+
+ }
+
+ closest.push_back(left);
+ closest.push_back(right);
+ closest.push_back(overall);
+
+ //fill sim with number of matched bases
+ Sequence tempL = *(left);
+ string tempLFrag = tempL.getAligned();
+ tempL.setAligned(tempLFrag.substr(0, breakpoint));
+
+ int bothMatches = findNumMatchedBases(queryLeft, tempL);
+ sim.push_back(bothMatches);
+
+ Sequence tempR = *(right);
+ string tempRFrag = tempR.getAligned();
+ tempR.setAligned(tempRFrag.substr(breakpoint, tempRFrag.length()));
+
+ bothMatches = findNumMatchedBases(queryRight, tempR);
+ sim.push_back(bothMatches);
+
+ bothMatches = findNumMatchedBases(query, *(overall));
+ sim.push_back(bothMatches);
+
+ return closest;
+
+
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignSim", "findClosestSides");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+
+int AlignSim::findNumMatchedBases(Sequence seq, Sequence temp) {
+ try{
+
+ int num = 0;
+
+ string query = seq.getAligned();
+ string subject = temp.getAligned();
+//cout << seq.getName() << endl << endl;
+//cout << temp.getName() << endl << endl;
+
+ for (int i = 0; i < query.length(); i++) {
+ //count matches if query[i] is a base and subject is same bases or gap
+ if (isalpha(query[i])) {
+ if(!isalpha(subject[i])) { num++; }
+ else if (query[i] == subject[i]) { num++; }
+ else { }
+ }
+ }
+//cout << "num = " << num << endl;
+ return num;
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignSim", "findNumMatchedBases");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+vector< vector<sim> > AlignSim::createProcessesIS(vector<Sequence*> seqs, vector<linePair*> linesToProcess) {
+ try {
+ vector< vector<sim> > localIs; localIs.resize(seqs.size());
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ int process = 0;
+ vector<int> processIDS;
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid);
+ process++;
+ }else if (pid == 0){
+
+ mothurOut("Finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
+ localIs = findIS(linesToProcess[process]->start, linesToProcess[process]->end, seqs);
+ mothurOut("Done finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
+
+ //write out data to file so parent can read it
+ ofstream out;
+ string s = toString(getpid()) + ".temp";
+ openOutputFile(s, out);
+
+ //output pairs
+ for (int i = linesToProcess[process]->start; i < linesToProcess[process]->end; i++) {
+ out << localIs[i].size() << endl;
+ for (int j = 0; j < localIs[i].size(); j++) {
+ localIs[i][j].leftParent->printSequence(out);
+ localIs[i][j].rightParent->printSequence(out);
+ out << ">" << '\t' << localIs[i][j].score << '\t' << localIs[i][j].midpoint << endl;
+ }
+ }
+ out.close();
+
+ exit(0);
+ }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+ }
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processors;i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+ //get data created by processes
+ for (int i=0;i<processors;i++) {
+ ifstream in;
+ string s = toString(processIDS[i]) + ".temp";
+ openInputFile(s, in);
+
+ //get pairs
+ for (int k = linesToProcess[i]->start; k < linesToProcess[i]->end; k++) {
+ int size;
+ in >> size;
+ gobble(in);
+
+ vector<sim> tempVector;
+
+ for (int j = 0; j < size; j++) {
+
+ sim temp;
+
+ temp.leftParent = new Sequence(in);
+ gobble(in);
+ //temp.leftParent->printSequence(cout);
+ temp.rightParent = new Sequence(in);
+ gobble(in);
+//temp.rightParent->printSequence(cout);
+ string throwaway;
+ in >> throwaway >> temp.score >> temp.midpoint;
+ //cout << temp.score << '\t' << temp.midpoint << endl;
+ gobble(in);
+
+ tempVector.push_back(temp);
+ }
+
+ localIs[k] = tempVector;
+ }
+
+ in.close();
+ remove(s.c_str());
+ }
+
+
+#else
+ localIs = findIS(linesToProcess[0]->start, linesToProcess[0]->end, seqs);
+#endif
+
+ return localIs;
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignSim", "createProcessesIS");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************