]> git.donarmstrong.com Git - mothur.git/blobdiff - alignedsimilarity.cpp
continued work on chimeras and fixed bug in trim.seqs and reverse.seqs that was due...
[mothur.git] / alignedsimilarity.cpp
diff --git a/alignedsimilarity.cpp b/alignedsimilarity.cpp
deleted file mode 100644 (file)
index 0ed9ce3..0000000
+++ /dev/null
@@ -1,543 +0,0 @@
-/*
- *  alignedsimilarity.cpp
- *  Mothur
- *
- *  Created by westcott on 8/18/09.
- *  Copyright 2009 Schloss Lab. All rights reserved.
- *
- */
-
-#include "alignedsimilarity.h"
-#include "ignoregaps.h"
-
-
-//***************************************************************************************************************
-AlignSim::AlignSim(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
-//***************************************************************************************************************
-
-AlignSim::~AlignSim() {
-       try {
-               for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
-               for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
-               delete distCalc;
-       }
-       catch(exception& e) {
-               errorOut(e, "AlignSim", "~AlignSim");
-               exit(1);
-       }
-}      
-//***************************************************************************************************************
-void AlignSim::print(ostream& out) {
-       try {
-               
-               mothurOutEndLine();
-               
-               for (int i = 0; i < querySeqs.size(); i++) {
-                       
-                       int j = 0;  float largest = -10;
-                       //find largest sim value
-                       for (int k = 0; k < IS[i].size(); k++) {
-                               //is this score larger
-                               if (IS[i][k].score > largest) {
-                                       j = k;
-                                       largest = IS[i][k].score;
-                               }
-                       }
-                       
-                       //find parental similarity
-                       distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent));
-                       float dist = distCalc->getDist();
-                       
-                       //convert to similarity
-                       dist = (1 - dist) * 100;
-
-                       //warn about parental similarity - if its above 82% may not detect a chimera 
-                       if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly.");  mothurOutEndLine(); }
-                       
-                       int index = ceil(dist);
-                       
-                       if (index == 0) { index=1;  }
-       
-                       //is your DE value higher than the 95%
-                       string chimera;
-                       if (IS[i][j].score > quantile[index-1][4])              {       chimera = "Yes";        }
-                       else                                                                            {       chimera = "No";         }                       
-                       
-                       out << querySeqs[i]->getName() <<  "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl;
-                       
-                       if (chimera == "Yes") {
-                               mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine();
-                       }
-                       out << "Improvement Score\t";
-                       
-                       for (int r = 0; r < IS[i].size(); r++) {  out << IS[i][r].score << '\t';  }
-                       out << endl;
-               }
-       }
-       catch(exception& e) {
-               errorOut(e, "AlignSim", "print");
-               exit(1);
-       }
-}
-
-//***************************************************************************************************************
-void AlignSim::getChimeras() {
-       try {
-               
-               //read in query sequences and subject sequences
-               mothurOut("Reading sequences and template file... "); cout.flush();
-               querySeqs = readSeqs(fastafile);
-               templateSeqs = readSeqs(templateFile);
-               mothurOut("Done."); mothurOutEndLine();
-               
-               int numSeqs = querySeqs.size();
-               
-               IS.resize(numSeqs);
-               
-               //break up file if needed
-               int linesPerProcess = numSeqs / processors ;
-               
-               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                       //find breakup of sequences for all times we will Parallelize
-                       if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
-                       else {
-                               //fill line pairs
-                               for (int i = 0; i < (processors-1); i++) {                      
-                                       lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
-                               }
-                               //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
-                               int i = processors - 1;
-                               lines.push_back(new linePair((i*linesPerProcess), numSeqs));
-                       }
-                       
-                       //find breakup of templatefile for quantiles
-                       if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
-                       else { 
-                               for (int i = 0; i < processors; i++) {
-                                       templateLines.push_back(new linePair());
-                                       templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
-                                       templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
-                               }
-                       }
-               #else
-                       lines.push_back(new linePair(0, numSeqs));
-                       templateLines.push_back(new linePair(0, templateSeqs.size()));
-               #endif
-       
-       
-               distCalc = new ignoreGaps();
-               
-               //find window breaks
-               windowBreak = findWindows();
-//for (int i = 0; i < windowBreak.size(); i++) { cout << windowBreak[i] << '\t';  }
-//cout << endl;
-
-               mothurOut("Finding the IS values for your sequences..."); cout.flush();
-               if (processors == 1) { 
-                       IS = findIS(lines[0]->start, lines[0]->end, querySeqs);
-               }else {         IS = createProcessesIS(querySeqs, lines);               }
-               mothurOut("Done."); mothurOutEndLine();
-               
-               //quantiles are used to determine whether the de values found indicate a chimera
-               if (quanfile != "") {  quantile = readQuantiles();  }
-               else {
-                       
-                       mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .alignedsimilarity.quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
-                       //get IS quantiles as a reference to these IS scores
-                       //you are assuming the template is free of chimeras and therefore will give you a baseline as to the scores you would like to see
-                       if (processors == 1) { 
-                               templateIS = findIS(templateLines[0]->start, templateLines[0]->end, templateSeqs);
-                       }else {         templateIS = createProcessesIS(templateSeqs, templateLines);            }
-//cout << "here" << endl;                      
-                       ofstream out4;
-                       string o;
-                       
-                       o = getRootName(templateFile) + "alignedsimilarity.quan";
-                       
-                       openOutputFile(o, out4);
-                       
-                       //adjust quantiles
-                       //construct table
-                       quantile.resize(100);
-               
-                       for (int i = 0; i < templateIS.size(); i++) {
-                               
-                               if (templateIS[i].size() != 0) {
-                                       int j = 0; float score = -1000;
-                                       //find highest IS score
-       //cout << templateIS[i].size() << endl;
-                                       for (int k = 0; k < templateIS[i].size(); k++) {
-                                               if (templateIS[i][k].score > score) {
-                                                       score = templateIS[i][k].score;
-                                                       j = k;
-                                               }
-                                       }
-       //cout << j << endl;            
-                                       //find similarity of parents
-                                       distCalc->calcDist(*(templateIS[i][j].leftParent), *(templateIS[i][j].rightParent));
-                                       float dist = distCalc->getDist();
-                       
-                                       //convert to similarity
-                                       dist = (1 - dist) * 100;
-       //      cout << dist << endl;   
-                                       int index = ceil(dist);
-       //      cout << "index = " << index << endl;    
-                                       if (index == 0) {       index = 1;      }
-                                       quantile[index-1].push_back(templateIS[i][j].score);
-                               }
-                       }
-               
-               
-                       for (int i = 0; i < quantile.size(); i++) {
-                               vector<float> temp;
-                               
-                               if (quantile[i].size() == 0) {
-                                       //in case this is not a distance found in your template files
-                                       for (int g = 0; g < 6; g++) {
-                                               temp.push_back(0.0);
-                                       }
-                               }else{
-                                       
-                                       sort(quantile[i].begin(), quantile[i].end());
-                                       
-                                       //save 10%
-                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.10)]);
-                                       //save 25%
-                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.25)]);
-                                       //save 50%
-                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.5)]);
-                                       //save 75%
-                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.75)]);
-                                       //save 95%
-                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.95)]);
-                                       //save 99%
-                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.99)]);
-                                       
-                               }
-                               
-                               //output quan value
-                               out4 << i+1 << '\t';                            
-                               for (int u = 0; u < temp.size(); u++) {   out4 << temp[u] << '\t'; }
-                               out4 << endl;
-                               
-                               quantile[i] = temp;
-                               
-                       }
-                       
-                       out4.close();
-                       
-                       mothurOut("Done."); mothurOutEndLine();
-
-               }
-               
-               //free memory
-               for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
-               for (int i = 0; i < templateLines.size(); i++)                  {       delete templateLines[i];                }
-                       
-       }
-       catch(exception& e) {
-               errorOut(e, "AlignSim", "getChimeras");
-               exit(1);
-       }
-}
-//***************************************************************************************************************
-vector<int> AlignSim::findWindows() {
-       try {
-               
-               vector<int> win; 
-               
-               if (increment > querySeqs[0]->getAligned().length()) {  mothurOut("You have selected an increment larger than the length of your sequences.  I will use the default of 25.");  increment = 25; }
-               
-               for (int m = increment;  m < (querySeqs[0]->getAligned().length() - increment); m+=increment) {  win.push_back(m);  }
-
-               return win;
-       
-       }
-       catch(exception& e) {
-               errorOut(e, "AlignSim", "findWindows");
-               exit(1);
-       }
-}
-
-//***************************************************************************************************************
-vector< vector<sim>  > AlignSim::findIS(int start, int end, vector<Sequence*> seqs) {
-       try {
-               
-               vector< vector<sim> >  isValues;
-               isValues.resize(seqs.size());
-               
-               //for each sequence
-               for(int i = start; i < end; i++){
-                       
-                       vector<sim> temp;  temp.resize(windowBreak.size());
-                       
-                       mothurOut("Finding IS value for sequence " + toString(i)); mothurOutEndLine();
-                       
-                       //for each window
-                       for (int m = 0; m < windowBreak.size(); m++) {
-                               
-                               vector<Sequence*> closest;  //left, right, overall
-                               vector<int> similarity; //left+right and overall
-                               
-                               //find closest left, closest right and closest overall
-                               closest = findClosestSides(seqs[i], windowBreak[m], similarity, i);
-                               
-                               int totalNumBases = seqs[i]->getUnaligned().length();
-                               
-                               //IS = left+right-overall
-                               float is = ((similarity[0]+similarity[1]) / (float) totalNumBases) - (similarity[2] / (float) totalNumBases);
-                               
-                               ///save IS, leftparent, rightparent, breakpoint
-                               temp[m].leftParent = closest[0];
-                               temp[m].rightParent = closest[1];
-                               temp[m].score = is;
-                               temp[m].midpoint = windowBreak[m];
-//cout << is << '\t';
-                       }
-//cout << endl;                        
-                       isValues[i] = temp;
-               
-               }
-               
-               return isValues;
-       
-       }
-       catch(exception& e) {
-               errorOut(e, "AlignSim", "findIS");
-               exit(1);
-       }
-}
-//***************************************************************************************************************
-vector<Sequence*> AlignSim::findClosestSides(Sequence* seq, int breakpoint, vector<int>& sim, int i) {
-       try{
-       
-               vector<Sequence*> closest;
-               
-               Sequence query, queryLeft, queryRight;
-               string frag = seq->getAligned();
-               string fragLeft = frag.substr(0, breakpoint);
-               string fragRight = frag.substr(breakpoint, frag.length());
-               
-               //get pieces
-               query = *(seq);  
-               queryLeft = *(seq);  
-               queryRight = *(seq);
-               queryLeft.setAligned(fragLeft);
-               queryRight.setAligned(fragRight);
-               
-               //initialize
-               Sequence* overall = templateSeqs[0];
-               Sequence* left = templateSeqs[0];
-               Sequence* right = templateSeqs[0];
-               
-               float smallestOverall, smallestLeft, smallestRight;
-               smallestOverall = 1000;  smallestLeft = 1000;  smallestRight = 1000;
-               
-               //go through the templateSeqs and search for the closest
-               for(int j = 0; j < templateSeqs.size(); j++){
-                       
-                       //so you don't pick yourself
-                       if (j != i) {
-                               Sequence temp, tempLeft, tempRight;
-                               string fragTemp = templateSeqs[j]->getAligned();
-                               string fragTempLeft = fragTemp.substr(0, breakpoint);
-                               string fragTempRight = fragTemp.substr(breakpoint, fragTemp.length());
-                               
-                               //get pieces
-                               temp = *(templateSeqs[j]); 
-                               tempLeft = *(templateSeqs[j]); 
-                               tempRight = *(templateSeqs[j]);
-                               tempLeft.setAligned(fragTempLeft);
-                               tempRight.setAligned(fragTempRight);
-                               
-                               //find overall dist
-                               distCalc->calcDist(query, temp);
-                               float dist = distCalc->getDist();       
-                               
-                               if (dist < smallestOverall) { 
-                                       overall = templateSeqs[j];
-                                       smallestOverall = dist;
-                               }
-                               
-                               //find left dist
-                               distCalc->calcDist(queryLeft, tempLeft);
-                               dist = distCalc->getDist();
-                               
-                               if (dist < smallestLeft) { 
-                                       left = templateSeqs[j];
-                                       smallestLeft = dist;
-                               }
-                               
-                               //find left dist
-                               distCalc->calcDist(queryRight, tempRight);
-                               dist = distCalc->getDist();
-                               
-                               if (dist < smallestRight) { 
-                                       right = templateSeqs[j];
-                                       smallestRight = dist;
-                               }
-                       }
-
-               }
-                       
-               closest.push_back(left);
-               closest.push_back(right);
-               closest.push_back(overall);
-               
-               //fill sim with number of matched bases
-               Sequence tempL = *(left);
-               string tempLFrag = tempL.getAligned();
-               tempL.setAligned(tempLFrag.substr(0, breakpoint));
-               
-               int bothMatches = findNumMatchedBases(queryLeft, tempL);
-               sim.push_back(bothMatches);
-
-               Sequence tempR = *(right);
-               string tempRFrag = tempR.getAligned();
-               tempR.setAligned(tempRFrag.substr(breakpoint, tempRFrag.length()));
-               
-               bothMatches = findNumMatchedBases(queryRight, tempR);
-               sim.push_back(bothMatches);
-
-               bothMatches = findNumMatchedBases(query, *(overall));
-               sim.push_back(bothMatches);
-                       
-               return closest;
-
-
-       }
-       catch(exception& e) {
-               errorOut(e, "AlignSim", "findClosestSides");
-               exit(1);
-       }
-}
-//***************************************************************************************************************
-
-int AlignSim::findNumMatchedBases(Sequence seq, Sequence temp) {
-       try{
-       
-               int num = 0;
-               
-               string query = seq.getAligned();
-               string subject = temp.getAligned();
-//cout << seq.getName() << endl << endl;
-//cout << temp.getName()  << endl << endl;
-               
-               for (int i = 0; i < query.length(); i++) {
-                       //count matches if query[i] is a base and subject is same bases or gap
-                       if (isalpha(query[i])) {
-                               if(!isalpha(subject[i])) {  num++;  } 
-                               else if (query[i] == subject[i])  { num++;  }
-                               else {  }
-                       }
-               }
-//cout << "num = " << num << endl;             
-               return num;
-       }
-       catch(exception& e) {
-               errorOut(e, "AlignSim", "findNumMatchedBases");
-               exit(1);
-       }
-}
-
-/**************************************************************************************************/
-vector< vector<sim> > AlignSim::createProcessesIS(vector<Sequence*> seqs, vector<linePair*> linesToProcess) {
-       try {
-       vector< vector<sim> > localIs; localIs.resize(seqs.size());
-       
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-               int process = 0;
-               vector<int> processIDS;
-               
-               //loop through and create all the processes you want
-               while (process != processors) {
-                       int pid = fork();
-                       
-                       if (pid > 0) {
-                               processIDS.push_back(pid);  
-                               process++;
-                       }else if (pid == 0){
-                               
-                               mothurOut("Finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
-                               localIs = findIS(linesToProcess[process]->start, linesToProcess[process]->end, seqs);
-                               mothurOut("Done finding IS values for sequences " +  toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
-                               
-                               //write out data to file so parent can read it
-                               ofstream out;
-                               string s = toString(getpid()) + ".temp";
-                               openOutputFile(s, out);
-                               
-                               //output pairs
-                               for (int i = linesToProcess[process]->start; i < linesToProcess[process]->end; i++) {
-                                        out << localIs[i].size() << endl;
-                                        for (int j = 0; j < localIs[i].size(); j++) {
-                                               localIs[i][j].leftParent->printSequence(out);
-                                               localIs[i][j].rightParent->printSequence(out);
-                                               out << ">" << '\t' << localIs[i][j].score << '\t' << localIs[i][j].midpoint << endl;
-                                        }
-                               }
-                               out.close();
-                               
-                               exit(0);
-                       }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
-               }
-               
-               //force parent to wait until all the processes are done
-               for (int i=0;i<processors;i++) { 
-                       int temp = processIDS[i];
-                       wait(&temp);
-               }
-               
-               //get data created by processes
-               for (int i=0;i<processors;i++) { 
-                       ifstream in;
-                       string s = toString(processIDS[i]) + ".temp";
-                       openInputFile(s, in);
-                       
-                       //get pairs
-                       for (int k = linesToProcess[i]->start; k < linesToProcess[i]->end; k++) {
-                               int size;
-                               in >> size;
-                               gobble(in);
-                               
-                               vector<sim> tempVector;
-                               
-                               for (int j = 0; j < size; j++) {
-                               
-                                       sim temp;
-                                       
-                                       temp.leftParent = new Sequence(in);
-                                       gobble(in);
-       //temp.leftParent->printSequence(cout);                         
-                                       temp.rightParent = new Sequence(in);
-                                       gobble(in);
-//temp.rightParent->printSequence(cout);
-                                       string throwaway;
-                                       in >> throwaway >> temp.score >> temp.midpoint;
-       //cout << temp.score << '\t' << temp.midpoint << endl;                          
-                                       gobble(in);
-                                       
-                                       tempVector.push_back(temp);
-                               }
-                               
-                               localIs[k] = tempVector;
-                       }
-                       
-                       in.close();
-                       remove(s.c_str());
-               }
-                       
-       
-#else
-               localIs = findIS(linesToProcess[0]->start, linesToProcess[0]->end, seqs);
-#endif 
-
-               return localIs;
-       }
-       catch(exception& e) {
-               errorOut(e, "AlignSim", "createProcessesIS");
-               exit(1);
-       }
-}
-
-//***************************************************************************************************************