From 42b802c0006d8b13bd5b27ea48d032a85d3f2102 Mon Sep 17 00:00:00 2001 From: westcott Date: Tue, 10 Mar 2009 16:03:17 +0000 Subject: [PATCH] working on libshuff --- Mothur.xcodeproj/project.pbxproj | 6 + coverage.cpp | 40 +++++ coverage.h | 40 +++++ fullmatrix.cpp | 113 +++++++++---- fullmatrix.h | 7 +- libshuffcommand.cpp | 273 ++++++++++++++++++++++++++++++- libshuffcommand.h | 22 +++ 7 files changed, 466 insertions(+), 35 deletions(-) create mode 100644 coverage.cpp create mode 100644 coverage.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index e9446df..b5eaa8b 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -15,6 +15,7 @@ 374610830F40652400460C57 /* unweighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 374610820F40652400460C57 /* unweighted.cpp */; }; 3746109D0F40657600460C57 /* unifracunweightedcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3746109C0F40657600460C57 /* unifracunweightedcommand.cpp */; }; 374CD63F0F65832000D90B4A /* libshuffcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 374CD63E0F65832000D90B4A /* libshuffcommand.cpp */; }; + 374CD6F10F65A4C100D90B4A /* coverage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 374CD6F00F65A4C100D90B4A /* coverage.cpp */; }; 3782163D0F616079008E1F6D /* fullmatrix.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3782163C0F616079008E1F6D /* fullmatrix.cpp */; }; 379293C30F2DE73400B9034A /* treemap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379293C20F2DE73400B9034A /* treemap.cpp */; }; 379294700F2E191800B9034A /* parsimonycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3792946F0F2E191800B9034A /* parsimonycommand.cpp */; }; @@ -129,6 +130,8 @@ 3746109C0F40657600460C57 /* unifracunweightedcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = unifracunweightedcommand.cpp; sourceTree = ""; }; 374CD63D0F65832000D90B4A /* libshuffcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = libshuffcommand.h; sourceTree = ""; }; 374CD63E0F65832000D90B4A /* libshuffcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = libshuffcommand.cpp; sourceTree = ""; }; + 374CD6EF0F65A4C100D90B4A /* coverage.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = coverage.h; sourceTree = ""; }; + 374CD6F00F65A4C100D90B4A /* coverage.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = coverage.cpp; sourceTree = ""; }; 3782163B0F616079008E1F6D /* fullmatrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fullmatrix.h; sourceTree = ""; }; 3782163C0F616079008E1F6D /* fullmatrix.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fullmatrix.cpp; sourceTree = ""; }; 379293C10F2DE73400B9034A /* treemap.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = treemap.h; sourceTree = ""; }; @@ -410,6 +413,8 @@ 37D927BB0F21331F001D4494 /* bootstrap.cpp */, 37D927C00F21331F001D4494 /* chao1.h */, 37D927BF0F21331F001D4494 /* chao1.cpp */, + 374CD6EF0F65A4C100D90B4A /* coverage.h */, + 374CD6F00F65A4C100D90B4A /* coverage.cpp */, 37D927E80F21331F001D4494 /* jackknife.h */, 37D927E70F21331F001D4494 /* jackknife.cpp */, 37D927F50F21331F001D4494 /* npshannon.h */, @@ -705,6 +710,7 @@ A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */, 3782163D0F616079008E1F6D /* fullmatrix.cpp in Sources */, 374CD63F0F65832000D90B4A /* libshuffcommand.cpp in Sources */, + 374CD6F10F65A4C100D90B4A /* coverage.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/coverage.cpp b/coverage.cpp new file mode 100644 index 0000000..e7baf00 --- /dev/null +++ b/coverage.cpp @@ -0,0 +1,40 @@ +/* + * coverage.cpp + * Mothur + * + * Created by Sarah Westcott on 3/9/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + */ + +#include "coverage.h" + +//********************************************************************************************************************** +vector< vector > Coverage::getValues(FullMatrix*, float) { + try { + globaldata = GlobalData::getInstance(); + numGroups = globaldata->Groups.size(); + + //initialize data + data.resize(numGroups); + for (int l = 0; l < data.size(); l++) { + data[l].push_back(0.0); + } + + /**************************************/ + //get the minumums for each comparision + /**************************************/ + return data; + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the Coverage class Function getValues. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the Coverage class function getValues. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} +//********************************************************************************************************************** + + \ No newline at end of file diff --git a/coverage.h b/coverage.h new file mode 100644 index 0000000..c556bb2 --- /dev/null +++ b/coverage.h @@ -0,0 +1,40 @@ +#ifndef COVERAGE_H +#define COVERAGE_H + +/* + * coverage.h + * Mothur + * + * Created by Sarah Westcott on 3/9/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + */ + +#include "mothur.h" +#include "fullmatrix.h" +#include "globaldata.hpp" + +using namespace std; + +/***********************************************************************/ + +class Coverage { + + public: + Coverage(){}; + ~Coverage(){}; + vector< vector > getValues(FullMatrix*, float); + + private: + GlobalData* globaldata; + vector< vector > data; + int numGroups; + +}; + + +/***********************************************************************/ + + + +#endif \ No newline at end of file diff --git a/fullmatrix.cpp b/fullmatrix.cpp index 66c67b7..090e2fe 100644 --- a/fullmatrix.cpp +++ b/fullmatrix.cpp @@ -16,7 +16,6 @@ FullMatrix::FullMatrix(ifstream& filehandle) { globaldata = GlobalData::getInstance(); groupmap = globaldata->gGroupmap; - float minSoFar = 0.0; string name, group; filehandle >> numSeqs >> name; @@ -41,17 +40,9 @@ FullMatrix::FullMatrix(ifstream& filehandle) { square = true; filehandle.putback(d); - if (numSeqs >= 2) { - //save first distance that is not distance to itself as minimum - filehandle >> matrix[0][0] >> minSoFar; - matrix[0][1] = minSoFar; - } - - for(int i=2;i> matrix[0][i]; - if (matrix[0][i] < minSoFar) { minSoFar = matrix[0][i]; } } - index[0].minDist = minSoFar; break; } @@ -87,8 +78,8 @@ void FullMatrix::readSquareMatrix(ifstream& filehandle) { reading = new Progress("Reading matrix: ", numSeqs * numSeqs); int count = 0; - float distance, minSoFar; - minSoFar = 0.0; + float distance; + string group, name; for(int i=1;i> minSoFar; - matrix[i][0] = minSoFar; - if(group == "not found") { cout << "Error: Sequence '" << name << "' was not found in the group file, please correct." << endl; exit(1); } - for(int j=1;j> distance; - - if ((distance < minSoFar) && (i != j)) { minSoFar = distance; } matrix[i][j] = distance; count++; reading->update(count); } - - //save minimum value for each row - index[i].minDist = minSoFar; } reading->finish(); delete reading; @@ -136,8 +119,7 @@ void FullMatrix::readLTMatrix(ifstream& filehandle) { reading = new Progress("Reading matrix: ", numSeqs * (numSeqs - 1) / 2); int count = 0; - float distance, minSoFar; - minSoFar = 0.0; + float distance; string group, name; @@ -147,24 +129,17 @@ void FullMatrix::readLTMatrix(ifstream& filehandle) { group = groupmap->getGroup(name); index[i].groupname = group; index[i].seqName = name; - - filehandle >> minSoFar; - matrix[i][0] = minSoFar; if(group == "not found") { cout << "Error: Sequence '" << name << "' was not found in the group file, please correct." << endl; exit(1); } - for(int j=1;j> distance; - - if (distance < minSoFar) { minSoFar = distance; } matrix[i][j] = distance; matrix[j][i] = distance; count++; reading->update(count); } - //save minimum value for each row - index[i].minDist = minSoFar; } reading->finish(); delete reading; @@ -275,5 +250,81 @@ void FullMatrix::printMatrix(ostream& out) { } } + /**************************************************************************/ +void FullMatrix::getMinsForRowsVectors(){ + try{ + numGroups = globaldata->gGroupmap->namesOfGroups.size(); + numUserGroups = globaldata->Groups.size(); + + //sort globaldata->gGroupmap.namesOfGroups so that it will match the matrix + sort(globaldata->gGroupmap->namesOfGroups.begin(), globaldata->gGroupmap->namesOfGroups.end()); + + /*************************************************/ + //find where in matrix each group starts and stops + /*************************************************/ + vector bounds; //bounds[0] = 0, bounds[1] = starting row in matrix from group B, bounds[2] = starting row in matrix from group C, bounds[3] = no need to find C because its numSeqs. + bounds.resize(numGroups); + + bounds[numGroups] = numSeqs; + //for each group find bounds of subgroup/comparison + for (int i = 0; i < numGroups; i++) { + getBounds(bounds[i], globaldata->gGroupmap->namesOfGroups[i]); + } + + /************************************************************/ + //fill the minsForRows vectors for each group the user wants + /************************************************************/ + int countx = bounds[0]; //where second group starts + int county = bounds[0]; + + //go through the entire matrix + for (int x = 0; x < numSeqs; x++) { + for (int y = 0; y < numSeqs; y++) { + //if have not changed groups + if ((x < countx) && (y < county)) { + if (inUsersGroups(index[x].groupname, globaldata->Groups)) { + } + } + } + } + + + + + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the FullMatrix class Function getMinsForRowsVectors. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the FullMatrix class function getMinsForRowsVectors. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + +} + +/**************************************************************************/ +void FullMatrix::getBounds(int& higher, string group) { + try{ + bool gotLower = false; + + //for each group find bounds of subgroup/comparison + for (it = index.begin(); it != index.end(); it++) { + if (it->second.groupname == group) { + if (gotLower != true) { gotLower = true; } + }else if ((gotLower == true) && (it->second.groupname != group)) { higher = it->first; break; } + } + + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the FullMatrix class Function getBounds. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the FullMatrix class function getBounds. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + +} diff --git a/fullmatrix.h b/fullmatrix.h index 0bd5c87..43bde96 100644 --- a/fullmatrix.h +++ b/fullmatrix.h @@ -19,7 +19,6 @@ using namespace std; struct Names { string groupname; string seqName; - float minDist; }; @@ -32,16 +31,20 @@ class FullMatrix { int getNumSeqs(); void printMatrix(ostream&); + void getMinsForRowsVectors(); //requires globaldata->Groups to be filled private: void sortGroups(int, int); //this function sorts the sequences within the matrix. + void getBounds(int&, string); void readSquareMatrix(ifstream&); void readLTMatrix(ifstream&); vector< vector > matrix; //a 2D distance matrix of all the sequences and their distances to eachother. + vector< vector > minsForRows; //vector< minimum distance for that subrow> -one for each comparison. map index; // row in vector, sequence group. need to know this so when we sort it can be updated. + map::iterator it; GroupMap* groupmap; //maps sequences to groups they belong to. GlobalData* globaldata; - int numSeqs; + int numSeqs, numGroups, numUserGroups; bool square; }; diff --git a/libshuffcommand.cpp b/libshuffcommand.cpp index 1abbc05..a34fb36 100644 --- a/libshuffcommand.cpp +++ b/libshuffcommand.cpp @@ -14,7 +14,37 @@ LibShuffCommand::LibShuffCommand(){ try { - //globaldata = GlobalData::getInstance(); + globaldata = GlobalData::getInstance(); + convert(globaldata->getCutOff(), cutOff); + convert(globaldata->getIters(), iters); + matrix = globaldata->gMatrix; + coverageFile = getRootName(globaldata->getPhylipFile()) + "coverage"; + summaryFile = getRootName(globaldata->getPhylipFile()) + "slsummary"; + openOutputFile(coverageFile, out); + openOutputFile(summaryFile, outSum); + + //set the groups to be analyzed + setGroups(); + + //file headers for coverage file + out << "D" << '\t'; + for (int i = 0; i < groupComb.size(); i++) { + out << "C" + groupComb[i] << '\t'; + } + + for (int i = 0; i < numGroups; i++) { + for (int j = 0; j < numGroups; j++) { + //don't output AA to AA + if (i != j) { + out << "Delta" + globaldata->Groups[i] + "-" + globaldata->Groups[j] << '\t'; + } + } + } + out << endl; + + numComp = numGroups*numGroups; + + coverage = new Coverage(); } catch(exception& e) { @@ -31,13 +61,114 @@ LibShuffCommand::LibShuffCommand(){ //********************************************************************************************************************** LibShuffCommand::~LibShuffCommand(){ - + delete coverage; } //********************************************************************************************************************** int LibShuffCommand::execute(){ try { + //deltaValues[0] = scores for the difference between AA and AB. + //cValues[0][0] = AA, cValues[0][1] = AB, cValues[0][2] = AC, cValues[1][0] = BA, cValues[1][1] = BB... + + sumDelta.resize(numComp-numGroups, 0.0); + + float D = 0.0; + + /*****************************/ + //get values for users matrix + /*****************************/ + matrix->getMinsForRowsVectors(); + + //get values for users matrix + while (D <= cutOff) { + cValues = coverage->getValues(matrix, D); + + //find delta values + int count = 0; + for (int i = 0; i < numGroups; i++) { + for (int j = 0; j < numGroups; j++) { + //don't save AA to AA + if (i != j) { + //(Caa - Cab)^2 + deltaValues.push_back( (abs(cValues[i][i]-cValues[i][j]) * abs(cValues[i][i]-cValues[i][j])) ); + sumDelta[count] += deltaValues[count]; + count++; + } + } + } + + D += 0.01; + + printCoverageFile(D); + + //clear out old Values + cValues.clear(); + deltaValues.clear(); + } + + /*******************************************************************************/ + //create and score random matrixes finding the sumDelta values for summary file + /******************************************************************************/ + + //initialize rsumDelta + rsumDelta.resize(numComp-numGroups); + for (int l = 0; l < rsumDelta.size(); l++) { + for (int w = 0; w < iters; w++) { + rsumDelta[l].push_back(0.0); + } + } + + for (int m = 0; m < iters; m++) { + //generate random matrix in getValues + //values for random matrix + while (D <= cutOff) { + cValues = coverage->getValues(matrix, D); + + //find delta values + int count = 0; + for (int i = 0; i < numGroups; i++) { + for (int j = 0; j < numGroups; j++) { + //don't save AA to AA + if (i != j) { + //(Caa - Cab)^2 + rsumDelta[count][m] += (abs(cValues[i][i]-cValues[i][j]) * abs(cValues[i][i]-cValues[i][j])); + count++; + } + } + } + + D += 0.01; + + //clear out old Values + cValues.clear(); + } + } + + /**********************************************************/ + //find the signifigance of the user matrix' sumdelta values + /**********************************************************/ + + for (int t = 0; t < rsumDelta.size(); t++) { + //sort rsumDelta t + sort(rsumDelta[t].begin(), rsumDelta[t].end()); + + //the index of the score higher than yours is returned + //so if you have 1000 random matrices the index returned is 100 + //then there are 900 matrices with a score greater then you. + //giving you a signifigance of 0.900 + int index = findIndex(sumDelta[t], t); + + //the signifigance is the number of trees with the users score or higher + sumDeltaSig.push_back((iters-index)/(float)iters); + + } + + printSummaryFile(); + + //clear out users groups + globaldata->Groups.clear(); + return 0; } catch(exception& e) { @@ -49,5 +180,143 @@ int LibShuffCommand::execute(){ exit(1); } } +//********************************************************************************************************************** +void LibShuffCommand::printCoverageFile(float d) { + try { + //format output + out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint); + + out << setprecision(globaldata->getIters().length()) << d << '\t'; + + //print out coverage values + for (int i = 0; i < numGroups; i++) { + for (int j = 0; j < numGroups; j++) { + out << cValues[i][j] << '\t'; + } + } + + //print out delta values + for (int i = 0; i < deltaValues.size(); i++) { + out << deltaValues[i] << '\t'; + } + + out << endl; + + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the LibShuffCommand class Function printCoverageFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the LibShuffCommand class function printCoverageFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} +//********************************************************************************************************************** +void LibShuffCommand::printSummaryFile() { + try { + //format output + outSum.setf(ios::fixed, ios::floatfield); outSum.setf(ios::showpoint); + + for (int i = 0; i < numGroups; i++) { + for (int j = 0; j < numGroups; j++) { + //don't output AA to AA + if (i != j) { + outSum << "Delta" + globaldata->Groups[i] + "-" + globaldata->Groups[j] << '\t'<< "DeltaSig" + globaldata->Groups[i] + "-" + globaldata->Groups[j] << '\t'; + } + } + } + outSum << endl; + + //print out delta values + for (int i = 0; i < sumDelta.size(); i++) { + outSum << setprecision(6) << sumDelta[i] << '\t' << setprecision(globaldata->getIters().length()) << sumDeltaSig[i] << '\t'; + } + + outSum << endl; + + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the LibShuffCommand class Function printSummaryFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the LibShuffCommand class function printSummaryFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} //********************************************************************************************************************** +void LibShuffCommand::setGroups() { + try { + //if the user has not entered specific groups to analyze then do them all + if (globaldata->Groups.size() == 0) { + numGroups = globaldata->gGroupmap->getNumGroups(); + for (int i=0; i < numGroups; i++) { + globaldata->Groups.push_back(globaldata->gGroupmap->namesOfGroups[i]); + } + }else { + if (globaldata->getGroups() != "all") { + //check that groups are valid + for (int i = 0; i < globaldata->Groups.size(); i++) { + if (globaldata->gGroupmap->isValidGroup(globaldata->Groups[i]) != true) { + cout << globaldata->Groups[i] << " is not a valid group, and will be disregarded." << endl; + // erase the invalid group from globaldata->Groups + globaldata->Groups.erase (globaldata->Groups.begin()+i); + } + } + + //if the user only entered invalid groups + if ((globaldata->Groups.size() == 0) || (globaldata->Groups.size() == 1)) { + numGroups = globaldata->gGroupmap->getNumGroups(); + for (int i=0; i < numGroups; i++) { + globaldata->Groups.push_back(globaldata->gGroupmap->namesOfGroups[i]); + } + cout << "When using the groups parameter you must have at least 2 valid groups. I will run the command using all the groups in your groupfile." << endl; + }else { numGroups = globaldata->Groups.size(); } + }else { //users wants all groups + numGroups = globaldata->gGroupmap->getNumGroups(); + globaldata->Groups.clear(); + for (int i=0; i < numGroups; i++) { + globaldata->Groups.push_back(globaldata->gGroupmap->namesOfGroups[i]); + } + } + } + + // number of comparisons i.e. with groups A,B,C = AA, AB, AC, BA, BB, BC...; + for (int i=0; iGroups[i] + "-" + globaldata->Groups[l]); + } + } + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the LibShuffCommand class Function setGroups. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the LibShuffCommand class function setGroups. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} +/***********************************************************/ +int LibShuffCommand::findIndex(float score, int index) { + try{ + for (int i = 0; i < rsumDelta[index].size(); i++) { + if (rsumDelta[index][i] >= score) { return i; } + } + return rsumDelta[index].size(); + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the LibShuffCommand class Function findIndex. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the LibShuffCommand class function findIndex. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} + +/***********************************************************/ + diff --git a/libshuffcommand.h b/libshuffcommand.h index 490f23e..b2474c5 100644 --- a/libshuffcommand.h +++ b/libshuffcommand.h @@ -11,6 +11,8 @@ */ #include "command.hpp" +#include "coverage.h" +#include "fullmatrix.h" using namespace std; @@ -24,7 +26,27 @@ class LibShuffCommand : public Command { int execute(); private: + vector< vector > cValues; // vector of coverage scores, one for each comparison. + vector deltaValues; // vector of delta scores, one for each comparison. + vector sumDelta; //sum of delta scores, one for each comparison. + vector sumDeltaSig; //number of random matrixes with that delta value or ?? + vector< vector > rsumDelta; //vector< vector > + vector groupComb; + + + void setGroups(); + int findIndex(float, int); + void printCoverageFile(float); + void printSummaryFile(); + GlobalData* globaldata; + Coverage* coverage; + FullMatrix* matrix; + float cutOff; + int numGroups, numComp, iters; + string coverageFile, summaryFile; + ofstream out, outSum; + }; -- 2.39.2