From 8770435f2eedcbf4e69daba716144e83da1dd939 Mon Sep 17 00:00:00 2001 From: westcott Date: Thu, 19 Mar 2009 14:02:15 +0000 Subject: [PATCH] libshuff and updated help to include libshuff and modifications to read.dist. --- coverage.cpp | 126 +++++++++++--------------------------- coverage.h | 3 +- fullmatrix.cpp | 50 +++++---------- fullmatrix.h | 7 +-- helpcommand.cpp | 23 ++++++- libshuffcommand.cpp | 145 +++++++++++++++++++------------------------- libshuffcommand.h | 7 ++- 7 files changed, 141 insertions(+), 220 deletions(-) diff --git a/coverage.cpp b/coverage.cpp index ba2167a..cd8138d 100644 --- a/coverage.cpp +++ b/coverage.cpp @@ -15,81 +15,20 @@ Coverage::Coverage() { numUserGroups = globaldata->Groups.size(); numGroups = globaldata->gGroupmap->getNumGroups(); } -//********************************************************************************************************************** -void Coverage::getValues(FullMatrix* matrix, float d, vector< vector >& data) { - try { - vector min; - vector groups; - - //initialize data - data.resize(numUserGroups); - for (int l = 0; l < data.size(); l++) { - data[l].push_back(0.0); - } - /**************************************/ - //get the minimums for each comparision - /**************************************/ - int count = 0; - int a = 0; - int b = 0; - for (int i = 0; i < numGroups; i++) { - for (int j = 0; j < numGroups; j++) { - - //is this "box" one hte user wants analyzed? - if ((inUsersGroups(globaldata->gGroupmap->namesOfGroups[i], globaldata->Groups) == true) && (inUsersGroups(globaldata->gGroupmap->namesOfGroups[j], globaldata->Groups) == true)) { - - min = matrix->getMins(count); //returns vector of mins for "box" requested ie. groups A, B, 0 = AA, 1 = AB, 2 = BA, 3 = BB; - - //find the coverage at this distance - sort(min.begin(), min.end()); -//cout << "minvector for : " << globaldata->gGroupmap->namesOfGroups[i] + globaldata->gGroupmap->namesOfGroups[j] << endl; -//for(int h = 0; h d) { index = m; break; } - } - - // if you don't find one than all the mins are less than d - if (index == -1) { index = min.size(); } - - //save value in data - data[a][b] = 1.0 - ((min.size()-index)/(float)min.size()); -//cout << "D = " << d << "data " << a << b << " = " << data[a][b] << endl; - if (b < numUserGroups-1) { b++; } - else{ //you are moving to a new row of "boxes" - b = 0; - a++; - } - } - count++; - } - } - } - catch(exception& e) { - cout << "Standard Error: " << e.what() << " has occurred in the Coverage class Function getValues. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; - exit(1); - } - catch(...) { - cout << "An unknown error has occurred in the Coverage class function getValues. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; - exit(1); - } -} //********************************************************************************************************************** -//For the random matrices -void Coverage::getValues(FullMatrix* matrix, float d, vector< vector >& data, string r) { +void Coverage::getValues(FullMatrix* matrix, vector< vector< vector > >& data, vector dist, string mode) { try { vector min; vector groups; //initialize data - data.resize(numUserGroups); + data.resize(dist.size()); for (int l = 0; l < data.size(); l++) { - data[l].push_back(0.0); + data[l].resize(numGroups); + for (int k = 0; k < data[l].size(); k++) { + data[l][k].push_back(0.0); + } } /**************************************/ @@ -103,46 +42,51 @@ void Coverage::getValues(FullMatrix* matrix, float d, vector< vector >& d //is this "box" one hte user wants analyzed? if ((inUsersGroups(globaldata->gGroupmap->namesOfGroups[i], globaldata->Groups) == true) && (inUsersGroups(globaldata->gGroupmap->namesOfGroups[j], globaldata->Groups) == true)) { -cout << "combo " << a << b << endl; -cout << "original matrix mins4rows " << endl; -matrix->printMinsForRows(cout); - //create random matrix for this comparison - matrix->shuffle(count); + + if (mode == "random") { + //create random matrix for this comparison + matrix->shuffle(globaldata->gGroupmap->namesOfGroups[i], globaldata->gGroupmap->namesOfGroups[j]); + } min = matrix->getMins(count); //returns vector of mins for "box" requested ie. groups A, B, 0 = AA, 1 = AB, 2 = BA, 3 = BB; -cout << "shuffled matrix mins4rows " << endl; -matrix->printMinsForRows(cout); //find the coverage at this distance sort(min.begin(), min.end()); - int index = -1; - //find index in min where value is higher than d - for (int m = 0; m < min.size(); m++) { - if (min[m] > d) { index = m; break; } - } + //loop through each distance and fill data + for (int k = 0; k < data.size(); k++) { + + int index = -1; + //find index in min where value is higher than d + for (int m = 0; m < min.size(); m++) { + if (min[m] > dist[k]) { index = m; break; } + } - // if you don't find one than all the mins are less than d - if (index == -1) { index = min.size(); } + // if you don't find one than all the mins are less than d + if (index == -1) { index = min.size(); } - //save value in data - data[a][b] = 1.0 - ((min.size()-index)/(float)min.size()); -cout << "D = " << d << "data " << a << b << " = " << data[a][b] << endl; + //save value in data + data[k][a][b] = 1.0 - ((min.size()-index)/(float)min.size()); + + } + + //move to next box if (b < numUserGroups-1) { b++; } else{ //you are moving to a new row of "boxes" b = 0; a++; } + + count++; + + if (mode == "random") { + //restore matrix to original form for next shuffle + matrix->restore(); + } } - count++; - - //restore matrix to original form for next shuffle - matrix->restore(); -min = matrix->getMins(count-1); -cout << "restored matrix mins4rows " << endl; -matrix->printMinsForRows(cout); } } + } catch(exception& e) { cout << "Standard Error: " << e.what() << " has occurred in the Coverage class Function getValues. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; diff --git a/coverage.h b/coverage.h index 7e64300..2f9fcfb 100644 --- a/coverage.h +++ b/coverage.h @@ -23,8 +23,7 @@ class Coverage { public: Coverage(); ~Coverage(){}; - void getValues(FullMatrix*, float, vector< vector >&); - void getValues(FullMatrix*, float, vector< vector >&, string); //for random matrices + void getValues(FullMatrix*, vector< vector< vector > >&, vector, string); //matrix, container for results, vector of distances, mode private: GlobalData* globaldata; diff --git a/fullmatrix.cpp b/fullmatrix.cpp index 6c09639..8eb4813 100644 --- a/fullmatrix.cpp +++ b/fullmatrix.cpp @@ -233,7 +233,7 @@ void FullMatrix::printMatrix(ostream& out) { for (int i = 0; i < numSeqs; i++) { out << "row " << i << " group = " << index[i].groupname << " name = " << index[i].seqName << endl; for (int j = 0; j < numSeqs; j++) { - //out << matrix[i][j] << " "; + out << matrix[i][j] << " "; } out << endl; } @@ -393,49 +393,27 @@ void FullMatrix::printMinsForRows(ostream& out) { } } + /**************************************************************************/ //shuffles the sequences in the 2 groups passed in. -void FullMatrix::shuffle(int box){ +void FullMatrix::shuffle(string groupA, string groupB){ try{ vector rows2Swap; vector shuffled; float y = 0; string name = ""; - /****************************/ - //find the box the user wants - /****************************/ - int count = 0; - int lowBoundy = bounds[0]; //where first group starts - int highBoundy = bounds[1]; //where second group starts - int county = 1; //index in bound - - //find the bounds for the box the user wants - for (int i = 0; i < (numGroups * numGroups); i++) { - - //are you at the box? - if (count == box) { break; } - else { count++; } - //move to next box - if (county < numGroups) { - county++; - highBoundy = bounds[county]; - lowBoundy = bounds[county-1]; - }else{ //you are moving to a new row of "boxes" - county = 1; - highBoundy = bounds[county]; - lowBoundy = bounds[county-1]; - } - } - - /************************/ - //save its rows locations - /************************/ + /********************************/ + //save rows you want to randomize + /********************************/ //go through the matrix map to find the rows from groups you want to randomize - for (int y = lowBoundy; y < highBoundy; y++) { - rows2Swap.push_back(y); - shuffled.push_back(y); + for (it = index.begin(); it != index.end(); it++) { + //is this row from group A or B? + if ((it->second.groupname == groupA) || (it->second.groupname == groupB)) { + rows2Swap.push_back(it->first); + shuffled.push_back(it->first); + } } //randomize rows to shuffle in shuffled @@ -445,6 +423,7 @@ void FullMatrix::shuffle(int box){ //swap rows and columns to randomize box /***************************************/ for (int i = 0; i < shuffled.size(); i++) { + //record the swaps you are making so you can undo them in restore function restoreIndex[i].a = shuffled[i]; restoreIndex[i].b = rows2Swap[i]; @@ -467,6 +446,7 @@ void FullMatrix::shuffle(int box){ name = index[shuffled[i]].seqName; index[shuffled[i]].seqName = index[rows2Swap[i]].seqName; index[rows2Swap[i]].seqName = name; + } } catch(exception& e) { @@ -488,6 +468,7 @@ void FullMatrix::restore(){ //reverse iterate through swaps and undo them to restore original matrix and index map. for(it2 = restoreIndex.rbegin(); it2 != restoreIndex.rend(); it2++) { /* swap rows */ + for (int h = 0; h < numSeqs; h++) { y = matrix[it2->second.a][h]; matrix[it2->second.a][h] = matrix[it2->second.b][h]; @@ -506,6 +487,7 @@ void FullMatrix::restore(){ name = index[it2->second.a].seqName; index[it2->second.a].seqName = index[it2->second.b].seqName; index[it2->second.b].seqName = name; + } //clear restore for next shuffle diff --git a/fullmatrix.h b/fullmatrix.h index b744490..078cdb7 100644 --- a/fullmatrix.h +++ b/fullmatrix.h @@ -39,16 +39,15 @@ class FullMatrix { void setBounds(); //requires globaldata->Groups to be filled vector getMins(int); //returns vector of mins for "box" requested ie. groups A, B, 0 = AA, 1 = AB, 2 = BA, 3 = BB; void getDist(vector&); //fills a vector with the valid distances for the integral form. - void shuffle(int); //shuffles the sequences in the box passed in. + void shuffle(string, string); //shuffles the sequences in the groups passed in. void restore(); //unshuffles the matrix. - - - void printMinsForRows(ostream&); + private: void sortGroups(int, int); //this function sorts the sequences within the matrix. void getBounds(int&, string); void readSquareMatrix(ifstream&); void readLTMatrix(ifstream&); + void printMinsForRows(ostream&); map index; // row in vector, sequence group. need to know this so when we sort it can be updated. map restoreIndex; //a map of the swaps made so you can undo them in restore. diff --git a/helpcommand.cpp b/helpcommand.cpp index 05592fc..e04f636 100644 --- a/helpcommand.cpp +++ b/helpcommand.cpp @@ -25,12 +25,17 @@ HelpCommand::~HelpCommand(){} int HelpCommand::execute(){ if (globaldata->helpRequest == "read.dist") { - cout << "The read.dist command parameter options are phylip or column, name, cutoff and precision" << "\n"; - cout << "The read.dist command should be in the following format: " << "\n"; + cout << "The read.dist command parameter options are phylip or column, group, name, cutoff and precision" << "\n"; + cout << "The read.dist command must be run before using the cluster or libshuff commands" << "\n"; + cout << "The read.dist command can be used in two ways. The first is to read a phylip or column and run the cluster command" << "\n"; + cout << "For this use the read.dist command should be in the following format: " << "\n"; cout << "read.dist(phylip=yourDistFile, name=yourNameFile, cutoff=yourCutoff, precision=yourPrecision) " << "\n"; cout << "The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. " << "\n"; cout << "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed." << "\n"; - cout << "Note: No spaces between parameter labels (i.e. dist), '=' and parameters (i.e.yourDistfile)." << "\n" << "\n"; + cout << "The second way to use the read.dist command is to read a phylip or column and a group, so you can use the libshuff command." << "\n"; + cout << "For this use the read.dist command should be in the following format: " << "\n"; + cout << "read.dist(phylip=yourPhylipfile, group=yourGroupFile). The cutoff and precision parameters are not valid with this use. " << "\n"; + cout << "Note: No spaces between parameter labels (i.e. phylip), '=' and parameters (i.e.yourPhylipfile)." << "\n" << "\n"; }else if (globaldata->helpRequest == "read.otu") { cout << "The read.otu command must be run before you execute a collect.single, rarefaction.single, summary.single, " << "\n"; cout << "collect.shared, rarefaction.shared or summary.shared command. Mothur will generate a .list, .rabund and .sabund upon completion of the cluster command " << "\n"; @@ -170,6 +175,18 @@ int HelpCommand::execute(){ cout << "The default value for groups is all the groups in your groupfile, and iters is 1000." << "\n"; cout << "The unifrac.unweighted command output two files: .unweighted and .uwsummary their descriptions are in the manual." << "\n"; cout << "Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListfile)." << "\n" << "\n"; + }else if (globaldata->helpRequest == "libshuff") { + cout << "The libshuff command can only be executed after a successful read.dist command." << "\n"; + cout << "The libshuff command parameters are groups, iters, step, form and cutoff. No parameters are required." << "\n"; + cout << "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups." << "\n"; + cout << "The group names are separated by dashes. The iters parameter allows you to specify how many random matrices you would like compared to your matrix." << "\n"; + cout << "The step parameter allows you to specify change in distance you would like between each output if you are using the discrete form." << "\n"; + cout << "The form parameter allows you to specify if you would like to analyze your matrix using the discrete or integral form. Your options are integral or discrete." << "\n"; + cout << "The libshuff command should be in the following format: libshuff(groups=yourGroups, iters=yourIters, cutOff=yourCutOff, form=yourForm, step=yourStep)." << "\n"; + cout << "Example libshuff(groups=A-B-C, iters=500, form=discrete, step=0.01, cutOff=2.0)." << "\n"; + cout << "The default value for groups is all the groups in your groupfile, iters is 10000, cutoff is 1.0, form is integral and step is 0.01." << "\n"; + cout << "The libshuff command output two files: .coverage and .slsummary their descriptions are in the manual." << "\n"; + cout << "Note: No spaces between parameter labels (i.e. iters), '=' and parameters (i.e.yourIters)." << "\n" << "\n"; }else if (globaldata->helpRequest == "quit") { cout << "The quit command will terminate Dotur and should be in the following format: " << "\n"; cout << "quit()" << "\n" << "\n"; diff --git a/libshuffcommand.cpp b/libshuffcommand.cpp index bd01662..2ed7ee5 100644 --- a/libshuffcommand.cpp +++ b/libshuffcommand.cpp @@ -71,30 +71,34 @@ LibShuffCommand::~LibShuffCommand(){ int LibShuffCommand::execute(){ try { //deltaValues[0] = scores for the difference between AA and AB. - //cValues[0][0] = AA, cValues[0][1] = AB, cValues[0][2] = AC, cValues[1][0] = BA, cValues[1][1] = BB... - vector dist; - int next; + //cValues[0][0][0] = AA at distance 0.0, cValues[0][0][1] = AB at distance 0.0, cValues[0][0][2] = AC at distance 0.0, cValues[0][1][0] = BA at distance 0.0, cValues[0][1][1] = BB... sumDelta.resize(numComp-numGroups, 0.0); - float D = 0.0; - - /*****************************/ - //get values for users matrix - /*****************************/ matrix->setBounds(); - if (form != "discrete") { matrix->getDist(dist); next = 1; } -//cout << "Distances" << endl; -//for (int i = 0; i < dist.size(); i++) { cout << dist[i] << " "; } -//cout << endl; + //load distances + if (form != "discrete") { matrix->getDist(dist); } + else { + float f = 0.0; + while (f <= cutOff) { + dist.push_back(f); + f += step; + } + } + /*****************************/ //get values for users matrix - while (D <= cutOff) { - //clear out old Values - deltaValues.clear(); - coverage->getValues(matrix, D, cValues); + /*****************************/ + //clear out old Values + deltaValues.clear(); + deltaValues.resize(dist.size()); + + coverage->getValues(matrix, cValues, dist, "user"); + + //loop through each distance and load rsumdelta + for (int p = 0; p < cValues.size(); p++) { //find delta values int count = 0; for (int i = 0; i < numGroups; i++) { @@ -102,40 +106,16 @@ int LibShuffCommand::execute(){ //don't save AA to AA if (i != j) { //(Caa - Cab)^2 - deltaValues.push_back( (abs(cValues[i][i]-cValues[i][j]) * abs(cValues[i][i]-cValues[i][j])) ); - sumDelta[count] += deltaValues[count]; + deltaValues[p].push_back( (abs(cValues[p][i][i]-cValues[p][i][j]) * abs(cValues[p][i][i]-cValues[p][i][j])) ); + sumDelta[count] += deltaValues[p][count]; count++; } } } + } - printCoverageFile(D); - - //check form - if (form != "discrete") { - if (next == dist.size()) { break; } - else { D = dist[next]; next++; } - }else { D += step; } + printCoverageFile(); - - } - - //output sum Deltas - for (int i = 0; i < numGroups; i++) { - for (int j = 0; j < numGroups; j++) { - //don't output AA to AA - if (i != j) { - cout << "Delta " + globaldata->Groups[i] + "-" + globaldata->Groups[j] << '\t'; - } - } - } - cout << endl; - - for (int i = 0; i < sumDelta.size(); i++) { - cout << setprecision(6) << sumDelta[i] << '\t'; - } - cout << endl; - /*******************************************************************************/ //create and score random matrixes finding the sumDelta values for summary file /******************************************************************************/ @@ -152,13 +132,11 @@ int LibShuffCommand::execute(){ for (int m = 0; m < iters; m++) { //generate random matrix in getValues //values for random matrix - cout << "Iteration " << m+1 << endl; - D = 0.0; - next = 1; - - while (D <= cutOff) { - coverage->getValues(matrix, D, cValues, "random"); + + coverage->getValues(matrix, cValues, dist, "random"); + //loop through each distance and load rsumdelta + for (int p = 0; p < cValues.size(); p++) { //find delta values int count = 0; for (int i = 0; i < numGroups; i++) { @@ -166,31 +144,26 @@ int LibShuffCommand::execute(){ //don't save AA to AA if (i != j) { //(Caa - Cab)^2 - rsumDelta[count][m] += ((abs(cValues[i][i]-cValues[i][j]) * abs(cValues[i][i]-cValues[i][j]))); -//cout << "iter " << m << " box " << i << j << " delta = " << ((abs(cValues[i][i]-cValues[i][j]) * abs(cValues[i][i]-cValues[i][j]))) << endl; + rsumDelta[count][m] += ((abs(cValues[p][i][i]-cValues[p][i][j]) * abs(cValues[p][i][i]-cValues[p][i][j]))); count++; } } } - - //check form - if (form != "discrete") { - if (next == dist.size()) { break; } - else { D = dist[next]; next++; } - }else { D += step; } - - - //clear out old Values - cValues.clear(); + } -cout << "random sum delta for iter " << m << endl; -for (int i = 0; i < rsumDelta.size(); i++) { - cout << setprecision(6) << rsumDelta[i][m] << '\t'; -} -cout << endl; +//cout << "iter " << m << endl; + //clear out old Values + cValues.clear(); + +//cout << "random sum delta for iter " << m << endl; +//for (int i = 0; i < rsumDelta.size(); i++) { +// cout << setprecision(6) << rsumDelta[i][m] << '\t'; +//} +//cout << endl; } + /**********************************************************/ //find the signifigance of the user matrix' sumdelta values /**********************************************************/ @@ -227,27 +200,28 @@ cout << endl; } } //********************************************************************************************************************** -void LibShuffCommand::printCoverageFile(float d) { +void LibShuffCommand::printCoverageFile() { try { //format output out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint); - out << setprecision(6) << d << '\t'; - - //print out coverage values - for (int i = 0; i < numGroups; i++) { - for (int j = 0; j < numGroups; j++) { - out << cValues[i][j] << '\t'; + //loop through each distance + for (int p = 0; p < cValues.size(); p++) { + out << setprecision(6) << dist[p] << '\t'; + //print out coverage values + for (int i = 0; i < numGroups; i++) { + for (int j = 0; j < numGroups; j++) { + out << cValues[p][i][j] << '\t'; + } } + + for (int h = 0; h < deltaValues[p].size(); h++) { + out << deltaValues[p][h] << '\t'; + } + + out << endl; } - //print out delta values - for (int i = 0; i < deltaValues.size(); i++) { - out << deltaValues[i] << '\t'; - } - - out << endl; - } catch(exception& e) { cout << "Standard Error: " << e.what() << " has occurred in the LibShuffCommand class Function printCoverageFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; @@ -278,8 +252,13 @@ void LibShuffCommand::printSummaryFile() { //print out delta values for (int i = 0; i < sumDelta.size(); i++) { - outSum << setprecision(6) << sumDelta[i] << '\t' << setprecision(globaldata->getIters().length()) << sumDeltaSig[i] << '\t'; - cout << setprecision(6) << sumDelta[i] << '\t' << setprecision(globaldata->getIters().length()) << sumDeltaSig[i] << '\t'; + if (sumDeltaSig[i] > (1/(float)iters)) { + outSum << setprecision(6) << sumDelta[i] << '\t' << setprecision(globaldata->getIters().length()) << sumDeltaSig[i] << '\t'; + cout << setprecision(6) << sumDelta[i] << '\t' << setprecision(globaldata->getIters().length()) << sumDeltaSig[i] << '\t'; + }else { + outSum << setprecision(6) << sumDelta[i] << '\t' << setprecision(globaldata->getIters().length()) << "<" << (1/float(iters)) << '\t'; + cout << setprecision(6) << sumDelta[i] << '\t' << setprecision(globaldata->getIters().length()) << "<" << (1/float(iters)) << '\t'; + } } outSum << endl; diff --git a/libshuffcommand.h b/libshuffcommand.h index fdaa7ee..950bb00 100644 --- a/libshuffcommand.h +++ b/libshuffcommand.h @@ -26,17 +26,18 @@ class LibShuffCommand : public Command { int execute(); private: - vector< vector > cValues; // vector of coverage scores, one for each comparison. - vector deltaValues; // vector of delta scores, one for each comparison. + vector< vector< vector > > cValues; // vector -one for each distance level. + vector< vector > deltaValues; // vector< vector of delta scores, one for each comparison.> -one at each distance vector sumDelta; //sum of delta scores, one for each comparison. vector sumDeltaSig; //number of random matrixes with that delta value or ?? vector< vector > rsumDelta; //vector< vector > vector groupComb; + vector dist; void setGroups(); int findIndex(float, int); - void printCoverageFile(float); + void printCoverageFile(); void printSummaryFile(); GlobalData* globaldata; -- 2.39.2