From: westcott Date: Thu, 26 Feb 2009 14:44:51 +0000 (+0000) Subject: weightedcommand X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=5a3592c6478d5d786ec20e4bee71854ad92fdb8c weightedcommand --- diff --git a/mothur.cpp b/mothur.cpp index 3a91aa6..5142ca2 100644 --- a/mothur.cpp +++ b/mothur.cpp @@ -17,7 +17,7 @@ GlobalData* GlobalData::_uniqueInstance = 0; int main(int argc, char *argv[]){ try { - // srand(54321); + //srand(54321); srand( (unsigned)time( NULL ) ); Engine* dotur; diff --git a/parsimony.h b/parsimony.h index ebe3d8d..11d4cc8 100644 --- a/parsimony.h +++ b/parsimony.h @@ -23,6 +23,7 @@ class Parsimony : public TreeCalculator { Parsimony(TreeMap* t) : tmap(t) {}; ~Parsimony() {}; EstOutput getValues(Tree*); + EstOutput getValues(Tree*, string, string) { return data; }; private: GlobalData* globaldata; diff --git a/tree.cpp b/tree.cpp index eea3af4..2689b03 100644 --- a/tree.cpp +++ b/tree.cpp @@ -309,43 +309,32 @@ void Tree::randomLabels() { //set up the groups the user wants to include setGroups(); - for(int i=numLeaves-1;i>=0;i--){ - if(tree[i].pGroups.size() == 0){ - continue; - } - - int escape = 1; + for(int i = 0; i < numLeaves; i++){ int z; - - while(escape == 1){ - //get random index to switch with - z = int((float)(i+1) * (float)(rand()) / ((float)RAND_MAX+1.0)); - - if(tree[z].pGroups.size() != 0){ - escape = 0; - } - } + //get random index to switch with + z = int((float)(i+1) * (float)(rand()) / ((float)RAND_MAX+1.0)); //you only want to randomize the nodes that are from a group the user wants analyzed, so //if either of the leaf nodes you are about to switch are not in the users groups then you don't want to switch them. bool treez, treei; - - //leaves have only one group so you can just set it to begin() - it = tree[z].pGroups.begin(); - treez = inUsersGroups(it->first, globaldata->Groups); - - it = tree[i].pGroups.begin(); - treei = inUsersGroups(it->first, globaldata->Groups); + + treez = inUsersGroups(tree[z].getGroup(), globaldata->Groups); + treei = inUsersGroups(tree[i].getGroup(), globaldata->Groups); if ((treez == true) && (treei == true)) { //switches node i and node z's info. map lib_hold = tree[z].pGroups; tree[z].pGroups = (tree[i].pGroups); tree[i].pGroups = (lib_hold); - - tree[z].setGroup(tree[z].pGroups.begin()->first); - tree[i].setGroup(tree[i].pGroups.begin()->first); - + + string zgroup = tree[z].getGroup(); + tree[z].setGroup(tree[i].getGroup()); + tree[i].setGroup(zgroup); + + string zname = tree[z].getName(); + tree[z].setName(tree[i].getName()); + tree[i].setName(zname); + map gcount_hold = tree[z].pcount; tree[z].pcount = (tree[i].pcount); tree[i].pcount = (gcount_hold); @@ -363,6 +352,45 @@ void Tree::randomLabels() { } /**************************************************************************************************/ +void Tree::randomLabels(string groupA, string groupB) { + try { + for(int i = 0; i < numLeaves; i++) { + int z; + //get random index to switch with + z = int((float)(i+1) * (float)(rand()) / ((float)RAND_MAX+1.0)); + + //you only want to randomize the nodes that are from a group the user wants analyzed, so + //if either of the leaf nodes you are about to switch are not in the users groups then you don't want to switch them. + if (((tree[z].getGroup() == groupA) || (tree[z].getGroup() == groupB)) && ((tree[i].getGroup() == groupA) || (tree[i].getGroup() == groupB))) { + //switches node i and node z's info. + map lib_hold = tree[z].pGroups; + tree[z].pGroups = (tree[i].pGroups); + tree[i].pGroups = (lib_hold); + + string zgroup = tree[z].getGroup(); + tree[z].setGroup(tree[i].getGroup()); + tree[i].setGroup(zgroup); + + string zname = tree[z].getName(); + tree[z].setName(tree[i].getName()); + tree[i].setName(zname); + + map gcount_hold = tree[z].pcount; + tree[z].pcount = (tree[i].pcount); + tree[i].pcount = (gcount_hold); + } + } + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the Tree class Function randomLabels. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the Tree class function randomLabels. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} +/**************************************************************************************************/ void Tree::randomBlengths() { try { for(int i=numNodes-1;i>=0;i--){ @@ -387,6 +415,11 @@ void Tree::assembleRandomUnifracTree() { randomLabels(); assembleTree(); } +/*************************************************************************************************/ +void Tree::assembleRandomUnifracTree(string groupA, string groupB) { + randomLabels(groupA, groupB); + assembleTree(); +} /*************************************************************************************************/ //for now it's just random topology but may become random labels as well later that why this is such a simple function now... @@ -439,16 +472,18 @@ void Tree::randomTopology() { /*****************************************************************/ // This prints out the tree in Newick form. -void Tree::createNewickFile() { +void Tree::createNewickFile(string f) { try { int root = findRoot(); - filename = getRootName(globaldata->getTreeFile()) + "newick"; + //filename = getRootName(globaldata->getTreeFile()) + "newick"; + filename = f; openOutputFile(filename, out); printBranch(root); // you are at the end of the tree out << ";" << endl; + out.close(); } catch(exception& e) { cout << "Standard Error: " << e.what() << " has occurred in the Tree class Function createNewickFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; @@ -493,7 +528,7 @@ void Tree::printBranch(int node) { printBranch(tree[node].getRChild()); out << ")"; }else { //you are a leaf - tree[node].printNode(); //prints out name and branch length + out << tree[node].getName() << ":" << tree[node].getBranchLength(); } } diff --git a/tree.h b/tree.h index 6404db7..b0ae873 100644 --- a/tree.h +++ b/tree.h @@ -26,7 +26,8 @@ public: void getCopy(Tree*); //makes tree a copy of the one passed in. void assembleRandomTree(); void assembleRandomUnifracTree(); - void createNewickFile(); + void assembleRandomUnifracTree(string, string); + void createNewickFile(string); int getIndex(string); void setIndex(string, int); int getNumNodes() { return numNodes; } @@ -51,6 +52,7 @@ private: void randomTopology(); void randomBlengths(); void randomLabels(); + void randomLabels(string, string); int findRoot(); //return index of root node void printBranch(int); //recursively print out tree void setGroups(); diff --git a/treecalculator.h b/treecalculator.h index 133330c..9ee4774 100644 --- a/treecalculator.h +++ b/treecalculator.h @@ -29,6 +29,7 @@ public: TreeCalculator(string n) : name(n) {}; ~TreeCalculator(){}; virtual EstOutput getValues(Tree*) = 0; + virtual EstOutput getValues(Tree*, string, string) = 0; virtual string getName() { return name; } diff --git a/treenode.cpp b/treenode.cpp index 7fff2c1..cf7192e 100644 --- a/treenode.cpp +++ b/treenode.cpp @@ -64,7 +64,8 @@ void Node::printNode() { for(it=pcount.begin();it!=pcount.end();it++){ cout << ' ' << it->first << ':' << it->second; } - cout << endl; + cout << endl; + } catch(exception& e) { diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp index e2b085f..9a1ad0c 100644 --- a/unifracunweightedcommand.cpp +++ b/unifracunweightedcommand.cpp @@ -109,7 +109,6 @@ int UnifracUnweightedCommand::execute() { //get percentage of random trees with that info rscoreFreq[it->first] /= iters; rcumul-= it->second; - } //save the signifigance of the users score for printing later diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp index 4ec2773..8fc5b3b 100644 --- a/unifracweightedcommand.cpp +++ b/unifracweightedcommand.cpp @@ -18,6 +18,9 @@ UnifracWeightedCommand::UnifracWeightedCommand() { tmap = globaldata->gTreemap; weightedFile = globaldata->getTreeFile() + ".weighted"; openOutputFile(weightedFile, out); + //column headers + out << "Group" << '\t' << "Score" << '\t' << "UserFreq" << '\t' << "UserCumul" << '\t' << "RandFreq" << '\t' << "RandCumul" << endl; + sumFile = globaldata->getTreeFile() + ".wsummary"; openOutputFile(sumFile, outSum); @@ -42,111 +45,87 @@ int UnifracWeightedCommand::execute() { //get weighted for users tree userData.resize(numComp,0); //data[0] = weightedscore AB, data[1] = weightedscore AC... randomData.resize(numComp,0); //data[0] = weightedscore AB, data[1] = weightedscore AC... - uscoreFreq.resize(numComp); - validScores.resize(numComp); - totalrscoreFreq.resize(numComp); - uCumul.resize(numComp); - + //create new tree with same num nodes and leaves as users randT = new Tree(); //get pscores for users trees for (int i = 0; i < T.size(); i++) { - rscoreFreq.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC... - rCumul.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC... - + rScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC... + uScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC... + validScores.resize(numComp); + cout << "Processing tree " << i+1 << endl; userData = weighted->getValues(T[i]); //userData[0] = weightedscore //save users score for (int s=0; sgetCopy(T[i]); - //get scores for random trees for (int j = 0; j < iters; j++) { - //create a random tree with same topology as T[i], but different labels - randT->assembleRandomUnifracTree(); - //get pscore of random tree - randomData = weighted->getValues(randT); - - //save ramdoms score - for (int p=0; pgetCopy(T[i]); + + if (globaldata->Groups.size() != 0) { + //create a random tree with same topology as T[i], but different labels + randT->assembleRandomUnifracTree(globaldata->Groups[r], globaldata->Groups[l]); + //get wscore of random tree + randomData = weighted->getValues(randT, globaldata->Groups[r], globaldata->Groups[l]); + }else { + //create a random tree with same topology as T[i], but different labels + randT->assembleRandomUnifracTree(tmap->namesOfGroups[r], tmap->namesOfGroups[l]); + //get wscore of random tree + randomData = weighted->getValues(randT, tmap->namesOfGroups[r], tmap->namesOfGroups[l]); + } + + //save scores + rScores[count].push_back(randomData[0]); + validScores[count][randomData[0]] = randomData[0]; + count++; } - - //add random score to valid scores - validScores[p][randomData[p]] = randomData[p]; + n++; } } - - saveRandomScores(); //save all random scores for weighted file + + removeValidScoresDuplicates(); //find the signifigance of the score for summary file - for (int t = 0; t < numComp; t++) { - float rcumul = 1.0000; - for (it = validScores[t].begin(); it != validScores[t].end(); it++) { - //make rscoreFreq map and rCumul - it2 = rscoreFreq[t].find(it->first); - rCumul[t][it->first] = rcumul; - //get percentage of random trees with that info - if (it2 != rscoreFreq[t].end()) { rscoreFreq[t][it->first] /= iters; rcumul-= it2->second; } - else { rscoreFreq[t][it->first] = 0.0000; } //no random trees with that score - } - } - - //save the signifigance of the users score for printing later for (int f = 0; f < numComp; f++) { - WScoreSig.push_back(rCumul[f][userData[f]]); - } + //sort random scores + sort(rScores[f].begin(), rScores[f].end()); + + //the index of the score higher than yours is returned + //so if you have 1000 random trees the index returned is 100 + //then there are 900 trees with a score greater then you. + //giving you a signifigance of 0.900 + int index = findIndex(userData[f]); if (index == -1) { cout << "error in UnifracWeightedCommand" << endl; exit(1); } //error code + //the signifigance is the number of trees with the users score or higher + WScoreSig.push_back((iters-index)/(float)iters); + } - //clear random data - rscoreFreq.clear(); - rCumul.clear(); - } - - rCumul.resize(numComp); - for (int b = 0; b < numComp; b++) { - float ucumul = 1.0000; - float rcumul = 1.0000; - //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print. - for (it = validScores[b].begin(); it != validScores[b].end(); it++) { - it2 = uscoreFreq[b].find(it->first); - //make uCumul map - uCumul[b][it->first] = ucumul; - //user data has that score - if (it2 != uscoreFreq[b].end()) { uscoreFreq[b][it->first] /= T.size(); ucumul-= it2->second; } - else { uscoreFreq[b][it->first] = 0.0000; } //no user trees with that score + out << "Tree# " << i << endl; + //printWeightedFile(); - //make rscoreFreq map and rCumul - it2 = totalrscoreFreq[b].find(it->first); - rCumul[b][it->first] = rcumul; - //get percentage of random trees with that info - if (it2 != totalrscoreFreq[b].end()) { totalrscoreFreq[b][it->first] /= (iters * T.size()); rcumul-= it2->second; } - else { totalrscoreFreq[b][it->first] = 0.0000; } //no random trees with that score - } + //clear data + rScores.clear(); + uScores.clear(); + validScores.clear(); } - printWeightedFile(); printWSummaryFile(); //clear out users groups @@ -166,21 +145,18 @@ int UnifracWeightedCommand::execute() { exit(1); } } -/***********************************************************/ +/*********************************************************** void UnifracWeightedCommand::printWeightedFile() { try { - //column headers - - out << "Group" << '\t' << "Score" << '\t' << "UserFreq" << '\t' << "UserCumul" << '\t' << "RandFreq" << '\t' << "RandCumul" << endl; - + //format output out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint); //for each group for (int e = 0; e < numComp; e++) { //print each line in that group - for (it = validScores[e].begin(); it != validScores[e].end(); it++) { - out << setprecision(6) << groupComb[e] << '\t' << it->first << '\t' << '\t' << uscoreFreq[e][it->first] << '\t' << uCumul[e][it->first] << '\t' << totalrscoreFreq[e][it->first] << '\t' << rCumul[e][it->first] << endl; + for (i = 0; i < validScores[e].size(); i++) { + out << setprecision(6) << groupComb[e] << '\t' << validScores[e][i] << '\t' << '\t' << uscoreFreq[e][it->first] << '\t' << uCumul[e][it->first] << '\t' << rscoreFreq[e][it->first] << '\t' << rCumul[e][it->first] << endl; } } @@ -230,28 +206,44 @@ void UnifracWeightedCommand::printWSummaryFile() { } /***********************************************************/ -void UnifracWeightedCommand::saveRandomScores() { +void UnifracWeightedCommand::removeValidScoresDuplicates() { try { for (int e = 0; e < numComp; e++) { - //update total map with new random scores - for (it = rscoreFreq[e].begin(); it != rscoreFreq[e].end(); it++) { - //does this score already exist in the total map - it2 = totalrscoreFreq[e].find(it->first); - //if yes then add them - if (it2 != totalrscoreFreq[e].end()) { - totalrscoreFreq[e][it->first] += rscoreFreq[e][it->first]; - }else{ //its a new score - totalrscoreFreq[e][it->first] = rscoreFreq[e][it->first]; - } + //sort valid scores + sort(validScores[e].begin(), validScores[e].end()); + + for (int i = 0; i< validScores[e].size()-1; i++) { + if (validScores[e][i] == validScores[e][i+1]) { validScores[e].erase(validScores[e].begin()+i); } + } + } + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the UnifracWeightedCommand class Function removeValidScoresDuplicates. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the UnifracWeightedCommand class function removeValidScoresDuplicates. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} + +/***********************************************************/ +int UnifracWeightedCommand::findIndex(float score) { + try{ + for (int e = 0; e < numComp; e++) { + for (int i = 0; i < rScores[e].size(); i++) { +//cout << rScores[e][i] << " number " << i << endl; + if (rScores[e][i] >= score) { return i; } } } + return -1; } catch(exception& e) { - cout << "Standard Error: " << e.what() << " has occurred in the UnifracWeightedCommand class Function saveRandomScores. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + cout << "Standard Error: " << e.what() << " has occurred in the UnifracWeightedCommand class Function findIndex. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; exit(1); } catch(...) { - cout << "An unknown error has occurred in the UnifracWeightedCommand class function saveRandomScores. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + cout << "An unknown error has occurred in the UnifracWeightedCommand class function findIndex. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; exit(1); } } diff --git a/unifracweightedcommand.h b/unifracweightedcommand.h index 9deef73..d309566 100644 --- a/unifracweightedcommand.h +++ b/unifracweightedcommand.h @@ -38,20 +38,16 @@ class UnifracWeightedCommand : public Command { int iters, numGroups, numComp; EstOutput userData; //weighted score info for user tree EstOutput randomData; //weighted score info for random trees - vector< map > validScores; //vector each group comb has an entry - vector< map > rscoreFreq; //vector each group comb has an entry - vector< map > uscoreFreq; //vector each group comb has an entry - vector< map > totalrscoreFreq; //vector each group comb has an entry - vector< map > rCumul; //vector each group comb has an entry - vector< map > uCumul; //vector each group comb has an entry - map::iterator it; - map::iterator it2; - + vector< vector > validScores; //vector each group comb has an entry + vector< vector > rScores; //vector each group comb has an entry + vector< vector > uScores; //vector each group comb has an entry + ofstream outSum, out; void printWSummaryFile(); - void printWeightedFile(); - void saveRandomScores(); + // void printWeightedFile(); + void removeValidScoresDuplicates(); + int findIndex(float); void setGroups(); }; diff --git a/unweighted.h b/unweighted.h index b6f225f..b24ad22 100644 --- a/unweighted.h +++ b/unweighted.h @@ -22,6 +22,7 @@ class Unweighted : public TreeCalculator { Unweighted(TreeMap* t) : tmap(t) {}; ~Unweighted() {}; EstOutput getValues(Tree*); + EstOutput getValues(Tree*, string, string) { return data; }; private: GlobalData* globaldata; diff --git a/weighted.cpp b/weighted.cpp index c43180d..915cf36 100644 --- a/weighted.cpp +++ b/weighted.cpp @@ -173,4 +173,97 @@ EstOutput Weighted::getValues(Tree* t) { } -/**************************************************************************************************/ \ No newline at end of file +/**************************************************************************************************/ +EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) { + try { + globaldata = GlobalData::getInstance(); + + data.clear(); //clear out old values + + //initialize weighted score + WScore[(groupA+groupB)] = 0.0; + float D = 0.0; + + + /********************************************************/ + //calculate a D value for the group combo + for(int v=0;vgetNumLeaves();v++){ + int index = v; + double sum = 0.0000; + + //while you aren't at root + while(t->tree[index].getParent() != -1){ + + //if you have a BL + if(t->tree[index].getBranchLength() != -1){ + sum += t->tree[index].getBranchLength(); + } + index = t->tree[index].getParent(); + } + + //get last breanch length added + if(t->tree[index].getBranchLength() != -1){ + sum += t->tree[index].getBranchLength(); + } + + if ((t->tree[v].getGroup() == groupA) || (t->tree[v].getGroup() == groupB)) { + sum /= (double)tmap->seqsPerGroup[t->tree[v].getGroup()]; + D += sum; + } + } + /********************************************************/ + + //calculate u for the group comb + for(int i=0;igetNumNodes();i++){ + double u; + //does this node have descendants from groupA + it = t->tree[i].pcount.find(groupA); + //if it does u = # of its descendants with a certain group / total number in tree with a certain group + if (it != t->tree[i].pcount.end()) { + u = (double) t->tree[i].pcount[groupA] / (double) tmap->seqsPerGroup[groupA]; + }else { u = 0.00; } + + + //does this node have descendants from group l + it = t->tree[i].pcount.find(groupB); + //if it does subtract their percentage from u + if (it != t->tree[i].pcount.end()) { + u -= (double) t->tree[i].pcount[groupB] / (double) tmap->seqsPerGroup[groupB]; + } + + u = abs(u) * t->tree[i].getBranchLength(); + + //save groupcombs u value + WScore[(groupA+groupB)] += u; + } + + /********************************************************/ + + //calculate weighted score for the group combination + double UN; + UN = (WScore[(groupA+groupB)] / D); + + if (isnan(UN) || isinf(UN)) { UN = 0; } + data.push_back(UN); + + return data; + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the Weighted class Function getValues. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the Weighted class function getValues. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + +} + + + + + + + + + diff --git a/weighted.h b/weighted.h index 7e4be7c..41b738b 100644 --- a/weighted.h +++ b/weighted.h @@ -22,6 +22,7 @@ class Weighted : public TreeCalculator { Weighted(TreeMap* t) : tmap(t) {}; ~Weighted() {}; EstOutput getValues(Tree*); + EstOutput getValues(Tree*, string, string); private: GlobalData* globaldata;