From: westcott Date: Mon, 2 Aug 2010 18:05:57 +0000 (+0000) Subject: fixed bug with phylo.diversity. not calculating as intended. X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=515c3398ea27e2105f616fc5662b2a7ceb486aa0 fixed bug with phylo.diversity. not calculating as intended. --- diff --git a/makefile b/makefile index a8a0170..46ccb9d 100644 --- a/makefile +++ b/makefile @@ -13,7 +13,7 @@ CXXFLAGS += -O3 -MOTHUR_FILES = "\"Enter_your_default_path_here\"" +MOTHUR_FILES = "\"../Release\"" ifeq ($(strip $(MOTHUR_FILES)),"\"Enter_your_default_path_here\"") else CXXFLAGS += -DMOTHUR_FILES=${MOTHUR_FILES} diff --git a/nast.cpp b/nast.cpp index 90c8383..9cdf1b2 100644 --- a/nast.cpp +++ b/nast.cpp @@ -187,7 +187,6 @@ void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAl } } i -= insertLength; - //if (i < 0) { cout << " we have a negative i = " << i << endl; } } else{ @@ -208,6 +207,9 @@ void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAl } // i -= insertLength; + + //if i is negative, we want to remove the extra gaps to the right + if (i < 0) { cout << "i is negative" << endl; } } } } diff --git a/phylodiversity.cpp b/phylodiversity.cpp index 37884c4..86dc539 100644 --- a/phylodiversity.cpp +++ b/phylodiversity.cpp @@ -9,8 +9,8 @@ #include "phylodiversity.h" -/**************************************************************************************************/ -EstOutput PhyloDiversity::getValues(Tree* t, vector treeNodes) { +/************************************************************************************************** +EstOutput PhyloDiversity::getValues(Tree* t, vector treeNodes, vector< vector >& data) { try { map DScore; @@ -20,57 +20,52 @@ EstOutput PhyloDiversity::getValues(Tree* t, vector treeNodes) { //initialize Dscore for (int i=0; iGroups.size(); i++) { DScore[globaldata->Groups[i]] = 0.0; } - /********************************************************/ + /******************************************************** //calculate a D value for each group for(int v=0;vcontrol_pressed) { return data; } - //is this node from a sequence which is in one of the users groups - if (inUsersGroups(t->tree[treeNodes[v]].getGroup(), globaldata->Groups) == true) { - - //calc the branch length - //while you aren't at root - float sum = 0.0; - int index = treeNodes[v]; + //calc the branch length + //while you aren't at root + float sum = 0.0; + int index = treeNodes[v]; - while(t->tree[index].getParent() != -1){ - - //if you have a BL - if(t->tree[index].getBranchLength() != -1){ - sum += abs(t->tree[index].getBranchLength()); - } - index = t->tree[index].getParent(); - } - - //get last breanch length added + while(t->tree[index].getParent() != -1){ + + //if you have a BL if(t->tree[index].getBranchLength() != -1){ sum += abs(t->tree[index].getBranchLength()); } - - //for each group in the groups update the total branch length accounting for the names file - vector groups = t->tree[treeNodes[v]].getGroup(); - for (int j = 0; j < groups.size(); j++) { - int numSeqsInGroupJ = 0; - map::iterator it; - it = t->tree[treeNodes[v]].pcount.find(groups[j]); - if (it != t->tree[treeNodes[v]].pcount.end()) { //this leaf node contains seqs from group j - numSeqsInGroupJ = it->second; - } - - //add branch length to total for group - DScore[groups[j]] += (numSeqsInGroupJ * sum); + index = t->tree[index].getParent(); + } + + //get last breanch length added + if(t->tree[index].getBranchLength() != -1){ + sum += abs(t->tree[index].getBranchLength()); + } + + //for each group in the groups update the total branch length accounting for the names file + vector groups = t->tree[treeNodes[v]].getGroup(); + for (int j = 0; j < groups.size(); j++) { + int numSeqsInGroupJ = 0; + map::iterator it; + it = t->tree[treeNodes[v]].pcount.find(groups[j]); + if (it != t->tree[treeNodes[v]].pcount.end()) { //this leaf node contains seqs from group j + numSeqsInGroupJ = it->second; } + + //add branch length to total for group + DScore[groups[j]] += (numSeqsInGroupJ * sum); } + } for (int i=0; iGroups.size(); i++) { - //if (groupTotals[globaldata->Groups[i]] != 0.0) { //avoid divide by zero error - //float percent = DScore[globaldata->Groups[i]] / groupTotals[globaldata->Groups[i]]; float percent = DScore[globaldata->Groups[i]]; data.push_back(percent); - //}else { data.push_back(0.0); } + } return data; @@ -81,62 +76,6 @@ EstOutput PhyloDiversity::getValues(Tree* t, vector treeNodes) { } } /**************************************************************************************************/ -void PhyloDiversity::setTotalGroupBranchLengths(Tree* t) { - try { - - groupTotals.clear(); - - //initialize group totals - for (int i=0; iGroups.size(); i++) { groupTotals[globaldata->Groups[i]] = 0.0; } - - - /********************************************************/ - //calculate a D value for each group - for(int v=0;vgetNumLeaves();v++){ - - //is this node from a sequence which is in one of the users groups - if (inUsersGroups(t->tree[v].getGroup(), globaldata->Groups) == true) { - - //calc the branch length - int index = v; - float sum = 0.0; - - while(t->tree[index].getParent() != -1){ //while you aren't at root - - //if you have a BL - if(t->tree[index].getBranchLength() != -1){ - sum += abs(t->tree[index].getBranchLength()); - } - index = t->tree[index].getParent(); - } - - //get last breanch length added - if(t->tree[index].getBranchLength() != -1){ - sum += abs(t->tree[index].getBranchLength()); - } - - //account for the names file - vector groups = t->tree[v].getGroup(); - for (int j = 0; j < groups.size(); j++) { - int numSeqsInGroupJ = 0; - map::iterator it; - it = t->tree[v].pcount.find(groups[j]); - if (it != t->tree[v].pcount.end()) { //this leaf node contains seqs from group j - numSeqsInGroupJ = it->second; - } - //add branch length to total for group - groupTotals[groups[j]] += (numSeqsInGroupJ * sum); - }//end for - }//end if - }//end for - - } - catch(exception& e) { - m->errorOut(e, "PhyloDiversity", "setTotalGroupBranchLengths"); - exit(1); - } -} -/**************************************************************************************************/ diff --git a/phylodiversity.h b/phylodiversity.h index 131a63c..5e02595 100644 --- a/phylodiversity.h +++ b/phylodiversity.h @@ -15,7 +15,6 @@ #include "globaldata.hpp" #include "mothurout.h" -typedef vector EstOutput; /***********************************************************************/ @@ -25,15 +24,13 @@ class PhyloDiversity { PhyloDiversity(TreeMap* t) : tmap(t) { globaldata = GlobalData::getInstance(); m = MothurOut::getInstance(); } ~PhyloDiversity() {}; - EstOutput getValues(Tree*, vector); - void setTotalGroupBranchLengths(Tree*); + //int getValues(Tree*, vector, vector< vector< float> >&); + private: GlobalData* globaldata; MothurOut* m; - EstOutput data; TreeMap* tmap; - map groupTotals; }; /***********************************************************************/ diff --git a/phylodiversitycommand.cpp b/phylodiversitycommand.cpp index 0c2f72d..3904ca0 100644 --- a/phylodiversitycommand.cpp +++ b/phylodiversitycommand.cpp @@ -8,7 +8,6 @@ */ #include "phylodiversitycommand.h" -#include "phylodiversity.h" //********************************************************************************************************************** PhyloDiversityCommand::PhyloDiversityCommand(string option) { @@ -57,6 +56,7 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { splitAtDash(groups, Groups); globaldata->Groups = Groups; } + } } @@ -102,10 +102,7 @@ int PhyloDiversityCommand::execute(){ for (int i = 0; i < globaldata->Groups.size(); i++) { if (globaldata->Groups[i] == "xxx") { globaldata->Groups.erase(globaldata->Groups.begin()+i); break; } } vector outputNames; - - //diversity calculator - PhyloDiversity phylo(globaldata->gTreemap); - + vector trees = globaldata->gTree; //for each of the users trees @@ -113,8 +110,6 @@ int PhyloDiversityCommand::execute(){ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } - //phylo.setTotalGroupBranchLengths(trees[i]); - string outFile = outputDir + getRootName(getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylo.diversity"; if (rarefy) { outFile += ".rarefaction"; } outputNames.push_back(outFile); @@ -131,57 +126,86 @@ int PhyloDiversityCommand::execute(){ numLeafNodes = randomLeaf.size(); //reset the number of leaf nodes you are using - //convert freq percentage to number - int increment = 100; - if (freq < 1.0) { increment = numLeafNodes * freq; } - else { increment = freq; } + //each group, each sampling, if no rarefy iters = 1; + map > diversity; //each group, each sampling, if no rarefy iters = 1; - vector< vector > diversity; - diversity.resize(globaldata->Groups.size()); + map > sumDiversity; + + //find largest group total + int largestGroup = 0; + for (int j = 0; j < globaldata->Groups.size(); j++) { + if (globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]] > largestGroup) { largestGroup = globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]; } + + //initialize diversity + diversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0); //numSampled + //groupA 0.0 0.0 + + //initialize sumDiversity + sumDiversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0); + } + + //convert freq percentage to number + int increment = 100; + if (freq < 1.0) { increment = largestGroup * freq; + }else { increment = freq; } //initialize sampling spots - vector numSampledList; - for(int k = 0; k < numLeafNodes; k++){ if((k == 0) || (k+1) % increment == 0){ numSampledList.push_back(k); } } - if(numLeafNodes % increment != 0){ numSampledList.push_back(numLeafNodes); } - - //initialize diversity - for (int j = 0; j < diversity.size(); j++) { diversity[j].resize(numSampledList.size(), 0.0); } // 10sampled 20 sampled ... - //groupA 0.0 0.0 - //then for each iter you add to score and then when printing divide by iters to get average + set numSampledList; + for(int k = 1; k <= largestGroup; k++){ if((k == 1) || (k % increment == 0)){ numSampledList.insert(k); } } + if(largestGroup % increment != 0){ numSampledList.insert(largestGroup); } + + //add other groups ending points + for (int j = 0; j < globaldata->Groups.size(); j++) { + if (numSampledList.count(diversity[globaldata->Groups[j]].size()-1) == 0) { numSampledList.insert(diversity[globaldata->Groups[j]].size()-1); } + } + for (int l = 0; l < iters; l++) { random_shuffle(randomLeaf.begin(), randomLeaf.end()); - vector leavesSampled; - EstOutput data; - int count = 0; + //initialize counts + map counts; + for (int j = 0; j < globaldata->Groups.size(); j++) { counts[globaldata->Groups[j]] = 0; } + for(int k = 0; k < numLeafNodes; k++){ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } - leavesSampled.push_back(randomLeaf[k]); - - if((k == 0) || (k+1) % increment == 0){ //ready to calc? - - data = phylo.getValues(trees[i], leavesSampled); - - //datas results are in the same order as globaldatas groups - for (int h = 0; h < data.size(); h++) { diversity[h][count] += data[h]; } - - count++; + //calc branch length of randomLeaf k + float br = calcBranchLength(trees[i], randomLeaf[k]); + + //for each group in the groups update the total branch length accounting for the names file + vector groups = trees[i]->tree[randomLeaf[k]].getGroup(); + for (int j = 0; j < groups.size(); j++) { + int numSeqsInGroupJ = 0; + map::iterator it; + it = trees[i]->tree[randomLeaf[k]].pcount.find(groups[j]); + if (it != trees[i]->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j + numSeqsInGroupJ = it->second; + } + + for (int s = (counts[groups[j]]+1); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) { + diversity[groups[j]][s] = diversity[groups[j]][s-1] + (numSeqsInGroupJ * br); + } + counts[groups[j]] += numSeqsInGroupJ; } } - - if(numLeafNodes % increment != 0){ - - data = phylo.getValues(trees[i], leavesSampled); - - //datas results are in the same order as globaldatas groups - for (int h = 0; h < data.size(); h++) { diversity[h][count] += data[h]; } + + if (rarefy) { + //add this diversity to the sum + for (int j = 0; j < globaldata->Groups.size(); j++) { + for (int g = 0; g < diversity[globaldata->Groups[j]].size(); g++) { + sumDiversity[globaldata->Groups[j]][g] += diversity[globaldata->Groups[j]][g]; + } + } } } - printData(numSampledList, diversity, outFile); + if (rarefy) { + printData(numSampledList, sumDiversity, outFile); + }else{ + printData(numSampledList, diversity, outFile); + } } @@ -203,7 +227,7 @@ int PhyloDiversityCommand::execute(){ } //********************************************************************************************************************** -void PhyloDiversityCommand::printData(vector& num, vector< vector >& div, string file){ +void PhyloDiversityCommand::printData(set& num, map< string, vector >& div, string file){ try { ofstream out; openOutputFile(file, out); @@ -214,13 +238,16 @@ void PhyloDiversityCommand::printData(vector& num, vector< vector >& out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint); - for (int i = 0; i < num.size(); i++) { - if (i == (num.size()-1)) { out << num[i] << '\t'; } - else { out << (num[i]+1) << '\t'; } + for (set::iterator it = num.begin(); it != num.end(); it++) { + int numSampled = *it; - for (int j = 0; j < div.size(); j++) { - float score = div[j][i] / (float)iters; - out << setprecision(6) << score << '\t'; + out << numSampled << '\t'; + + for (int j = 0; j < globaldata->Groups.size(); j++) { + if (numSampled < div[globaldata->Groups[j]].size()) { + float score = div[globaldata->Groups[j]][numSampled] / (float)iters; + out << setprecision(6) << score << '\t'; + }else { out << "NA" << '\t'; } } out << endl; } @@ -235,3 +262,33 @@ void PhyloDiversityCommand::printData(vector& num, vector< vector >& } //********************************************************************************************************************** +float PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf){ + try { + + //calc the branch length + //while you aren't at root + float sum = 0.0; + int index = leaf; + + while(t->tree[index].getParent() != -1){ + + //if you have a BL + if(t->tree[index].getBranchLength() != -1){ + sum += abs(t->tree[index].getBranchLength()); + } + index = t->tree[index].getParent(); + } + + //get last breanch length added + if(t->tree[index].getBranchLength() != -1){ + sum += abs(t->tree[index].getBranchLength()); + } + + return sum; + } + catch(exception& e) { + m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength"); + exit(1); + } +} +//********************************************************************************************************************** \ No newline at end of file diff --git a/phylodiversitycommand.h b/phylodiversitycommand.h index 8dd569b..0aafb8d 100644 --- a/phylodiversitycommand.h +++ b/phylodiversitycommand.h @@ -31,7 +31,8 @@ class PhyloDiversityCommand : public Command { string groups, outputDir; vector Groups, outputNames; //holds groups to be used, and outputFile names - void printData(vector&, vector< vector >&, string); + void printData(set&, map< string, vector >&, string); + float calcBranchLength(Tree*, int); }; #endif diff --git a/phylosummary.cpp b/phylosummary.cpp index 004c131..3339697 100644 --- a/phylosummary.cpp +++ b/phylosummary.cpp @@ -173,7 +173,7 @@ void PhyloSummary::print(ofstream& out){ out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t"; if (groupmap != NULL) { //so the labels match the counts below, since the map sorts them automatically... - sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end()); + //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end()); for (int i = 0; i < groupmap->namesOfGroups.size(); i++) { out << groupmap->namesOfGroups[i] << '\t'; @@ -194,9 +194,10 @@ void PhyloSummary::print(ofstream& out){ map::iterator itGroup; if (groupmap != NULL) { - for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) { - out << itGroup->second << '\t'; - } + //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) { + // out << itGroup->second << '\t'; + //} + for (int i = 0; i < groupmap->namesOfGroups.size(); i++) { out << tree[0].groupCount[groupmap->namesOfGroups[i]] << '\t'; } } out << endl; @@ -230,9 +231,10 @@ void PhyloSummary::print(int i, ofstream& out){ map::iterator itGroup; if (groupmap != NULL) { - for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) { - out << itGroup->second << '\t'; - } + //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) { + // out << itGroup->second << '\t'; + //} + for (int i = 0; i < groupmap->namesOfGroups.size(); i++) { out << tree[it->second].groupCount[groupmap->namesOfGroups[i]] << '\t'; } } out << endl; }