]> git.donarmstrong.com Git - mothur.git/commitdiff
fixed bug with phylo.diversity. not calculating as intended.
authorwestcott <westcott>
Mon, 2 Aug 2010 18:05:57 +0000 (18:05 +0000)
committerwestcott <westcott>
Mon, 2 Aug 2010 18:05:57 +0000 (18:05 +0000)
makefile
nast.cpp
phylodiversity.cpp
phylodiversity.h
phylodiversitycommand.cpp
phylodiversitycommand.h
phylosummary.cpp

index a8a0170db2642bf436dbace2f002a2f370c76d23..46ccb9d106edee81bdfdea884576c39d37d09c56 100644 (file)
--- a/makefile
+++ b/makefile
@@ -13,7 +13,7 @@
 
 CXXFLAGS += -O3
 
-MOTHUR_FILES = "\"Enter_your_default_path_here\""
+MOTHUR_FILES = "\"../Release\""
 ifeq  ($(strip $(MOTHUR_FILES)),"\"Enter_your_default_path_here\"")
 else
        CXXFLAGS += -DMOTHUR_FILES=${MOTHUR_FILES}
index 90c8383f2a8a6fb23467d6b662879dfde5fd8f49..9cdf1b26619a170b7f7d67cc9edc60d89d66559a 100644 (file)
--- a/nast.cpp
+++ b/nast.cpp
@@ -187,7 +187,6 @@ void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAl
                                                }
                                        }
                                        i -= insertLength;
-       //if (i < 0) { cout << " we have a negative i = " << i << endl; }
 
                                }
                                else{
@@ -208,6 +207,9 @@ void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAl
                                }
                        
 //                             i -= insertLength;
+                               
+                               //if i is negative, we want to remove the extra gaps to the right
+                               if (i < 0) { cout << "i is negative" << endl; }
                        } 
                }
        }
index 37884c483619887bcc478b6fdf7e77be08424cd4..86dc539b8e1a6e1385d178ab0473bcc966a71705 100644 (file)
@@ -9,8 +9,8 @@
 
 #include "phylodiversity.h"
 
-/**************************************************************************************************/
-EstOutput PhyloDiversity::getValues(Tree* t, vector<int> treeNodes) {
+/**************************************************************************************************
+EstOutput PhyloDiversity::getValues(Tree* t, vector<int> treeNodes, vector< vector<float> >& data) {
     try {
                
                map<string, float> DScore;
@@ -20,57 +20,52 @@ EstOutput PhyloDiversity::getValues(Tree* t, vector<int> treeNodes) {
                //initialize Dscore
                for (int i=0; i<globaldata->Groups.size(); i++) {               DScore[globaldata->Groups[i]] = 0.0;    }
        
-               /********************************************************/
+               /********************************************************
                //calculate a D value for each group 
                for(int v=0;v<treeNodes.size();v++){
                                
                        if (m->control_pressed) { return data; }
                        
-                       //is this node from a sequence which is in one of the users groups
-                       if (inUsersGroups(t->tree[treeNodes[v]].getGroup(), globaldata->Groups) == true) {
-                                       
-                               //calc the branch length
-                               //while you aren't at root
-                               float sum = 0.0;
-                               int index = treeNodes[v];
+                       //calc the branch length
+                       //while you aren't at root
+                       float sum = 0.0;
+                       int index = treeNodes[v];
 
-                               while(t->tree[index].getParent() != -1){
-                                       
-                                       //if you have a BL
-                                       if(t->tree[index].getBranchLength() != -1){
-                                               sum += abs(t->tree[index].getBranchLength());
-                                       }
-                                       index = t->tree[index].getParent();
-                               }
-                                       
-                               //get last breanch length added
+                       while(t->tree[index].getParent() != -1){
+                               
+                               //if you have a BL
                                if(t->tree[index].getBranchLength() != -1){
                                        sum += abs(t->tree[index].getBranchLength());
                                }
-                                       
-                               //for each group in the groups update the total branch length accounting for the names file
-                               vector<string> groups = t->tree[treeNodes[v]].getGroup();
-                               for (int j = 0; j < groups.size(); j++) {
-                                       int numSeqsInGroupJ = 0;
-                                       map<string, int>::iterator it;
-                                       it = t->tree[treeNodes[v]].pcount.find(groups[j]);
-                                       if (it != t->tree[treeNodes[v]].pcount.end()) { //this leaf node contains seqs from group j
-                                               numSeqsInGroupJ = it->second;
-                                       }
-
-                                       //add branch length to total for group
-                                       DScore[groups[j]] += (numSeqsInGroupJ * sum);
+                               index = t->tree[index].getParent();
+                       }
+                               
+                       //get last breanch length added
+                       if(t->tree[index].getBranchLength() != -1){
+                               sum += abs(t->tree[index].getBranchLength());
+                       }
+                               
+                       //for each group in the groups update the total branch length accounting for the names file
+                       vector<string> groups = t->tree[treeNodes[v]].getGroup();
+                       for (int j = 0; j < groups.size(); j++) {
+                               int numSeqsInGroupJ = 0;
+                               map<string, int>::iterator it;
+                               it = t->tree[treeNodes[v]].pcount.find(groups[j]);
+                               if (it != t->tree[treeNodes[v]].pcount.end()) { //this leaf node contains seqs from group j
+                                       numSeqsInGroupJ = it->second;
                                }
+
+                               //add branch length to total for group
+                               DScore[groups[j]] += (numSeqsInGroupJ * sum);
                        }
+                       
                }
                
        
                for (int i=0; i<globaldata->Groups.size(); i++) {   
-                       //if (groupTotals[globaldata->Groups[i]] != 0.0) { //avoid divide by zero error
-                               //float percent = DScore[globaldata->Groups[i]] / groupTotals[globaldata->Groups[i]];
                        float percent = DScore[globaldata->Groups[i]]; 
                        data.push_back(percent);  
-                       //}else {  data.push_back(0.0);  }
+                       
                }
                
                return data;
@@ -81,62 +76,6 @@ EstOutput PhyloDiversity::getValues(Tree* t, vector<int> treeNodes) {
        }
 }
 /**************************************************************************************************/
-void PhyloDiversity::setTotalGroupBranchLengths(Tree* t) {
-    try {
-               
-               groupTotals.clear();
-               
-               //initialize group totals
-               for (int i=0; i<globaldata->Groups.size(); i++) {               groupTotals[globaldata->Groups[i]] = 0.0;       }
-               
-               
-               /********************************************************/
-               //calculate a D value for each group 
-               for(int v=0;v<t->getNumLeaves();v++){
-                       
-                       //is this node from a sequence which is in one of the users groups
-                       if (inUsersGroups(t->tree[v].getGroup(), globaldata->Groups) == true) {
-                       
-                               //calc the branch length
-                               int index = v;
-                               float sum = 0.0;
-                               
-                               while(t->tree[index].getParent() != -1){  //while you aren't at root
-                                               
-                                       //if you have a BL
-                                       if(t->tree[index].getBranchLength() != -1){
-                                               sum += abs(t->tree[index].getBranchLength());
-                                       }
-                                       index = t->tree[index].getParent();
-                               }
-                                       
-                               //get last breanch length added
-                               if(t->tree[index].getBranchLength() != -1){
-                                       sum += abs(t->tree[index].getBranchLength());
-                               }
-                                                       
-                               //account for the names file
-                               vector<string> groups = t->tree[v].getGroup();
-                               for (int j = 0; j < groups.size(); j++) {
-                                       int numSeqsInGroupJ = 0;
-                                       map<string, int>::iterator it;
-                                       it = t->tree[v].pcount.find(groups[j]);
-                                       if (it != t->tree[v].pcount.end()) { //this leaf node contains seqs from group j
-                                               numSeqsInGroupJ = it->second;
-                                       }
 
-                                       //add branch length to total for group
-                                       groupTotals[groups[j]] += (numSeqsInGroupJ * sum);
-                               }//end for
-                       }//end if
-               }//end for
-               
-       }
-       catch(exception& e) {
-               m->errorOut(e, "PhyloDiversity", "setTotalGroupBranchLengths");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
 
 
index 131a63cd31b799f9e60eae28fcfcf9beed732d9f..5e0259541d691fc0b5b51cbf7f46feb6034f8cf8 100644 (file)
@@ -15,7 +15,6 @@
 #include "globaldata.hpp"
 #include "mothurout.h"
 
-typedef vector<double> EstOutput; 
 
 /***********************************************************************/
 
@@ -25,15 +24,13 @@ class PhyloDiversity  {
                PhyloDiversity(TreeMap* t) : tmap(t) { globaldata = GlobalData::getInstance();  m = MothurOut::getInstance(); }
                ~PhyloDiversity() {};
                
-               EstOutput getValues(Tree*, vector<int>);
-               void setTotalGroupBranchLengths(Tree*);
+               //int getValues(Tree*, vector<int>, vector< vector< float> >&);
+               
                
        private:
                GlobalData* globaldata;
                MothurOut* m;
-               EstOutput data;
                TreeMap* tmap;
-               map<string, float> groupTotals;
 };
 
 /***********************************************************************/
index 0c2f72d0ad6fd57f628ec01d16ba6c8c31a00d8c..3904ca089f01650da13912d88f308a69aae75192 100644 (file)
@@ -8,7 +8,6 @@
  */
 
 #include "phylodiversitycommand.h"
-#include "phylodiversity.h"
 
 //**********************************************************************************************************************
 PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
@@ -57,6 +56,7 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
                                splitAtDash(groups, Groups);
                                globaldata->Groups = Groups;
                        }
+                       
                }
                
        }
@@ -102,10 +102,7 @@ int PhyloDiversityCommand::execute(){
                for (int i = 0; i < globaldata->Groups.size(); i++) { if (globaldata->Groups[i] == "xxx") { globaldata->Groups.erase(globaldata->Groups.begin()+i);  break; }  }
                 
                vector<string> outputNames;
-               
-               //diversity calculator
-               PhyloDiversity phylo(globaldata->gTreemap);
-               
+                       
                vector<Tree*> trees = globaldata->gTree;
                
                //for each of the users trees
@@ -113,8 +110,6 @@ int PhyloDiversityCommand::execute(){
                
                        if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
                        
-                       //phylo.setTotalGroupBranchLengths(trees[i]);
-                       
                        string outFile = outputDir + getRootName(getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylo.diversity";
                        if (rarefy) { outFile += ".rarefaction"; }
                        outputNames.push_back(outFile);
@@ -131,57 +126,86 @@ int PhyloDiversityCommand::execute(){
                        
                        numLeafNodes = randomLeaf.size();  //reset the number of leaf nodes you are using 
                        
-                       //convert freq percentage to number
-                       int increment = 100;
-                       if (freq < 1.0) {  increment = numLeafNodes * freq;  }
-                       else { increment = freq;  }
+                       //each group, each sampling, if no rarefy iters = 1;
+                       map<string, vector<float> > diversity;
                        
                        //each group, each sampling, if no rarefy iters = 1;
-                       vector< vector<float> > diversity;
-                       diversity.resize(globaldata->Groups.size());
+                       map<string, vector<float> > sumDiversity;
+                       
+                       //find largest group total 
+                       int largestGroup = 0;
+                       for (int j = 0; j < globaldata->Groups.size(); j++) {  
+                               if (globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]] > largestGroup) { largestGroup = globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]; }
+                               
+                               //initialize diversity
+                               diversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);              //numSampled
+                                                                                                                                                                                                                       //groupA                0.0                     0.0
+                                                                                                                                                                                                                       
+                               //initialize sumDiversity
+                               sumDiversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);
+                       }       
+
+                       //convert freq percentage to number
+                       int increment = 100;
+                       if (freq < 1.0) {  increment = largestGroup * freq;  
+                       }else { increment = freq;  }
                        
                        //initialize sampling spots
-                       vector<int> numSampledList;
-                       for(int k = 0; k < numLeafNodes; k++){  if((k == 0) || (k+1) % increment == 0){  numSampledList.push_back(k); }   }
-                       if(numLeafNodes % increment != 0){      numSampledList.push_back(numLeafNodes);   }
-                       
-                       //initialize diversity
-                       for (int j = 0; j < diversity.size(); j++) {   diversity[j].resize(numSampledList.size(), 0.0);  }  //                  10sampled       20 sampled ...
-                                                                                                                                                                                                                               //groupA                0.0                     0.0
-                                                                                                                                                                                                                       //then for each iter you add to score and then when printing divide by iters to get average
+                       set<int> numSampledList;
+                       for(int k = 1; k <= largestGroup; k++){  if((k == 1) || (k % increment == 0)){  numSampledList.insert(k); }   }
+                       if(largestGroup % increment != 0){      numSampledList.insert(largestGroup);   }
+                       
+                       //add other groups ending points
+                       for (int j = 0; j < globaldata->Groups.size(); j++) {  
+                               if (numSampledList.count(diversity[globaldata->Groups[j]].size()-1) == 0) {  numSampledList.insert(diversity[globaldata->Groups[j]].size()-1); }
+                       }
+
                        for (int l = 0; l < iters; l++) {
                                random_shuffle(randomLeaf.begin(), randomLeaf.end());
                
-                               vector<int> leavesSampled;
-                               EstOutput data;
-                               int count = 0;
+                               //initialize counts
+                               map<string, int> counts;
+                               for (int j = 0; j < globaldata->Groups.size(); j++) {  counts[globaldata->Groups[j]] = 0; }
+                               
                                for(int k = 0; k < numLeafNodes; k++){
                                                
                                        if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
                                        
-                                       leavesSampled.push_back(randomLeaf[k]);
-                                               
-                                       if((k == 0) || (k+1) % increment == 0){ //ready to calc?
-                                               
-                                               data = phylo.getValues(trees[i], leavesSampled);
-                                               
-                                               //datas results are in the same order as globaldatas groups
-                                               for (int h = 0; h < data.size(); h++) {  diversity[h][count] += data[h];  }
-                                               
-                                               count++;
+                                       //calc branch length of randomLeaf k
+                                       float br = calcBranchLength(trees[i], randomLeaf[k]);
+                       
+                                       //for each group in the groups update the total branch length accounting for the names file
+                                       vector<string> groups = trees[i]->tree[randomLeaf[k]].getGroup();
+                                       for (int j = 0; j < groups.size(); j++) {
+                                               int numSeqsInGroupJ = 0;
+                                               map<string, int>::iterator it;
+                                               it = trees[i]->tree[randomLeaf[k]].pcount.find(groups[j]);
+                                               if (it != trees[i]->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
+                                                       numSeqsInGroupJ = it->second;
+                                               }
+                                       
+                                               for (int s = (counts[groups[j]]+1); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
+                                                       diversity[groups[j]][s] = diversity[groups[j]][s-1] + (numSeqsInGroupJ * br);
+                                               }
+                                               counts[groups[j]] += numSeqsInGroupJ;
                                        }
                                }
-               
-                               if(numLeafNodes % increment != 0){      
-                                       
-                                       data = phylo.getValues(trees[i], leavesSampled);
-                                       
-                                       //datas results are in the same order as globaldatas groups
-                                       for (int h = 0; h < data.size(); h++) {  diversity[h][count] += data[h];  }
+                               
+                               if (rarefy) {
+                                       //add this diversity to the sum
+                                       for (int j = 0; j < globaldata->Groups.size(); j++) {  
+                                               for (int g = 0; g < diversity[globaldata->Groups[j]].size(); g++) {
+                                                       sumDiversity[globaldata->Groups[j]][g] += diversity[globaldata->Groups[j]][g];
+                                               }
+                                       }
                                }
                        }
                        
-                       printData(numSampledList, diversity, outFile);
+                       if (rarefy) { 
+                               printData(numSampledList, sumDiversity, outFile);
+                       }else{
+                               printData(numSampledList, diversity, outFile);
+                       }
 
                }
                
@@ -203,7 +227,7 @@ int PhyloDiversityCommand::execute(){
 }
 //**********************************************************************************************************************
 
-void PhyloDiversityCommand::printData(vector<int>& num, vector< vector<float> >& div, string file){
+void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, string file){
        try {
                ofstream out;
                openOutputFile(file, out);
@@ -214,13 +238,16 @@ void PhyloDiversityCommand::printData(vector<int>& num, vector< vector<float> >&
                
                out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
                
-               for (int i = 0; i < num.size(); i++) {  
-                       if (i == (num.size()-1)) {  out << num[i] << '\t';  }
-                       else {  out << (num[i]+1) << '\t';  }
+               for (set<int>::iterator it = num.begin(); it != num.end(); it++) {  
+                       int numSampled = *it;
                        
-                       for (int j = 0; j < div.size(); j++) {
-                               float score = div[j][i] / (float)iters;
-                               out << setprecision(6) << score << '\t';
+                       out << numSampled << '\t';  
+                       
+                       for (int j = 0; j < globaldata->Groups.size(); j++) {
+                               if (numSampled < div[globaldata->Groups[j]].size()) { 
+                                       float score = div[globaldata->Groups[j]][numSampled] / (float)iters;
+                                       out << setprecision(6) << score << '\t';
+                               }else { out << "NA" << '\t'; }
                        }
                        out << endl;
                }
@@ -235,3 +262,33 @@ void PhyloDiversityCommand::printData(vector<int>& num, vector< vector<float> >&
 }
 
 //**********************************************************************************************************************
+float PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf){
+       try {
+
+               //calc the branch length
+               //while you aren't at root
+               float sum = 0.0;
+               int index = leaf;
+
+               while(t->tree[index].getParent() != -1){
+                       
+                       //if you have a BL
+                       if(t->tree[index].getBranchLength() != -1){
+                               sum += abs(t->tree[index].getBranchLength());
+                       }
+                       index = t->tree[index].getParent();
+               }
+                       
+               //get last breanch length added
+               if(t->tree[index].getBranchLength() != -1){
+                       sum += abs(t->tree[index].getBranchLength());
+               }
+               
+               return sum;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
\ No newline at end of file
index 8dd569bbcf567d1990465a2c17bf62ed4ad55831..0aafb8dbc53305a327f7a2b825a2b9c74df8bdb9 100644 (file)
@@ -31,7 +31,8 @@ class PhyloDiversityCommand : public Command {
                string groups, outputDir;
                vector<string> Groups, outputNames; //holds groups to be used, and outputFile names
                
-               void printData(vector<int>&, vector< vector<float> >&, string);
+               void printData(set<int>&, map< string, vector<float> >&, string);
+               float calcBranchLength(Tree*, int);
 };
 
 #endif
index 004c131d6779de0ab5d47df75308ad8b835fde2f..33396975f506626ff275a8c37fa6c0f83a3ead18 100644 (file)
@@ -173,7 +173,7 @@ void PhyloSummary::print(ofstream& out){
                out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t";
                if (groupmap != NULL) {
                        //so the labels match the counts below, since the map sorts them automatically...
-                       sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
+                       //sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end());
                        
                        for (int i = 0; i < groupmap->namesOfGroups.size(); i++) {
                                out << groupmap->namesOfGroups[i] << '\t';
@@ -194,9 +194,10 @@ void PhyloSummary::print(ofstream& out){
                
                map<string, int>::iterator itGroup;
                if (groupmap != NULL) {
-                       for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
-                               out << itGroup->second << '\t';
-                       }
+                       //for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
+                       //      out << itGroup->second << '\t';
+                       //}
+                       for (int i = 0; i < groupmap->namesOfGroups.size(); i++) {  out << tree[0].groupCount[groupmap->namesOfGroups[i]] << '\t'; } 
                }
                out << endl;
                
@@ -230,9 +231,10 @@ void PhyloSummary::print(int i, ofstream& out){
                                
                                map<string, int>::iterator itGroup;
                                if (groupmap != NULL) {
-                                       for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
-                                               out << itGroup->second << '\t';
-                                       }
+                                       //for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
+                                       //      out << itGroup->second << '\t';
+                                       //}
+                                       for (int i = 0; i < groupmap->namesOfGroups.size(); i++) {  out << tree[it->second].groupCount[groupmap->namesOfGroups[i]] << '\t'; } 
                                }
                                out << endl;
                        }