]> git.donarmstrong.com Git - mothur.git/blob - phylodiversitycommand.cpp
fixed phylo.diversity
[mothur.git] / phylodiversitycommand.cpp
1 /*
2  *  phylodiversitycommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/30/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylodiversitycommand.h"
11
12 //**********************************************************************************************************************
13 PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
14         try {
15                 globaldata = GlobalData::getInstance();
16                 abort = false;
17                 
18                 //allow user to run help
19                 if(option == "help") { help(); abort = true; }
20                 
21                 else {
22                         //valid paramters for this command
23                         string Array[] =  {"freq","rarefy","iters","groups","summary","collect","scale","outputdir","inputdir"};
24                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25                         
26                         OptionParser parser(option);
27                         map<string,string> parameters = parser.getParameters();
28                         
29                         ValidParameters validParameter;
30                 
31                         //check to make sure all parameters are valid for command
32                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
33                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
34                         }
35                         
36                         //if the user changes the output directory command factory will send this info to us in the output parameter 
37                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(globaldata->getTreeFile());              }
38                         
39                         if (globaldata->gTree.size() == 0) {//no trees were read
40                                 m->mothurOut("You must execute the read.tree command, before you may execute the phylo.diversity command."); m->mothurOutEndLine(); abort = true;  }
41
42                         string temp;
43                         temp = validParameter.validFile(parameters, "freq", false);                     if (temp == "not found") { temp = "100"; }
44                         convert(temp, freq); 
45                         
46                         temp = validParameter.validFile(parameters, "iters", false);                    if (temp == "not found") { temp = "1000"; }
47                         convert(temp, iters); 
48                         
49                         temp = validParameter.validFile(parameters, "rarefy", false);                   if (temp == "not found") { temp = "F"; }
50                         rarefy = m->isTrue(temp);
51                         if (!rarefy) { iters = 1;  }
52                         
53                         temp = validParameter.validFile(parameters, "summary", false);                  if (temp == "not found") { temp = "T"; }
54                         summary = m->isTrue(temp);
55                         
56                         temp = validParameter.validFile(parameters, "scale", false);                    if (temp == "not found") { temp = "F"; }
57                         scale = m->isTrue(temp);
58                         
59                         temp = validParameter.validFile(parameters, "collect", false);                  if (temp == "not found") { temp = "F"; }
60                         collect = m->isTrue(temp);
61                         
62                         groups = validParameter.validFile(parameters, "groups", false);                 
63                         if (groups == "not found") { groups = ""; Groups = globaldata->gTreemap->namesOfGroups;  globaldata->Groups = Groups;  }
64                         else { 
65                                 m->splitAtDash(groups, Groups);
66                                 globaldata->Groups = Groups;
67                         }
68                         
69                         if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
70                 }
71                 
72         }
73         catch(exception& e) {
74                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
75                 exit(1);
76         }                       
77 }
78 //**********************************************************************************************************************
79
80 void PhyloDiversityCommand::help(){
81         try {
82                 m->mothurOut("The phylo.diversity command can only be executed after a successful read.tree command.\n");
83                 m->mothurOut("The phylo.diversity command parameters are groups, iters, freq, scale, rarefy, collect and summary.  No parameters are required.\n");
84                 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n");
85                 m->mothurOut("The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n");
86                 m->mothurOut("The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n");
87                 m->mothurOut("The scale parameter is used indicate that you want your ouptut scaled to the number of sequences sampled, default = false. \n");
88                 m->mothurOut("The rarefy parameter allows you to create a rarefaction curve. The default is false.\n");
89                 m->mothurOut("The collect parameter allows you to create a collectors curve. The default is false.\n");
90                 m->mothurOut("The summary parameter allows you to create a .summary file. The default is true.\n");
91                 m->mothurOut("The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n");
92                 m->mothurOut("Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n");
93                 m->mothurOut("The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n");
94                 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
95
96         }
97         catch(exception& e) {
98                 m->errorOut(e, "PhyloDiversityCommand", "help");
99                 exit(1);
100         }
101 }
102
103 //**********************************************************************************************************************
104
105 PhyloDiversityCommand::~PhyloDiversityCommand(){}
106
107 //**********************************************************************************************************************
108
109 int PhyloDiversityCommand::execute(){
110         try {
111                 
112                 if (abort == true) { return 0; }
113                 
114                 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
115                 for (int i = 0; i < globaldata->Groups.size(); i++) { if (globaldata->Groups[i] == "xxx") { globaldata->Groups.erase(globaldata->Groups.begin()+i);  break; }  }
116                  
117                 vector<string> outputNames;
118                         
119                 vector<Tree*> trees = globaldata->gTree;
120                 
121                 //for each of the users trees
122                 for(int i = 0; i < trees.size(); i++) {
123                 
124                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
125                         
126                         ofstream outSum, outRare, outCollect;
127                         string outSumFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylodiv.summary";
128                         string outRareFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylodiv.rarefaction";
129                         string outCollectFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylodiv";
130                         
131                         if (summary)    { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile);                             }
132                         if (rarefy)             { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile);                          }
133                         if (collect)    { m->openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile); }
134                         
135                         int numLeafNodes = trees[i]->getNumLeaves();
136                         
137                         //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
138                         vector<int> randomLeaf;
139                         for (int j = 0; j < numLeafNodes; j++) {  
140                                 if (m->inUsersGroups(trees[i]->tree[j].getGroup(), globaldata->Groups) == true) { //is this a node from the group the user selected.
141                                         randomLeaf.push_back(j); 
142                                 }
143                         }
144                         
145                         numLeafNodes = randomLeaf.size();  //reset the number of leaf nodes you are using 
146                         
147                         //each group, each sampling, if no rarefy iters = 1;
148                         map<string, vector<float> > diversity;
149                         
150                         //each group, each sampling, if no rarefy iters = 1;
151                         map<string, vector<float> > sumDiversity;
152                         
153                         //find largest group total 
154                         int largestGroup = 0;
155                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
156                                 if (globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]] > largestGroup) { largestGroup = globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]; }
157                                 
158                                 //initialize diversity
159                                 diversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);              //numSampled
160                                                                                                                                                                                                                         //groupA                0.0                     0.0
161                                                                                                                                                                                                                         
162                                 //initialize sumDiversity
163                                 sumDiversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);
164                         }       
165
166                         //convert freq percentage to number
167                         int increment = 100;
168                         if (freq < 1.0) {  increment = largestGroup * freq;  
169                         }else { increment = freq;  }
170                         
171                         //initialize sampling spots
172                         set<int> numSampledList;
173                         for(int k = 1; k <= largestGroup; k++){  if((k == 1) || (k % increment == 0)){  numSampledList.insert(k); }   }
174                         if(largestGroup % increment != 0){      numSampledList.insert(largestGroup);   }
175                         
176                         //add other groups ending points
177                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
178                                 if (numSampledList.count(diversity[globaldata->Groups[j]].size()-1) == 0) {  numSampledList.insert(diversity[globaldata->Groups[j]].size()-1); }
179                         }
180
181                         for (int l = 0; l < iters; l++) {
182                                 random_shuffle(randomLeaf.begin(), randomLeaf.end());
183                 
184                                 //initialize counts
185                                 map<string, int> counts;
186                                 map< string, set<int> > countedBranch;  
187                                 for (int j = 0; j < globaldata->Groups.size(); j++) {  counts[globaldata->Groups[j]] = 0; countedBranch[globaldata->Groups[j]].insert(-2);  }  //add dummy index to initialize countedBranch sets
188                                 
189                                 for(int k = 0; k < numLeafNodes; k++){
190                                                 
191                                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
192                                         
193                                         //calc branch length of randomLeaf k
194                                         float br = calcBranchLength(trees[i], randomLeaf[k], countedBranch);
195                         
196                                         //for each group in the groups update the total branch length accounting for the names file
197                                         vector<string> groups = trees[i]->tree[randomLeaf[k]].getGroup();
198                                         
199                                         for (int j = 0; j < groups.size(); j++) {
200                                                 int numSeqsInGroupJ = 0;
201                                                 map<string, int>::iterator it;
202                                                 it = trees[i]->tree[randomLeaf[k]].pcount.find(groups[j]);
203                                                 if (it != trees[i]->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
204                                                         numSeqsInGroupJ = it->second;
205                                                 }
206                                                 
207                                                 if (numSeqsInGroupJ != 0) {     diversity[groups[j]][(counts[groups[j]]+1)] = diversity[groups[j]][counts[groups[j]]] + br;  }
208                                                 
209                                                 for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
210                                                         diversity[groups[j]][s] = diversity[groups[j]][s-1];  //update counts, but don't add in redundant branch lengths
211                                                 }
212                                                 counts[groups[j]] += numSeqsInGroupJ;
213                                         }
214                                 }
215                                 
216                                 if (rarefy) {
217                                         //add this diversity to the sum
218                                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
219                                                 for (int g = 0; g < diversity[globaldata->Groups[j]].size(); g++) {
220                                                         sumDiversity[globaldata->Groups[j]][g] += diversity[globaldata->Groups[j]][g];
221                                                 }
222                                         }
223                                 }
224                                 
225                                 if ((collect) && (l == 0)) {  printData(numSampledList, diversity, outCollect, 1);  }
226                                 if ((summary) && (l == 0)) {  printSumData(diversity, outSum, 1);  }
227                         }
228                         
229                         if (rarefy) {   printData(numSampledList, sumDiversity, outRare, iters);        }
230                 }
231                 
232         
233                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
234
235                 m->mothurOutEndLine();
236                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
237                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
238                 m->mothurOutEndLine();
239
240                 
241                 return 0;
242         }
243         catch(exception& e) {
244                 m->errorOut(e, "PhyloDiversityCommand", "execute");
245                 exit(1);
246         }
247 }
248 //**********************************************************************************************************************
249
250 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
251         try {
252                 
253                 out << "Groups\tnumSampled\tphyloDiversity" << endl;
254                 
255                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
256                         
257                 for (int j = 0; j < globaldata->Groups.size(); j++) {
258                         int numSampled = (div[globaldata->Groups[j]].size()-1);
259                         out << globaldata->Groups[j] << '\t' << numSampled << '\t';
260                 
261                          
262                         float score;
263                         if (scale)      {  score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;      }
264                         else            {       score = div[globaldata->Groups[j]][numSampled] / (float)numIters;       }
265                                 
266                         out << setprecision(4) << score << endl;
267                 }
268                                         
269                 out.close();
270                 
271         }
272         catch(exception& e) {
273                 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
274                 exit(1);
275         }
276 }
277 //**********************************************************************************************************************
278
279 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
280         try {
281                 
282                 out << "numSampled\t";
283                 for (int i = 0; i < globaldata->Groups.size(); i++) { out << globaldata->Groups[i] << '\t';  }
284                 out << endl;
285                 
286                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
287                 
288                 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {  
289                         int numSampled = *it;
290                         
291                         out << numSampled << '\t';  
292                         
293                         for (int j = 0; j < globaldata->Groups.size(); j++) {
294                                 if (numSampled < div[globaldata->Groups[j]].size()) { 
295                                         float score;
296                                         if (scale)      {  score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;      }
297                                         else            {       score = div[globaldata->Groups[j]][numSampled] / (float)numIters;       }
298
299                                         out << setprecision(4) << score << '\t';
300                                 }else { out << "NA" << '\t'; }
301                         }
302                         out << endl;
303                 }
304                 
305                 out.close();
306                 
307         }
308         catch(exception& e) {
309                 m->errorOut(e, "PhyloDiversityCommand", "printData");
310                 exit(1);
311         }
312 }
313 //**********************************************************************************************************************
314 float PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< string, set<int> >& counted){
315         try {
316
317                 //calc the branch length
318                 //while you aren't at root
319                 float sum = 0.0;
320                 int index = leaf;
321                 
322                 vector<string> groups = t->tree[leaf].getGroup();
323                                         
324                 while(t->tree[index].getParent() != -1){
325                         
326                         //if you have a BL
327                         if(t->tree[index].getBranchLength() != -1){
328                                 if (counted[groups[0]].count(index) == 0) { //you have not already counted this branch
329                                         sum += abs(t->tree[index].getBranchLength());
330                                         for (int j = 0; j < groups.size(); j++) {  counted[groups[j]].insert(index);  }
331                                 }
332                         }
333                         index = t->tree[index].getParent();
334                 }
335                         
336                 //get last breanch length added
337                 if(t->tree[index].getBranchLength() != -1){
338                         if (counted[groups[0]].count(index) == 0) { //you have not already counted this branch
339                                 sum += abs(t->tree[index].getBranchLength());
340                                 for (int j = 0; j < groups.size(); j++) {  counted[groups[j]].insert(index);  }
341                         }
342                 }
343                 
344                 return sum;
345         }
346         catch(exception& e) {
347                 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
348                 exit(1);
349         }
350 }
351 //**********************************************************************************************************************