]> git.donarmstrong.com Git - mothur.git/blob - phylodiversitycommand.cpp
added summary, collect, and scale parameters to phylo.diversity command.
[mothur.git] / phylodiversitycommand.cpp
1 /*
2  *  phylodiversitycommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/30/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylodiversitycommand.h"
11
12 //**********************************************************************************************************************
13 PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
14         try {
15                 globaldata = GlobalData::getInstance();
16                 abort = false;
17                 
18                 //allow user to run help
19                 if(option == "help") { help(); abort = true; }
20                 
21                 else {
22                         //valid paramters for this command
23                         string Array[] =  {"freq","rarefy","iters","groups","summary","collect","scale","outputdir","inputdir"};
24                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25                         
26                         OptionParser parser(option);
27                         map<string,string> parameters = parser.getParameters();
28                         
29                         ValidParameters validParameter;
30                 
31                         //check to make sure all parameters are valid for command
32                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
33                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
34                         }
35                         
36                         //if the user changes the output directory command factory will send this info to us in the output parameter 
37                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = hasPath(globaldata->getTreeFile());         }
38                         
39                         if (globaldata->gTree.size() == 0) {//no trees were read
40                                 m->mothurOut("You must execute the read.tree command, before you may execute the phylo.diversity command."); m->mothurOutEndLine(); abort = true;  }
41
42                         string temp;
43                         temp = validParameter.validFile(parameters, "freq", false);                     if (temp == "not found") { temp = "100"; }
44                         convert(temp, freq); 
45                         
46                         temp = validParameter.validFile(parameters, "iters", false);                    if (temp == "not found") { temp = "1000"; }
47                         convert(temp, iters); 
48                         
49                         temp = validParameter.validFile(parameters, "rarefy", false);                   if (temp == "not found") { temp = "F"; }
50                         rarefy = isTrue(temp);
51                         if (!rarefy) { iters = 1;  }
52                         
53                         temp = validParameter.validFile(parameters, "summary", false);                  if (temp == "not found") { temp = "T"; }
54                         summary = isTrue(temp);
55                         
56                         temp = validParameter.validFile(parameters, "scale", false);                    if (temp == "not found") { temp = "F"; }
57                         scale = isTrue(temp);
58                         
59                         temp = validParameter.validFile(parameters, "collect", false);                  if (temp == "not found") { temp = "F"; }
60                         collect = isTrue(temp);
61                         
62                         groups = validParameter.validFile(parameters, "groups", false);                 
63                         if (groups == "not found") { groups = ""; Groups = globaldata->gTreemap->namesOfGroups;  globaldata->Groups = Groups;  }
64                         else { 
65                                 splitAtDash(groups, Groups);
66                                 globaldata->Groups = Groups;
67                         }
68                         
69                         if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
70                 }
71                 
72         }
73         catch(exception& e) {
74                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
75                 exit(1);
76         }                       
77 }
78 //**********************************************************************************************************************
79
80 void PhyloDiversityCommand::help(){
81         try {
82                 m->mothurOut("The phylo.diversity command can only be executed after a successful read.tree command.\n");
83                 m->mothurOut("The phylo.diversity command parameters are groups, iters, freq, scale, rarefy, collect and summary.  No parameters are required.\n");
84                 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n");
85                 m->mothurOut("The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n");
86                 m->mothurOut("The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n");
87                 m->mothurOut("The scale parameter is used indicate that you want your ouptut scaled to the number of sequences sampled, default = false. \n");
88                 m->mothurOut("The rarefy parameter allows you to create a rarefaction curve. The default is false.\n");
89                 m->mothurOut("The collect parameter allows you to create a collectors curve. The default is false.\n");
90                 m->mothurOut("The summary parameter allows you to create a .summary file. The default is true.\n");
91                 m->mothurOut("The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n");
92                 m->mothurOut("Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n");
93                 m->mothurOut("The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n");
94                 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
95
96         }
97         catch(exception& e) {
98                 m->errorOut(e, "PhyloDiversityCommand", "help");
99                 exit(1);
100         }
101 }
102
103 //**********************************************************************************************************************
104
105 PhyloDiversityCommand::~PhyloDiversityCommand(){}
106
107 //**********************************************************************************************************************
108
109 int PhyloDiversityCommand::execute(){
110         try {
111                 
112                 if (abort == true) { return 0; }
113                 
114                 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
115                 for (int i = 0; i < globaldata->Groups.size(); i++) { if (globaldata->Groups[i] == "xxx") { globaldata->Groups.erase(globaldata->Groups.begin()+i);  break; }  }
116                  
117                 vector<string> outputNames;
118                         
119                 vector<Tree*> trees = globaldata->gTree;
120                 
121                 //for each of the users trees
122                 for(int i = 0; i < trees.size(); i++) {
123                 
124                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
125                         
126                         ofstream outSum, outRare, outCollect;
127                         string outSumFile = outputDir + getRootName(getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylodiv.summary";
128                         string outRareFile = outputDir + getRootName(getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylodiv.rarefaction";
129                         string outCollectFile = outputDir + getRootName(getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylodiv";
130                         
131                         if (summary)    { openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile);                                }
132                         if (rarefy)             { openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile);                             }
133                         if (collect)    { openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile);    }
134                         
135                         int numLeafNodes = trees[i]->getNumLeaves();
136                         
137                         //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
138                         vector<int> randomLeaf;
139                         for (int j = 0; j < numLeafNodes; j++) {  
140                                 if (inUsersGroups(trees[i]->tree[j].getGroup(), globaldata->Groups) == true) { //is this a node from the group the user selected.
141                                         randomLeaf.push_back(j); 
142                                 }
143                         }
144                         
145                         numLeafNodes = randomLeaf.size();  //reset the number of leaf nodes you are using 
146                         
147                         //each group, each sampling, if no rarefy iters = 1;
148                         map<string, vector<float> > diversity;
149                         
150                         //each group, each sampling, if no rarefy iters = 1;
151                         map<string, vector<float> > sumDiversity;
152                         
153                         //find largest group total 
154                         int largestGroup = 0;
155                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
156                                 if (globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]] > largestGroup) { largestGroup = globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]; }
157                                 
158                                 //initialize diversity
159                                 diversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);              //numSampled
160                                                                                                                                                                                                                         //groupA                0.0                     0.0
161                                                                                                                                                                                                                         
162                                 //initialize sumDiversity
163                                 sumDiversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);
164                         }       
165
166                         //convert freq percentage to number
167                         int increment = 100;
168                         if (freq < 1.0) {  increment = largestGroup * freq;  
169                         }else { increment = freq;  }
170                         
171                         //initialize sampling spots
172                         set<int> numSampledList;
173                         for(int k = 1; k <= largestGroup; k++){  if((k == 1) || (k % increment == 0)){  numSampledList.insert(k); }   }
174                         if(largestGroup % increment != 0){      numSampledList.insert(largestGroup);   }
175                         
176                         //add other groups ending points
177                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
178                                 if (numSampledList.count(diversity[globaldata->Groups[j]].size()-1) == 0) {  numSampledList.insert(diversity[globaldata->Groups[j]].size()-1); }
179                         }
180
181                         for (int l = 0; l < iters; l++) {
182                                 random_shuffle(randomLeaf.begin(), randomLeaf.end());
183                 
184                                 //initialize counts
185                                 map<string, int> counts;
186                                 for (int j = 0; j < globaldata->Groups.size(); j++) {  counts[globaldata->Groups[j]] = 0; }
187                                 
188                                 for(int k = 0; k < numLeafNodes; k++){
189                                                 
190                                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
191                                         
192                                         //calc branch length of randomLeaf k
193                                         float br = calcBranchLength(trees[i], randomLeaf[k]);
194                         
195                                         //for each group in the groups update the total branch length accounting for the names file
196                                         vector<string> groups = trees[i]->tree[randomLeaf[k]].getGroup();
197                                         for (int j = 0; j < groups.size(); j++) {
198                                                 int numSeqsInGroupJ = 0;
199                                                 map<string, int>::iterator it;
200                                                 it = trees[i]->tree[randomLeaf[k]].pcount.find(groups[j]);
201                                                 if (it != trees[i]->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
202                                                         numSeqsInGroupJ = it->second;
203                                                 }
204                                         
205                                                 for (int s = (counts[groups[j]]+1); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
206                                                         diversity[groups[j]][s] = diversity[groups[j]][s-1] + ((float) numSeqsInGroupJ * br);
207                                                 }
208                                                 counts[groups[j]] += numSeqsInGroupJ;
209                                         }
210                                 }
211                                 
212                                 if (rarefy) {
213                                         //add this diversity to the sum
214                                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
215                                                 for (int g = 0; g < diversity[globaldata->Groups[j]].size(); g++) {
216                                                         sumDiversity[globaldata->Groups[j]][g] += diversity[globaldata->Groups[j]][g];
217                                                 }
218                                         }
219                                 }
220                                 
221                                 if ((collect) && (l == 0)) {  printData(numSampledList, diversity, outCollect, 1);  }
222                                 if ((summary) && (l == 0)) {  printSumData(diversity, outSum, 1);  }
223                         }
224                         
225                         if (rarefy) {   printData(numSampledList, sumDiversity, outRare, iters);        }
226                 }
227                 
228         
229                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
230
231                 m->mothurOutEndLine();
232                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
233                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
234                 m->mothurOutEndLine();
235
236                 
237                 return 0;
238         }
239         catch(exception& e) {
240                 m->errorOut(e, "PhyloDiversityCommand", "execute");
241                 exit(1);
242         }
243 }
244 //**********************************************************************************************************************
245
246 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
247         try {
248                 
249                 out << "numSampled\t";
250                 for (int i = 0; i < globaldata->Groups.size(); i++) { out << globaldata->Groups[i] << '\t';  }
251                 out << endl;
252                 
253                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
254                 
255                 set<int> num;
256                 //find end points to output
257                 for (map<string, vector<float> >::iterator itEnds = div.begin(); itEnds != div.end(); itEnds++) {       num.insert(itEnds->second.size()-1);  }
258                 
259                 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {  
260                         int numSampled = *it;
261                         
262                         out << numSampled << '\t';  
263                         
264                         for (int j = 0; j < globaldata->Groups.size(); j++) {
265                                 if (numSampled < div[globaldata->Groups[j]].size()) { 
266                                         float score;
267                                         if (scale)      {  score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;      }
268                                         else            {       score = div[globaldata->Groups[j]][numSampled] / (float)numIters;       }
269                                         
270                                         out << setprecision(4) << score << '\t';
271                                 }else { out << "NA" << '\t'; }
272                         }
273                         out << endl;
274                 }
275                 
276                 out.close();
277                 
278         }
279         catch(exception& e) {
280                 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
281                 exit(1);
282         }
283 }
284 //**********************************************************************************************************************
285
286 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
287         try {
288                 
289                 out << "numSampled\t";
290                 for (int i = 0; i < globaldata->Groups.size(); i++) { out << globaldata->Groups[i] << '\t';  }
291                 out << endl;
292                 
293                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
294                 
295                 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {  
296                         int numSampled = *it;
297                         
298                         out << numSampled << '\t';  
299                         
300                         for (int j = 0; j < globaldata->Groups.size(); j++) {
301                                 if (numSampled < div[globaldata->Groups[j]].size()) { 
302                                         float score;
303                                         if (scale)      {  score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;      }
304                                         else            {       score = div[globaldata->Groups[j]][numSampled] / (float)numIters;       }
305
306                                         out << setprecision(4) << score << '\t';
307                                 }else { out << "NA" << '\t'; }
308                         }
309                         out << endl;
310                 }
311                 
312                 out.close();
313                 
314         }
315         catch(exception& e) {
316                 m->errorOut(e, "PhyloDiversityCommand", "printData");
317                 exit(1);
318         }
319 }
320 //**********************************************************************************************************************
321 float PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf){
322         try {
323
324                 //calc the branch length
325                 //while you aren't at root
326                 float sum = 0.0;
327                 int index = leaf;
328
329                 while(t->tree[index].getParent() != -1){
330                         
331                         //if you have a BL
332                         if(t->tree[index].getBranchLength() != -1){
333                                 sum += abs(t->tree[index].getBranchLength());
334                         }
335                         index = t->tree[index].getParent();
336                 }
337                         
338                 //get last breanch length added
339                 if(t->tree[index].getBranchLength() != -1){
340                         sum += abs(t->tree[index].getBranchLength());
341                 }
342                 
343                 return sum;
344         }
345         catch(exception& e) {
346                 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
347                 exit(1);
348         }
349 }
350 //**********************************************************************************************************************