]> git.donarmstrong.com Git - mothur.git/blob - phylodiversitycommand.cpp
fixed phylo.diversity
[mothur.git] / phylodiversitycommand.cpp
1 /*
2  *  phylodiversitycommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/30/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylodiversitycommand.h"
11
12 //**********************************************************************************************************************
13 PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
14         try {
15                 globaldata = GlobalData::getInstance();
16                 abort = false;
17                 
18                 //allow user to run help
19                 if(option == "help") { help(); abort = true; }
20                 
21                 else {
22                         //valid paramters for this command
23                         string Array[] =  {"freq","rarefy","iters","groups","summary","collect","scale","outputdir","inputdir"};
24                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25                         
26                         OptionParser parser(option);
27                         map<string,string> parameters = parser.getParameters();
28                         
29                         ValidParameters validParameter;
30                 
31                         //check to make sure all parameters are valid for command
32                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
33                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
34                         }
35                         
36                         //if the user changes the output directory command factory will send this info to us in the output parameter 
37                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(globaldata->getTreeFile());              }
38                         
39                         if (globaldata->gTree.size() == 0) {//no trees were read
40                                 m->mothurOut("You must execute the read.tree command, before you may execute the phylo.diversity command."); m->mothurOutEndLine(); abort = true;  }
41
42                         string temp;
43                         temp = validParameter.validFile(parameters, "freq", false);                     if (temp == "not found") { temp = "100"; }
44                         convert(temp, freq); 
45                         
46                         temp = validParameter.validFile(parameters, "iters", false);                    if (temp == "not found") { temp = "1000"; }
47                         convert(temp, iters); 
48                         
49                         temp = validParameter.validFile(parameters, "rarefy", false);                   if (temp == "not found") { temp = "F"; }
50                         rarefy = m->isTrue(temp);
51                         if (!rarefy) { iters = 1;  }
52                         
53                         temp = validParameter.validFile(parameters, "summary", false);                  if (temp == "not found") { temp = "T"; }
54                         summary = m->isTrue(temp);
55                         
56                         temp = validParameter.validFile(parameters, "scale", false);                    if (temp == "not found") { temp = "F"; }
57                         scale = m->isTrue(temp);
58                         
59                         temp = validParameter.validFile(parameters, "collect", false);                  if (temp == "not found") { temp = "F"; }
60                         collect = m->isTrue(temp);
61                         
62                         groups = validParameter.validFile(parameters, "groups", false);                 
63                         if (groups == "not found") { groups = ""; Groups = globaldata->gTreemap->namesOfGroups;  globaldata->Groups = Groups;  }
64                         else { 
65                                 m->splitAtDash(groups, Groups);
66                                 globaldata->Groups = Groups;
67                         }
68                         
69                         if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
70                 }
71                 
72         }
73         catch(exception& e) {
74                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
75                 exit(1);
76         }                       
77 }
78 //**********************************************************************************************************************
79
80 void PhyloDiversityCommand::help(){
81         try {
82                 m->mothurOut("The phylo.diversity command can only be executed after a successful read.tree command.\n");
83                 m->mothurOut("The phylo.diversity command parameters are groups, iters, freq, scale, rarefy, collect and summary.  No parameters are required.\n");
84                 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n");
85                 m->mothurOut("The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n");
86                 m->mothurOut("The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n");
87                 m->mothurOut("The scale parameter is used indicate that you want your ouptut scaled to the number of sequences sampled, default = false. \n");
88                 m->mothurOut("The rarefy parameter allows you to create a rarefaction curve. The default is false.\n");
89                 m->mothurOut("The collect parameter allows you to create a collectors curve. The default is false.\n");
90                 m->mothurOut("The summary parameter allows you to create a .summary file. The default is true.\n");
91                 m->mothurOut("The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n");
92                 m->mothurOut("Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n");
93                 m->mothurOut("The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n");
94                 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
95
96         }
97         catch(exception& e) {
98                 m->errorOut(e, "PhyloDiversityCommand", "help");
99                 exit(1);
100         }
101 }
102
103 //**********************************************************************************************************************
104
105 PhyloDiversityCommand::~PhyloDiversityCommand(){}
106
107 //**********************************************************************************************************************
108
109 int PhyloDiversityCommand::execute(){
110         try {
111                 
112                 if (abort == true) { return 0; }
113                 
114                 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
115                 for (int i = 0; i < globaldata->Groups.size(); i++) { if (globaldata->Groups[i] == "xxx") { globaldata->Groups.erase(globaldata->Groups.begin()+i);  break; }  }
116                  
117                 vector<string> outputNames;
118                         
119                 vector<Tree*> trees = globaldata->gTree;
120                 
121                 //for each of the users trees
122                 for(int i = 0; i < trees.size(); i++) {
123                 
124                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
125                         
126                         ofstream outSum, outRare, outCollect;
127                         string outSumFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylodiv.summary";
128                         string outRareFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylodiv.rarefaction";
129                         string outCollectFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylodiv";
130                         
131                         if (summary)    { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile);                             }
132                         if (rarefy)             { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile);                          }
133                         if (collect)    { m->openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile); }
134                         
135                         int numLeafNodes = trees[i]->getNumLeaves();
136                         
137                         //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
138                         vector<int> randomLeaf;
139                         for (int j = 0; j < numLeafNodes; j++) {  
140                                 if (m->inUsersGroups(trees[i]->tree[j].getGroup(), globaldata->Groups) == true) { //is this a node from the group the user selected.
141                                         randomLeaf.push_back(j); 
142                                 }
143                         }
144                         
145                         numLeafNodes = randomLeaf.size();  //reset the number of leaf nodes you are using 
146                         
147                         //each group, each sampling, if no rarefy iters = 1;
148                         map<string, vector<float> > diversity;
149                         
150                         //each group, each sampling, if no rarefy iters = 1;
151                         map<string, vector<float> > sumDiversity;
152                         
153                         //find largest group total 
154                         int largestGroup = 0;
155                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
156                                 if (globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]] > largestGroup) { largestGroup = globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]; }
157                                 
158                                 //initialize diversity
159                                 diversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);              //numSampled
160                                                                                                                                                                                                                         //groupA                0.0                     0.0
161                                                                                                                                                                                                                         
162                                 //initialize sumDiversity
163                                 sumDiversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);
164                         }       
165
166                         //convert freq percentage to number
167                         int increment = 100;
168                         if (freq < 1.0) {  increment = largestGroup * freq;  
169                         }else { increment = freq;  }
170                         
171                         //initialize sampling spots
172                         set<int> numSampledList;
173                         for(int k = 1; k <= largestGroup; k++){  if((k == 1) || (k % increment == 0)){  numSampledList.insert(k); }   }
174                         if(largestGroup % increment != 0){      numSampledList.insert(largestGroup);   }
175                         
176                         //add other groups ending points
177                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
178                                 if (numSampledList.count(diversity[globaldata->Groups[j]].size()-1) == 0) {  numSampledList.insert(diversity[globaldata->Groups[j]].size()-1); }
179                         }
180
181                         for (int l = 0; l < iters; l++) {
182                                 random_shuffle(randomLeaf.begin(), randomLeaf.end());
183                 
184                                 //initialize counts
185                                 map<string, int> counts;
186                                 map< string, set<int> > countedBranch;  
187                                 for (int j = 0; j < globaldata->Groups.size(); j++) {  counts[globaldata->Groups[j]] = 0; countedBranch[globaldata->Groups[j]].insert(-2);  }  //add dummy index to initialize countedBranch sets
188                                 
189                                 for(int k = 0; k < numLeafNodes; k++){
190                                                 
191                                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
192                                         
193                                         //calc branch length of randomLeaf k
194                                         float br = calcBranchLength(trees[i], randomLeaf[k], countedBranch);
195                         
196                                         //for each group in the groups update the total branch length accounting for the names file
197                                         vector<string> groups = trees[i]->tree[randomLeaf[k]].getGroup();
198                                         
199                                         for (int j = 0; j < groups.size(); j++) {
200                                                 int numSeqsInGroupJ = 0;
201                                                 map<string, int>::iterator it;
202                                                 it = trees[i]->tree[randomLeaf[k]].pcount.find(groups[j]);
203                                                 if (it != trees[i]->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
204                                                         numSeqsInGroupJ = it->second;
205                                                 }
206                                                 
207                                                 if (numSeqsInGroupJ != 0) {     diversity[groups[j]][(counts[groups[j]]+1)] = diversity[groups[j]][counts[groups[j]]] + br;  }
208                                                 
209                                                 for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
210                                                         diversity[groups[j]][s] = diversity[groups[j]][s-1];  //update counts, but don't add in redundant branch lengths
211                                                 }
212                                                 counts[groups[j]] += numSeqsInGroupJ;
213                                         }
214                                 }
215                                 
216                                 if (rarefy) {
217                                         //add this diversity to the sum
218                                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
219                                                 for (int g = 0; g < diversity[globaldata->Groups[j]].size(); g++) {
220                                                         sumDiversity[globaldata->Groups[j]][g] += diversity[globaldata->Groups[j]][g];
221                                                 }
222                                         }
223                                 }
224                                 
225                                 if ((collect) && (l == 0)) {  printData(numSampledList, diversity, outCollect, 1);  }
226                                 if ((summary) && (l == 0)) {  printSumData(diversity, outSum, 1);  }
227                         }
228                         
229                         if (rarefy) {   printData(numSampledList, sumDiversity, outRare, iters);        }
230                 }
231                 
232         
233                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
234
235                 m->mothurOutEndLine();
236                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
237                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
238                 m->mothurOutEndLine();
239
240                 
241                 return 0;
242         }
243         catch(exception& e) {
244                 m->errorOut(e, "PhyloDiversityCommand", "execute");
245                 exit(1);
246         }
247 }
248 //**********************************************************************************************************************
249
250 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
251         try {
252                 
253                 out << "numSampled\t";
254                 for (int i = 0; i < globaldata->Groups.size(); i++) { out << globaldata->Groups[i] << '\t';  }
255                 out << endl;
256                 
257                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
258                 
259                 set<int> num;
260                 //find end points to output
261                 for (map<string, vector<float> >::iterator itEnds = div.begin(); itEnds != div.end(); itEnds++) {       num.insert(itEnds->second.size()-1);  }
262                 
263                 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {  
264                         int numSampled = *it;
265                         
266                         out << numSampled << '\t';  
267                         
268                         for (int j = 0; j < globaldata->Groups.size(); j++) {
269                                 if (numSampled < div[globaldata->Groups[j]].size()) { 
270                                         float score;
271                                         if (scale)      {  score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;      }
272                                         else            {       score = div[globaldata->Groups[j]][numSampled] / (float)numIters;       }
273                                         
274                                         out << setprecision(4) << score << '\t';
275         
276                                 }else { out << "NA" << '\t'; }
277                         }
278                         out << endl;
279                 }
280                 
281                 out.close();
282                 
283         }
284         catch(exception& e) {
285                 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
286                 exit(1);
287         }
288 }
289 //**********************************************************************************************************************
290
291 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
292         try {
293                 
294                 out << "numSampled\t";
295                 for (int i = 0; i < globaldata->Groups.size(); i++) { out << globaldata->Groups[i] << '\t';  }
296                 out << endl;
297                 
298                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
299                 
300                 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {  
301                         int numSampled = *it;
302                         
303                         out << numSampled << '\t';  
304                         
305                         for (int j = 0; j < globaldata->Groups.size(); j++) {
306                                 if (numSampled < div[globaldata->Groups[j]].size()) { 
307                                         float score;
308                                         if (scale)      {  score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;      }
309                                         else            {       score = div[globaldata->Groups[j]][numSampled] / (float)numIters;       }
310
311                                         out << setprecision(4) << score << '\t';
312                                 }else { out << "NA" << '\t'; }
313                         }
314                         out << endl;
315                 }
316                 
317                 out.close();
318                 
319         }
320         catch(exception& e) {
321                 m->errorOut(e, "PhyloDiversityCommand", "printData");
322                 exit(1);
323         }
324 }
325 //**********************************************************************************************************************
326 float PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< string, set<int> >& counted){
327         try {
328
329                 //calc the branch length
330                 //while you aren't at root
331                 float sum = 0.0;
332                 int index = leaf;
333                 
334                 vector<string> groups = t->tree[leaf].getGroup();
335                                         
336                 while(t->tree[index].getParent() != -1){
337                         
338                         //if you have a BL
339                         if(t->tree[index].getBranchLength() != -1){
340                                 if (counted[groups[0]].count(index) == 0) { //you have not already counted this branch
341                                         sum += abs(t->tree[index].getBranchLength());
342                                         for (int j = 0; j < groups.size(); j++) {  counted[groups[j]].insert(index);  }
343                                 }
344                         }
345                         index = t->tree[index].getParent();
346                 }
347                         
348                 //get last breanch length added
349                 if(t->tree[index].getBranchLength() != -1){
350                         if (counted[groups[0]].count(index) == 0) { //you have not already counted this branch
351                                 sum += abs(t->tree[index].getBranchLength());
352                                 for (int j = 0; j < groups.size(); j++) {  counted[groups[j]].insert(index);  }
353                         }
354                 }
355                 
356                 return sum;
357         }
358         catch(exception& e) {
359                 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
360                 exit(1);
361         }
362 }
363 //**********************************************************************************************************************