]> git.donarmstrong.com Git - mothur.git/blob - phylodiversitycommand.cpp
fixed bug with phylo.diversity. not calculating as intended.
[mothur.git] / phylodiversitycommand.cpp
1 /*
2  *  phylodiversitycommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/30/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylodiversitycommand.h"
11
12 //**********************************************************************************************************************
13 PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
14         try {
15                 globaldata = GlobalData::getInstance();
16                 abort = false;
17                 
18                 //allow user to run help
19                 if(option == "help") { help(); abort = true; }
20                 
21                 else {
22                         //valid paramters for this command
23                         string Array[] =  {"freq","rarefy","iters","groups","outputdir","inputdir"};
24                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25                         
26                         OptionParser parser(option);
27                         map<string,string> parameters = parser.getParameters();
28                         
29                         ValidParameters validParameter;
30                 
31                         //check to make sure all parameters are valid for command
32                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
33                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
34                         }
35                         
36                         //if the user changes the output directory command factory will send this info to us in the output parameter 
37                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
38                         
39                         if (globaldata->gTree.size() == 0) {//no trees were read
40                                 m->mothurOut("You must execute the read.tree command, before you may execute the phylo.diversity command."); m->mothurOutEndLine(); abort = true;  }
41
42                         string temp;
43                         temp = validParameter.validFile(parameters, "freq", false);                     if (temp == "not found") { temp = "100"; }
44                         convert(temp, freq); 
45                         
46                         temp = validParameter.validFile(parameters, "iters", false);                    if (temp == "not found") { temp = "1000"; }
47                         convert(temp, iters); 
48                         
49                         temp = validParameter.validFile(parameters, "rarefy", false);                   if (temp == "not found") { temp = "F"; }
50                         rarefy = isTrue(temp);
51                         if (!rarefy) { iters = 1;  }
52                         
53                         groups = validParameter.validFile(parameters, "groups", false);                 
54                         if (groups == "not found") { groups = ""; Groups = globaldata->gTreemap->namesOfGroups;  globaldata->Groups = Groups;  }
55                         else { 
56                                 splitAtDash(groups, Groups);
57                                 globaldata->Groups = Groups;
58                         }
59                         
60                 }
61                 
62         }
63         catch(exception& e) {
64                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
65                 exit(1);
66         }                       
67 }
68 //**********************************************************************************************************************
69
70 void PhyloDiversityCommand::help(){
71         try {
72                 m->mothurOut("The phylo.diversity command can only be executed after a successful read.tree command.\n");
73                 m->mothurOut("The phylo.diversity command parameters are groups, iters, freq and rarefy.  No parameters are required.\n");
74                 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n");
75                 m->mothurOut("The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n");
76                 m->mothurOut("The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n");
77                 m->mothurOut("The rarefy parameter allows you to create a rarefaction curve. The default is false.\n");
78                 m->mothurOut("The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n");
79                 m->mothurOut("Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n");
80                 m->mothurOut("The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n");
81                 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
82
83         }
84         catch(exception& e) {
85                 m->errorOut(e, "PhyloDiversityCommand", "help");
86                 exit(1);
87         }
88 }
89
90 //**********************************************************************************************************************
91
92 PhyloDiversityCommand::~PhyloDiversityCommand(){}
93
94 //**********************************************************************************************************************
95
96 int PhyloDiversityCommand::execute(){
97         try {
98                 
99                 if (abort == true) { return 0; }
100                 
101                 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
102                 for (int i = 0; i < globaldata->Groups.size(); i++) { if (globaldata->Groups[i] == "xxx") { globaldata->Groups.erase(globaldata->Groups.begin()+i);  break; }  }
103                  
104                 vector<string> outputNames;
105                         
106                 vector<Tree*> trees = globaldata->gTree;
107                 
108                 //for each of the users trees
109                 for(int i = 0; i < trees.size(); i++) {
110                 
111                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
112                         
113                         string outFile = outputDir + getRootName(getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylo.diversity";
114                         if (rarefy) { outFile += ".rarefaction"; }
115                         outputNames.push_back(outFile);
116                         
117                         int numLeafNodes = trees[i]->getNumLeaves();
118                         
119                         //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
120                         vector<int> randomLeaf;
121                         for (int j = 0; j < numLeafNodes; j++) {  
122                                 if (inUsersGroups(trees[i]->tree[j].getGroup(), globaldata->Groups) == true) { //is this a node from the group the user selected.
123                                         randomLeaf.push_back(j); 
124                                 }
125                         }
126                         
127                         numLeafNodes = randomLeaf.size();  //reset the number of leaf nodes you are using 
128                         
129                         //each group, each sampling, if no rarefy iters = 1;
130                         map<string, vector<float> > diversity;
131                         
132                         //each group, each sampling, if no rarefy iters = 1;
133                         map<string, vector<float> > sumDiversity;
134                         
135                         //find largest group total 
136                         int largestGroup = 0;
137                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
138                                 if (globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]] > largestGroup) { largestGroup = globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]; }
139                                 
140                                 //initialize diversity
141                                 diversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);              //numSampled
142                                                                                                                                                                                                                         //groupA                0.0                     0.0
143                                                                                                                                                                                                                         
144                                 //initialize sumDiversity
145                                 sumDiversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);
146                         }       
147
148                         //convert freq percentage to number
149                         int increment = 100;
150                         if (freq < 1.0) {  increment = largestGroup * freq;  
151                         }else { increment = freq;  }
152                         
153                         //initialize sampling spots
154                         set<int> numSampledList;
155                         for(int k = 1; k <= largestGroup; k++){  if((k == 1) || (k % increment == 0)){  numSampledList.insert(k); }   }
156                         if(largestGroup % increment != 0){      numSampledList.insert(largestGroup);   }
157                         
158                         //add other groups ending points
159                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
160                                 if (numSampledList.count(diversity[globaldata->Groups[j]].size()-1) == 0) {  numSampledList.insert(diversity[globaldata->Groups[j]].size()-1); }
161                         }
162
163                         for (int l = 0; l < iters; l++) {
164                                 random_shuffle(randomLeaf.begin(), randomLeaf.end());
165                 
166                                 //initialize counts
167                                 map<string, int> counts;
168                                 for (int j = 0; j < globaldata->Groups.size(); j++) {  counts[globaldata->Groups[j]] = 0; }
169                                 
170                                 for(int k = 0; k < numLeafNodes; k++){
171                                                 
172                                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
173                                         
174                                         //calc branch length of randomLeaf k
175                                         float br = calcBranchLength(trees[i], randomLeaf[k]);
176                         
177                                         //for each group in the groups update the total branch length accounting for the names file
178                                         vector<string> groups = trees[i]->tree[randomLeaf[k]].getGroup();
179                                         for (int j = 0; j < groups.size(); j++) {
180                                                 int numSeqsInGroupJ = 0;
181                                                 map<string, int>::iterator it;
182                                                 it = trees[i]->tree[randomLeaf[k]].pcount.find(groups[j]);
183                                                 if (it != trees[i]->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
184                                                         numSeqsInGroupJ = it->second;
185                                                 }
186                                         
187                                                 for (int s = (counts[groups[j]]+1); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
188                                                         diversity[groups[j]][s] = diversity[groups[j]][s-1] + (numSeqsInGroupJ * br);
189                                                 }
190                                                 counts[groups[j]] += numSeqsInGroupJ;
191                                         }
192                                 }
193                                 
194                                 if (rarefy) {
195                                         //add this diversity to the sum
196                                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
197                                                 for (int g = 0; g < diversity[globaldata->Groups[j]].size(); g++) {
198                                                         sumDiversity[globaldata->Groups[j]][g] += diversity[globaldata->Groups[j]][g];
199                                                 }
200                                         }
201                                 }
202                         }
203                         
204                         if (rarefy) { 
205                                 printData(numSampledList, sumDiversity, outFile);
206                         }else{
207                                 printData(numSampledList, diversity, outFile);
208                         }
209
210                 }
211                 
212         
213                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
214
215                 m->mothurOutEndLine();
216                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
217                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
218                 m->mothurOutEndLine();
219
220                 
221                 return 0;
222         }
223         catch(exception& e) {
224                 m->errorOut(e, "PhyloDiversityCommand", "execute");
225                 exit(1);
226         }
227 }
228 //**********************************************************************************************************************
229
230 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, string file){
231         try {
232                 ofstream out;
233                 openOutputFile(file, out);
234                 
235                 out << "numSampled\t";
236                 for (int i = 0; i < globaldata->Groups.size(); i++) { out << globaldata->Groups[i] << '\t';  }
237                 out << endl;
238                 
239                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
240                 
241                 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {  
242                         int numSampled = *it;
243                         
244                         out << numSampled << '\t';  
245                         
246                         for (int j = 0; j < globaldata->Groups.size(); j++) {
247                                 if (numSampled < div[globaldata->Groups[j]].size()) { 
248                                         float score = div[globaldata->Groups[j]][numSampled] / (float)iters;
249                                         out << setprecision(6) << score << '\t';
250                                 }else { out << "NA" << '\t'; }
251                         }
252                         out << endl;
253                 }
254                 
255                 out.close();
256                 
257         }
258         catch(exception& e) {
259                 m->errorOut(e, "PhyloDiversityCommand", "printData");
260                 exit(1);
261         }
262 }
263
264 //**********************************************************************************************************************
265 float PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf){
266         try {
267
268                 //calc the branch length
269                 //while you aren't at root
270                 float sum = 0.0;
271                 int index = leaf;
272
273                 while(t->tree[index].getParent() != -1){
274                         
275                         //if you have a BL
276                         if(t->tree[index].getBranchLength() != -1){
277                                 sum += abs(t->tree[index].getBranchLength());
278                         }
279                         index = t->tree[index].getParent();
280                 }
281                         
282                 //get last breanch length added
283                 if(t->tree[index].getBranchLength() != -1){
284                         sum += abs(t->tree[index].getBranchLength());
285                 }
286                 
287                 return sum;
288         }
289         catch(exception& e) {
290                 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
291                 exit(1);
292         }
293 }
294 //**********************************************************************************************************************