2 * phylodiversitycommand.cpp
5 * Created by westcott on 4/30/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "phylodiversitycommand.h"
12 //**********************************************************************************************************************
13 PhyloDiversityCommand::PhyloDiversityCommand(string option) {
15 globaldata = GlobalData::getInstance();
18 //allow user to run help
19 if(option == "help") { help(); abort = true; }
22 //valid paramters for this command
23 string Array[] = {"freq","rarefy","iters","groups","summary","collect","scale","outputdir","inputdir"};
24 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
26 OptionParser parser(option);
27 map<string,string> parameters = parser.getParameters();
29 ValidParameters validParameter;
31 //check to make sure all parameters are valid for command
32 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
33 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
36 //if the user changes the output directory command factory will send this info to us in the output parameter
37 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(globaldata->getTreeFile()); }
39 if (globaldata->gTree.size() == 0) {//no trees were read
40 m->mothurOut("You must execute the read.tree command, before you may execute the phylo.diversity command."); m->mothurOutEndLine(); abort = true; }
43 temp = validParameter.validFile(parameters, "freq", false); if (temp == "not found") { temp = "100"; }
46 temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; }
49 temp = validParameter.validFile(parameters, "rarefy", false); if (temp == "not found") { temp = "F"; }
50 rarefy = m->isTrue(temp);
51 if (!rarefy) { iters = 1; }
53 temp = validParameter.validFile(parameters, "summary", false); if (temp == "not found") { temp = "T"; }
54 summary = m->isTrue(temp);
56 temp = validParameter.validFile(parameters, "scale", false); if (temp == "not found") { temp = "F"; }
57 scale = m->isTrue(temp);
59 temp = validParameter.validFile(parameters, "collect", false); if (temp == "not found") { temp = "F"; }
60 collect = m->isTrue(temp);
62 groups = validParameter.validFile(parameters, "groups", false);
63 if (groups == "not found") { groups = ""; Groups = globaldata->gTreemap->namesOfGroups; globaldata->Groups = Groups; }
65 m->splitAtDash(groups, Groups);
66 globaldata->Groups = Groups;
69 if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
74 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
78 //**********************************************************************************************************************
80 void PhyloDiversityCommand::help(){
82 m->mothurOut("The phylo.diversity command can only be executed after a successful read.tree command.\n");
83 m->mothurOut("The phylo.diversity command parameters are groups, iters, freq, scale, rarefy, collect and summary. No parameters are required.\n");
84 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n");
85 m->mothurOut("The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n");
86 m->mothurOut("The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n");
87 m->mothurOut("The scale parameter is used indicate that you want your ouptut scaled to the number of sequences sampled, default = false. \n");
88 m->mothurOut("The rarefy parameter allows you to create a rarefaction curve. The default is false.\n");
89 m->mothurOut("The collect parameter allows you to create a collectors curve. The default is false.\n");
90 m->mothurOut("The summary parameter allows you to create a .summary file. The default is true.\n");
91 m->mothurOut("The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n");
92 m->mothurOut("Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n");
93 m->mothurOut("The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n");
94 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
98 m->errorOut(e, "PhyloDiversityCommand", "help");
103 //**********************************************************************************************************************
105 PhyloDiversityCommand::~PhyloDiversityCommand(){}
107 //**********************************************************************************************************************
109 int PhyloDiversityCommand::execute(){
112 if (abort == true) { return 0; }
114 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
115 for (int i = 0; i < globaldata->Groups.size(); i++) { if (globaldata->Groups[i] == "xxx") { globaldata->Groups.erase(globaldata->Groups.begin()+i); break; } }
117 vector<string> outputNames;
119 vector<Tree*> trees = globaldata->gTree;
121 //for each of the users trees
122 for(int i = 0; i < trees.size(); i++) {
124 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
126 ofstream outSum, outRare, outCollect;
127 string outSumFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylodiv.summary";
128 string outRareFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylodiv.rarefaction";
129 string outCollectFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylodiv";
131 if (summary) { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile); }
132 if (rarefy) { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile); }
133 if (collect) { m->openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile); }
135 int numLeafNodes = trees[i]->getNumLeaves();
137 //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
138 vector<int> randomLeaf;
139 for (int j = 0; j < numLeafNodes; j++) {
140 if (m->inUsersGroups(trees[i]->tree[j].getGroup(), globaldata->Groups) == true) { //is this a node from the group the user selected.
141 randomLeaf.push_back(j);
145 numLeafNodes = randomLeaf.size(); //reset the number of leaf nodes you are using
147 //each group, each sampling, if no rarefy iters = 1;
148 map<string, vector<float> > diversity;
150 //each group, each sampling, if no rarefy iters = 1;
151 map<string, vector<float> > sumDiversity;
153 //find largest group total
154 int largestGroup = 0;
155 for (int j = 0; j < globaldata->Groups.size(); j++) {
156 if (globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]] > largestGroup) { largestGroup = globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]; }
158 //initialize diversity
159 diversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0); //numSampled
162 //initialize sumDiversity
163 sumDiversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);
166 //convert freq percentage to number
168 if (freq < 1.0) { increment = largestGroup * freq;
169 }else { increment = freq; }
171 //initialize sampling spots
172 set<int> numSampledList;
173 for(int k = 1; k <= largestGroup; k++){ if((k == 1) || (k % increment == 0)){ numSampledList.insert(k); } }
174 if(largestGroup % increment != 0){ numSampledList.insert(largestGroup); }
176 //add other groups ending points
177 for (int j = 0; j < globaldata->Groups.size(); j++) {
178 if (numSampledList.count(diversity[globaldata->Groups[j]].size()-1) == 0) { numSampledList.insert(diversity[globaldata->Groups[j]].size()-1); }
181 for (int l = 0; l < iters; l++) {
182 random_shuffle(randomLeaf.begin(), randomLeaf.end());
185 map<string, int> counts;
186 map< string, set<int> > countedBranch;
187 for (int j = 0; j < globaldata->Groups.size(); j++) { counts[globaldata->Groups[j]] = 0; countedBranch[globaldata->Groups[j]].insert(-2); } //add dummy index to initialize countedBranch sets
189 for(int k = 0; k < numLeafNodes; k++){
191 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
193 //calc branch length of randomLeaf k
194 float br = calcBranchLength(trees[i], randomLeaf[k], countedBranch);
196 //for each group in the groups update the total branch length accounting for the names file
197 vector<string> groups = trees[i]->tree[randomLeaf[k]].getGroup();
199 for (int j = 0; j < groups.size(); j++) {
200 int numSeqsInGroupJ = 0;
201 map<string, int>::iterator it;
202 it = trees[i]->tree[randomLeaf[k]].pcount.find(groups[j]);
203 if (it != trees[i]->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
204 numSeqsInGroupJ = it->second;
207 if (numSeqsInGroupJ != 0) { diversity[groups[j]][(counts[groups[j]]+1)] = diversity[groups[j]][counts[groups[j]]] + br; }
209 for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
210 diversity[groups[j]][s] = diversity[groups[j]][s-1]; //update counts, but don't add in redundant branch lengths
212 counts[groups[j]] += numSeqsInGroupJ;
217 //add this diversity to the sum
218 for (int j = 0; j < globaldata->Groups.size(); j++) {
219 for (int g = 0; g < diversity[globaldata->Groups[j]].size(); g++) {
220 sumDiversity[globaldata->Groups[j]][g] += diversity[globaldata->Groups[j]][g];
225 if ((collect) && (l == 0)) { printData(numSampledList, diversity, outCollect, 1); }
226 if ((summary) && (l == 0)) { printSumData(diversity, outSum, 1); }
229 if (rarefy) { printData(numSampledList, sumDiversity, outRare, iters); }
233 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
235 m->mothurOutEndLine();
236 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
237 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
238 m->mothurOutEndLine();
243 catch(exception& e) {
244 m->errorOut(e, "PhyloDiversityCommand", "execute");
248 //**********************************************************************************************************************
250 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
253 out << "Groups\tnumSampled\tphyloDiversity" << endl;
255 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
257 for (int j = 0; j < globaldata->Groups.size(); j++) {
258 int numSampled = (div[globaldata->Groups[j]].size()-1);
259 out << globaldata->Groups[j] << '\t' << numSampled << '\t';
263 if (scale) { score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled; }
264 else { score = div[globaldata->Groups[j]][numSampled] / (float)numIters; }
266 out << setprecision(4) << score << endl;
272 catch(exception& e) {
273 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
277 //**********************************************************************************************************************
279 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
282 out << "numSampled\t";
283 for (int i = 0; i < globaldata->Groups.size(); i++) { out << globaldata->Groups[i] << '\t'; }
286 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
288 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {
289 int numSampled = *it;
291 out << numSampled << '\t';
293 for (int j = 0; j < globaldata->Groups.size(); j++) {
294 if (numSampled < div[globaldata->Groups[j]].size()) {
296 if (scale) { score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled; }
297 else { score = div[globaldata->Groups[j]][numSampled] / (float)numIters; }
299 out << setprecision(4) << score << '\t';
300 }else { out << "NA" << '\t'; }
308 catch(exception& e) {
309 m->errorOut(e, "PhyloDiversityCommand", "printData");
313 //**********************************************************************************************************************
314 float PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< string, set<int> >& counted){
317 //calc the branch length
318 //while you aren't at root
322 vector<string> groups = t->tree[leaf].getGroup();
324 while(t->tree[index].getParent() != -1){
327 if(t->tree[index].getBranchLength() != -1){
328 if (counted[groups[0]].count(index) == 0) { //you have not already counted this branch
329 sum += abs(t->tree[index].getBranchLength());
330 for (int j = 0; j < groups.size(); j++) { counted[groups[j]].insert(index); }
333 index = t->tree[index].getParent();
336 //get last breanch length added
337 if(t->tree[index].getBranchLength() != -1){
338 if (counted[groups[0]].count(index) == 0) { //you have not already counted this branch
339 sum += abs(t->tree[index].getBranchLength());
340 for (int j = 0; j < groups.size(); j++) { counted[groups[j]].insert(index); }
346 catch(exception& e) {
347 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
351 //**********************************************************************************************************************