2 * phylodiversitycommand.cpp
5 * Created by westcott on 4/30/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "phylodiversitycommand.h"
12 //**********************************************************************************************************************
13 PhyloDiversityCommand::PhyloDiversityCommand(string option) {
15 globaldata = GlobalData::getInstance();
18 //allow user to run help
19 if(option == "help") { help(); abort = true; }
22 //valid paramters for this command
23 string Array[] = {"freq","rarefy","iters","groups","processors","summary","collect","scale","outputdir","inputdir"};
24 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
26 OptionParser parser(option);
27 map<string,string> parameters = parser.getParameters();
29 ValidParameters validParameter;
31 //check to make sure all parameters are valid for command
32 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
33 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
36 //if the user changes the output directory command factory will send this info to us in the output parameter
37 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(globaldata->getTreeFile()); }
39 if (globaldata->gTree.size() == 0) {//no trees were read
40 m->mothurOut("You must execute the read.tree command, before you may execute the phylo.diversity command."); m->mothurOutEndLine(); abort = true; }
43 temp = validParameter.validFile(parameters, "freq", false); if (temp == "not found") { temp = "100"; }
46 temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; }
49 temp = validParameter.validFile(parameters, "rarefy", false); if (temp == "not found") { temp = "F"; }
50 rarefy = m->isTrue(temp);
51 if (!rarefy) { iters = 1; }
53 temp = validParameter.validFile(parameters, "summary", false); if (temp == "not found") { temp = "T"; }
54 summary = m->isTrue(temp);
56 temp = validParameter.validFile(parameters, "scale", false); if (temp == "not found") { temp = "F"; }
57 scale = m->isTrue(temp);
59 temp = validParameter.validFile(parameters, "collect", false); if (temp == "not found") { temp = "F"; }
60 collect = m->isTrue(temp);
62 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
63 convert(temp, processors);
65 groups = validParameter.validFile(parameters, "groups", false);
66 if (groups == "not found") { groups = ""; Groups = globaldata->gTreemap->namesOfGroups; globaldata->Groups = Groups; }
68 m->splitAtDash(groups, Groups);
69 globaldata->Groups = Groups;
72 if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
77 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
81 //**********************************************************************************************************************
83 void PhyloDiversityCommand::help(){
85 m->mothurOut("The phylo.diversity command can only be executed after a successful read.tree command.\n");
86 m->mothurOut("The phylo.diversity command parameters are groups, iters, freq, processors, scale, rarefy, collect and summary. No parameters are required.\n");
87 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n");
88 m->mothurOut("The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n");
89 m->mothurOut("The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n");
90 m->mothurOut("The scale parameter is used indicate that you want your ouptut scaled to the number of sequences sampled, default = false. \n");
91 m->mothurOut("The rarefy parameter allows you to create a rarefaction curve. The default is false.\n");
92 m->mothurOut("The collect parameter allows you to create a collectors curve. The default is false.\n");
93 m->mothurOut("The summary parameter allows you to create a .summary file. The default is true.\n");
94 m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
95 m->mothurOut("The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n");
96 m->mothurOut("Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n");
97 m->mothurOut("The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n");
98 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
101 catch(exception& e) {
102 m->errorOut(e, "PhyloDiversityCommand", "help");
107 //**********************************************************************************************************************
109 PhyloDiversityCommand::~PhyloDiversityCommand(){}
111 //**********************************************************************************************************************
113 int PhyloDiversityCommand::execute(){
116 if (abort == true) { return 0; }
118 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
119 for (int i = 0; i < globaldata->Groups.size(); i++) { if (globaldata->Groups[i] == "xxx") { globaldata->Groups.erase(globaldata->Groups.begin()+i); break; } }
121 vector<string> outputNames;
123 vector<Tree*> trees = globaldata->gTree;
125 //for each of the users trees
126 for(int i = 0; i < trees.size(); i++) {
128 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
130 ofstream outSum, outRare, outCollect;
131 string outSumFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylodiv.summary";
132 string outRareFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylodiv.rarefaction";
133 string outCollectFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylodiv";
135 if (summary) { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile); }
136 if (rarefy) { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile); }
137 if (collect) { m->openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile); }
139 int numLeafNodes = trees[i]->getNumLeaves();
141 //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
142 vector<int> randomLeaf;
143 for (int j = 0; j < numLeafNodes; j++) {
144 if (m->inUsersGroups(trees[i]->tree[j].getGroup(), globaldata->Groups) == true) { //is this a node from the group the user selected.
145 randomLeaf.push_back(j);
149 numLeafNodes = randomLeaf.size(); //reset the number of leaf nodes you are using
151 //each group, each sampling, if no rarefy iters = 1;
152 map<string, vector<float> > diversity;
154 //each group, each sampling, if no rarefy iters = 1;
155 map<string, vector<float> > sumDiversity;
157 //find largest group total
158 int largestGroup = 0;
159 for (int j = 0; j < globaldata->Groups.size(); j++) {
160 if (globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]] > largestGroup) { largestGroup = globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]; }
162 //initialize diversity
163 diversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0); //numSampled
166 //initialize sumDiversity
167 sumDiversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);
170 //convert freq percentage to number
172 if (freq < 1.0) { increment = largestGroup * freq;
173 }else { increment = freq; }
175 //initialize sampling spots
176 set<int> numSampledList;
177 for(int k = 1; k <= largestGroup; k++){ if((k == 1) || (k % increment == 0)){ numSampledList.insert(k); } }
178 if(largestGroup % increment != 0){ numSampledList.insert(largestGroup); }
180 //add other groups ending points
181 for (int j = 0; j < globaldata->Groups.size(); j++) {
182 if (numSampledList.count(diversity[globaldata->Groups[j]].size()-1) == 0) { numSampledList.insert(diversity[globaldata->Groups[j]].size()-1); }
185 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
187 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);
190 vector<int> procIters;
192 int numItersPerProcessor = iters / processors;
194 //divide iters between processes
195 for (int h = 0; h < processors; h++) {
196 if(h == processors - 1){
197 numItersPerProcessor = iters - h * numItersPerProcessor;
199 procIters.push_back(numItersPerProcessor);
202 createProcesses(procIters, trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum);
204 }else{ //no need to paralellize if you dont want to rarefy
205 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);
210 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);
213 if (rarefy) { printData(numSampledList, sumDiversity, outRare, iters); }
217 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
219 m->mothurOutEndLine();
220 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
221 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
222 m->mothurOutEndLine();
227 catch(exception& e) {
228 m->errorOut(e, "PhyloDiversityCommand", "execute");
232 //**********************************************************************************************************************
233 int PhyloDiversityCommand::createProcesses(vector<int>& procIters, Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum){
235 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
238 vector<int> processIDS;
239 map< string, vector<float> >::iterator itSum;
243 //loop through and create all the processes you want
244 while (process != processors) {
248 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
251 driver(t, div, sumDiv, procIters[process], increment, randomLeaf, numSampledList, outCollect, outSum, false);
253 string outTemp = outputDir + toString(getpid()) + ".sumDiv.temp";
255 m->openOutputFile(outTemp, out);
257 //output the sumDIversity
258 for (itSum = sumDiv.begin(); itSum != sumDiv.end(); itSum++) {
259 out << itSum->first << '\t' << (itSum->second).size() << '\t';
260 for (int k = 0; k < (itSum->second).size(); k++) {
261 out << (itSum->second)[k] << '\t';
269 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
272 driver(t, div, sumDiv, procIters[0], increment, randomLeaf, numSampledList, outCollect, outSum, true);
274 //force parent to wait until all the processes are done
275 for (int i=0;i<(processors-1);i++) {
276 int temp = processIDS[i];
280 //get data created by processes
281 for (int i=0;i<(processors-1);i++) {
283 //input the sumDIversity
284 string inTemp = outputDir + toString(processIDS[i]) + ".sumDiv.temp";
286 m->openInputFile(inTemp, in);
288 //output the sumDIversity
289 for (int j = 0; j < sumDiv.size(); j++) {
293 in >> group >> size; m->gobble(in);
295 for (int k = 0; k < size; k++) {
299 sumDiv[group][k] += tempVal;
305 remove(inTemp.c_str());
313 catch(exception& e) {
314 m->errorOut(e, "PhyloDiversityCommand", "createProcesses");
318 //**********************************************************************************************************************
319 int PhyloDiversityCommand::driver(Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum, bool doSumCollect){
321 int numLeafNodes = randomLeaf.size();
323 for (int l = 0; l < numIters; l++) {
324 random_shuffle(randomLeaf.begin(), randomLeaf.end());
327 map<string, int> counts;
328 map< string, set<int> > countedBranch;
329 for (int j = 0; j < globaldata->Groups.size(); j++) { counts[globaldata->Groups[j]] = 0; countedBranch[globaldata->Groups[j]].insert(-2); } //add dummy index to initialize countedBranch sets
331 for(int k = 0; k < numLeafNodes; k++){
333 if (m->control_pressed) { return 0; }
335 //calc branch length of randomLeaf k
336 float br = calcBranchLength(t, randomLeaf[k], countedBranch);
338 //for each group in the groups update the total branch length accounting for the names file
339 vector<string> groups = t->tree[randomLeaf[k]].getGroup();
341 for (int j = 0; j < groups.size(); j++) {
342 int numSeqsInGroupJ = 0;
343 map<string, int>::iterator it;
344 it = t->tree[randomLeaf[k]].pcount.find(groups[j]);
345 if (it != t->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
346 numSeqsInGroupJ = it->second;
349 if (numSeqsInGroupJ != 0) { div[groups[j]][(counts[groups[j]]+1)] = div[groups[j]][counts[groups[j]]] + br; }
351 for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
352 div[groups[j]][s] = div[groups[j]][s-1]; //update counts, but don't add in redundant branch lengths
354 counts[groups[j]] += numSeqsInGroupJ;
359 //add this diversity to the sum
360 for (int j = 0; j < globaldata->Groups.size(); j++) {
361 for (int g = 0; g < div[globaldata->Groups[j]].size(); g++) {
362 sumDiv[globaldata->Groups[j]][g] += div[globaldata->Groups[j]][g];
367 if ((collect) && (l == 0) && doSumCollect) { printData(numSampledList, div, outCollect, 1); }
368 if ((summary) && (l == 0) && doSumCollect) { printSumData(div, outSum, 1); }
374 catch(exception& e) {
375 m->errorOut(e, "PhyloDiversityCommand", "driver");
380 //**********************************************************************************************************************
382 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
385 out << "Groups\tnumSampled\tphyloDiversity" << endl;
387 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
389 for (int j = 0; j < globaldata->Groups.size(); j++) {
390 int numSampled = (div[globaldata->Groups[j]].size()-1);
391 out << globaldata->Groups[j] << '\t' << numSampled << '\t';
395 if (scale) { score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled; }
396 else { score = div[globaldata->Groups[j]][numSampled] / (float)numIters; }
398 out << setprecision(4) << score << endl;
404 catch(exception& e) {
405 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
409 //**********************************************************************************************************************
411 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
414 out << "numSampled\t";
415 for (int i = 0; i < globaldata->Groups.size(); i++) { out << globaldata->Groups[i] << '\t'; }
418 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
420 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {
421 int numSampled = *it;
423 out << numSampled << '\t';
425 for (int j = 0; j < globaldata->Groups.size(); j++) {
426 if (numSampled < div[globaldata->Groups[j]].size()) {
428 if (scale) { score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled; }
429 else { score = div[globaldata->Groups[j]][numSampled] / (float)numIters; }
431 out << setprecision(4) << score << '\t';
432 }else { out << "NA" << '\t'; }
440 catch(exception& e) {
441 m->errorOut(e, "PhyloDiversityCommand", "printData");
445 //**********************************************************************************************************************
446 float PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< string, set<int> >& counted){
449 //calc the branch length
450 //while you aren't at root
454 vector<string> groups = t->tree[leaf].getGroup();
456 while(t->tree[index].getParent() != -1){
459 if(t->tree[index].getBranchLength() != -1){
460 if (counted[groups[0]].count(index) == 0) { //you have not already counted this branch
461 sum += abs(t->tree[index].getBranchLength());
462 for (int j = 0; j < groups.size(); j++) { counted[groups[j]].insert(index); }
465 index = t->tree[index].getParent();
468 //get last breanch length added
469 if(t->tree[index].getBranchLength() != -1){
470 if (counted[groups[0]].count(index) == 0) { //you have not already counted this branch
471 sum += abs(t->tree[index].getBranchLength());
472 for (int j = 0; j < groups.size(); j++) { counted[groups[j]].insert(index); }
478 catch(exception& e) {
479 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
483 //**********************************************************************************************************************