2 * phylodiversitycommand.cpp
5 * Created by westcott on 4/30/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "phylodiversitycommand.h"
12 //**********************************************************************************************************************
13 vector<string> PhyloDiversityCommand::getValidParameters(){
15 string Array[] = {"freq","rarefy","iters","groups","processors","summary","collect","scale","outputdir","inputdir"};
16 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
20 m->errorOut(e, "PhyloDiversityCommand", "getValidParameters");
24 //**********************************************************************************************************************
25 PhyloDiversityCommand::PhyloDiversityCommand(){
27 abort = true; calledHelp = true;
28 vector<string> tempOutNames;
29 outputTypes["phylodiv"] = tempOutNames;
30 outputTypes["rarefy"] = tempOutNames;
31 outputTypes["summary"] = tempOutNames;
34 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
38 //**********************************************************************************************************************
39 vector<string> PhyloDiversityCommand::getRequiredParameters(){
41 vector<string> myArray;
45 m->errorOut(e, "PhyloDiversityCommand", "getRequiredParameters");
49 //**********************************************************************************************************************
50 vector<string> PhyloDiversityCommand::getRequiredFiles(){
52 string Array[] = {"tree","group"};
53 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
57 m->errorOut(e, "PhyloDiversityCommand", "getRequiredFiles");
61 //**********************************************************************************************************************
62 PhyloDiversityCommand::PhyloDiversityCommand(string option) {
64 globaldata = GlobalData::getInstance();
65 abort = false; calledHelp = false;
67 //allow user to run help
68 if(option == "help") { help(); abort = true; calledHelp = true; }
71 //valid paramters for this command
72 string Array[] = {"freq","rarefy","iters","groups","processors","summary","collect","scale","outputdir","inputdir"};
73 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
75 OptionParser parser(option);
76 map<string,string> parameters = parser.getParameters();
78 ValidParameters validParameter;
80 //check to make sure all parameters are valid for command
81 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
82 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
85 //initialize outputTypes
86 vector<string> tempOutNames;
87 outputTypes["phylodiv"] = tempOutNames;
88 outputTypes["rarefy"] = tempOutNames;
89 outputTypes["summary"] = tempOutNames;
91 //if the user changes the output directory command factory will send this info to us in the output parameter
92 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(globaldata->getTreeFile()); }
94 if (globaldata->gTree.size() == 0) {//no trees were read
95 m->mothurOut("You must execute the read.tree command, before you may execute the phylo.diversity command."); m->mothurOutEndLine(); abort = true; }
98 temp = validParameter.validFile(parameters, "freq", false); if (temp == "not found") { temp = "100"; }
101 temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; }
102 convert(temp, iters);
104 temp = validParameter.validFile(parameters, "rarefy", false); if (temp == "not found") { temp = "F"; }
105 rarefy = m->isTrue(temp);
106 if (!rarefy) { iters = 1; }
108 temp = validParameter.validFile(parameters, "summary", false); if (temp == "not found") { temp = "T"; }
109 summary = m->isTrue(temp);
111 temp = validParameter.validFile(parameters, "scale", false); if (temp == "not found") { temp = "F"; }
112 scale = m->isTrue(temp);
114 temp = validParameter.validFile(parameters, "collect", false); if (temp == "not found") { temp = "F"; }
115 collect = m->isTrue(temp);
117 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
118 convert(temp, processors);
120 groups = validParameter.validFile(parameters, "groups", false);
121 if (groups == "not found") { groups = ""; Groups = globaldata->gTreemap->namesOfGroups; globaldata->Groups = Groups; }
123 m->splitAtDash(groups, Groups);
124 globaldata->Groups = Groups;
127 if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
131 catch(exception& e) {
132 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
136 //**********************************************************************************************************************
138 void PhyloDiversityCommand::help(){
140 m->mothurOut("The phylo.diversity command can only be executed after a successful read.tree command.\n");
141 m->mothurOut("The phylo.diversity command parameters are groups, iters, freq, processors, scale, rarefy, collect and summary. No parameters are required.\n");
142 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n");
143 m->mothurOut("The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n");
144 m->mothurOut("The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n");
145 m->mothurOut("The scale parameter is used indicate that you want your ouptut scaled to the number of sequences sampled, default = false. \n");
146 m->mothurOut("The rarefy parameter allows you to create a rarefaction curve. The default is false.\n");
147 m->mothurOut("The collect parameter allows you to create a collectors curve. The default is false.\n");
148 m->mothurOut("The summary parameter allows you to create a .summary file. The default is true.\n");
149 m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
150 m->mothurOut("The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n");
151 m->mothurOut("Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n");
152 m->mothurOut("The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n");
153 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
156 catch(exception& e) {
157 m->errorOut(e, "PhyloDiversityCommand", "help");
162 //**********************************************************************************************************************
164 PhyloDiversityCommand::~PhyloDiversityCommand(){}
166 //**********************************************************************************************************************
168 int PhyloDiversityCommand::execute(){
171 if (abort == true) { if (calledHelp) { return 0; } return 2; }
173 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
174 for (int i = 0; i < globaldata->Groups.size(); i++) { if (globaldata->Groups[i] == "xxx") { globaldata->Groups.erase(globaldata->Groups.begin()+i); break; } }
176 vector<string> outputNames;
178 vector<Tree*> trees = globaldata->gTree;
180 //for each of the users trees
181 for(int i = 0; i < trees.size(); i++) {
183 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
185 ofstream outSum, outRare, outCollect;
186 string outSumFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylodiv.summary";
187 string outRareFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylodiv.rarefaction";
188 string outCollectFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylodiv";
190 if (summary) { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile); outputTypes["summary"].push_back(outSumFile); }
191 if (rarefy) { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile); outputTypes["rarefy"].push_back(outRareFile); }
192 if (collect) { m->openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile); outputTypes["phylodiv"].push_back(outCollectFile); }
194 int numLeafNodes = trees[i]->getNumLeaves();
196 //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
197 vector<int> randomLeaf;
198 for (int j = 0; j < numLeafNodes; j++) {
199 if (m->inUsersGroups(trees[i]->tree[j].getGroup(), globaldata->Groups) == true) { //is this a node from the group the user selected.
200 randomLeaf.push_back(j);
204 numLeafNodes = randomLeaf.size(); //reset the number of leaf nodes you are using
206 //each group, each sampling, if no rarefy iters = 1;
207 map<string, vector<float> > diversity;
209 //each group, each sampling, if no rarefy iters = 1;
210 map<string, vector<float> > sumDiversity;
212 //find largest group total
213 int largestGroup = 0;
214 for (int j = 0; j < globaldata->Groups.size(); j++) {
215 if (globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]] > largestGroup) { largestGroup = globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]; }
217 //initialize diversity
218 diversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0); //numSampled
221 //initialize sumDiversity
222 sumDiversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);
225 //convert freq percentage to number
227 if (freq < 1.0) { increment = largestGroup * freq;
228 }else { increment = freq; }
230 //initialize sampling spots
231 set<int> numSampledList;
232 for(int k = 1; k <= largestGroup; k++){ if((k == 1) || (k % increment == 0)){ numSampledList.insert(k); } }
233 if(largestGroup % increment != 0){ numSampledList.insert(largestGroup); }
235 //add other groups ending points
236 for (int j = 0; j < globaldata->Groups.size(); j++) {
237 if (numSampledList.count(diversity[globaldata->Groups[j]].size()-1) == 0) { numSampledList.insert(diversity[globaldata->Groups[j]].size()-1); }
240 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
242 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);
245 vector<int> procIters;
247 int numItersPerProcessor = iters / processors;
249 //divide iters between processes
250 for (int h = 0; h < processors; h++) {
251 if(h == processors - 1){
252 numItersPerProcessor = iters - h * numItersPerProcessor;
254 procIters.push_back(numItersPerProcessor);
257 createProcesses(procIters, trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum);
259 }else{ //no need to paralellize if you dont want to rarefy
260 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);
265 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);
268 if (rarefy) { printData(numSampledList, sumDiversity, outRare, iters); }
272 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
274 m->mothurOutEndLine();
275 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
276 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
277 m->mothurOutEndLine();
282 catch(exception& e) {
283 m->errorOut(e, "PhyloDiversityCommand", "execute");
287 //**********************************************************************************************************************
288 int PhyloDiversityCommand::createProcesses(vector<int>& procIters, Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum){
290 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
293 vector<int> processIDS;
294 map< string, vector<float> >::iterator itSum;
298 //loop through and create all the processes you want
299 while (process != processors) {
303 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
306 driver(t, div, sumDiv, procIters[process], increment, randomLeaf, numSampledList, outCollect, outSum, false);
308 string outTemp = outputDir + toString(getpid()) + ".sumDiv.temp";
310 m->openOutputFile(outTemp, out);
312 //output the sumDIversity
313 for (itSum = sumDiv.begin(); itSum != sumDiv.end(); itSum++) {
314 out << itSum->first << '\t' << (itSum->second).size() << '\t';
315 for (int k = 0; k < (itSum->second).size(); k++) {
316 out << (itSum->second)[k] << '\t';
325 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
326 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
331 driver(t, div, sumDiv, procIters[0], increment, randomLeaf, numSampledList, outCollect, outSum, true);
333 //force parent to wait until all the processes are done
334 for (int i=0;i<(processors-1);i++) {
335 int temp = processIDS[i];
339 //get data created by processes
340 for (int i=0;i<(processors-1);i++) {
342 //input the sumDIversity
343 string inTemp = outputDir + toString(processIDS[i]) + ".sumDiv.temp";
345 m->openInputFile(inTemp, in);
347 //output the sumDIversity
348 for (int j = 0; j < sumDiv.size(); j++) {
352 in >> group >> size; m->gobble(in);
354 for (int k = 0; k < size; k++) {
358 sumDiv[group][k] += tempVal;
364 remove(inTemp.c_str());
372 catch(exception& e) {
373 m->errorOut(e, "PhyloDiversityCommand", "createProcesses");
377 //**********************************************************************************************************************
378 int PhyloDiversityCommand::driver(Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum, bool doSumCollect){
380 int numLeafNodes = randomLeaf.size();
382 for (int l = 0; l < numIters; l++) {
383 random_shuffle(randomLeaf.begin(), randomLeaf.end());
386 map<string, int> counts;
387 map< string, set<int> > countedBranch;
388 for (int j = 0; j < globaldata->Groups.size(); j++) { counts[globaldata->Groups[j]] = 0; countedBranch[globaldata->Groups[j]].insert(-2); } //add dummy index to initialize countedBranch sets
390 for(int k = 0; k < numLeafNodes; k++){
392 if (m->control_pressed) { return 0; }
394 //calc branch length of randomLeaf k
395 vector<float> br = calcBranchLength(t, randomLeaf[k], countedBranch);
397 //for each group in the groups update the total branch length accounting for the names file
398 vector<string> groups = t->tree[randomLeaf[k]].getGroup();
400 for (int j = 0; j < groups.size(); j++) {
401 int numSeqsInGroupJ = 0;
402 map<string, int>::iterator it;
403 it = t->tree[randomLeaf[k]].pcount.find(groups[j]);
404 if (it != t->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
405 numSeqsInGroupJ = it->second;
408 if (numSeqsInGroupJ != 0) { div[groups[j]][(counts[groups[j]]+1)] = div[groups[j]][counts[groups[j]]] + br[j]; }
410 for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
411 div[groups[j]][s] = div[groups[j]][s-1]; //update counts, but don't add in redundant branch lengths
413 counts[groups[j]] += numSeqsInGroupJ;
418 //add this diversity to the sum
419 for (int j = 0; j < globaldata->Groups.size(); j++) {
420 for (int g = 0; g < div[globaldata->Groups[j]].size(); g++) {
421 sumDiv[globaldata->Groups[j]][g] += div[globaldata->Groups[j]][g];
426 if ((collect) && (l == 0) && doSumCollect) { printData(numSampledList, div, outCollect, 1); }
427 if ((summary) && (l == 0) && doSumCollect) { printSumData(div, outSum, 1); }
433 catch(exception& e) {
434 m->errorOut(e, "PhyloDiversityCommand", "driver");
439 //**********************************************************************************************************************
441 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
444 out << "Groups\tnumSampled\tphyloDiversity" << endl;
446 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
448 for (int j = 0; j < globaldata->Groups.size(); j++) {
449 int numSampled = (div[globaldata->Groups[j]].size()-1);
450 out << globaldata->Groups[j] << '\t' << numSampled << '\t';
454 if (scale) { score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled; }
455 else { score = div[globaldata->Groups[j]][numSampled] / (float)numIters; }
457 out << setprecision(4) << score << endl;
463 catch(exception& e) {
464 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
468 //**********************************************************************************************************************
470 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
473 out << "numSampled\t";
474 for (int i = 0; i < globaldata->Groups.size(); i++) { out << globaldata->Groups[i] << '\t'; }
477 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
479 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {
480 int numSampled = *it;
482 out << numSampled << '\t';
484 for (int j = 0; j < globaldata->Groups.size(); j++) {
485 if (numSampled < div[globaldata->Groups[j]].size()) {
487 if (scale) { score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled; }
488 else { score = div[globaldata->Groups[j]][numSampled] / (float)numIters; }
490 out << setprecision(4) << score << '\t';
491 }else { out << "NA" << '\t'; }
499 catch(exception& e) {
500 m->errorOut(e, "PhyloDiversityCommand", "printData");
504 //**********************************************************************************************************************
505 //need a vector of floats one branch length for every group the node represents.
506 vector<float> PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< string, set<int> >& counted){
509 //calc the branch length
510 //while you aren't at root
514 vector<string> groups = t->tree[leaf].getGroup();
515 sums.resize(groups.size(), 0.0);
517 map<string, map<int, double> > tempTotals; //maps node to total Branch Length
518 map< string, set<int> > tempCounted;
519 set<int>::iterator it;
522 if(t->tree[index].getBranchLength() != -1){
523 for (int k = 0; k < groups.size(); k++) {
524 sums[k] += abs(t->tree[index].getBranchLength());
525 counted[groups[k]].insert(index);
529 for (int k = 0; k < groups.size(); k++) {
530 tempTotals[groups[k]][index] = 0.0;
533 index = t->tree[index].getParent();
535 //while you aren't at root
536 while(t->tree[index].getParent() != -1){
538 if (m->control_pressed) { return sums; }
541 for (int k = 0; k < groups.size(); k++) {
542 map<string, int>::iterator itGroup = t->tree[index].pcount.find(groups[k]);
543 if (itGroup != t->tree[index].pcount.end()) { pcountSize++; }
545 //do both your chidren have have descendants from the users groups?
546 int lc = t->tree[index].getLChild();
547 int rc = t->tree[index].getRChild();
550 itGroup = t->tree[lc].pcount.find(groups[k]);
551 if (itGroup != t->tree[lc].pcount.end()) { LpcountSize++; }
554 itGroup = t->tree[rc].pcount.find(groups[k]);
555 if (itGroup != t->tree[rc].pcount.end()) { RpcountSize++; }
557 //if yes, add your childrens tempTotals
558 if ((LpcountSize != 0) && (RpcountSize != 0)) {
559 sums[k] += tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc];
561 for (it = tempCounted[groups[k]].begin(); it != tempCounted[groups[k]].end(); it++) { counted[groups[k]].insert(*it); }
563 //cout << "added to total " << tempTotals[lc] << '\t' << tempTotals[rc] << endl;
564 if (t->tree[index].getBranchLength() != -1) {
565 if (counted[groups[k]].count(index) == 0) {
566 tempTotals[groups[k]][index] = abs(t->tree[index].getBranchLength());
567 tempCounted[groups[k]].insert(index);
569 tempTotals[groups[k]][index] = 0.0;
572 tempTotals[groups[k]][index] = 0.0;
574 }else { //if no, your tempTotal is your childrens temp totals + your branch length
575 tempTotals[groups[k]][index] = tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc];
577 if (counted[groups[k]].count(index) == 0) {
578 tempTotals[groups[k]][index] += abs(t->tree[index].getBranchLength());
579 tempCounted[groups[k]].insert(index);
583 //cout << "temptotal = "<< tempTotals[i] << endl;
586 index = t->tree[index].getParent();
592 catch(exception& e) {
593 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
597 //**********************************************************************************************************************