2 * phylodiversitycommand.cpp
5 * Created by westcott on 4/30/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "phylodiversitycommand.h"
12 //**********************************************************************************************************************
13 vector<string> PhyloDiversityCommand::getValidParameters(){
15 string Array[] = {"freq","rarefy","iters","groups","processors","summary","collect","scale","outputdir","inputdir"};
16 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
20 m->errorOut(e, "PhyloDiversityCommand", "getValidParameters");
24 //**********************************************************************************************************************
25 PhyloDiversityCommand::PhyloDiversityCommand(){
27 //initialize outputTypes
28 vector<string> tempOutNames;
29 outputTypes["phylodiv"] = tempOutNames;
30 outputTypes["rarefy"] = tempOutNames;
31 outputTypes["summary"] = tempOutNames;
34 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
38 //**********************************************************************************************************************
39 vector<string> PhyloDiversityCommand::getRequiredParameters(){
41 vector<string> myArray;
45 m->errorOut(e, "PhyloDiversityCommand", "getRequiredParameters");
49 //**********************************************************************************************************************
50 vector<string> PhyloDiversityCommand::getRequiredFiles(){
52 string Array[] = {"tree","group"};
53 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
57 m->errorOut(e, "PhyloDiversityCommand", "getRequiredFiles");
61 //**********************************************************************************************************************
62 PhyloDiversityCommand::PhyloDiversityCommand(string option) {
64 globaldata = GlobalData::getInstance();
67 //allow user to run help
68 if(option == "help") { help(); abort = true; }
71 //valid paramters for this command
72 string Array[] = {"freq","rarefy","iters","groups","processors","summary","collect","scale","outputdir","inputdir"};
73 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
75 OptionParser parser(option);
76 map<string,string> parameters = parser.getParameters();
78 ValidParameters validParameter;
80 //check to make sure all parameters are valid for command
81 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
82 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
85 //initialize outputTypes
86 vector<string> tempOutNames;
87 outputTypes["phylodiv"] = tempOutNames;
88 outputTypes["rarefy"] = tempOutNames;
89 outputTypes["summary"] = tempOutNames;
91 //if the user changes the output directory command factory will send this info to us in the output parameter
92 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(globaldata->getTreeFile()); }
94 if (globaldata->gTree.size() == 0) {//no trees were read
95 m->mothurOut("You must execute the read.tree command, before you may execute the phylo.diversity command."); m->mothurOutEndLine(); abort = true; }
98 temp = validParameter.validFile(parameters, "freq", false); if (temp == "not found") { temp = "100"; }
101 temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; }
102 convert(temp, iters);
104 temp = validParameter.validFile(parameters, "rarefy", false); if (temp == "not found") { temp = "F"; }
105 rarefy = m->isTrue(temp);
106 if (!rarefy) { iters = 1; }
108 temp = validParameter.validFile(parameters, "summary", false); if (temp == "not found") { temp = "T"; }
109 summary = m->isTrue(temp);
111 temp = validParameter.validFile(parameters, "scale", false); if (temp == "not found") { temp = "F"; }
112 scale = m->isTrue(temp);
114 temp = validParameter.validFile(parameters, "collect", false); if (temp == "not found") { temp = "F"; }
115 collect = m->isTrue(temp);
117 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
118 convert(temp, processors);
120 groups = validParameter.validFile(parameters, "groups", false);
121 if (groups == "not found") { groups = ""; Groups = globaldata->gTreemap->namesOfGroups; globaldata->Groups = Groups; }
123 m->splitAtDash(groups, Groups);
124 globaldata->Groups = Groups;
127 if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
131 catch(exception& e) {
132 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
136 //**********************************************************************************************************************
138 void PhyloDiversityCommand::help(){
140 m->mothurOut("The phylo.diversity command can only be executed after a successful read.tree command.\n");
141 m->mothurOut("The phylo.diversity command parameters are groups, iters, freq, processors, scale, rarefy, collect and summary. No parameters are required.\n");
142 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n");
143 m->mothurOut("The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n");
144 m->mothurOut("The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n");
145 m->mothurOut("The scale parameter is used indicate that you want your ouptut scaled to the number of sequences sampled, default = false. \n");
146 m->mothurOut("The rarefy parameter allows you to create a rarefaction curve. The default is false.\n");
147 m->mothurOut("The collect parameter allows you to create a collectors curve. The default is false.\n");
148 m->mothurOut("The summary parameter allows you to create a .summary file. The default is true.\n");
149 m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
150 m->mothurOut("The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n");
151 m->mothurOut("Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n");
152 m->mothurOut("The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n");
153 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
156 catch(exception& e) {
157 m->errorOut(e, "PhyloDiversityCommand", "help");
162 //**********************************************************************************************************************
164 PhyloDiversityCommand::~PhyloDiversityCommand(){}
166 //**********************************************************************************************************************
168 int PhyloDiversityCommand::execute(){
171 if (abort == true) { return 0; }
173 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
174 for (int i = 0; i < globaldata->Groups.size(); i++) { if (globaldata->Groups[i] == "xxx") { globaldata->Groups.erase(globaldata->Groups.begin()+i); break; } }
176 vector<string> outputNames;
178 vector<Tree*> trees = globaldata->gTree;
180 //for each of the users trees
181 for(int i = 0; i < trees.size(); i++) {
183 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
185 ofstream outSum, outRare, outCollect;
186 string outSumFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylodiv.summary";
187 string outRareFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylodiv.rarefaction";
188 string outCollectFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylodiv";
190 if (summary) { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile); outputTypes["summary"].push_back(outSumFile); }
191 if (rarefy) { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile); outputTypes["rarefy"].push_back(outRareFile); }
192 if (collect) { m->openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile); outputTypes["phylodiv"].push_back(outCollectFile); }
194 int numLeafNodes = trees[i]->getNumLeaves();
196 //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
197 vector<int> randomLeaf;
198 for (int j = 0; j < numLeafNodes; j++) {
199 if (m->inUsersGroups(trees[i]->tree[j].getGroup(), globaldata->Groups) == true) { //is this a node from the group the user selected.
200 randomLeaf.push_back(j);
204 numLeafNodes = randomLeaf.size(); //reset the number of leaf nodes you are using
206 //each group, each sampling, if no rarefy iters = 1;
207 map<string, vector<float> > diversity;
209 //each group, each sampling, if no rarefy iters = 1;
210 map<string, vector<float> > sumDiversity;
212 //find largest group total
213 int largestGroup = 0;
214 for (int j = 0; j < globaldata->Groups.size(); j++) {
215 if (globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]] > largestGroup) { largestGroup = globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]; }
217 //initialize diversity
218 diversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0); //numSampled
221 //initialize sumDiversity
222 sumDiversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);
225 //convert freq percentage to number
227 if (freq < 1.0) { increment = largestGroup * freq;
228 }else { increment = freq; }
230 //initialize sampling spots
231 set<int> numSampledList;
232 for(int k = 1; k <= largestGroup; k++){ if((k == 1) || (k % increment == 0)){ numSampledList.insert(k); } }
233 if(largestGroup % increment != 0){ numSampledList.insert(largestGroup); }
235 //add other groups ending points
236 for (int j = 0; j < globaldata->Groups.size(); j++) {
237 if (numSampledList.count(diversity[globaldata->Groups[j]].size()-1) == 0) { numSampledList.insert(diversity[globaldata->Groups[j]].size()-1); }
240 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
242 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);
245 vector<int> procIters;
247 int numItersPerProcessor = iters / processors;
249 //divide iters between processes
250 for (int h = 0; h < processors; h++) {
251 if(h == processors - 1){
252 numItersPerProcessor = iters - h * numItersPerProcessor;
254 procIters.push_back(numItersPerProcessor);
257 createProcesses(procIters, trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum);
259 }else{ //no need to paralellize if you dont want to rarefy
260 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);
265 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);
268 if (rarefy) { printData(numSampledList, sumDiversity, outRare, iters); }
272 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
274 m->mothurOutEndLine();
275 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
276 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
277 m->mothurOutEndLine();
282 catch(exception& e) {
283 m->errorOut(e, "PhyloDiversityCommand", "execute");
287 //**********************************************************************************************************************
288 int PhyloDiversityCommand::createProcesses(vector<int>& procIters, Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum){
290 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
293 vector<int> processIDS;
294 map< string, vector<float> >::iterator itSum;
298 //loop through and create all the processes you want
299 while (process != processors) {
303 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
306 driver(t, div, sumDiv, procIters[process], increment, randomLeaf, numSampledList, outCollect, outSum, false);
308 string outTemp = outputDir + toString(getpid()) + ".sumDiv.temp";
310 m->openOutputFile(outTemp, out);
312 //output the sumDIversity
313 for (itSum = sumDiv.begin(); itSum != sumDiv.end(); itSum++) {
314 out << itSum->first << '\t' << (itSum->second).size() << '\t';
315 for (int k = 0; k < (itSum->second).size(); k++) {
316 out << (itSum->second)[k] << '\t';
324 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
327 driver(t, div, sumDiv, procIters[0], increment, randomLeaf, numSampledList, outCollect, outSum, true);
329 //force parent to wait until all the processes are done
330 for (int i=0;i<(processors-1);i++) {
331 int temp = processIDS[i];
335 //get data created by processes
336 for (int i=0;i<(processors-1);i++) {
338 //input the sumDIversity
339 string inTemp = outputDir + toString(processIDS[i]) + ".sumDiv.temp";
341 m->openInputFile(inTemp, in);
343 //output the sumDIversity
344 for (int j = 0; j < sumDiv.size(); j++) {
348 in >> group >> size; m->gobble(in);
350 for (int k = 0; k < size; k++) {
354 sumDiv[group][k] += tempVal;
360 remove(inTemp.c_str());
368 catch(exception& e) {
369 m->errorOut(e, "PhyloDiversityCommand", "createProcesses");
373 //**********************************************************************************************************************
374 int PhyloDiversityCommand::driver(Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum, bool doSumCollect){
376 int numLeafNodes = randomLeaf.size();
378 for (int l = 0; l < numIters; l++) {
379 random_shuffle(randomLeaf.begin(), randomLeaf.end());
382 map<string, int> counts;
383 map< string, set<int> > countedBranch;
384 for (int j = 0; j < globaldata->Groups.size(); j++) { counts[globaldata->Groups[j]] = 0; countedBranch[globaldata->Groups[j]].insert(-2); } //add dummy index to initialize countedBranch sets
386 for(int k = 0; k < numLeafNodes; k++){
388 if (m->control_pressed) { return 0; }
390 //calc branch length of randomLeaf k
391 vector<float> br = calcBranchLength(t, randomLeaf[k], countedBranch);
393 //for each group in the groups update the total branch length accounting for the names file
394 vector<string> groups = t->tree[randomLeaf[k]].getGroup();
396 for (int j = 0; j < groups.size(); j++) {
397 int numSeqsInGroupJ = 0;
398 map<string, int>::iterator it;
399 it = t->tree[randomLeaf[k]].pcount.find(groups[j]);
400 if (it != t->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
401 numSeqsInGroupJ = it->second;
404 if (numSeqsInGroupJ != 0) { div[groups[j]][(counts[groups[j]]+1)] = div[groups[j]][counts[groups[j]]] + br[j]; }
406 for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
407 div[groups[j]][s] = div[groups[j]][s-1]; //update counts, but don't add in redundant branch lengths
409 counts[groups[j]] += numSeqsInGroupJ;
414 //add this diversity to the sum
415 for (int j = 0; j < globaldata->Groups.size(); j++) {
416 for (int g = 0; g < div[globaldata->Groups[j]].size(); g++) {
417 sumDiv[globaldata->Groups[j]][g] += div[globaldata->Groups[j]][g];
422 if ((collect) && (l == 0) && doSumCollect) { printData(numSampledList, div, outCollect, 1); }
423 if ((summary) && (l == 0) && doSumCollect) { printSumData(div, outSum, 1); }
429 catch(exception& e) {
430 m->errorOut(e, "PhyloDiversityCommand", "driver");
435 //**********************************************************************************************************************
437 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
440 out << "Groups\tnumSampled\tphyloDiversity" << endl;
442 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
444 for (int j = 0; j < globaldata->Groups.size(); j++) {
445 int numSampled = (div[globaldata->Groups[j]].size()-1);
446 out << globaldata->Groups[j] << '\t' << numSampled << '\t';
450 if (scale) { score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled; }
451 else { score = div[globaldata->Groups[j]][numSampled] / (float)numIters; }
453 out << setprecision(4) << score << endl;
459 catch(exception& e) {
460 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
464 //**********************************************************************************************************************
466 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
469 out << "numSampled\t";
470 for (int i = 0; i < globaldata->Groups.size(); i++) { out << globaldata->Groups[i] << '\t'; }
473 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
475 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {
476 int numSampled = *it;
478 out << numSampled << '\t';
480 for (int j = 0; j < globaldata->Groups.size(); j++) {
481 if (numSampled < div[globaldata->Groups[j]].size()) {
483 if (scale) { score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled; }
484 else { score = div[globaldata->Groups[j]][numSampled] / (float)numIters; }
486 out << setprecision(4) << score << '\t';
487 }else { out << "NA" << '\t'; }
495 catch(exception& e) {
496 m->errorOut(e, "PhyloDiversityCommand", "printData");
500 //**********************************************************************************************************************
501 //need a vector of floats one branch length for every group the node represents.
502 vector<float> PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< string, set<int> >& counted){
505 //calc the branch length
506 //while you aren't at root
510 vector<string> groups = t->tree[leaf].getGroup();
511 sums.resize(groups.size(), 0.0);
513 map<string, map<int, double> > tempTotals; //maps node to total Branch Length
514 map< string, set<int> > tempCounted;
515 set<int>::iterator it;
518 if(t->tree[index].getBranchLength() != -1){
519 for (int k = 0; k < groups.size(); k++) {
520 sums[k] += abs(t->tree[index].getBranchLength());
521 counted[groups[k]].insert(index);
525 for (int k = 0; k < groups.size(); k++) {
526 tempTotals[groups[k]][index] = 0.0;
529 index = t->tree[index].getParent();
531 //while you aren't at root
532 while(t->tree[index].getParent() != -1){
534 if (m->control_pressed) { return sums; }
537 for (int k = 0; k < groups.size(); k++) {
538 map<string, int>::iterator itGroup = t->tree[index].pcount.find(groups[k]);
539 if (itGroup != t->tree[index].pcount.end()) { pcountSize++; }
541 //do both your chidren have have descendants from the users groups?
542 int lc = t->tree[index].getLChild();
543 int rc = t->tree[index].getRChild();
546 itGroup = t->tree[lc].pcount.find(groups[k]);
547 if (itGroup != t->tree[lc].pcount.end()) { LpcountSize++; }
550 itGroup = t->tree[rc].pcount.find(groups[k]);
551 if (itGroup != t->tree[rc].pcount.end()) { RpcountSize++; }
553 //if yes, add your childrens tempTotals
554 if ((LpcountSize != 0) && (RpcountSize != 0)) {
555 sums[k] += tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc];
557 for (it = tempCounted[groups[k]].begin(); it != tempCounted[groups[k]].end(); it++) { counted[groups[k]].insert(*it); }
559 //cout << "added to total " << tempTotals[lc] << '\t' << tempTotals[rc] << endl;
560 if (t->tree[index].getBranchLength() != -1) {
561 if (counted[groups[k]].count(index) == 0) {
562 tempTotals[groups[k]][index] = abs(t->tree[index].getBranchLength());
563 tempCounted[groups[k]].insert(index);
565 tempTotals[groups[k]][index] = 0.0;
568 tempTotals[groups[k]][index] = 0.0;
570 }else { //if no, your tempTotal is your childrens temp totals + your branch length
571 tempTotals[groups[k]][index] = tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc];
573 if (counted[groups[k]].count(index) == 0) {
574 tempTotals[groups[k]][index] += abs(t->tree[index].getBranchLength());
575 tempCounted[groups[k]].insert(index);
579 //cout << "temptotal = "<< tempTotals[i] << endl;
582 index = t->tree[index].getParent();
588 catch(exception& e) {
589 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
593 //**********************************************************************************************************************