2 * phylodiversitycommand.cpp
5 * Created by westcott on 4/30/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "phylodiversitycommand.h"
12 //**********************************************************************************************************************
13 vector<string> PhyloDiversityCommand::getValidParameters(){
15 string Array[] = {"freq","rarefy","iters","groups","processors","summary","collect","scale","outputdir","inputdir"};
16 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
20 m->errorOut(e, "PhyloDiversityCommand", "getValidParameters");
24 //**********************************************************************************************************************
25 PhyloDiversityCommand::PhyloDiversityCommand(){
28 //initialize outputTypes
29 vector<string> tempOutNames;
30 outputTypes["phylodiv"] = tempOutNames;
31 outputTypes["rarefy"] = tempOutNames;
32 outputTypes["summary"] = tempOutNames;
35 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
39 //**********************************************************************************************************************
40 vector<string> PhyloDiversityCommand::getRequiredParameters(){
42 vector<string> myArray;
46 m->errorOut(e, "PhyloDiversityCommand", "getRequiredParameters");
50 //**********************************************************************************************************************
51 vector<string> PhyloDiversityCommand::getRequiredFiles(){
53 string Array[] = {"tree","group"};
54 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
58 m->errorOut(e, "PhyloDiversityCommand", "getRequiredFiles");
62 //**********************************************************************************************************************
63 PhyloDiversityCommand::PhyloDiversityCommand(string option) {
65 globaldata = GlobalData::getInstance();
68 //allow user to run help
69 if(option == "help") { help(); abort = true; }
72 //valid paramters for this command
73 string Array[] = {"freq","rarefy","iters","groups","processors","summary","collect","scale","outputdir","inputdir"};
74 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
76 OptionParser parser(option);
77 map<string,string> parameters = parser.getParameters();
79 ValidParameters validParameter;
81 //check to make sure all parameters are valid for command
82 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
83 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
86 //initialize outputTypes
87 vector<string> tempOutNames;
88 outputTypes["phylodiv"] = tempOutNames;
89 outputTypes["rarefy"] = tempOutNames;
90 outputTypes["summary"] = tempOutNames;
92 //if the user changes the output directory command factory will send this info to us in the output parameter
93 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(globaldata->getTreeFile()); }
95 if (globaldata->gTree.size() == 0) {//no trees were read
96 m->mothurOut("You must execute the read.tree command, before you may execute the phylo.diversity command."); m->mothurOutEndLine(); abort = true; }
99 temp = validParameter.validFile(parameters, "freq", false); if (temp == "not found") { temp = "100"; }
102 temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; }
103 convert(temp, iters);
105 temp = validParameter.validFile(parameters, "rarefy", false); if (temp == "not found") { temp = "F"; }
106 rarefy = m->isTrue(temp);
107 if (!rarefy) { iters = 1; }
109 temp = validParameter.validFile(parameters, "summary", false); if (temp == "not found") { temp = "T"; }
110 summary = m->isTrue(temp);
112 temp = validParameter.validFile(parameters, "scale", false); if (temp == "not found") { temp = "F"; }
113 scale = m->isTrue(temp);
115 temp = validParameter.validFile(parameters, "collect", false); if (temp == "not found") { temp = "F"; }
116 collect = m->isTrue(temp);
118 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
119 convert(temp, processors);
121 groups = validParameter.validFile(parameters, "groups", false);
122 if (groups == "not found") { groups = ""; Groups = globaldata->gTreemap->namesOfGroups; globaldata->Groups = Groups; }
124 m->splitAtDash(groups, Groups);
125 globaldata->Groups = Groups;
128 if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
132 catch(exception& e) {
133 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
137 //**********************************************************************************************************************
139 void PhyloDiversityCommand::help(){
141 m->mothurOut("The phylo.diversity command can only be executed after a successful read.tree command.\n");
142 m->mothurOut("The phylo.diversity command parameters are groups, iters, freq, processors, scale, rarefy, collect and summary. No parameters are required.\n");
143 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n");
144 m->mothurOut("The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n");
145 m->mothurOut("The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n");
146 m->mothurOut("The scale parameter is used indicate that you want your ouptut scaled to the number of sequences sampled, default = false. \n");
147 m->mothurOut("The rarefy parameter allows you to create a rarefaction curve. The default is false.\n");
148 m->mothurOut("The collect parameter allows you to create a collectors curve. The default is false.\n");
149 m->mothurOut("The summary parameter allows you to create a .summary file. The default is true.\n");
150 m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
151 m->mothurOut("The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n");
152 m->mothurOut("Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n");
153 m->mothurOut("The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n");
154 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
157 catch(exception& e) {
158 m->errorOut(e, "PhyloDiversityCommand", "help");
163 //**********************************************************************************************************************
165 PhyloDiversityCommand::~PhyloDiversityCommand(){}
167 //**********************************************************************************************************************
169 int PhyloDiversityCommand::execute(){
172 if (abort == true) { return 0; }
174 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
175 for (int i = 0; i < globaldata->Groups.size(); i++) { if (globaldata->Groups[i] == "xxx") { globaldata->Groups.erase(globaldata->Groups.begin()+i); break; } }
177 vector<string> outputNames;
179 vector<Tree*> trees = globaldata->gTree;
181 //for each of the users trees
182 for(int i = 0; i < trees.size(); i++) {
184 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
186 ofstream outSum, outRare, outCollect;
187 string outSumFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylodiv.summary";
188 string outRareFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylodiv.rarefaction";
189 string outCollectFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylodiv";
191 if (summary) { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile); outputTypes["summary"].push_back(outSumFile); }
192 if (rarefy) { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile); outputTypes["rarefy"].push_back(outRareFile); }
193 if (collect) { m->openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile); outputTypes["phylodiv"].push_back(outCollectFile); }
195 int numLeafNodes = trees[i]->getNumLeaves();
197 //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
198 vector<int> randomLeaf;
199 for (int j = 0; j < numLeafNodes; j++) {
200 if (m->inUsersGroups(trees[i]->tree[j].getGroup(), globaldata->Groups) == true) { //is this a node from the group the user selected.
201 randomLeaf.push_back(j);
205 numLeafNodes = randomLeaf.size(); //reset the number of leaf nodes you are using
207 //each group, each sampling, if no rarefy iters = 1;
208 map<string, vector<float> > diversity;
210 //each group, each sampling, if no rarefy iters = 1;
211 map<string, vector<float> > sumDiversity;
213 //find largest group total
214 int largestGroup = 0;
215 for (int j = 0; j < globaldata->Groups.size(); j++) {
216 if (globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]] > largestGroup) { largestGroup = globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]; }
218 //initialize diversity
219 diversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0); //numSampled
222 //initialize sumDiversity
223 sumDiversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);
226 //convert freq percentage to number
228 if (freq < 1.0) { increment = largestGroup * freq;
229 }else { increment = freq; }
231 //initialize sampling spots
232 set<int> numSampledList;
233 for(int k = 1; k <= largestGroup; k++){ if((k == 1) || (k % increment == 0)){ numSampledList.insert(k); } }
234 if(largestGroup % increment != 0){ numSampledList.insert(largestGroup); }
236 //add other groups ending points
237 for (int j = 0; j < globaldata->Groups.size(); j++) {
238 if (numSampledList.count(diversity[globaldata->Groups[j]].size()-1) == 0) { numSampledList.insert(diversity[globaldata->Groups[j]].size()-1); }
241 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
243 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);
246 vector<int> procIters;
248 int numItersPerProcessor = iters / processors;
250 //divide iters between processes
251 for (int h = 0; h < processors; h++) {
252 if(h == processors - 1){
253 numItersPerProcessor = iters - h * numItersPerProcessor;
255 procIters.push_back(numItersPerProcessor);
258 createProcesses(procIters, trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum);
260 }else{ //no need to paralellize if you dont want to rarefy
261 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);
266 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);
269 if (rarefy) { printData(numSampledList, sumDiversity, outRare, iters); }
273 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
275 m->mothurOutEndLine();
276 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
277 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
278 m->mothurOutEndLine();
283 catch(exception& e) {
284 m->errorOut(e, "PhyloDiversityCommand", "execute");
288 //**********************************************************************************************************************
289 int PhyloDiversityCommand::createProcesses(vector<int>& procIters, Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum){
291 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
294 vector<int> processIDS;
295 map< string, vector<float> >::iterator itSum;
299 //loop through and create all the processes you want
300 while (process != processors) {
304 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
307 driver(t, div, sumDiv, procIters[process], increment, randomLeaf, numSampledList, outCollect, outSum, false);
309 string outTemp = outputDir + toString(getpid()) + ".sumDiv.temp";
311 m->openOutputFile(outTemp, out);
313 //output the sumDIversity
314 for (itSum = sumDiv.begin(); itSum != sumDiv.end(); itSum++) {
315 out << itSum->first << '\t' << (itSum->second).size() << '\t';
316 for (int k = 0; k < (itSum->second).size(); k++) {
317 out << (itSum->second)[k] << '\t';
326 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
327 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
332 driver(t, div, sumDiv, procIters[0], increment, randomLeaf, numSampledList, outCollect, outSum, true);
334 //force parent to wait until all the processes are done
335 for (int i=0;i<(processors-1);i++) {
336 int temp = processIDS[i];
340 //get data created by processes
341 for (int i=0;i<(processors-1);i++) {
343 //input the sumDIversity
344 string inTemp = outputDir + toString(processIDS[i]) + ".sumDiv.temp";
346 m->openInputFile(inTemp, in);
348 //output the sumDIversity
349 for (int j = 0; j < sumDiv.size(); j++) {
353 in >> group >> size; m->gobble(in);
355 for (int k = 0; k < size; k++) {
359 sumDiv[group][k] += tempVal;
365 remove(inTemp.c_str());
373 catch(exception& e) {
374 m->errorOut(e, "PhyloDiversityCommand", "createProcesses");
378 //**********************************************************************************************************************
379 int PhyloDiversityCommand::driver(Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum, bool doSumCollect){
381 int numLeafNodes = randomLeaf.size();
383 for (int l = 0; l < numIters; l++) {
384 random_shuffle(randomLeaf.begin(), randomLeaf.end());
387 map<string, int> counts;
388 map< string, set<int> > countedBranch;
389 for (int j = 0; j < globaldata->Groups.size(); j++) { counts[globaldata->Groups[j]] = 0; countedBranch[globaldata->Groups[j]].insert(-2); } //add dummy index to initialize countedBranch sets
391 for(int k = 0; k < numLeafNodes; k++){
393 if (m->control_pressed) { return 0; }
395 //calc branch length of randomLeaf k
396 vector<float> br = calcBranchLength(t, randomLeaf[k], countedBranch);
398 //for each group in the groups update the total branch length accounting for the names file
399 vector<string> groups = t->tree[randomLeaf[k]].getGroup();
401 for (int j = 0; j < groups.size(); j++) {
402 int numSeqsInGroupJ = 0;
403 map<string, int>::iterator it;
404 it = t->tree[randomLeaf[k]].pcount.find(groups[j]);
405 if (it != t->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
406 numSeqsInGroupJ = it->second;
409 if (numSeqsInGroupJ != 0) { div[groups[j]][(counts[groups[j]]+1)] = div[groups[j]][counts[groups[j]]] + br[j]; }
411 for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
412 div[groups[j]][s] = div[groups[j]][s-1]; //update counts, but don't add in redundant branch lengths
414 counts[groups[j]] += numSeqsInGroupJ;
419 //add this diversity to the sum
420 for (int j = 0; j < globaldata->Groups.size(); j++) {
421 for (int g = 0; g < div[globaldata->Groups[j]].size(); g++) {
422 sumDiv[globaldata->Groups[j]][g] += div[globaldata->Groups[j]][g];
427 if ((collect) && (l == 0) && doSumCollect) { printData(numSampledList, div, outCollect, 1); }
428 if ((summary) && (l == 0) && doSumCollect) { printSumData(div, outSum, 1); }
434 catch(exception& e) {
435 m->errorOut(e, "PhyloDiversityCommand", "driver");
440 //**********************************************************************************************************************
442 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
445 out << "Groups\tnumSampled\tphyloDiversity" << endl;
447 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
449 for (int j = 0; j < globaldata->Groups.size(); j++) {
450 int numSampled = (div[globaldata->Groups[j]].size()-1);
451 out << globaldata->Groups[j] << '\t' << numSampled << '\t';
455 if (scale) { score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled; }
456 else { score = div[globaldata->Groups[j]][numSampled] / (float)numIters; }
458 out << setprecision(4) << score << endl;
464 catch(exception& e) {
465 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
469 //**********************************************************************************************************************
471 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
474 out << "numSampled\t";
475 for (int i = 0; i < globaldata->Groups.size(); i++) { out << globaldata->Groups[i] << '\t'; }
478 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
480 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {
481 int numSampled = *it;
483 out << numSampled << '\t';
485 for (int j = 0; j < globaldata->Groups.size(); j++) {
486 if (numSampled < div[globaldata->Groups[j]].size()) {
488 if (scale) { score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled; }
489 else { score = div[globaldata->Groups[j]][numSampled] / (float)numIters; }
491 out << setprecision(4) << score << '\t';
492 }else { out << "NA" << '\t'; }
500 catch(exception& e) {
501 m->errorOut(e, "PhyloDiversityCommand", "printData");
505 //**********************************************************************************************************************
506 //need a vector of floats one branch length for every group the node represents.
507 vector<float> PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< string, set<int> >& counted){
510 //calc the branch length
511 //while you aren't at root
515 vector<string> groups = t->tree[leaf].getGroup();
516 sums.resize(groups.size(), 0.0);
518 map<string, map<int, double> > tempTotals; //maps node to total Branch Length
519 map< string, set<int> > tempCounted;
520 set<int>::iterator it;
523 if(t->tree[index].getBranchLength() != -1){
524 for (int k = 0; k < groups.size(); k++) {
525 sums[k] += abs(t->tree[index].getBranchLength());
526 counted[groups[k]].insert(index);
530 for (int k = 0; k < groups.size(); k++) {
531 tempTotals[groups[k]][index] = 0.0;
534 index = t->tree[index].getParent();
536 //while you aren't at root
537 while(t->tree[index].getParent() != -1){
539 if (m->control_pressed) { return sums; }
542 for (int k = 0; k < groups.size(); k++) {
543 map<string, int>::iterator itGroup = t->tree[index].pcount.find(groups[k]);
544 if (itGroup != t->tree[index].pcount.end()) { pcountSize++; }
546 //do both your chidren have have descendants from the users groups?
547 int lc = t->tree[index].getLChild();
548 int rc = t->tree[index].getRChild();
551 itGroup = t->tree[lc].pcount.find(groups[k]);
552 if (itGroup != t->tree[lc].pcount.end()) { LpcountSize++; }
555 itGroup = t->tree[rc].pcount.find(groups[k]);
556 if (itGroup != t->tree[rc].pcount.end()) { RpcountSize++; }
558 //if yes, add your childrens tempTotals
559 if ((LpcountSize != 0) && (RpcountSize != 0)) {
560 sums[k] += tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc];
562 for (it = tempCounted[groups[k]].begin(); it != tempCounted[groups[k]].end(); it++) { counted[groups[k]].insert(*it); }
564 //cout << "added to total " << tempTotals[lc] << '\t' << tempTotals[rc] << endl;
565 if (t->tree[index].getBranchLength() != -1) {
566 if (counted[groups[k]].count(index) == 0) {
567 tempTotals[groups[k]][index] = abs(t->tree[index].getBranchLength());
568 tempCounted[groups[k]].insert(index);
570 tempTotals[groups[k]][index] = 0.0;
573 tempTotals[groups[k]][index] = 0.0;
575 }else { //if no, your tempTotal is your childrens temp totals + your branch length
576 tempTotals[groups[k]][index] = tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc];
578 if (counted[groups[k]].count(index) == 0) {
579 tempTotals[groups[k]][index] += abs(t->tree[index].getBranchLength());
580 tempCounted[groups[k]].insert(index);
584 //cout << "temptotal = "<< tempTotals[i] << endl;
587 index = t->tree[index].getParent();
593 catch(exception& e) {
594 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
598 //**********************************************************************************************************************