]> git.donarmstrong.com Git - mothur.git/blob - phylodiversitycommand.cpp
sffinfo bug with flow grams right index when clipQualRight=0
[mothur.git] / phylodiversitycommand.cpp
1 /*
2  *  phylodiversitycommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/30/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylodiversitycommand.h"
11 #include "treereader.h"
12
13 //**********************************************************************************************************************
14 vector<string> PhyloDiversityCommand::setParameters(){  
15         try {
16
17                 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none","phylodiv",false,true,true); parameters.push_back(ptree);
18         CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pname);
19         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","",false,false,true); parameters.push_back(pcount);
20                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
21                 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
22                 CommandParameter piters("iters", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(piters);
23                 CommandParameter pfreq("freq", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pfreq);
24                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
25                 CommandParameter prarefy("rarefy", "Boolean", "", "F", "", "", "","rarefy",false,false); parameters.push_back(prarefy);
26                 CommandParameter psummary("summary", "Boolean", "", "T", "", "", "","summary",false,false); parameters.push_back(psummary);
27                 CommandParameter pcollect("collect", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pcollect);
28                 CommandParameter pscale("scale", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pscale);
29                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
30                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
31                 
32                 vector<string> myArray;
33                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
34                 return myArray;
35         }
36         catch(exception& e) {
37                 m->errorOut(e, "PhyloDiversityCommand", "setParameters");
38                 exit(1);
39         }
40 }
41 //**********************************************************************************************************************
42 string PhyloDiversityCommand::getHelpString(){  
43         try {
44                 string helpString = "";
45                 helpString += "The phylo.diversity command parameters are tree, group, name, count, groups, iters, freq, processors, scale, rarefy, collect and summary.  tree and group are required, unless you have valid current files.\n";
46                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n";
47                 helpString += "The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n";
48                 helpString += "The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n";
49                 helpString += "The scale parameter is used indicate that you want your output scaled to the number of sequences sampled, default = false. \n";
50                 helpString += "The rarefy parameter allows you to create a rarefaction curve. The default is false.\n";
51                 helpString += "The collect parameter allows you to create a collectors curve. The default is false.\n";
52                 helpString += "The summary parameter allows you to create a .summary file. The default is true.\n";
53                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
54                 helpString += "The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n";
55                 helpString += "Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n";
56                 helpString += "The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n";
57                 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
58                 return helpString;
59         }
60         catch(exception& e) {
61                 m->errorOut(e, "PhyloDiversityCommand", "getHelpString");
62                 exit(1);
63         }
64 }
65 //**********************************************************************************************************************
66 string PhyloDiversityCommand::getOutputPattern(string type) {
67     try {
68         string pattern = "";
69         
70         if (type == "phylodiv") {  pattern = "[filename],[tag],phylodiv"; } 
71         else if (type == "rarefy") {  pattern = "[filename],[tag],phylodiv.rarefaction"; } 
72         else if (type == "summary") {  pattern = "[filename],[tag],phylodiv.summary"; }
73         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
74         
75         return pattern;
76     }
77     catch(exception& e) {
78         m->errorOut(e, "PhyloDiversityCommand", "getOutputPattern");
79         exit(1);
80     }
81 }
82
83 //**********************************************************************************************************************
84 PhyloDiversityCommand::PhyloDiversityCommand(){ 
85         try {
86                 abort = true; calledHelp = true; 
87                 setParameters();
88                 vector<string> tempOutNames;
89                 outputTypes["phylodiv"] = tempOutNames;
90                 outputTypes["rarefy"] = tempOutNames;
91                 outputTypes["summary"] = tempOutNames;
92         }
93         catch(exception& e) {
94                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
95                 exit(1);
96         }
97 }
98 //**********************************************************************************************************************
99 PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
100         try {
101                 abort = false; calledHelp = false;   
102                 
103                 //allow user to run help
104                 if(option == "help") { help(); abort = true; calledHelp = true; }
105                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
106                 
107                 else {
108                         vector<string> myArray = setParameters();;
109                         
110                         OptionParser parser(option);
111                         map<string,string> parameters = parser.getParameters();
112                         map<string,string>::iterator it;
113                         
114                         ValidParameters validParameter;
115                 
116                         //check to make sure all parameters are valid for command
117                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
118                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
119                         }
120                         
121                         //initialize outputTypes
122                         vector<string> tempOutNames;
123                         outputTypes["phylodiv"] = tempOutNames;
124                         outputTypes["rarefy"] = tempOutNames;
125                         outputTypes["summary"] = tempOutNames;
126                         
127                         //if the user changes the input directory command factory will send this info to us in the output parameter 
128                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
129                         if (inputDir == "not found"){   inputDir = "";          }
130                         else {
131                                 string path;
132                                 it = parameters.find("tree");
133                                 //user has given a template file
134                                 if(it != parameters.end()){ 
135                                         path = m->hasPath(it->second);
136                                         //if the user has not given a path then, add inputdir. else leave path alone.
137                                         if (path == "") {       parameters["tree"] = inputDir + it->second;             }
138                                 }
139                                 
140                                 it = parameters.find("group");
141                                 //user has given a template file
142                                 if(it != parameters.end()){ 
143                                         path = m->hasPath(it->second);
144                                         //if the user has not given a path then, add inputdir. else leave path alone.
145                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
146                                 }
147                                 
148                                 it = parameters.find("name");
149                                 //user has given a template file
150                                 if(it != parameters.end()){ 
151                                         path = m->hasPath(it->second);
152                                         //if the user has not given a path then, add inputdir. else leave path alone.
153                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
154                                 }
155                 
156                 it = parameters.find("count");
157                                 //user has given a template file
158                                 if(it != parameters.end()){ 
159                                         path = m->hasPath(it->second);
160                                         //if the user has not given a path then, add inputdir. else leave path alone.
161                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
162                                 }
163                         }
164                         
165                         //check for required parameters
166                         treefile = validParameter.validFile(parameters, "tree", true);
167                         if (treefile == "not open") { treefile = ""; abort = true; }
168                         else if (treefile == "not found") {                             
169                                 //if there is a current design file, use it
170                                 treefile = m->getTreeFile(); 
171                                 if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
172                                 else {  m->mothurOut("You have no current tree file and the tree parameter is required."); m->mothurOutEndLine(); abort = true; }                                                               
173                         }else { m->setTreeFile(treefile); }     
174                         
175                         //check for required parameters
176                         groupfile = validParameter.validFile(parameters, "group", true);
177                         if (groupfile == "not open") { groupfile = ""; abort = true; }
178                         else if (groupfile == "not found") { groupfile = ""; }
179                         else { m->setGroupFile(groupfile); }
180                         
181                         namefile = validParameter.validFile(parameters, "name", true);
182                         if (namefile == "not open") { namefile = ""; abort = true; }
183                         else if (namefile == "not found") { namefile = ""; }
184                         else { m->setNameFile(namefile); }
185                         
186             countfile = validParameter.validFile(parameters, "count", true);
187                         if (countfile == "not open") { countfile = ""; abort = true; }
188                         else if (countfile == "not found") { countfile = "";  } 
189                         else { m->setCountTableFile(countfile); }
190             
191             if ((namefile != "") && (countfile != "")) {
192                 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
193             }
194                         
195             if ((groupfile != "") && (countfile != "")) {
196                 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
197             }
198
199                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(treefile);       }
200                         
201                         string temp;
202                         temp = validParameter.validFile(parameters, "freq", false);                     if (temp == "not found") { temp = "100"; }
203                         m->mothurConvert(temp, freq); 
204                         
205                         temp = validParameter.validFile(parameters, "iters", false);                    if (temp == "not found") { temp = "1000"; }
206                         m->mothurConvert(temp, iters); 
207                         
208                         temp = validParameter.validFile(parameters, "rarefy", false);                   if (temp == "not found") { temp = "F"; }
209                         rarefy = m->isTrue(temp);
210                         if (!rarefy) { iters = 1;  }
211                         
212                         temp = validParameter.validFile(parameters, "summary", false);                  if (temp == "not found") { temp = "T"; }
213                         summary = m->isTrue(temp);
214                         
215                         temp = validParameter.validFile(parameters, "scale", false);                    if (temp == "not found") { temp = "F"; }
216                         scale = m->isTrue(temp);
217                         
218                         temp = validParameter.validFile(parameters, "collect", false);                  if (temp == "not found") { temp = "F"; }
219                         collect = m->isTrue(temp);
220                         
221                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
222                         m->setProcessors(temp);
223                         m->mothurConvert(temp, processors); 
224                         
225                         groups = validParameter.validFile(parameters, "groups", false);                 
226                         if (groups == "not found") { groups = "";  }
227                         else { 
228                                 m->splitAtDash(groups, Groups);
229                                 m->setGroups(Groups);
230                         }
231                         
232                         if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
233                         
234                         if (countfile=="") {
235                 if (namefile == "") {
236                     vector<string> files; files.push_back(treefile);
237                     parser.getNameFile(files);
238                 } 
239             }
240                 }
241                 
242         }
243         catch(exception& e) {
244                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
245                 exit(1);
246         }                       
247 }
248 //**********************************************************************************************************************
249
250 int PhyloDiversityCommand::execute(){
251         try {
252                 
253                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
254                 
255         int start = time(NULL);
256         
257                 m->setTreeFile(treefile);
258         TreeReader* reader;
259         if (countfile == "") { reader = new TreeReader(treefile, groupfile, namefile); }
260         else { reader = new TreeReader(treefile, countfile); }
261         vector<Tree*> trees = reader->getTrees();
262         ct = trees[0]->getCountTable();
263         delete reader;
264
265                 SharedUtil util;
266                 vector<string> mGroups = m->getGroups();
267                 vector<string> tGroups = ct->getNamesOfGroups();
268                 util.setGroups(mGroups, tGroups, "phylo.diversity");    //sets the groups the user wants to analyze
269                 
270                 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
271                 for (int i = 0; i < mGroups.size(); i++) { if (mGroups[i] == "xxx") { mGroups.erase(mGroups.begin()+i);  break; }  }
272                 m->setGroups(mGroups);
273                  
274                 vector<string> outputNames;
275                 
276                 //for each of the users trees
277                 for(int i = 0; i < trees.size(); i++) {
278                 
279                         if (m->control_pressed) { delete ct; for (int j = 0; j < trees.size(); j++) { delete trees[j]; } for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]);        } return 0; }
280                         
281                         ofstream outSum, outRare, outCollect;
282             map<string, string> variables; 
283             variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(treefile));
284             variables["[tag]"] = toString(i+1);
285                         string outSumFile = getOutputFileName("summary",variables);
286                         string outRareFile = getOutputFileName("rarefy",variables);
287                         string outCollectFile = getOutputFileName("phylodiv",variables);
288                         
289                         if (summary)    { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile);             outputTypes["summary"].push_back(outSumFile);                   }
290                         if (rarefy)             { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile);  outputTypes["rarefy"].push_back(outRareFile);                   }
291                         if (collect)    { m->openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile);  outputTypes["phylodiv"].push_back(outCollectFile);  }
292                         
293                         int numLeafNodes = trees[i]->getNumLeaves();
294                         
295                         //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
296                         vector<int> randomLeaf;
297                         for (int j = 0; j < numLeafNodes; j++) {  
298                                 if (m->inUsersGroups(trees[i]->tree[j].getGroup(), mGroups) == true) { //is this a node from the group the user selected.
299                                         randomLeaf.push_back(j); 
300                                 }
301                         }
302             
303            /* float sum = 0;
304             vector<float> sums; sums.resize(m->getGroups().size(), 0);
305             vector<float> sumsAboveRoot; sumsAboveRoot.resize(m->getGroups().size(), 0);
306             for (int j = 0; j < trees[i]->getNumNodes(); j++) { 
307                 if (trees[i]->tree[j].getBranchLength() < 0) { cout << j << '\t' << trees[i]->tree[j].getName() << '\t' << trees[i]->tree[j].getBranchLength() << endl; }
308                 
309                                 sum += abs(trees[i]->tree[j].getBranchLength());
310                 for (int k = 0; k < m->getGroups().size(); k++) {
311                     map<string, int>::iterator itGroup = trees[i]->tree[j].pcount.find(m->getGroups()[k]);
312                     if (itGroup != trees[i]->tree[j].pcount.end()) {  //this branch belongs to a group we care about
313                         if (j < rootForGroup[m->getGroups()[k]]) {
314                             sums[k] += abs(trees[i]->tree[j].getBranchLength());
315                         }else {
316                             sumsAboveRoot[k] += abs(trees[i]->tree[j].getBranchLength());
317                         }
318                     } 
319                 }
320                         }
321             cout << sum << endl; //exit(1);
322             
323             for (int k = 0; k < m->getGroups().size(); k++) {
324                 cout << m->getGroups()[k] << "root node = " << rootForGroup[m->getGroups()[k]] << "sum below root = " << sums[k] << "sum above root = " << sumsAboveRoot[k] << endl;
325             }
326              exit(1);  */ 
327                         
328                         numLeafNodes = randomLeaf.size();  //reset the number of leaf nodes you are using 
329                         
330                         //each group, each sampling, if no rarefy iters = 1;
331                         map<string, vector<float> > diversity;
332                         
333                         //each group, each sampling, if no rarefy iters = 1;
334                         map<string, vector<float> > sumDiversity;
335                         
336                         //find largest group total 
337                         int largestGroup = 0;
338                         for (int j = 0; j < mGroups.size(); j++) { 
339                 int numSeqsThisGroup = ct->getGroupCount(mGroups[j]);
340                                 if (numSeqsThisGroup > largestGroup) { largestGroup = numSeqsThisGroup; }
341                                 
342                                 //initialize diversity
343                                 diversity[mGroups[j]].resize(numSeqsThisGroup+1, 0.0);          //numSampled
344                                                                                                                                                                                                                         //groupA                0.0                     0.0
345                                                                                                                                                                                                                         
346                                 //initialize sumDiversity
347                                 sumDiversity[mGroups[j]].resize(numSeqsThisGroup+1, 0.0);
348                         }       
349
350                         //convert freq percentage to number
351                         int increment = 100;
352                         if (freq < 1.0) {  increment = largestGroup * freq;  
353                         }else { increment = freq;  }
354                         
355                         //initialize sampling spots
356                         set<int> numSampledList;
357                         for(int k = 1; k <= largestGroup; k++){  if((k == 1) || (k % increment == 0)){  numSampledList.insert(k); }   }
358                         if(largestGroup % increment != 0){      numSampledList.insert(largestGroup);   }
359                         
360                         //add other groups ending points
361                         for (int j = 0; j < mGroups.size(); j++) {  
362                                 if (numSampledList.count(diversity[mGroups[j]].size()-1) == 0) {  numSampledList.insert(diversity[mGroups[j]].size()-1); }
363                         }
364                         
365                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
366                                 if(processors == 1){
367                                         driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
368                                 }else{
369                                         if (rarefy) {
370                                                 vector<int> procIters;
371                                                 
372                                                 int numItersPerProcessor = iters / processors;
373                                                 
374                                                 //divide iters between processes
375                                                 for (int h = 0; h < processors; h++) {
376                                                         if(h == processors - 1){
377                                                                 numItersPerProcessor = iters - h * numItersPerProcessor;
378                                                         }
379                                                         procIters.push_back(numItersPerProcessor);
380                                                 }
381                                                 
382                                                 createProcesses(procIters, trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum); 
383                                                 
384                                         }else{ //no need to paralellize if you dont want to rarefy
385                                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
386                                         }
387                                 }
388
389                         #else
390                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
391                         #endif
392
393                         if (rarefy) {   printData(numSampledList, sumDiversity, outRare, iters);        }
394                 }
395                 
396         
397                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
398
399         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run phylo.diversity."); m->mothurOutEndLine();
400
401         
402                 m->mothurOutEndLine();
403                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
404                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
405                 m->mothurOutEndLine();
406
407                 
408                 return 0;
409         }
410         catch(exception& e) {
411                 m->errorOut(e, "PhyloDiversityCommand", "execute");
412                 exit(1);
413         }
414 }
415 //**********************************************************************************************************************
416 int PhyloDiversityCommand::createProcesses(vector<int>& procIters, Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum){
417         try {
418                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
419                 int process = 1;
420                 
421                 vector<int> processIDS;
422                 map< string, vector<float> >::iterator itSum;
423                 
424                 //loop through and create all the processes you want
425                 while (process != processors) {
426                         int pid = fork();
427                         
428                         if (pid > 0) {
429                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
430                                 process++;
431                         }else if (pid == 0){
432                                 driver(t, div, sumDiv, procIters[process], increment, randomLeaf, numSampledList, outCollect, outSum, false);
433                                 
434                                 string outTemp = outputDir + toString(getpid()) + ".sumDiv.temp";
435                                 ofstream out;
436                                 m->openOutputFile(outTemp, out);
437                                 
438                                 //output the sumDIversity
439                                 for (itSum = sumDiv.begin(); itSum != sumDiv.end(); itSum++) {
440                                         out << itSum->first << '\t' << (itSum->second).size() << '\t';
441                                         for (int k = 0; k < (itSum->second).size(); k++) { 
442                                                 out << (itSum->second)[k] << '\t';
443                                         }
444                                         out << endl;
445                                 }
446                                 
447                                 out.close();
448                                 
449                                 exit(0);
450                         }else { 
451                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
452                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
453                                 exit(0);
454                         }
455                 }
456                 
457                 driver(t, div, sumDiv, procIters[0], increment, randomLeaf, numSampledList, outCollect, outSum, true);
458                 
459                 //force parent to wait until all the processes are done
460                 for (int i=0;i<(processors-1);i++) { 
461                         int temp = processIDS[i];
462                         wait(&temp);
463                 }
464                 
465                 //get data created by processes
466                 for (int i=0;i<(processors-1);i++) { 
467                         
468                         //input the sumDIversity
469                         string inTemp = outputDir + toString(processIDS[i]) + ".sumDiv.temp";
470                         ifstream in;
471                         m->openInputFile(inTemp, in);
472                                 
473                         //output the sumDIversity
474                         for (int j = 0; j < sumDiv.size(); j++) { 
475                                 string group = "";
476                                 int size = 0;
477                                 
478                                 in >> group >> size; m->gobble(in);
479                                 
480                                 for (int k = 0; k < size; k++) { 
481                                         float tempVal;
482                                         in >> tempVal;
483                                         
484                                         sumDiv[group][k] += tempVal;
485                                 }
486                                 m->gobble(in);
487                         }
488                                 
489                         in.close();
490                         m->mothurRemove(inTemp);
491                 }
492                 
493 #endif
494
495         return 0;               
496         
497         }
498         catch(exception& e) {
499                 m->errorOut(e, "PhyloDiversityCommand", "createProcesses");
500                 exit(1);
501         }
502 }
503 //**********************************************************************************************************************
504 int PhyloDiversityCommand::driver(Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum, bool doSumCollect){
505         try {
506                 int numLeafNodes = randomLeaf.size();
507                 vector<string> mGroups = m->getGroups();
508         
509         map<string, int> rootForGroup = getRootForGroups(t); //maps groupName to root node in tree. "root" for group may not be the trees root and we don't want to include the extra branches.
510         
511                 for (int l = 0; l < numIters; l++) {
512                                 random_shuffle(randomLeaf.begin(), randomLeaf.end());
513          
514                                 //initialize counts
515                                 map<string, int> counts;
516                 vector< map<string, bool> > countedBranch;
517                 for (int i = 0; i < t->getNumNodes(); i++) {
518                     map<string, bool> temp;
519                     for (int j = 0; j < mGroups.size(); j++) { temp[mGroups[j]] = false; }
520                     countedBranch.push_back(temp);
521                 }
522             
523                                 for (int j = 0; j < mGroups.size(); j++) {  counts[mGroups[j]] = 0;   }  
524                                 
525                                 for(int k = 0; k < numLeafNodes; k++){
526                                                 
527                                         if (m->control_pressed) { return 0; }
528                                         
529                                         //calc branch length of randomLeaf k
530                     vector<float> br = calcBranchLength(t, randomLeaf[k], countedBranch, rootForGroup);
531                         
532                                         //for each group in the groups update the total branch length accounting for the names file
533                                         vector<string> groups = t->tree[randomLeaf[k]].getGroup();
534                                         
535                                         for (int j = 0; j < groups.size(); j++) {
536                         
537                         if (m->inUsersGroups(groups[j], mGroups)) {
538                             int numSeqsInGroupJ = 0;
539                             map<string, int>::iterator it;
540                             it = t->tree[randomLeaf[k]].pcount.find(groups[j]);
541                             if (it != t->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
542                                 numSeqsInGroupJ = it->second;
543                             }
544                             
545                             if (numSeqsInGroupJ != 0) { div[groups[j]][(counts[groups[j]]+1)] = div[groups[j]][counts[groups[j]]] + br[j];  }
546                             
547                             for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
548                                 div[groups[j]][s] = div[groups[j]][s-1];  //update counts, but don't add in redundant branch lengths
549                             }
550                             counts[groups[j]] += numSeqsInGroupJ;
551                         }
552                                         }
553                                 }
554                                 
555                                 if (rarefy) {
556                                         //add this diversity to the sum
557                                         for (int j = 0; j < mGroups.size(); j++) {  
558                                                 for (int g = 0; g < div[mGroups[j]].size(); g++) {
559                                                         sumDiv[mGroups[j]][g] += div[mGroups[j]][g];
560                                                 }
561                                         }
562                                 }
563                                 
564                                 if ((collect) && (l == 0) && doSumCollect) {  printData(numSampledList, div, outCollect, 1);  }
565                                 if ((summary) && (l == 0) && doSumCollect) {  printSumData(div, outSum, 1);  }
566                         }
567                         
568                         return 0;
569
570         }
571         catch(exception& e) {
572                 m->errorOut(e, "PhyloDiversityCommand", "driver");
573                 exit(1);
574         }
575 }
576
577 //**********************************************************************************************************************
578
579 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
580         try {
581                 
582                 out << "Groups\tnumSampled\tphyloDiversity" << endl;
583                 
584                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
585                 
586                 vector<string> mGroups = m->getGroups();
587                 for (int j = 0; j < mGroups.size(); j++) {
588                         int numSampled = (div[mGroups[j]].size()-1);
589                         out << mGroups[j] << '\t' << numSampled << '\t';
590                 
591                          
592                         float score;
593                         if (scale)      {  score = (div[mGroups[j]][numSampled] / (float)numIters) / (float)numSampled; }
594                         else            {       score = div[mGroups[j]][numSampled] / (float)numIters;  }
595                                 
596                         out << setprecision(4) << score << endl;
597             cout << mGroups[j] << '\t' << numSampled << '\t'<< setprecision(4) << score << endl;
598                 }
599                                         
600                 out.close();
601                 
602         }
603         catch(exception& e) {
604                 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
605                 exit(1);
606         }
607 }
608 //**********************************************************************************************************************
609
610 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
611         try {
612                 
613                 out << "numSampled\t";
614                 vector<string> mGroups = m->getGroups();
615                 for (int i = 0; i < mGroups.size(); i++) { out << mGroups[i] << '\t';  }
616                 out << endl;
617                 
618                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
619                 
620                 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {  
621                         int numSampled = *it;
622                         
623                         out << numSampled << '\t';  
624                 
625                         for (int j = 0; j < mGroups.size(); j++) {
626                                 if (numSampled < div[mGroups[j]].size()) { 
627                                         float score;
628                                         if (scale)      {  score = (div[mGroups[j]][numSampled] / (float)numIters) / (float)numSampled; }
629                                         else            {       score = div[mGroups[j]][numSampled] / (float)numIters;  }
630
631                                         out << setprecision(4) << score << '\t';
632                                 }else { out << "NA" << '\t'; }
633                         }
634                         out << endl;
635                 }
636                 
637                 out.close();
638                 
639         }
640         catch(exception& e) {
641                 m->errorOut(e, "PhyloDiversityCommand", "printData");
642                 exit(1);
643         }
644 }
645 //**********************************************************************************************************************
646 //need a vector of floats one branch length for every group the node represents.
647 vector<float> PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, vector< map<string, bool> >& counted, map<string, int> roots){
648         try {
649         
650                 //calc the branch length
651                 //while you aren't at root
652                 vector<float> sums; 
653                 int index = leaf;
654                 
655                 vector<string> groups = t->tree[leaf].getGroup();
656                 sums.resize(groups.size(), 0.0);
657                 
658         
659         //you are a leaf
660                 if(t->tree[index].getBranchLength() != -1){     
661                         for (int k = 0; k < groups.size(); k++) { 
662                 sums[k] += abs(t->tree[index].getBranchLength());       
663                         }
664                 }
665         
666         
667         index = t->tree[index].getParent();     
668         
669                 //while you aren't at root
670                 while(t->tree[index].getParent() != -1){
671             
672                         if (m->control_pressed) {  return sums; }
673                         
674                         for (int k = 0; k < groups.size(); k++) {
675                 
676                 if (index >= roots[groups[k]]) { counted[index][groups[k]] = true; } //if you are at this groups "root", then say we are done
677                 
678                 if (!counted[index][groups[k]]){ //if counted[index][groups[k] is true this groups has already added all br from here to root, so quit early
679                     if (t->tree[index].getBranchLength() != -1) {
680                         sums[k] += abs(t->tree[index].getBranchLength());
681                     }
682                     counted[index][groups[k]] = true;
683                 }
684             }
685             index = t->tree[index].getParent(); 
686         }
687         
688                 return sums;
689         
690         }
691         catch(exception& e) {
692                 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
693                 exit(1);
694         }
695 }
696 //**********************************************************************************************************************
697 map<string, int> PhyloDiversityCommand::getRootForGroups(Tree* t){
698         try {
699                 map<string, int> roots; //maps group to root for group, may not be root of tree
700                 map<string, bool> done;
701        
702                 //initialize root for all groups to -1
703                 for (int k = 0; k < (t->getCountTable())->getNamesOfGroups().size(); k++) { done[(t->getCountTable())->getNamesOfGroups()[k]] = false; }
704         
705         for (int i = 0; i < t->getNumLeaves(); i++) {
706             
707             vector<string> groups = t->tree[i].getGroup();
708             
709             int index = t->tree[i].getParent();
710             
711             for (int j = 0; j < groups.size(); j++) {
712                 
713                     if (done[groups[j]] == false) { //we haven't found the root for this group yet, initialize it
714                         done[groups[j]] = true;
715                         roots[groups[j]] = i; //set root to self to start
716                     }
717                     
718                     //while you aren't at root
719                     while(t->tree[index].getParent() != -1){
720                         
721                         if (m->control_pressed) {  return roots; }
722                         
723                         //do both your chidren have have descendants from the users groups? 
724                         int lc = t->tree[index].getLChild();
725                         int rc = t->tree[index].getRChild();
726                         
727                         int LpcountSize = 0;
728                         map<string, int>:: iterator itGroup = t->tree[lc].pcount.find(groups[j]);
729                         if (itGroup != t->tree[lc].pcount.end()) { LpcountSize++;  } 
730                         
731                         int RpcountSize = 0;
732                         itGroup = t->tree[rc].pcount.find(groups[j]);
733                         if (itGroup != t->tree[rc].pcount.end()) { RpcountSize++;  } 
734                         
735                         if ((LpcountSize != 0) && (RpcountSize != 0)) { //possible root
736                             if (index > roots[groups[j]]) {  roots[groups[j]] = index; }
737                         }else { ;}
738                         
739                         index = t->tree[index].getParent();     
740                     }
741                 //}
742             }
743         }
744         
745         
746         
747         return roots;
748         
749         }
750         catch(exception& e) {
751                 m->errorOut(e, "PhyloDiversityCommand", "getRootForGroups");
752                 exit(1);
753         }
754 }
755 //**********************************************************************************************************************
756
757
758