]> git.donarmstrong.com Git - mothur.git/blob - phylodiversitycommand.cpp
added multiple processors option for Windows users to align.seqs, dist.seqs, summary...
[mothur.git] / phylodiversitycommand.cpp
1 /*
2  *  phylodiversitycommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/30/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylodiversitycommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> PhyloDiversityCommand::setParameters(){  
14         try {
15
16                 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree);
17                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pgroup);
18                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
19                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
20                 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
21                 CommandParameter pfreq("freq", "Number", "", "100", "", "", "",false,false); parameters.push_back(pfreq);
22                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
23                 CommandParameter prarefy("rarefy", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prarefy);
24                 CommandParameter psummary("summary", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(psummary);
25                 CommandParameter pcollect("collect", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pcollect);
26                 CommandParameter pscale("scale", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pscale);
27                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
28                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29                 
30                 vector<string> myArray;
31                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
32                 return myArray;
33         }
34         catch(exception& e) {
35                 m->errorOut(e, "PhyloDiversityCommand", "setParameters");
36                 exit(1);
37         }
38 }
39 //**********************************************************************************************************************
40 string PhyloDiversityCommand::getHelpString(){  
41         try {
42                 string helpString = "";
43                 helpString += "The phylo.diversity command parameters are tree, group, name, groups, iters, freq, processors, scale, rarefy, collect and summary.  tree and group are required, unless you have valid current files.\n";
44                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n";
45                 helpString += "The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n";
46                 helpString += "The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n";
47                 helpString += "The scale parameter is used indicate that you want your output scaled to the number of sequences sampled, default = false. \n";
48                 helpString += "The rarefy parameter allows you to create a rarefaction curve. The default is false.\n";
49                 helpString += "The collect parameter allows you to create a collectors curve. The default is false.\n";
50                 helpString += "The summary parameter allows you to create a .summary file. The default is true.\n";
51                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
52                 helpString += "The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n";
53                 helpString += "Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n";
54                 helpString += "The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n";
55                 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
56                 return helpString;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "PhyloDiversityCommand", "getHelpString");
60                 exit(1);
61         }
62 }
63
64
65 //**********************************************************************************************************************
66 PhyloDiversityCommand::PhyloDiversityCommand(){ 
67         try {
68                 abort = true; calledHelp = true; 
69                 setParameters();
70                 vector<string> tempOutNames;
71                 outputTypes["phylodiv"] = tempOutNames;
72                 outputTypes["rarefy"] = tempOutNames;
73                 outputTypes["summary"] = tempOutNames;
74         }
75         catch(exception& e) {
76                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
77                 exit(1);
78         }
79 }
80 //**********************************************************************************************************************
81 PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
82         try {
83                 abort = false; calledHelp = false;   
84                 
85                 //allow user to run help
86                 if(option == "help") { help(); abort = true; calledHelp = true; }
87                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
88                 
89                 else {
90                         vector<string> myArray = setParameters();;
91                         
92                         OptionParser parser(option);
93                         map<string,string> parameters = parser.getParameters();
94                         map<string,string>::iterator it;
95                         
96                         ValidParameters validParameter;
97                 
98                         //check to make sure all parameters are valid for command
99                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
100                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
101                         }
102                         
103                         //initialize outputTypes
104                         vector<string> tempOutNames;
105                         outputTypes["phylodiv"] = tempOutNames;
106                         outputTypes["rarefy"] = tempOutNames;
107                         outputTypes["summary"] = tempOutNames;
108                         
109                         //if the user changes the input directory command factory will send this info to us in the output parameter 
110                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
111                         if (inputDir == "not found"){   inputDir = "";          }
112                         else {
113                                 string path;
114                                 it = parameters.find("tree");
115                                 //user has given a template file
116                                 if(it != parameters.end()){ 
117                                         path = m->hasPath(it->second);
118                                         //if the user has not given a path then, add inputdir. else leave path alone.
119                                         if (path == "") {       parameters["tree"] = inputDir + it->second;             }
120                                 }
121                                 
122                                 it = parameters.find("group");
123                                 //user has given a template file
124                                 if(it != parameters.end()){ 
125                                         path = m->hasPath(it->second);
126                                         //if the user has not given a path then, add inputdir. else leave path alone.
127                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
128                                 }
129                                 
130                                 it = parameters.find("name");
131                                 //user has given a template file
132                                 if(it != parameters.end()){ 
133                                         path = m->hasPath(it->second);
134                                         //if the user has not given a path then, add inputdir. else leave path alone.
135                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
136                                 }
137                         }
138                         
139                         m->runParse = true;
140                         m->clearGroups();
141                         m->clearAllGroups();
142                         m->Treenames.clear();
143                         m->names.clear();
144                         
145                         //check for required parameters
146                         treefile = validParameter.validFile(parameters, "tree", true);
147                         if (treefile == "not open") { abort = true; }
148                         else if (treefile == "not found") {                             
149                                 //if there is a current design file, use it
150                                 treefile = m->getTreeFile(); 
151                                 if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
152                                 else {  m->mothurOut("You have no current tree file and the tree parameter is required."); m->mothurOutEndLine(); abort = true; }                                                               
153                         }else { m->setTreeFile(treefile); }     
154                         
155                         //check for required parameters
156                         groupfile = validParameter.validFile(parameters, "group", true);
157                         if (groupfile == "not open") { groupfile = ""; abort = true; }
158                         else if (groupfile == "not found") { groupfile = ""; }
159                         else { m->setGroupFile(groupfile); }
160                         
161                         namefile = validParameter.validFile(parameters, "name", true);
162                         if (namefile == "not open") { abort = true; }
163                         else if (namefile == "not found") { namefile = ""; }
164                         else { m->setNameFile(namefile); }
165                         
166                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(treefile);       }
167                         
168                         string temp;
169                         temp = validParameter.validFile(parameters, "freq", false);                     if (temp == "not found") { temp = "100"; }
170                         convert(temp, freq); 
171                         
172                         temp = validParameter.validFile(parameters, "iters", false);                    if (temp == "not found") { temp = "1000"; }
173                         convert(temp, iters); 
174                         
175                         temp = validParameter.validFile(parameters, "rarefy", false);                   if (temp == "not found") { temp = "F"; }
176                         rarefy = m->isTrue(temp);
177                         if (!rarefy) { iters = 1;  }
178                         
179                         temp = validParameter.validFile(parameters, "summary", false);                  if (temp == "not found") { temp = "T"; }
180                         summary = m->isTrue(temp);
181                         
182                         temp = validParameter.validFile(parameters, "scale", false);                    if (temp == "not found") { temp = "F"; }
183                         scale = m->isTrue(temp);
184                         
185                         temp = validParameter.validFile(parameters, "collect", false);                  if (temp == "not found") { temp = "F"; }
186                         collect = m->isTrue(temp);
187                         
188                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
189                         m->setProcessors(temp);
190                         convert(temp, processors); 
191                         
192                         groups = validParameter.validFile(parameters, "groups", false);                 
193                         if (groups == "not found") { groups = "";  }
194                         else { 
195                                 m->splitAtDash(groups, Groups);
196                                 m->setGroups(Groups);
197                         }
198                         
199                         if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
200                 }
201                 
202         }
203         catch(exception& e) {
204                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
205                 exit(1);
206         }                       
207 }
208 //**********************************************************************************************************************
209
210 int PhyloDiversityCommand::execute(){
211         try {
212                 
213                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
214                 
215                 m->setTreeFile(treefile);
216                 
217                 if (groupfile != "") {
218                         //read in group map info.
219                         tmap = new TreeMap(groupfile);
220                         tmap->readMap();
221                 }else{ //fake out by putting everyone in one group
222                         Tree* tree = new Tree(treefile); delete tree;  //extracts names from tree to make faked out groupmap
223                         tmap = new TreeMap();
224                         
225                         for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
226                 }
227                 
228                 if (namefile != "") { readNamesFile(); }
229                 
230                 read = new ReadNewickTree(treefile);
231                 int readOk = read->read(tmap); 
232                 
233                 if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
234                 
235                 read->AssembleTrees();
236                 vector<Tree*> trees = read->getTrees();
237                 delete read;
238                 
239                 //make sure all files match
240                 //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
241                 int numNamesInTree;
242                 if (namefile != "")  {  
243                         if (numUniquesInName == m->Treenames.size()) {  numNamesInTree = nameMap.size();  }
244                         else {   numNamesInTree = m->Treenames.size();  }
245                 }else {  numNamesInTree = m->Treenames.size();  }
246                 
247                 
248                 //output any names that are in group file but not in tree
249                 if (numNamesInTree < tmap->getNumSeqs()) {
250                         for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
251                                 //is that name in the tree?
252                                 int count = 0;
253                                 for (int j = 0; j < m->Treenames.size(); j++) {
254                                         if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
255                                         count++;
256                                 }
257                                 
258                                 if (m->control_pressed) { 
259                                         delete tmap; for (int i = 0; i < trees.size(); i++) { delete trees[i]; }
260                                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
261                                         m->clearGroups();
262                                         return 0;
263                                 }
264                                 
265                                 //then you did not find it so report it 
266                                 if (count == m->Treenames.size()) { 
267                                         //if it is in your namefile then don't remove
268                                         map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
269                                         
270                                         if (it == nameMap.end()) {
271                                                 m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
272                                                 tmap->removeSeq(tmap->namesOfSeqs[i]);
273                                                 i--; //need this because removeSeq removes name from namesOfSeqs
274                                         }
275                                 }
276                         }
277                 }
278                 
279                 SharedUtil* util = new SharedUtil();
280                 vector<string> mGroups = m->getGroups();
281                 vector<string> tGroups = tmap->getNamesOfGroups();
282                 util->setGroups(mGroups, tGroups, "phylo.diversity");   //sets the groups the user wants to analyze
283                 delete util;
284                 
285                 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
286                 for (int i = 0; i < mGroups.size(); i++) { if (mGroups[i] == "xxx") { mGroups.erase(mGroups.begin()+i);  break; }  }
287                 m->setGroups(mGroups);
288                  
289                 vector<string> outputNames;
290                 
291                 //for each of the users trees
292                 for(int i = 0; i < trees.size(); i++) {
293                 
294                         if (m->control_pressed) { delete tmap; for (int j = 0; j < trees.size(); j++) { delete trees[j]; } for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        } return 0; }
295                         
296                         ofstream outSum, outRare, outCollect;
297                         string outSumFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv.summary";
298                         string outRareFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv.rarefaction";
299                         string outCollectFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv";
300                         
301                         if (summary)    { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile);             outputTypes["summary"].push_back(outSumFile);                   }
302                         if (rarefy)             { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile);  outputTypes["rarefy"].push_back(outRareFile);                   }
303                         if (collect)    { m->openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile);  outputTypes["phylodiv"].push_back(outCollectFile);  }
304                         
305                         int numLeafNodes = trees[i]->getNumLeaves();
306                         
307                         //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
308                         vector<int> randomLeaf;
309                         for (int j = 0; j < numLeafNodes; j++) {  
310                                 if (m->inUsersGroups(trees[i]->tree[j].getGroup(), mGroups) == true) { //is this a node from the group the user selected.
311                                         randomLeaf.push_back(j); 
312                                 }
313                         }
314                         
315                         numLeafNodes = randomLeaf.size();  //reset the number of leaf nodes you are using 
316                         
317                         //each group, each sampling, if no rarefy iters = 1;
318                         map<string, vector<float> > diversity;
319                         
320                         //each group, each sampling, if no rarefy iters = 1;
321                         map<string, vector<float> > sumDiversity;
322                         
323                         //find largest group total 
324                         int largestGroup = 0;
325                         for (int j = 0; j < mGroups.size(); j++) {  
326                                 if (tmap->seqsPerGroup[mGroups[j]] > largestGroup) { largestGroup = tmap->seqsPerGroup[mGroups[j]]; }
327                                 
328                                 //initialize diversity
329                                 diversity[mGroups[j]].resize(tmap->seqsPerGroup[mGroups[j]]+1, 0.0);            //numSampled
330                                                                                                                                                                                                                         //groupA                0.0                     0.0
331                                                                                                                                                                                                                         
332                                 //initialize sumDiversity
333                                 sumDiversity[mGroups[j]].resize(tmap->seqsPerGroup[mGroups[j]]+1, 0.0);
334                         }       
335
336                         //convert freq percentage to number
337                         int increment = 100;
338                         if (freq < 1.0) {  increment = largestGroup * freq;  
339                         }else { increment = freq;  }
340                         
341                         //initialize sampling spots
342                         set<int> numSampledList;
343                         for(int k = 1; k <= largestGroup; k++){  if((k == 1) || (k % increment == 0)){  numSampledList.insert(k); }   }
344                         if(largestGroup % increment != 0){      numSampledList.insert(largestGroup);   }
345                         
346                         //add other groups ending points
347                         for (int j = 0; j < mGroups.size(); j++) {  
348                                 if (numSampledList.count(diversity[mGroups[j]].size()-1) == 0) {  numSampledList.insert(diversity[mGroups[j]].size()-1); }
349                         }
350                         
351                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
352                                 if(processors == 1){
353                                         driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
354                                 }else{
355                                         if (rarefy) {
356                                                 vector<int> procIters;
357                                                 
358                                                 int numItersPerProcessor = iters / processors;
359                                                 
360                                                 //divide iters between processes
361                                                 for (int h = 0; h < processors; h++) {
362                                                         if(h == processors - 1){
363                                                                 numItersPerProcessor = iters - h * numItersPerProcessor;
364                                                         }
365                                                         procIters.push_back(numItersPerProcessor);
366                                                 }
367                                                 
368                                                 createProcesses(procIters, trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum); 
369                                                 
370                                         }else{ //no need to paralellize if you dont want to rarefy
371                                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
372                                         }
373                                 }
374
375                         #else
376                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
377                         #endif
378
379                         if (rarefy) {   printData(numSampledList, sumDiversity, outRare, iters);        }
380                 }
381                 
382         
383                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
384
385                 m->mothurOutEndLine();
386                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
387                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
388                 m->mothurOutEndLine();
389
390                 
391                 return 0;
392         }
393         catch(exception& e) {
394                 m->errorOut(e, "PhyloDiversityCommand", "execute");
395                 exit(1);
396         }
397 }
398 //**********************************************************************************************************************
399 int PhyloDiversityCommand::createProcesses(vector<int>& procIters, Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum){
400         try {
401                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
402                 int process = 1;
403                 
404                 vector<int> processIDS;
405                 map< string, vector<float> >::iterator itSum;
406                 
407                 //loop through and create all the processes you want
408                 while (process != processors) {
409                         int pid = fork();
410                         
411                         if (pid > 0) {
412                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
413                                 process++;
414                         }else if (pid == 0){
415                                 driver(t, div, sumDiv, procIters[process], increment, randomLeaf, numSampledList, outCollect, outSum, false);
416                                 
417                                 string outTemp = outputDir + toString(getpid()) + ".sumDiv.temp";
418                                 ofstream out;
419                                 m->openOutputFile(outTemp, out);
420                                 
421                                 //output the sumDIversity
422                                 for (itSum = sumDiv.begin(); itSum != sumDiv.end(); itSum++) {
423                                         out << itSum->first << '\t' << (itSum->second).size() << '\t';
424                                         for (int k = 0; k < (itSum->second).size(); k++) { 
425                                                 out << (itSum->second)[k] << '\t';
426                                         }
427                                         out << endl;
428                                 }
429                                 
430                                 out.close();
431                                 
432                                 exit(0);
433                         }else { 
434                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
435                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
436                                 exit(0);
437                         }
438                 }
439                 
440                 driver(t, div, sumDiv, procIters[0], increment, randomLeaf, numSampledList, outCollect, outSum, true);
441                 
442                 //force parent to wait until all the processes are done
443                 for (int i=0;i<(processors-1);i++) { 
444                         int temp = processIDS[i];
445                         wait(&temp);
446                 }
447                 
448                 //get data created by processes
449                 for (int i=0;i<(processors-1);i++) { 
450                         
451                         //input the sumDIversity
452                         string inTemp = outputDir + toString(processIDS[i]) + ".sumDiv.temp";
453                         ifstream in;
454                         m->openInputFile(inTemp, in);
455                                 
456                         //output the sumDIversity
457                         for (int j = 0; j < sumDiv.size(); j++) { 
458                                 string group = "";
459                                 int size = 0;
460                                 
461                                 in >> group >> size; m->gobble(in);
462                                 
463                                 for (int k = 0; k < size; k++) { 
464                                         float tempVal;
465                                         in >> tempVal;
466                                         
467                                         sumDiv[group][k] += tempVal;
468                                 }
469                                 m->gobble(in);
470                         }
471                                 
472                         in.close();
473                         m->mothurRemove(inTemp);
474                 }
475                 
476 #endif
477
478         return 0;               
479         
480         }
481         catch(exception& e) {
482                 m->errorOut(e, "PhyloDiversityCommand", "createProcesses");
483                 exit(1);
484         }
485 }
486 //**********************************************************************************************************************
487 int PhyloDiversityCommand::driver(Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum, bool doSumCollect){
488         try {
489                 int numLeafNodes = randomLeaf.size();
490                 vector<string> mGroups = m->getGroups();
491         
492                 for (int l = 0; l < numIters; l++) {
493                                 random_shuffle(randomLeaf.begin(), randomLeaf.end());
494                 
495                                 //initialize counts
496                                 map<string, int> counts;
497                                 map< string, set<int> > countedBranch;  
498                                 for (int j = 0; j < mGroups.size(); j++) {  counts[mGroups[j]] = 0; countedBranch[mGroups[j]].insert(-2);  }  //add dummy index to initialize countedBranch sets
499                                 
500                                 for(int k = 0; k < numLeafNodes; k++){
501                                                 
502                                         if (m->control_pressed) { return 0; }
503                                         
504                                         //calc branch length of randomLeaf k
505                                         vector<float> br = calcBranchLength(t, randomLeaf[k], countedBranch);
506                         
507                                         //for each group in the groups update the total branch length accounting for the names file
508                                         vector<string> groups = t->tree[randomLeaf[k]].getGroup();
509                                         
510                                         for (int j = 0; j < groups.size(); j++) {
511                                                 int numSeqsInGroupJ = 0;
512                                                 map<string, int>::iterator it;
513                                                 it = t->tree[randomLeaf[k]].pcount.find(groups[j]);
514                                                 if (it != t->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
515                                                         numSeqsInGroupJ = it->second;
516                                                 }
517                                                 
518                                                 if (numSeqsInGroupJ != 0) {     div[groups[j]][(counts[groups[j]]+1)] = div[groups[j]][counts[groups[j]]] + br[j];  }
519                                                 
520                                                 for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
521                                                         div[groups[j]][s] = div[groups[j]][s-1];  //update counts, but don't add in redundant branch lengths
522                                                 }
523                                                 counts[groups[j]] += numSeqsInGroupJ;
524                                         }
525                                 }
526                                 
527                                 if (rarefy) {
528                                         //add this diversity to the sum
529                                         for (int j = 0; j < mGroups.size(); j++) {  
530                                                 for (int g = 0; g < div[mGroups[j]].size(); g++) {
531                                                         sumDiv[mGroups[j]][g] += div[mGroups[j]][g];
532                                                 }
533                                         }
534                                 }
535                                 
536                                 if ((collect) && (l == 0) && doSumCollect) {  printData(numSampledList, div, outCollect, 1);  }
537                                 if ((summary) && (l == 0) && doSumCollect) {  printSumData(div, outSum, 1);  }
538                         }
539                         
540                         return 0;
541
542         }
543         catch(exception& e) {
544                 m->errorOut(e, "PhyloDiversityCommand", "driver");
545                 exit(1);
546         }
547 }
548
549 //**********************************************************************************************************************
550
551 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
552         try {
553                 
554                 out << "Groups\tnumSampled\tphyloDiversity" << endl;
555                 
556                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
557                 
558                 vector<string> mGroups = m->getGroups();
559                 for (int j = 0; j < mGroups.size(); j++) {
560                         int numSampled = (div[mGroups[j]].size()-1);
561                         out << mGroups[j] << '\t' << numSampled << '\t';
562                 
563                          
564                         float score;
565                         if (scale)      {  score = (div[mGroups[j]][numSampled] / (float)numIters) / (float)numSampled; }
566                         else            {       score = div[mGroups[j]][numSampled] / (float)numIters;  }
567                                 
568                         out << setprecision(4) << score << endl;
569                 }
570                                         
571                 out.close();
572                 
573         }
574         catch(exception& e) {
575                 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
576                 exit(1);
577         }
578 }
579 //**********************************************************************************************************************
580
581 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
582         try {
583                 
584                 out << "numSampled\t";
585                 vector<string> mGroups = m->getGroups();
586                 for (int i = 0; i < mGroups.size(); i++) { out << mGroups[i] << '\t';  }
587                 out << endl;
588                 
589                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
590                 
591                 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {  
592                         int numSampled = *it;
593                         
594                         out << numSampled << '\t';  
595                 
596                         for (int j = 0; j < mGroups.size(); j++) {
597                                 if (numSampled < div[mGroups[j]].size()) { 
598                                         float score;
599                                         if (scale)      {  score = (div[mGroups[j]][numSampled] / (float)numIters) / (float)numSampled; }
600                                         else            {       score = div[mGroups[j]][numSampled] / (float)numIters;  }
601
602                                         out << setprecision(4) << score << '\t';
603                                 }else { out << "NA" << '\t'; }
604                         }
605                         out << endl;
606                 }
607                 
608                 out.close();
609                 
610         }
611         catch(exception& e) {
612                 m->errorOut(e, "PhyloDiversityCommand", "printData");
613                 exit(1);
614         }
615 }
616 //**********************************************************************************************************************
617 //need a vector of floats one branch length for every group the node represents.
618 vector<float> PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< string, set<int> >& counted){
619         try {
620
621                 //calc the branch length
622                 //while you aren't at root
623                 vector<float> sums; 
624                 int index = leaf;
625                 
626                 vector<string> groups = t->tree[leaf].getGroup();
627                 sums.resize(groups.size(), 0.0);
628                 
629                 map<string, map<int, double> > tempTotals; //maps node to total Branch Length
630                 map< string, set<int> > tempCounted;
631                 set<int>::iterator it;
632         
633                 //you are a leaf
634                 if(t->tree[index].getBranchLength() != -1){     
635                         for (int k = 0; k < groups.size(); k++) { 
636                                 sums[k] += abs(t->tree[index].getBranchLength());       
637                                 counted[groups[k]].insert(index);
638                         }
639                 }
640                 
641                 for (int k = 0; k < groups.size(); k++) { 
642                         tempTotals[groups[k]][index] = 0.0;     
643                 }
644                 
645                 index = t->tree[index].getParent();     
646                         
647                 //while you aren't at root
648                 while(t->tree[index].getParent() != -1){
649
650                         if (m->control_pressed) {  return sums; }
651                         
652                         int pcountSize = 0;     
653                         for (int k = 0; k < groups.size(); k++) {
654                                 map<string, int>::iterator itGroup = t->tree[index].pcount.find(groups[k]);
655                                 if (itGroup != t->tree[index].pcount.end()) { pcountSize++;  } 
656                         
657                                 //do both your chidren have have descendants from the users groups? 
658                                 int lc = t->tree[index].getLChild();
659                                 int rc = t->tree[index].getRChild();
660                         
661                                 int LpcountSize = 0;
662                                 itGroup = t->tree[lc].pcount.find(groups[k]);
663                                 if (itGroup != t->tree[lc].pcount.end()) { LpcountSize++;  } 
664                                                         
665                                 int RpcountSize = 0;
666                                 itGroup = t->tree[rc].pcount.find(groups[k]);
667                                 if (itGroup != t->tree[rc].pcount.end()) { RpcountSize++;  } 
668                                                                 
669                                 //if yes, add your childrens tempTotals
670                                 if ((LpcountSize != 0) && (RpcountSize != 0)) {
671                                         sums[k] += tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc]; 
672                                         
673                                         for (it = tempCounted[groups[k]].begin(); it != tempCounted[groups[k]].end(); it++) { counted[groups[k]].insert(*it); }
674
675                                         //cout << "added to total " << tempTotals[lc] << '\t' << tempTotals[rc] << endl;
676                                         if (t->tree[index].getBranchLength() != -1) {
677                                                 if (counted[groups[k]].count(index) == 0) {
678                                                         tempTotals[groups[k]][index] = abs(t->tree[index].getBranchLength());
679                                                         tempCounted[groups[k]].insert(index);
680                                                 }else{
681                                                         tempTotals[groups[k]][index] = 0.0;
682                                                 }
683                                         }else {
684                                                 tempTotals[groups[k]][index] = 0.0;
685                                         }
686                                 }else { //if no, your tempTotal is your childrens temp totals + your branch length
687                                         tempTotals[groups[k]][index] = tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc]; 
688                                                                         
689                                         if (counted[groups[k]].count(index) == 0) {
690                                                 tempTotals[groups[k]][index] += abs(t->tree[index].getBranchLength());
691                                                 tempCounted[groups[k]].insert(index);
692                                         }
693
694                                 }
695                                 //cout << "temptotal = "<< tempTotals[i] << endl;
696                         }
697                         
698                         index = t->tree[index].getParent();     
699                 }
700
701                 return sums;
702
703         }
704         catch(exception& e) {
705                 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
706                 exit(1);
707         }
708 }
709 /*****************************************************************/
710 int PhyloDiversityCommand::readNamesFile() {
711         try {
712                 m->names.clear();
713                 numUniquesInName = 0;
714                 
715                 ifstream in;
716                 m->openInputFile(namefile, in);
717                 
718                 string first, second;
719                 map<string, string>::iterator itNames;
720                 
721                 while(!in.eof()) {
722                         in >> first >> second; m->gobble(in);
723                         
724                         numUniquesInName++;
725                         
726                         itNames = m->names.find(first);
727                         if (itNames == m->names.end()) {  
728                                 m->names[first] = second; 
729                                 
730                                 //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
731                                 vector<string> dupNames;
732                                 m->splitAtComma(second, dupNames);
733                                 
734                                 for (int i = 0; i < dupNames.size(); i++) {     
735                                         nameMap[dupNames[i]] = dupNames[i]; 
736                                         if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); } 
737                                 }
738                         }else {  m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }                  
739                 }
740                 in.close();
741                 
742                 return 0;
743         }
744         catch(exception& e) {
745                 m->errorOut(e, "PhyloDiversityCommand", "readNamesFile");
746                 exit(1);
747         }
748 }
749
750 //**********************************************************************************************************************
751
752
753