]> git.donarmstrong.com Git - mothur.git/blob - phylodiversitycommand.cpp
fixed bug with tree reading
[mothur.git] / phylodiversitycommand.cpp
1 /*
2  *  phylodiversitycommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/30/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylodiversitycommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> PhyloDiversityCommand::setParameters(){  
14         try {
15
16                 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree);
17                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pgroup);
18                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
19                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
20                 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
21                 CommandParameter pfreq("freq", "Number", "", "100", "", "", "",false,false); parameters.push_back(pfreq);
22                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
23                 CommandParameter prarefy("rarefy", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prarefy);
24                 CommandParameter psummary("summary", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(psummary);
25                 CommandParameter pcollect("collect", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pcollect);
26                 CommandParameter pscale("scale", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pscale);
27                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
28                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29                 
30                 vector<string> myArray;
31                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
32                 return myArray;
33         }
34         catch(exception& e) {
35                 m->errorOut(e, "PhyloDiversityCommand", "setParameters");
36                 exit(1);
37         }
38 }
39 //**********************************************************************************************************************
40 string PhyloDiversityCommand::getHelpString(){  
41         try {
42                 string helpString = "";
43                 helpString += "The phylo.diversity command parameters are tree, group, name, groups, iters, freq, processors, scale, rarefy, collect and summary.  tree and group are required, unless you have valid current files.\n";
44                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n";
45                 helpString += "The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n";
46                 helpString += "The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n";
47                 helpString += "The scale parameter is used indicate that you want your output scaled to the number of sequences sampled, default = false. \n";
48                 helpString += "The rarefy parameter allows you to create a rarefaction curve. The default is false.\n";
49                 helpString += "The collect parameter allows you to create a collectors curve. The default is false.\n";
50                 helpString += "The summary parameter allows you to create a .summary file. The default is true.\n";
51                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
52                 helpString += "The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n";
53                 helpString += "Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n";
54                 helpString += "The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n";
55                 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
56                 return helpString;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "PhyloDiversityCommand", "getHelpString");
60                 exit(1);
61         }
62 }
63
64
65 //**********************************************************************************************************************
66 PhyloDiversityCommand::PhyloDiversityCommand(){ 
67         try {
68                 abort = true; calledHelp = true; 
69                 setParameters();
70                 vector<string> tempOutNames;
71                 outputTypes["phylodiv"] = tempOutNames;
72                 outputTypes["rarefy"] = tempOutNames;
73                 outputTypes["summary"] = tempOutNames;
74         }
75         catch(exception& e) {
76                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
77                 exit(1);
78         }
79 }
80 //**********************************************************************************************************************
81 PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
82         try {
83                 abort = false; calledHelp = false;   
84                 
85                 //allow user to run help
86                 if(option == "help") { help(); abort = true; calledHelp = true; }
87                 
88                 else {
89                         vector<string> myArray = setParameters();;
90                         
91                         OptionParser parser(option);
92                         map<string,string> parameters = parser.getParameters();
93                         map<string,string>::iterator it;
94                         
95                         ValidParameters validParameter;
96                 
97                         //check to make sure all parameters are valid for command
98                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
99                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
100                         }
101                         
102                         //initialize outputTypes
103                         vector<string> tempOutNames;
104                         outputTypes["phylodiv"] = tempOutNames;
105                         outputTypes["rarefy"] = tempOutNames;
106                         outputTypes["summary"] = tempOutNames;
107                         
108                         //if the user changes the input directory command factory will send this info to us in the output parameter 
109                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
110                         if (inputDir == "not found"){   inputDir = "";          }
111                         else {
112                                 string path;
113                                 it = parameters.find("tree");
114                                 //user has given a template file
115                                 if(it != parameters.end()){ 
116                                         path = m->hasPath(it->second);
117                                         //if the user has not given a path then, add inputdir. else leave path alone.
118                                         if (path == "") {       parameters["tree"] = inputDir + it->second;             }
119                                 }
120                                 
121                                 it = parameters.find("group");
122                                 //user has given a template file
123                                 if(it != parameters.end()){ 
124                                         path = m->hasPath(it->second);
125                                         //if the user has not given a path then, add inputdir. else leave path alone.
126                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
127                                 }
128                                 
129                                 it = parameters.find("name");
130                                 //user has given a template file
131                                 if(it != parameters.end()){ 
132                                         path = m->hasPath(it->second);
133                                         //if the user has not given a path then, add inputdir. else leave path alone.
134                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
135                                 }
136                         }
137                         
138                         m->runParse = true;
139                         m->Groups.clear();
140                         m->namesOfGroups.clear();
141                         m->Treenames.clear();
142                         m->names.clear();
143                         
144                         //check for required parameters
145                         treefile = validParameter.validFile(parameters, "tree", true);
146                         if (treefile == "not open") { abort = true; }
147                         else if (treefile == "not found") {                             
148                                 //if there is a current design file, use it
149                                 treefile = m->getTreeFile(); 
150                                 if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
151                                 else {  m->mothurOut("You have no current tree file and the tree parameter is required."); m->mothurOutEndLine(); abort = true; }                                                               
152                         }       
153                         
154                         //check for required parameters
155                         groupfile = validParameter.validFile(parameters, "group", true);
156                         if (groupfile == "not open") { abort = true; }
157                         else if (groupfile == "not found") { 
158                                 //if there is a current design file, use it
159                                 groupfile = m->getGroupFile(); 
160                                 if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); }
161                                 else {  m->mothurOut("You have no current group file and the group parameter is required."); m->mothurOutEndLine(); abort = true; }                                                             
162                         }
163                         
164                         namefile = validParameter.validFile(parameters, "name", true);
165                         if (namefile == "not open") { abort = true; }
166                         else if (namefile == "not found") { namefile = ""; }
167                         
168                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(treefile);       }
169                         
170                         string temp;
171                         temp = validParameter.validFile(parameters, "freq", false);                     if (temp == "not found") { temp = "100"; }
172                         convert(temp, freq); 
173                         
174                         temp = validParameter.validFile(parameters, "iters", false);                    if (temp == "not found") { temp = "1000"; }
175                         convert(temp, iters); 
176                         
177                         temp = validParameter.validFile(parameters, "rarefy", false);                   if (temp == "not found") { temp = "F"; }
178                         rarefy = m->isTrue(temp);
179                         if (!rarefy) { iters = 1;  }
180                         
181                         temp = validParameter.validFile(parameters, "summary", false);                  if (temp == "not found") { temp = "T"; }
182                         summary = m->isTrue(temp);
183                         
184                         temp = validParameter.validFile(parameters, "scale", false);                    if (temp == "not found") { temp = "F"; }
185                         scale = m->isTrue(temp);
186                         
187                         temp = validParameter.validFile(parameters, "collect", false);                  if (temp == "not found") { temp = "F"; }
188                         collect = m->isTrue(temp);
189                         
190                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
191                         m->setProcessors(temp);
192                         convert(temp, processors); 
193                         
194                         groups = validParameter.validFile(parameters, "groups", false);                 
195                         if (groups == "not found") { groups = "";  }
196                         else { 
197                                 m->splitAtDash(groups, Groups);
198                                 m->Groups = Groups;
199                         }
200                         
201                         if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
202                 }
203                 
204         }
205         catch(exception& e) {
206                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
207                 exit(1);
208         }                       
209 }
210 //**********************************************************************************************************************
211
212 int PhyloDiversityCommand::execute(){
213         try {
214                 
215                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
216                 
217                 m->setTreeFile(treefile);
218                 
219                 //read in group map info.
220                 tmap = new TreeMap(groupfile);
221                 tmap->readMap();
222                 
223                 if (namefile != "") { readNamesFile(); }
224                 
225                 read = new ReadNewickTree(treefile);
226                 int readOk = read->read(tmap); 
227                 
228                 if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
229                 
230                 read->AssembleTrees();
231                 vector<Tree*> trees = read->getTrees();
232                 delete read;
233                 
234                 //make sure all files match
235                 //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
236                 int numNamesInTree;
237                 if (namefile != "")  {  
238                         if (numUniquesInName == m->Treenames.size()) {  numNamesInTree = nameMap.size();  }
239                         else {   numNamesInTree = m->Treenames.size();  }
240                 }else {  numNamesInTree = m->Treenames.size();  }
241                 
242                 
243                 //output any names that are in group file but not in tree
244                 if (numNamesInTree < tmap->getNumSeqs()) {
245                         for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
246                                 //is that name in the tree?
247                                 int count = 0;
248                                 for (int j = 0; j < m->Treenames.size(); j++) {
249                                         if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
250                                         count++;
251                                 }
252                                 
253                                 if (m->control_pressed) { 
254                                         delete tmap; for (int i = 0; i < trees.size(); i++) { delete trees[i]; }
255                                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } outputTypes.clear();
256                                         m->Groups.clear();
257                                         return 0;
258                                 }
259                                 
260                                 //then you did not find it so report it 
261                                 if (count == m->Treenames.size()) { 
262                                         //if it is in your namefile then don't remove
263                                         map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
264                                         
265                                         if (it == nameMap.end()) {
266                                                 m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
267                                                 tmap->removeSeq(tmap->namesOfSeqs[i]);
268                                                 i--; //need this because removeSeq removes name from namesOfSeqs
269                                         }
270                                 }
271                         }
272                 }
273                 
274                 SharedUtil* util = new SharedUtil();
275                 util->setGroups(m->Groups, tmap->namesOfGroups, "treegroup");   //sets the groups the user wants to analyze
276                 delete util;
277                 
278                 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
279                 for (int i = 0; i < m->Groups.size(); i++) { if (m->Groups[i] == "xxx") { m->Groups.erase(m->Groups.begin()+i);  break; }  }
280                  
281                 vector<string> outputNames;
282                 
283                 //for each of the users trees
284                 for(int i = 0; i < trees.size(); i++) {
285                 
286                         if (m->control_pressed) { delete tmap; for (int j = 0; j < trees.size(); j++) { delete trees[j]; } for (int j = 0; j < outputNames.size(); j++) {       remove(outputNames[j].c_str());         } return 0; }
287                         
288                         ofstream outSum, outRare, outCollect;
289                         string outSumFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv.summary";
290                         string outRareFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv.rarefaction";
291                         string outCollectFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv";
292                         
293                         if (summary)    { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile);             outputTypes["summary"].push_back(outSumFile);                   }
294                         if (rarefy)             { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile);  outputTypes["rarefy"].push_back(outRareFile);                   }
295                         if (collect)    { m->openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile);  outputTypes["phylodiv"].push_back(outCollectFile);  }
296                         
297                         int numLeafNodes = trees[i]->getNumLeaves();
298                         
299                         //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
300                         vector<int> randomLeaf;
301                         for (int j = 0; j < numLeafNodes; j++) {  
302                                 if (m->inUsersGroups(trees[i]->tree[j].getGroup(), m->Groups) == true) { //is this a node from the group the user selected.
303                                         randomLeaf.push_back(j); 
304                                 }
305                         }
306                         
307                         numLeafNodes = randomLeaf.size();  //reset the number of leaf nodes you are using 
308                         
309                         //each group, each sampling, if no rarefy iters = 1;
310                         map<string, vector<float> > diversity;
311                         
312                         //each group, each sampling, if no rarefy iters = 1;
313                         map<string, vector<float> > sumDiversity;
314                         
315                         //find largest group total 
316                         int largestGroup = 0;
317                         for (int j = 0; j < m->Groups.size(); j++) {  
318                                 if (tmap->seqsPerGroup[m->Groups[j]] > largestGroup) { largestGroup = tmap->seqsPerGroup[m->Groups[j]]; }
319                                 
320                                 //initialize diversity
321                                 diversity[m->Groups[j]].resize(tmap->seqsPerGroup[m->Groups[j]]+1, 0.0);                //numSampled
322                                                                                                                                                                                                                         //groupA                0.0                     0.0
323                                                                                                                                                                                                                         
324                                 //initialize sumDiversity
325                                 sumDiversity[m->Groups[j]].resize(tmap->seqsPerGroup[m->Groups[j]]+1, 0.0);
326                         }       
327
328                         //convert freq percentage to number
329                         int increment = 100;
330                         if (freq < 1.0) {  increment = largestGroup * freq;  
331                         }else { increment = freq;  }
332                         
333                         //initialize sampling spots
334                         set<int> numSampledList;
335                         for(int k = 1; k <= largestGroup; k++){  if((k == 1) || (k % increment == 0)){  numSampledList.insert(k); }   }
336                         if(largestGroup % increment != 0){      numSampledList.insert(largestGroup);   }
337                         
338                         //add other groups ending points
339                         for (int j = 0; j < m->Groups.size(); j++) {  
340                                 if (numSampledList.count(diversity[m->Groups[j]].size()-1) == 0) {  numSampledList.insert(diversity[m->Groups[j]].size()-1); }
341                         }
342                         
343                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
344                                 if(processors == 1){
345                                         driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
346                                 }else{
347                                         if (rarefy) {
348                                                 vector<int> procIters;
349                                                 
350                                                 int numItersPerProcessor = iters / processors;
351                                                 
352                                                 //divide iters between processes
353                                                 for (int h = 0; h < processors; h++) {
354                                                         if(h == processors - 1){
355                                                                 numItersPerProcessor = iters - h * numItersPerProcessor;
356                                                         }
357                                                         procIters.push_back(numItersPerProcessor);
358                                                 }
359                                                 
360                                                 createProcesses(procIters, trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum); 
361                                                 
362                                         }else{ //no need to paralellize if you dont want to rarefy
363                                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
364                                         }
365                                 }
366
367                         #else
368                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
369                         #endif
370
371                         if (rarefy) {   printData(numSampledList, sumDiversity, outRare, iters);        }
372                 }
373                 
374         
375                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
376
377                 m->mothurOutEndLine();
378                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
379                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
380                 m->mothurOutEndLine();
381
382                 
383                 return 0;
384         }
385         catch(exception& e) {
386                 m->errorOut(e, "PhyloDiversityCommand", "execute");
387                 exit(1);
388         }
389 }
390 //**********************************************************************************************************************
391 int PhyloDiversityCommand::createProcesses(vector<int>& procIters, Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum){
392         try {
393                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
394                 int process = 1;
395                 
396                 vector<int> processIDS;
397                 map< string, vector<float> >::iterator itSum;
398                 
399                 //loop through and create all the processes you want
400                 while (process != processors) {
401                         int pid = fork();
402                         
403                         if (pid > 0) {
404                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
405                                 process++;
406                         }else if (pid == 0){
407                                 driver(t, div, sumDiv, procIters[process], increment, randomLeaf, numSampledList, outCollect, outSum, false);
408                                 
409                                 string outTemp = outputDir + toString(getpid()) + ".sumDiv.temp";
410                                 ofstream out;
411                                 m->openOutputFile(outTemp, out);
412                                 
413                                 //output the sumDIversity
414                                 for (itSum = sumDiv.begin(); itSum != sumDiv.end(); itSum++) {
415                                         out << itSum->first << '\t' << (itSum->second).size() << '\t';
416                                         for (int k = 0; k < (itSum->second).size(); k++) { 
417                                                 out << (itSum->second)[k] << '\t';
418                                         }
419                                         out << endl;
420                                 }
421                                 
422                                 out.close();
423                                 
424                                 exit(0);
425                         }else { 
426                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
427                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
428                                 exit(0);
429                         }
430                 }
431                 
432                 driver(t, div, sumDiv, procIters[0], increment, randomLeaf, numSampledList, outCollect, outSum, true);
433                 
434                 //force parent to wait until all the processes are done
435                 for (int i=0;i<(processors-1);i++) { 
436                         int temp = processIDS[i];
437                         wait(&temp);
438                 }
439                 
440                 //get data created by processes
441                 for (int i=0;i<(processors-1);i++) { 
442                         
443                         //input the sumDIversity
444                         string inTemp = outputDir + toString(processIDS[i]) + ".sumDiv.temp";
445                         ifstream in;
446                         m->openInputFile(inTemp, in);
447                                 
448                         //output the sumDIversity
449                         for (int j = 0; j < sumDiv.size(); j++) { 
450                                 string group = "";
451                                 int size = 0;
452                                 
453                                 in >> group >> size; m->gobble(in);
454                                 
455                                 for (int k = 0; k < size; k++) { 
456                                         float tempVal;
457                                         in >> tempVal;
458                                         
459                                         sumDiv[group][k] += tempVal;
460                                 }
461                                 m->gobble(in);
462                         }
463                                 
464                         in.close();
465                         remove(inTemp.c_str());
466                 }
467                 
468 #endif
469
470         return 0;               
471         
472         }
473         catch(exception& e) {
474                 m->errorOut(e, "PhyloDiversityCommand", "createProcesses");
475                 exit(1);
476         }
477 }
478 //**********************************************************************************************************************
479 int PhyloDiversityCommand::driver(Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum, bool doSumCollect){
480         try {
481                 int numLeafNodes = randomLeaf.size();
482         
483                 for (int l = 0; l < numIters; l++) {
484                                 random_shuffle(randomLeaf.begin(), randomLeaf.end());
485                 
486                                 //initialize counts
487                                 map<string, int> counts;
488                                 map< string, set<int> > countedBranch;  
489                                 for (int j = 0; j < m->Groups.size(); j++) {  counts[m->Groups[j]] = 0; countedBranch[m->Groups[j]].insert(-2);  }  //add dummy index to initialize countedBranch sets
490                                 
491                                 for(int k = 0; k < numLeafNodes; k++){
492                                                 
493                                         if (m->control_pressed) { return 0; }
494                                         
495                                         //calc branch length of randomLeaf k
496                                         vector<float> br = calcBranchLength(t, randomLeaf[k], countedBranch);
497                         
498                                         //for each group in the groups update the total branch length accounting for the names file
499                                         vector<string> groups = t->tree[randomLeaf[k]].getGroup();
500                                         
501                                         for (int j = 0; j < groups.size(); j++) {
502                                                 int numSeqsInGroupJ = 0;
503                                                 map<string, int>::iterator it;
504                                                 it = t->tree[randomLeaf[k]].pcount.find(groups[j]);
505                                                 if (it != t->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
506                                                         numSeqsInGroupJ = it->second;
507                                                 }
508                                                 
509                                                 if (numSeqsInGroupJ != 0) {     div[groups[j]][(counts[groups[j]]+1)] = div[groups[j]][counts[groups[j]]] + br[j];  }
510                                                 
511                                                 for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
512                                                         div[groups[j]][s] = div[groups[j]][s-1];  //update counts, but don't add in redundant branch lengths
513                                                 }
514                                                 counts[groups[j]] += numSeqsInGroupJ;
515                                         }
516                                 }
517                                 
518                                 if (rarefy) {
519                                         //add this diversity to the sum
520                                         for (int j = 0; j < m->Groups.size(); j++) {  
521                                                 for (int g = 0; g < div[m->Groups[j]].size(); g++) {
522                                                         sumDiv[m->Groups[j]][g] += div[m->Groups[j]][g];
523                                                 }
524                                         }
525                                 }
526                                 
527                                 if ((collect) && (l == 0) && doSumCollect) {  printData(numSampledList, div, outCollect, 1);  }
528                                 if ((summary) && (l == 0) && doSumCollect) {  printSumData(div, outSum, 1);  }
529                         }
530                         
531                         return 0;
532
533         }
534         catch(exception& e) {
535                 m->errorOut(e, "PhyloDiversityCommand", "driver");
536                 exit(1);
537         }
538 }
539
540 //**********************************************************************************************************************
541
542 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
543         try {
544                 
545                 out << "Groups\tnumSampled\tphyloDiversity" << endl;
546                 
547                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
548                         
549                 for (int j = 0; j < m->Groups.size(); j++) {
550                         int numSampled = (div[m->Groups[j]].size()-1);
551                         out << m->Groups[j] << '\t' << numSampled << '\t';
552                 
553                          
554                         float score;
555                         if (scale)      {  score = (div[m->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;       }
556                         else            {       score = div[m->Groups[j]][numSampled] / (float)numIters;        }
557                                 
558                         out << setprecision(4) << score << endl;
559                 }
560                                         
561                 out.close();
562                 
563         }
564         catch(exception& e) {
565                 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
566                 exit(1);
567         }
568 }
569 //**********************************************************************************************************************
570
571 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
572         try {
573                 
574                 out << "numSampled\t";
575                 for (int i = 0; i < m->Groups.size(); i++) { out << m->Groups[i] << '\t';  }
576                 out << endl;
577                 
578                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
579                 
580                 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {  
581                         int numSampled = *it;
582                         
583                         out << numSampled << '\t';  
584                         
585                         for (int j = 0; j < m->Groups.size(); j++) {
586                                 if (numSampled < div[m->Groups[j]].size()) { 
587                                         float score;
588                                         if (scale)      {  score = (div[m->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;       }
589                                         else            {       score = div[m->Groups[j]][numSampled] / (float)numIters;        }
590
591                                         out << setprecision(4) << score << '\t';
592                                 }else { out << "NA" << '\t'; }
593                         }
594                         out << endl;
595                 }
596                 
597                 out.close();
598                 
599         }
600         catch(exception& e) {
601                 m->errorOut(e, "PhyloDiversityCommand", "printData");
602                 exit(1);
603         }
604 }
605 //**********************************************************************************************************************
606 //need a vector of floats one branch length for every group the node represents.
607 vector<float> PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< string, set<int> >& counted){
608         try {
609
610                 //calc the branch length
611                 //while you aren't at root
612                 vector<float> sums; 
613                 int index = leaf;
614                 
615                 vector<string> groups = t->tree[leaf].getGroup();
616                 sums.resize(groups.size(), 0.0);
617                 
618                 map<string, map<int, double> > tempTotals; //maps node to total Branch Length
619                 map< string, set<int> > tempCounted;
620                 set<int>::iterator it;
621         
622                 //you are a leaf
623                 if(t->tree[index].getBranchLength() != -1){     
624                         for (int k = 0; k < groups.size(); k++) { 
625                                 sums[k] += abs(t->tree[index].getBranchLength());       
626                                 counted[groups[k]].insert(index);
627                         }
628                 }
629                 
630                 for (int k = 0; k < groups.size(); k++) { 
631                         tempTotals[groups[k]][index] = 0.0;     
632                 }
633                 
634                 index = t->tree[index].getParent();     
635                         
636                 //while you aren't at root
637                 while(t->tree[index].getParent() != -1){
638
639                         if (m->control_pressed) {  return sums; }
640                         
641                         int pcountSize = 0;     
642                         for (int k = 0; k < groups.size(); k++) {
643                                 map<string, int>::iterator itGroup = t->tree[index].pcount.find(groups[k]);
644                                 if (itGroup != t->tree[index].pcount.end()) { pcountSize++;  } 
645                         
646                                 //do both your chidren have have descendants from the users groups? 
647                                 int lc = t->tree[index].getLChild();
648                                 int rc = t->tree[index].getRChild();
649                         
650                                 int LpcountSize = 0;
651                                 itGroup = t->tree[lc].pcount.find(groups[k]);
652                                 if (itGroup != t->tree[lc].pcount.end()) { LpcountSize++;  } 
653                                                         
654                                 int RpcountSize = 0;
655                                 itGroup = t->tree[rc].pcount.find(groups[k]);
656                                 if (itGroup != t->tree[rc].pcount.end()) { RpcountSize++;  } 
657                                                                 
658                                 //if yes, add your childrens tempTotals
659                                 if ((LpcountSize != 0) && (RpcountSize != 0)) {
660                                         sums[k] += tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc]; 
661                                         
662                                         for (it = tempCounted[groups[k]].begin(); it != tempCounted[groups[k]].end(); it++) { counted[groups[k]].insert(*it); }
663
664                                         //cout << "added to total " << tempTotals[lc] << '\t' << tempTotals[rc] << endl;
665                                         if (t->tree[index].getBranchLength() != -1) {
666                                                 if (counted[groups[k]].count(index) == 0) {
667                                                         tempTotals[groups[k]][index] = abs(t->tree[index].getBranchLength());
668                                                         tempCounted[groups[k]].insert(index);
669                                                 }else{
670                                                         tempTotals[groups[k]][index] = 0.0;
671                                                 }
672                                         }else {
673                                                 tempTotals[groups[k]][index] = 0.0;
674                                         }
675                                 }else { //if no, your tempTotal is your childrens temp totals + your branch length
676                                         tempTotals[groups[k]][index] = tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc]; 
677                                                                         
678                                         if (counted[groups[k]].count(index) == 0) {
679                                                 tempTotals[groups[k]][index] += abs(t->tree[index].getBranchLength());
680                                                 tempCounted[groups[k]].insert(index);
681                                         }
682
683                                 }
684                                 //cout << "temptotal = "<< tempTotals[i] << endl;
685                         }
686                         
687                         index = t->tree[index].getParent();     
688                 }
689
690                 return sums;
691
692         }
693         catch(exception& e) {
694                 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
695                 exit(1);
696         }
697 }
698 /*****************************************************************/
699 int PhyloDiversityCommand::readNamesFile() {
700         try {
701                 m->names.clear();
702                 numUniquesInName = 0;
703                 
704                 ifstream in;
705                 m->openInputFile(namefile, in);
706                 
707                 string first, second;
708                 map<string, string>::iterator itNames;
709                 
710                 while(!in.eof()) {
711                         in >> first >> second; m->gobble(in);
712                         
713                         numUniquesInName++;
714                         
715                         itNames = m->names.find(first);
716                         if (itNames == m->names.end()) {  
717                                 m->names[first] = second; 
718                                 
719                                 //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
720                                 vector<string> dupNames;
721                                 m->splitAtComma(second, dupNames);
722                                 
723                                 for (int i = 0; i < dupNames.size(); i++) {     
724                                         nameMap[dupNames[i]] = dupNames[i]; 
725                                         if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); } 
726                                 }
727                         }else {  m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }                  
728                 }
729                 in.close();
730                 
731                 return 0;
732         }
733         catch(exception& e) {
734                 m->errorOut(e, "PhyloDiversityCommand", "readNamesFile");
735                 exit(1);
736         }
737 }
738
739 //**********************************************************************************************************************
740
741
742