]> git.donarmstrong.com Git - mothur.git/blob - phylodiversitycommand.cpp
added citation function to commands
[mothur.git] / phylodiversitycommand.cpp
1 /*
2  *  phylodiversitycommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/30/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylodiversitycommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> PhyloDiversityCommand::setParameters(){  
14         try {
15
16                 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree);
17                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pgroup);
18                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
19                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
20                 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
21                 CommandParameter pfreq("freq", "Number", "", "100", "", "", "",false,false); parameters.push_back(pfreq);
22                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
23                 CommandParameter prarefy("rarefy", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prarefy);
24                 CommandParameter psummary("summary", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(psummary);
25                 CommandParameter pcollect("collect", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pcollect);
26                 CommandParameter pscale("scale", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pscale);
27                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
28                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29                 
30                 vector<string> myArray;
31                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
32                 return myArray;
33         }
34         catch(exception& e) {
35                 m->errorOut(e, "PhyloDiversityCommand", "setParameters");
36                 exit(1);
37         }
38 }
39 //**********************************************************************************************************************
40 string PhyloDiversityCommand::getHelpString(){  
41         try {
42                 string helpString = "";
43                 helpString += "The phylo.diversity command parameters are tree, group, name, groups, iters, freq, processors, scale, rarefy, collect and summary.  tree and group are required, unless you have valid current files.\n";
44                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n";
45                 helpString += "The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n";
46                 helpString += "The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n";
47                 helpString += "The scale parameter is used indicate that you want your output scaled to the number of sequences sampled, default = false. \n";
48                 helpString += "The rarefy parameter allows you to create a rarefaction curve. The default is false.\n";
49                 helpString += "The collect parameter allows you to create a collectors curve. The default is false.\n";
50                 helpString += "The summary parameter allows you to create a .summary file. The default is true.\n";
51                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
52                 helpString += "The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n";
53                 helpString += "Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n";
54                 helpString += "The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n";
55                 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
56                 return helpString;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "PhyloDiversityCommand", "getHelpString");
60                 exit(1);
61         }
62 }
63
64
65 //**********************************************************************************************************************
66 PhyloDiversityCommand::PhyloDiversityCommand(){ 
67         try {
68                 abort = true; calledHelp = true; 
69                 setParameters();
70                 vector<string> tempOutNames;
71                 outputTypes["phylodiv"] = tempOutNames;
72                 outputTypes["rarefy"] = tempOutNames;
73                 outputTypes["summary"] = tempOutNames;
74         }
75         catch(exception& e) {
76                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
77                 exit(1);
78         }
79 }
80 //**********************************************************************************************************************
81 PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
82         try {
83                 abort = false; calledHelp = false;   
84                 
85                 //allow user to run help
86                 if(option == "help") { help(); abort = true; calledHelp = true; }
87                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
88                 
89                 else {
90                         vector<string> myArray = setParameters();;
91                         
92                         OptionParser parser(option);
93                         map<string,string> parameters = parser.getParameters();
94                         map<string,string>::iterator it;
95                         
96                         ValidParameters validParameter;
97                 
98                         //check to make sure all parameters are valid for command
99                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
100                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
101                         }
102                         
103                         //initialize outputTypes
104                         vector<string> tempOutNames;
105                         outputTypes["phylodiv"] = tempOutNames;
106                         outputTypes["rarefy"] = tempOutNames;
107                         outputTypes["summary"] = tempOutNames;
108                         
109                         //if the user changes the input directory command factory will send this info to us in the output parameter 
110                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
111                         if (inputDir == "not found"){   inputDir = "";          }
112                         else {
113                                 string path;
114                                 it = parameters.find("tree");
115                                 //user has given a template file
116                                 if(it != parameters.end()){ 
117                                         path = m->hasPath(it->second);
118                                         //if the user has not given a path then, add inputdir. else leave path alone.
119                                         if (path == "") {       parameters["tree"] = inputDir + it->second;             }
120                                 }
121                                 
122                                 it = parameters.find("group");
123                                 //user has given a template file
124                                 if(it != parameters.end()){ 
125                                         path = m->hasPath(it->second);
126                                         //if the user has not given a path then, add inputdir. else leave path alone.
127                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
128                                 }
129                                 
130                                 it = parameters.find("name");
131                                 //user has given a template file
132                                 if(it != parameters.end()){ 
133                                         path = m->hasPath(it->second);
134                                         //if the user has not given a path then, add inputdir. else leave path alone.
135                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
136                                 }
137                         }
138                         
139                         m->runParse = true;
140                         m->Groups.clear();
141                         m->namesOfGroups.clear();
142                         m->Treenames.clear();
143                         m->names.clear();
144                         
145                         //check for required parameters
146                         treefile = validParameter.validFile(parameters, "tree", true);
147                         if (treefile == "not open") { abort = true; }
148                         else if (treefile == "not found") {                             
149                                 //if there is a current design file, use it
150                                 treefile = m->getTreeFile(); 
151                                 if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
152                                 else {  m->mothurOut("You have no current tree file and the tree parameter is required."); m->mothurOutEndLine(); abort = true; }                                                               
153                         }       
154                         
155                         //check for required parameters
156                         groupfile = validParameter.validFile(parameters, "group", true);
157                         if (groupfile == "not open") { abort = true; }
158                         else if (groupfile == "not found") { 
159                                 //if there is a current design file, use it
160                                 groupfile = m->getGroupFile(); 
161                                 if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); }
162                                 else {  m->mothurOut("You have no current group file and the group parameter is required."); m->mothurOutEndLine(); abort = true; }                                                             
163                         }
164                         
165                         namefile = validParameter.validFile(parameters, "name", true);
166                         if (namefile == "not open") { abort = true; }
167                         else if (namefile == "not found") { namefile = ""; }
168                         
169                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(treefile);       }
170                         
171                         string temp;
172                         temp = validParameter.validFile(parameters, "freq", false);                     if (temp == "not found") { temp = "100"; }
173                         convert(temp, freq); 
174                         
175                         temp = validParameter.validFile(parameters, "iters", false);                    if (temp == "not found") { temp = "1000"; }
176                         convert(temp, iters); 
177                         
178                         temp = validParameter.validFile(parameters, "rarefy", false);                   if (temp == "not found") { temp = "F"; }
179                         rarefy = m->isTrue(temp);
180                         if (!rarefy) { iters = 1;  }
181                         
182                         temp = validParameter.validFile(parameters, "summary", false);                  if (temp == "not found") { temp = "T"; }
183                         summary = m->isTrue(temp);
184                         
185                         temp = validParameter.validFile(parameters, "scale", false);                    if (temp == "not found") { temp = "F"; }
186                         scale = m->isTrue(temp);
187                         
188                         temp = validParameter.validFile(parameters, "collect", false);                  if (temp == "not found") { temp = "F"; }
189                         collect = m->isTrue(temp);
190                         
191                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
192                         m->setProcessors(temp);
193                         convert(temp, processors); 
194                         
195                         groups = validParameter.validFile(parameters, "groups", false);                 
196                         if (groups == "not found") { groups = "";  }
197                         else { 
198                                 m->splitAtDash(groups, Groups);
199                                 m->Groups = Groups;
200                         }
201                         
202                         if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
203                 }
204                 
205         }
206         catch(exception& e) {
207                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
208                 exit(1);
209         }                       
210 }
211 //**********************************************************************************************************************
212
213 int PhyloDiversityCommand::execute(){
214         try {
215                 
216                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
217                 
218                 m->setTreeFile(treefile);
219                 
220                 //read in group map info.
221                 tmap = new TreeMap(groupfile);
222                 tmap->readMap();
223                 
224                 if (namefile != "") { readNamesFile(); }
225                 
226                 read = new ReadNewickTree(treefile);
227                 int readOk = read->read(tmap); 
228                 
229                 if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
230                 
231                 read->AssembleTrees();
232                 vector<Tree*> trees = read->getTrees();
233                 delete read;
234                 
235                 //make sure all files match
236                 //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
237                 int numNamesInTree;
238                 if (namefile != "")  {  
239                         if (numUniquesInName == m->Treenames.size()) {  numNamesInTree = nameMap.size();  }
240                         else {   numNamesInTree = m->Treenames.size();  }
241                 }else {  numNamesInTree = m->Treenames.size();  }
242                 
243                 
244                 //output any names that are in group file but not in tree
245                 if (numNamesInTree < tmap->getNumSeqs()) {
246                         for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
247                                 //is that name in the tree?
248                                 int count = 0;
249                                 for (int j = 0; j < m->Treenames.size(); j++) {
250                                         if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
251                                         count++;
252                                 }
253                                 
254                                 if (m->control_pressed) { 
255                                         delete tmap; for (int i = 0; i < trees.size(); i++) { delete trees[i]; }
256                                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } outputTypes.clear();
257                                         m->Groups.clear();
258                                         return 0;
259                                 }
260                                 
261                                 //then you did not find it so report it 
262                                 if (count == m->Treenames.size()) { 
263                                         //if it is in your namefile then don't remove
264                                         map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
265                                         
266                                         if (it == nameMap.end()) {
267                                                 m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
268                                                 tmap->removeSeq(tmap->namesOfSeqs[i]);
269                                                 i--; //need this because removeSeq removes name from namesOfSeqs
270                                         }
271                                 }
272                         }
273                 }
274                 
275                 SharedUtil* util = new SharedUtil();
276                 util->setGroups(m->Groups, tmap->namesOfGroups, "treegroup");   //sets the groups the user wants to analyze
277                 delete util;
278                 
279                 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
280                 for (int i = 0; i < m->Groups.size(); i++) { if (m->Groups[i] == "xxx") { m->Groups.erase(m->Groups.begin()+i);  break; }  }
281                  
282                 vector<string> outputNames;
283                 
284                 //for each of the users trees
285                 for(int i = 0; i < trees.size(); i++) {
286                 
287                         if (m->control_pressed) { delete tmap; for (int j = 0; j < trees.size(); j++) { delete trees[j]; } for (int j = 0; j < outputNames.size(); j++) {       remove(outputNames[j].c_str());         } return 0; }
288                         
289                         ofstream outSum, outRare, outCollect;
290                         string outSumFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv.summary";
291                         string outRareFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv.rarefaction";
292                         string outCollectFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv";
293                         
294                         if (summary)    { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile);             outputTypes["summary"].push_back(outSumFile);                   }
295                         if (rarefy)             { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile);  outputTypes["rarefy"].push_back(outRareFile);                   }
296                         if (collect)    { m->openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile);  outputTypes["phylodiv"].push_back(outCollectFile);  }
297                         
298                         int numLeafNodes = trees[i]->getNumLeaves();
299                         
300                         //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
301                         vector<int> randomLeaf;
302                         for (int j = 0; j < numLeafNodes; j++) {  
303                                 if (m->inUsersGroups(trees[i]->tree[j].getGroup(), m->Groups) == true) { //is this a node from the group the user selected.
304                                         randomLeaf.push_back(j); 
305                                 }
306                         }
307                         
308                         numLeafNodes = randomLeaf.size();  //reset the number of leaf nodes you are using 
309                         
310                         //each group, each sampling, if no rarefy iters = 1;
311                         map<string, vector<float> > diversity;
312                         
313                         //each group, each sampling, if no rarefy iters = 1;
314                         map<string, vector<float> > sumDiversity;
315                         
316                         //find largest group total 
317                         int largestGroup = 0;
318                         for (int j = 0; j < m->Groups.size(); j++) {  
319                                 if (tmap->seqsPerGroup[m->Groups[j]] > largestGroup) { largestGroup = tmap->seqsPerGroup[m->Groups[j]]; }
320                                 
321                                 //initialize diversity
322                                 diversity[m->Groups[j]].resize(tmap->seqsPerGroup[m->Groups[j]]+1, 0.0);                //numSampled
323                                                                                                                                                                                                                         //groupA                0.0                     0.0
324                                                                                                                                                                                                                         
325                                 //initialize sumDiversity
326                                 sumDiversity[m->Groups[j]].resize(tmap->seqsPerGroup[m->Groups[j]]+1, 0.0);
327                         }       
328
329                         //convert freq percentage to number
330                         int increment = 100;
331                         if (freq < 1.0) {  increment = largestGroup * freq;  
332                         }else { increment = freq;  }
333                         
334                         //initialize sampling spots
335                         set<int> numSampledList;
336                         for(int k = 1; k <= largestGroup; k++){  if((k == 1) || (k % increment == 0)){  numSampledList.insert(k); }   }
337                         if(largestGroup % increment != 0){      numSampledList.insert(largestGroup);   }
338                         
339                         //add other groups ending points
340                         for (int j = 0; j < m->Groups.size(); j++) {  
341                                 if (numSampledList.count(diversity[m->Groups[j]].size()-1) == 0) {  numSampledList.insert(diversity[m->Groups[j]].size()-1); }
342                         }
343                         
344                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
345                                 if(processors == 1){
346                                         driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
347                                 }else{
348                                         if (rarefy) {
349                                                 vector<int> procIters;
350                                                 
351                                                 int numItersPerProcessor = iters / processors;
352                                                 
353                                                 //divide iters between processes
354                                                 for (int h = 0; h < processors; h++) {
355                                                         if(h == processors - 1){
356                                                                 numItersPerProcessor = iters - h * numItersPerProcessor;
357                                                         }
358                                                         procIters.push_back(numItersPerProcessor);
359                                                 }
360                                                 
361                                                 createProcesses(procIters, trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum); 
362                                                 
363                                         }else{ //no need to paralellize if you dont want to rarefy
364                                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
365                                         }
366                                 }
367
368                         #else
369                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
370                         #endif
371
372                         if (rarefy) {   printData(numSampledList, sumDiversity, outRare, iters);        }
373                 }
374                 
375         
376                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
377
378                 m->mothurOutEndLine();
379                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
380                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
381                 m->mothurOutEndLine();
382
383                 
384                 return 0;
385         }
386         catch(exception& e) {
387                 m->errorOut(e, "PhyloDiversityCommand", "execute");
388                 exit(1);
389         }
390 }
391 //**********************************************************************************************************************
392 int PhyloDiversityCommand::createProcesses(vector<int>& procIters, Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum){
393         try {
394                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
395                 int process = 1;
396                 
397                 vector<int> processIDS;
398                 map< string, vector<float> >::iterator itSum;
399                 
400                 //loop through and create all the processes you want
401                 while (process != processors) {
402                         int pid = fork();
403                         
404                         if (pid > 0) {
405                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
406                                 process++;
407                         }else if (pid == 0){
408                                 driver(t, div, sumDiv, procIters[process], increment, randomLeaf, numSampledList, outCollect, outSum, false);
409                                 
410                                 string outTemp = outputDir + toString(getpid()) + ".sumDiv.temp";
411                                 ofstream out;
412                                 m->openOutputFile(outTemp, out);
413                                 
414                                 //output the sumDIversity
415                                 for (itSum = sumDiv.begin(); itSum != sumDiv.end(); itSum++) {
416                                         out << itSum->first << '\t' << (itSum->second).size() << '\t';
417                                         for (int k = 0; k < (itSum->second).size(); k++) { 
418                                                 out << (itSum->second)[k] << '\t';
419                                         }
420                                         out << endl;
421                                 }
422                                 
423                                 out.close();
424                                 
425                                 exit(0);
426                         }else { 
427                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
428                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
429                                 exit(0);
430                         }
431                 }
432                 
433                 driver(t, div, sumDiv, procIters[0], increment, randomLeaf, numSampledList, outCollect, outSum, true);
434                 
435                 //force parent to wait until all the processes are done
436                 for (int i=0;i<(processors-1);i++) { 
437                         int temp = processIDS[i];
438                         wait(&temp);
439                 }
440                 
441                 //get data created by processes
442                 for (int i=0;i<(processors-1);i++) { 
443                         
444                         //input the sumDIversity
445                         string inTemp = outputDir + toString(processIDS[i]) + ".sumDiv.temp";
446                         ifstream in;
447                         m->openInputFile(inTemp, in);
448                                 
449                         //output the sumDIversity
450                         for (int j = 0; j < sumDiv.size(); j++) { 
451                                 string group = "";
452                                 int size = 0;
453                                 
454                                 in >> group >> size; m->gobble(in);
455                                 
456                                 for (int k = 0; k < size; k++) { 
457                                         float tempVal;
458                                         in >> tempVal;
459                                         
460                                         sumDiv[group][k] += tempVal;
461                                 }
462                                 m->gobble(in);
463                         }
464                                 
465                         in.close();
466                         remove(inTemp.c_str());
467                 }
468                 
469 #endif
470
471         return 0;               
472         
473         }
474         catch(exception& e) {
475                 m->errorOut(e, "PhyloDiversityCommand", "createProcesses");
476                 exit(1);
477         }
478 }
479 //**********************************************************************************************************************
480 int PhyloDiversityCommand::driver(Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum, bool doSumCollect){
481         try {
482                 int numLeafNodes = randomLeaf.size();
483         
484                 for (int l = 0; l < numIters; l++) {
485                                 random_shuffle(randomLeaf.begin(), randomLeaf.end());
486                 
487                                 //initialize counts
488                                 map<string, int> counts;
489                                 map< string, set<int> > countedBranch;  
490                                 for (int j = 0; j < m->Groups.size(); j++) {  counts[m->Groups[j]] = 0; countedBranch[m->Groups[j]].insert(-2);  }  //add dummy index to initialize countedBranch sets
491                                 
492                                 for(int k = 0; k < numLeafNodes; k++){
493                                                 
494                                         if (m->control_pressed) { return 0; }
495                                         
496                                         //calc branch length of randomLeaf k
497                                         vector<float> br = calcBranchLength(t, randomLeaf[k], countedBranch);
498                         
499                                         //for each group in the groups update the total branch length accounting for the names file
500                                         vector<string> groups = t->tree[randomLeaf[k]].getGroup();
501                                         
502                                         for (int j = 0; j < groups.size(); j++) {
503                                                 int numSeqsInGroupJ = 0;
504                                                 map<string, int>::iterator it;
505                                                 it = t->tree[randomLeaf[k]].pcount.find(groups[j]);
506                                                 if (it != t->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
507                                                         numSeqsInGroupJ = it->second;
508                                                 }
509                                                 
510                                                 if (numSeqsInGroupJ != 0) {     div[groups[j]][(counts[groups[j]]+1)] = div[groups[j]][counts[groups[j]]] + br[j];  }
511                                                 
512                                                 for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
513                                                         div[groups[j]][s] = div[groups[j]][s-1];  //update counts, but don't add in redundant branch lengths
514                                                 }
515                                                 counts[groups[j]] += numSeqsInGroupJ;
516                                         }
517                                 }
518                                 
519                                 if (rarefy) {
520                                         //add this diversity to the sum
521                                         for (int j = 0; j < m->Groups.size(); j++) {  
522                                                 for (int g = 0; g < div[m->Groups[j]].size(); g++) {
523                                                         sumDiv[m->Groups[j]][g] += div[m->Groups[j]][g];
524                                                 }
525                                         }
526                                 }
527                                 
528                                 if ((collect) && (l == 0) && doSumCollect) {  printData(numSampledList, div, outCollect, 1);  }
529                                 if ((summary) && (l == 0) && doSumCollect) {  printSumData(div, outSum, 1);  }
530                         }
531                         
532                         return 0;
533
534         }
535         catch(exception& e) {
536                 m->errorOut(e, "PhyloDiversityCommand", "driver");
537                 exit(1);
538         }
539 }
540
541 //**********************************************************************************************************************
542
543 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
544         try {
545                 
546                 out << "Groups\tnumSampled\tphyloDiversity" << endl;
547                 
548                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
549                         
550                 for (int j = 0; j < m->Groups.size(); j++) {
551                         int numSampled = (div[m->Groups[j]].size()-1);
552                         out << m->Groups[j] << '\t' << numSampled << '\t';
553                 
554                          
555                         float score;
556                         if (scale)      {  score = (div[m->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;       }
557                         else            {       score = div[m->Groups[j]][numSampled] / (float)numIters;        }
558                                 
559                         out << setprecision(4) << score << endl;
560                 }
561                                         
562                 out.close();
563                 
564         }
565         catch(exception& e) {
566                 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
567                 exit(1);
568         }
569 }
570 //**********************************************************************************************************************
571
572 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
573         try {
574                 
575                 out << "numSampled\t";
576                 for (int i = 0; i < m->Groups.size(); i++) { out << m->Groups[i] << '\t';  }
577                 out << endl;
578                 
579                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
580                 
581                 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {  
582                         int numSampled = *it;
583                         
584                         out << numSampled << '\t';  
585                         
586                         for (int j = 0; j < m->Groups.size(); j++) {
587                                 if (numSampled < div[m->Groups[j]].size()) { 
588                                         float score;
589                                         if (scale)      {  score = (div[m->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;       }
590                                         else            {       score = div[m->Groups[j]][numSampled] / (float)numIters;        }
591
592                                         out << setprecision(4) << score << '\t';
593                                 }else { out << "NA" << '\t'; }
594                         }
595                         out << endl;
596                 }
597                 
598                 out.close();
599                 
600         }
601         catch(exception& e) {
602                 m->errorOut(e, "PhyloDiversityCommand", "printData");
603                 exit(1);
604         }
605 }
606 //**********************************************************************************************************************
607 //need a vector of floats one branch length for every group the node represents.
608 vector<float> PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< string, set<int> >& counted){
609         try {
610
611                 //calc the branch length
612                 //while you aren't at root
613                 vector<float> sums; 
614                 int index = leaf;
615                 
616                 vector<string> groups = t->tree[leaf].getGroup();
617                 sums.resize(groups.size(), 0.0);
618                 
619                 map<string, map<int, double> > tempTotals; //maps node to total Branch Length
620                 map< string, set<int> > tempCounted;
621                 set<int>::iterator it;
622         
623                 //you are a leaf
624                 if(t->tree[index].getBranchLength() != -1){     
625                         for (int k = 0; k < groups.size(); k++) { 
626                                 sums[k] += abs(t->tree[index].getBranchLength());       
627                                 counted[groups[k]].insert(index);
628                         }
629                 }
630                 
631                 for (int k = 0; k < groups.size(); k++) { 
632                         tempTotals[groups[k]][index] = 0.0;     
633                 }
634                 
635                 index = t->tree[index].getParent();     
636                         
637                 //while you aren't at root
638                 while(t->tree[index].getParent() != -1){
639
640                         if (m->control_pressed) {  return sums; }
641                         
642                         int pcountSize = 0;     
643                         for (int k = 0; k < groups.size(); k++) {
644                                 map<string, int>::iterator itGroup = t->tree[index].pcount.find(groups[k]);
645                                 if (itGroup != t->tree[index].pcount.end()) { pcountSize++;  } 
646                         
647                                 //do both your chidren have have descendants from the users groups? 
648                                 int lc = t->tree[index].getLChild();
649                                 int rc = t->tree[index].getRChild();
650                         
651                                 int LpcountSize = 0;
652                                 itGroup = t->tree[lc].pcount.find(groups[k]);
653                                 if (itGroup != t->tree[lc].pcount.end()) { LpcountSize++;  } 
654                                                         
655                                 int RpcountSize = 0;
656                                 itGroup = t->tree[rc].pcount.find(groups[k]);
657                                 if (itGroup != t->tree[rc].pcount.end()) { RpcountSize++;  } 
658                                                                 
659                                 //if yes, add your childrens tempTotals
660                                 if ((LpcountSize != 0) && (RpcountSize != 0)) {
661                                         sums[k] += tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc]; 
662                                         
663                                         for (it = tempCounted[groups[k]].begin(); it != tempCounted[groups[k]].end(); it++) { counted[groups[k]].insert(*it); }
664
665                                         //cout << "added to total " << tempTotals[lc] << '\t' << tempTotals[rc] << endl;
666                                         if (t->tree[index].getBranchLength() != -1) {
667                                                 if (counted[groups[k]].count(index) == 0) {
668                                                         tempTotals[groups[k]][index] = abs(t->tree[index].getBranchLength());
669                                                         tempCounted[groups[k]].insert(index);
670                                                 }else{
671                                                         tempTotals[groups[k]][index] = 0.0;
672                                                 }
673                                         }else {
674                                                 tempTotals[groups[k]][index] = 0.0;
675                                         }
676                                 }else { //if no, your tempTotal is your childrens temp totals + your branch length
677                                         tempTotals[groups[k]][index] = tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc]; 
678                                                                         
679                                         if (counted[groups[k]].count(index) == 0) {
680                                                 tempTotals[groups[k]][index] += abs(t->tree[index].getBranchLength());
681                                                 tempCounted[groups[k]].insert(index);
682                                         }
683
684                                 }
685                                 //cout << "temptotal = "<< tempTotals[i] << endl;
686                         }
687                         
688                         index = t->tree[index].getParent();     
689                 }
690
691                 return sums;
692
693         }
694         catch(exception& e) {
695                 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
696                 exit(1);
697         }
698 }
699 /*****************************************************************/
700 int PhyloDiversityCommand::readNamesFile() {
701         try {
702                 m->names.clear();
703                 numUniquesInName = 0;
704                 
705                 ifstream in;
706                 m->openInputFile(namefile, in);
707                 
708                 string first, second;
709                 map<string, string>::iterator itNames;
710                 
711                 while(!in.eof()) {
712                         in >> first >> second; m->gobble(in);
713                         
714                         numUniquesInName++;
715                         
716                         itNames = m->names.find(first);
717                         if (itNames == m->names.end()) {  
718                                 m->names[first] = second; 
719                                 
720                                 //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
721                                 vector<string> dupNames;
722                                 m->splitAtComma(second, dupNames);
723                                 
724                                 for (int i = 0; i < dupNames.size(); i++) {     
725                                         nameMap[dupNames[i]] = dupNames[i]; 
726                                         if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); } 
727                                 }
728                         }else {  m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }                  
729                 }
730                 in.close();
731                 
732                 return 0;
733         }
734         catch(exception& e) {
735                 m->errorOut(e, "PhyloDiversityCommand", "readNamesFile");
736                 exit(1);
737         }
738 }
739
740 //**********************************************************************************************************************
741
742
743