]> git.donarmstrong.com Git - mothur.git/blob - phylodiversitycommand.cpp
changed blastdb.cpp to no include sequence names in blast input filenames. modified...
[mothur.git] / phylodiversitycommand.cpp
1 /*
2  *  phylodiversitycommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/30/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylodiversitycommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> PhyloDiversityCommand::setParameters(){  
14         try {
15
16                 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree);
17                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pgroup);
18                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
19                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
20                 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
21                 CommandParameter pfreq("freq", "Number", "", "100", "", "", "",false,false); parameters.push_back(pfreq);
22                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
23                 CommandParameter prarefy("rarefy", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prarefy);
24                 CommandParameter psummary("summary", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(psummary);
25                 CommandParameter pcollect("collect", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pcollect);
26                 CommandParameter pscale("scale", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pscale);
27                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
28                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29                 
30                 vector<string> myArray;
31                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
32                 return myArray;
33         }
34         catch(exception& e) {
35                 m->errorOut(e, "PhyloDiversityCommand", "setParameters");
36                 exit(1);
37         }
38 }
39 //**********************************************************************************************************************
40 string PhyloDiversityCommand::getHelpString(){  
41         try {
42                 string helpString = "";
43                 helpString += "The phylo.diversity command parameters are tree, group, name, groups, iters, freq, processors, scale, rarefy, collect and summary.  tree and group are required, unless you have valid current files.\n";
44                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n";
45                 helpString += "The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n";
46                 helpString += "The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n";
47                 helpString += "The scale parameter is used indicate that you want your output scaled to the number of sequences sampled, default = false. \n";
48                 helpString += "The rarefy parameter allows you to create a rarefaction curve. The default is false.\n";
49                 helpString += "The collect parameter allows you to create a collectors curve. The default is false.\n";
50                 helpString += "The summary parameter allows you to create a .summary file. The default is true.\n";
51                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
52                 helpString += "The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n";
53                 helpString += "Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n";
54                 helpString += "The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n";
55                 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
56                 return helpString;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "PhyloDiversityCommand", "getHelpString");
60                 exit(1);
61         }
62 }
63
64
65 //**********************************************************************************************************************
66 PhyloDiversityCommand::PhyloDiversityCommand(){ 
67         try {
68                 abort = true; calledHelp = true; 
69                 setParameters();
70                 vector<string> tempOutNames;
71                 outputTypes["phylodiv"] = tempOutNames;
72                 outputTypes["rarefy"] = tempOutNames;
73                 outputTypes["summary"] = tempOutNames;
74         }
75         catch(exception& e) {
76                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
77                 exit(1);
78         }
79 }
80 //**********************************************************************************************************************
81 PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
82         try {
83                 abort = false; calledHelp = false;   
84                 
85                 //allow user to run help
86                 if(option == "help") { help(); abort = true; calledHelp = true; }
87                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
88                 
89                 else {
90                         vector<string> myArray = setParameters();;
91                         
92                         OptionParser parser(option);
93                         map<string,string> parameters = parser.getParameters();
94                         map<string,string>::iterator it;
95                         
96                         ValidParameters validParameter;
97                 
98                         //check to make sure all parameters are valid for command
99                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
100                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
101                         }
102                         
103                         //initialize outputTypes
104                         vector<string> tempOutNames;
105                         outputTypes["phylodiv"] = tempOutNames;
106                         outputTypes["rarefy"] = tempOutNames;
107                         outputTypes["summary"] = tempOutNames;
108                         
109                         //if the user changes the input directory command factory will send this info to us in the output parameter 
110                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
111                         if (inputDir == "not found"){   inputDir = "";          }
112                         else {
113                                 string path;
114                                 it = parameters.find("tree");
115                                 //user has given a template file
116                                 if(it != parameters.end()){ 
117                                         path = m->hasPath(it->second);
118                                         //if the user has not given a path then, add inputdir. else leave path alone.
119                                         if (path == "") {       parameters["tree"] = inputDir + it->second;             }
120                                 }
121                                 
122                                 it = parameters.find("group");
123                                 //user has given a template file
124                                 if(it != parameters.end()){ 
125                                         path = m->hasPath(it->second);
126                                         //if the user has not given a path then, add inputdir. else leave path alone.
127                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
128                                 }
129                                 
130                                 it = parameters.find("name");
131                                 //user has given a template file
132                                 if(it != parameters.end()){ 
133                                         path = m->hasPath(it->second);
134                                         //if the user has not given a path then, add inputdir. else leave path alone.
135                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
136                                 }
137                         }
138                         
139                         m->runParse = true;
140                         m->Groups.clear();
141                         m->namesOfGroups.clear();
142                         m->Treenames.clear();
143                         m->names.clear();
144                         
145                         //check for required parameters
146                         treefile = validParameter.validFile(parameters, "tree", true);
147                         if (treefile == "not open") { abort = true; }
148                         else if (treefile == "not found") {                             
149                                 //if there is a current design file, use it
150                                 treefile = m->getTreeFile(); 
151                                 if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
152                                 else {  m->mothurOut("You have no current tree file and the tree parameter is required."); m->mothurOutEndLine(); abort = true; }                                                               
153                         }       
154                         
155                         //check for required parameters
156                         groupfile = validParameter.validFile(parameters, "group", true);
157                         if (groupfile == "not open") { groupfile = ""; abort = true; }
158                         else if (groupfile == "not found") { groupfile = ""; }
159                         
160                         namefile = validParameter.validFile(parameters, "name", true);
161                         if (namefile == "not open") { abort = true; }
162                         else if (namefile == "not found") { namefile = ""; }
163                         
164                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(treefile);       }
165                         
166                         string temp;
167                         temp = validParameter.validFile(parameters, "freq", false);                     if (temp == "not found") { temp = "100"; }
168                         convert(temp, freq); 
169                         
170                         temp = validParameter.validFile(parameters, "iters", false);                    if (temp == "not found") { temp = "1000"; }
171                         convert(temp, iters); 
172                         
173                         temp = validParameter.validFile(parameters, "rarefy", false);                   if (temp == "not found") { temp = "F"; }
174                         rarefy = m->isTrue(temp);
175                         if (!rarefy) { iters = 1;  }
176                         
177                         temp = validParameter.validFile(parameters, "summary", false);                  if (temp == "not found") { temp = "T"; }
178                         summary = m->isTrue(temp);
179                         
180                         temp = validParameter.validFile(parameters, "scale", false);                    if (temp == "not found") { temp = "F"; }
181                         scale = m->isTrue(temp);
182                         
183                         temp = validParameter.validFile(parameters, "collect", false);                  if (temp == "not found") { temp = "F"; }
184                         collect = m->isTrue(temp);
185                         
186                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
187                         m->setProcessors(temp);
188                         convert(temp, processors); 
189                         
190                         groups = validParameter.validFile(parameters, "groups", false);                 
191                         if (groups == "not found") { groups = "";  }
192                         else { 
193                                 m->splitAtDash(groups, Groups);
194                                 m->Groups = Groups;
195                         }
196                         
197                         if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
198                 }
199                 
200         }
201         catch(exception& e) {
202                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
203                 exit(1);
204         }                       
205 }
206 //**********************************************************************************************************************
207
208 int PhyloDiversityCommand::execute(){
209         try {
210                 
211                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
212                 
213                 m->setTreeFile(treefile);
214                 
215                 if (groupfile != "") {
216                         //read in group map info.
217                         tmap = new TreeMap(groupfile);
218                         tmap->readMap();
219                 }else{ //fake out by putting everyone in one group
220                         Tree* tree = new Tree(treefile); delete tree;  //extracts names from tree to make faked out groupmap
221                         tmap = new TreeMap();
222                         
223                         for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
224                 }
225                 
226                 if (namefile != "") { readNamesFile(); }
227                 
228                 read = new ReadNewickTree(treefile);
229                 int readOk = read->read(tmap); 
230                 
231                 if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
232                 
233                 read->AssembleTrees();
234                 vector<Tree*> trees = read->getTrees();
235                 delete read;
236                 
237                 //make sure all files match
238                 //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
239                 int numNamesInTree;
240                 if (namefile != "")  {  
241                         if (numUniquesInName == m->Treenames.size()) {  numNamesInTree = nameMap.size();  }
242                         else {   numNamesInTree = m->Treenames.size();  }
243                 }else {  numNamesInTree = m->Treenames.size();  }
244                 
245                 
246                 //output any names that are in group file but not in tree
247                 if (numNamesInTree < tmap->getNumSeqs()) {
248                         for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
249                                 //is that name in the tree?
250                                 int count = 0;
251                                 for (int j = 0; j < m->Treenames.size(); j++) {
252                                         if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
253                                         count++;
254                                 }
255                                 
256                                 if (m->control_pressed) { 
257                                         delete tmap; for (int i = 0; i < trees.size(); i++) { delete trees[i]; }
258                                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } outputTypes.clear();
259                                         m->Groups.clear();
260                                         return 0;
261                                 }
262                                 
263                                 //then you did not find it so report it 
264                                 if (count == m->Treenames.size()) { 
265                                         //if it is in your namefile then don't remove
266                                         map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
267                                         
268                                         if (it == nameMap.end()) {
269                                                 m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
270                                                 tmap->removeSeq(tmap->namesOfSeqs[i]);
271                                                 i--; //need this because removeSeq removes name from namesOfSeqs
272                                         }
273                                 }
274                         }
275                 }
276                 
277                 SharedUtil* util = new SharedUtil();
278                 util->setGroups(m->Groups, tmap->namesOfGroups, "phylo.diversity");     //sets the groups the user wants to analyze
279                 delete util;
280                 
281                 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
282                 for (int i = 0; i < m->Groups.size(); i++) { if (m->Groups[i] == "xxx") { m->Groups.erase(m->Groups.begin()+i);  break; }  }
283                  
284                 vector<string> outputNames;
285                 
286                 //for each of the users trees
287                 for(int i = 0; i < trees.size(); i++) {
288                 
289                         if (m->control_pressed) { delete tmap; for (int j = 0; j < trees.size(); j++) { delete trees[j]; } for (int j = 0; j < outputNames.size(); j++) {       remove(outputNames[j].c_str());         } return 0; }
290                         
291                         ofstream outSum, outRare, outCollect;
292                         string outSumFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv.summary";
293                         string outRareFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv.rarefaction";
294                         string outCollectFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv";
295                         
296                         if (summary)    { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile);             outputTypes["summary"].push_back(outSumFile);                   }
297                         if (rarefy)             { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile);  outputTypes["rarefy"].push_back(outRareFile);                   }
298                         if (collect)    { m->openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile);  outputTypes["phylodiv"].push_back(outCollectFile);  }
299                         
300                         int numLeafNodes = trees[i]->getNumLeaves();
301                         
302                         //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
303                         vector<int> randomLeaf;
304                         for (int j = 0; j < numLeafNodes; j++) {  
305                                 if (m->inUsersGroups(trees[i]->tree[j].getGroup(), m->Groups) == true) { //is this a node from the group the user selected.
306                                         randomLeaf.push_back(j); 
307                                 }
308                         }
309                         
310                         numLeafNodes = randomLeaf.size();  //reset the number of leaf nodes you are using 
311                         
312                         //each group, each sampling, if no rarefy iters = 1;
313                         map<string, vector<float> > diversity;
314                         
315                         //each group, each sampling, if no rarefy iters = 1;
316                         map<string, vector<float> > sumDiversity;
317                         
318                         //find largest group total 
319                         int largestGroup = 0;
320                         for (int j = 0; j < m->Groups.size(); j++) {  
321                                 if (tmap->seqsPerGroup[m->Groups[j]] > largestGroup) { largestGroup = tmap->seqsPerGroup[m->Groups[j]]; }
322                                 
323                                 //initialize diversity
324                                 diversity[m->Groups[j]].resize(tmap->seqsPerGroup[m->Groups[j]]+1, 0.0);                //numSampled
325                                                                                                                                                                                                                         //groupA                0.0                     0.0
326                                                                                                                                                                                                                         
327                                 //initialize sumDiversity
328                                 sumDiversity[m->Groups[j]].resize(tmap->seqsPerGroup[m->Groups[j]]+1, 0.0);
329                         }       
330
331                         //convert freq percentage to number
332                         int increment = 100;
333                         if (freq < 1.0) {  increment = largestGroup * freq;  
334                         }else { increment = freq;  }
335                         
336                         //initialize sampling spots
337                         set<int> numSampledList;
338                         for(int k = 1; k <= largestGroup; k++){  if((k == 1) || (k % increment == 0)){  numSampledList.insert(k); }   }
339                         if(largestGroup % increment != 0){      numSampledList.insert(largestGroup);   }
340                         
341                         //add other groups ending points
342                         for (int j = 0; j < m->Groups.size(); j++) {  
343                                 if (numSampledList.count(diversity[m->Groups[j]].size()-1) == 0) {  numSampledList.insert(diversity[m->Groups[j]].size()-1); }
344                         }
345                         
346                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
347                                 if(processors == 1){
348                                         driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
349                                 }else{
350                                         if (rarefy) {
351                                                 vector<int> procIters;
352                                                 
353                                                 int numItersPerProcessor = iters / processors;
354                                                 
355                                                 //divide iters between processes
356                                                 for (int h = 0; h < processors; h++) {
357                                                         if(h == processors - 1){
358                                                                 numItersPerProcessor = iters - h * numItersPerProcessor;
359                                                         }
360                                                         procIters.push_back(numItersPerProcessor);
361                                                 }
362                                                 
363                                                 createProcesses(procIters, trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum); 
364                                                 
365                                         }else{ //no need to paralellize if you dont want to rarefy
366                                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
367                                         }
368                                 }
369
370                         #else
371                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
372                         #endif
373
374                         if (rarefy) {   printData(numSampledList, sumDiversity, outRare, iters);        }
375                 }
376                 
377         
378                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
379
380                 m->mothurOutEndLine();
381                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
382                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
383                 m->mothurOutEndLine();
384
385                 
386                 return 0;
387         }
388         catch(exception& e) {
389                 m->errorOut(e, "PhyloDiversityCommand", "execute");
390                 exit(1);
391         }
392 }
393 //**********************************************************************************************************************
394 int PhyloDiversityCommand::createProcesses(vector<int>& procIters, Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum){
395         try {
396                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
397                 int process = 1;
398                 
399                 vector<int> processIDS;
400                 map< string, vector<float> >::iterator itSum;
401                 
402                 //loop through and create all the processes you want
403                 while (process != processors) {
404                         int pid = fork();
405                         
406                         if (pid > 0) {
407                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
408                                 process++;
409                         }else if (pid == 0){
410                                 driver(t, div, sumDiv, procIters[process], increment, randomLeaf, numSampledList, outCollect, outSum, false);
411                                 
412                                 string outTemp = outputDir + toString(getpid()) + ".sumDiv.temp";
413                                 ofstream out;
414                                 m->openOutputFile(outTemp, out);
415                                 
416                                 //output the sumDIversity
417                                 for (itSum = sumDiv.begin(); itSum != sumDiv.end(); itSum++) {
418                                         out << itSum->first << '\t' << (itSum->second).size() << '\t';
419                                         for (int k = 0; k < (itSum->second).size(); k++) { 
420                                                 out << (itSum->second)[k] << '\t';
421                                         }
422                                         out << endl;
423                                 }
424                                 
425                                 out.close();
426                                 
427                                 exit(0);
428                         }else { 
429                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
430                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
431                                 exit(0);
432                         }
433                 }
434                 
435                 driver(t, div, sumDiv, procIters[0], increment, randomLeaf, numSampledList, outCollect, outSum, true);
436                 
437                 //force parent to wait until all the processes are done
438                 for (int i=0;i<(processors-1);i++) { 
439                         int temp = processIDS[i];
440                         wait(&temp);
441                 }
442                 
443                 //get data created by processes
444                 for (int i=0;i<(processors-1);i++) { 
445                         
446                         //input the sumDIversity
447                         string inTemp = outputDir + toString(processIDS[i]) + ".sumDiv.temp";
448                         ifstream in;
449                         m->openInputFile(inTemp, in);
450                                 
451                         //output the sumDIversity
452                         for (int j = 0; j < sumDiv.size(); j++) { 
453                                 string group = "";
454                                 int size = 0;
455                                 
456                                 in >> group >> size; m->gobble(in);
457                                 
458                                 for (int k = 0; k < size; k++) { 
459                                         float tempVal;
460                                         in >> tempVal;
461                                         
462                                         sumDiv[group][k] += tempVal;
463                                 }
464                                 m->gobble(in);
465                         }
466                                 
467                         in.close();
468                         remove(inTemp.c_str());
469                 }
470                 
471 #endif
472
473         return 0;               
474         
475         }
476         catch(exception& e) {
477                 m->errorOut(e, "PhyloDiversityCommand", "createProcesses");
478                 exit(1);
479         }
480 }
481 //**********************************************************************************************************************
482 int PhyloDiversityCommand::driver(Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum, bool doSumCollect){
483         try {
484                 int numLeafNodes = randomLeaf.size();
485         
486                 for (int l = 0; l < numIters; l++) {
487                                 random_shuffle(randomLeaf.begin(), randomLeaf.end());
488                 
489                                 //initialize counts
490                                 map<string, int> counts;
491                                 map< string, set<int> > countedBranch;  
492                                 for (int j = 0; j < m->Groups.size(); j++) {  counts[m->Groups[j]] = 0; countedBranch[m->Groups[j]].insert(-2);  }  //add dummy index to initialize countedBranch sets
493                                 
494                                 for(int k = 0; k < numLeafNodes; k++){
495                                                 
496                                         if (m->control_pressed) { return 0; }
497                                         
498                                         //calc branch length of randomLeaf k
499                                         vector<float> br = calcBranchLength(t, randomLeaf[k], countedBranch);
500                         
501                                         //for each group in the groups update the total branch length accounting for the names file
502                                         vector<string> groups = t->tree[randomLeaf[k]].getGroup();
503                                         
504                                         for (int j = 0; j < groups.size(); j++) {
505                                                 int numSeqsInGroupJ = 0;
506                                                 map<string, int>::iterator it;
507                                                 it = t->tree[randomLeaf[k]].pcount.find(groups[j]);
508                                                 if (it != t->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
509                                                         numSeqsInGroupJ = it->second;
510                                                 }
511                                                 
512                                                 if (numSeqsInGroupJ != 0) {     div[groups[j]][(counts[groups[j]]+1)] = div[groups[j]][counts[groups[j]]] + br[j];  }
513                                                 
514                                                 for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
515                                                         div[groups[j]][s] = div[groups[j]][s-1];  //update counts, but don't add in redundant branch lengths
516                                                 }
517                                                 counts[groups[j]] += numSeqsInGroupJ;
518                                         }
519                                 }
520                                 
521                                 if (rarefy) {
522                                         //add this diversity to the sum
523                                         for (int j = 0; j < m->Groups.size(); j++) {  
524                                                 for (int g = 0; g < div[m->Groups[j]].size(); g++) {
525                                                         sumDiv[m->Groups[j]][g] += div[m->Groups[j]][g];
526                                                 }
527                                         }
528                                 }
529                                 
530                                 if ((collect) && (l == 0) && doSumCollect) {  printData(numSampledList, div, outCollect, 1);  }
531                                 if ((summary) && (l == 0) && doSumCollect) {  printSumData(div, outSum, 1);  }
532                         }
533                         
534                         return 0;
535
536         }
537         catch(exception& e) {
538                 m->errorOut(e, "PhyloDiversityCommand", "driver");
539                 exit(1);
540         }
541 }
542
543 //**********************************************************************************************************************
544
545 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
546         try {
547                 
548                 out << "Groups\tnumSampled\tphyloDiversity" << endl;
549                 
550                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
551                         
552                 for (int j = 0; j < m->Groups.size(); j++) {
553                         int numSampled = (div[m->Groups[j]].size()-1);
554                         out << m->Groups[j] << '\t' << numSampled << '\t';
555                 
556                          
557                         float score;
558                         if (scale)      {  score = (div[m->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;       }
559                         else            {       score = div[m->Groups[j]][numSampled] / (float)numIters;        }
560                                 
561                         out << setprecision(4) << score << endl;
562                 }
563                                         
564                 out.close();
565                 
566         }
567         catch(exception& e) {
568                 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
569                 exit(1);
570         }
571 }
572 //**********************************************************************************************************************
573
574 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
575         try {
576                 
577                 out << "numSampled\t";
578                 for (int i = 0; i < m->Groups.size(); i++) { out << m->Groups[i] << '\t';  }
579                 out << endl;
580                 
581                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
582                 
583                 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {  
584                         int numSampled = *it;
585                         
586                         out << numSampled << '\t';  
587                         
588                         for (int j = 0; j < m->Groups.size(); j++) {
589                                 if (numSampled < div[m->Groups[j]].size()) { 
590                                         float score;
591                                         if (scale)      {  score = (div[m->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;       }
592                                         else            {       score = div[m->Groups[j]][numSampled] / (float)numIters;        }
593
594                                         out << setprecision(4) << score << '\t';
595                                 }else { out << "NA" << '\t'; }
596                         }
597                         out << endl;
598                 }
599                 
600                 out.close();
601                 
602         }
603         catch(exception& e) {
604                 m->errorOut(e, "PhyloDiversityCommand", "printData");
605                 exit(1);
606         }
607 }
608 //**********************************************************************************************************************
609 //need a vector of floats one branch length for every group the node represents.
610 vector<float> PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< string, set<int> >& counted){
611         try {
612
613                 //calc the branch length
614                 //while you aren't at root
615                 vector<float> sums; 
616                 int index = leaf;
617                 
618                 vector<string> groups = t->tree[leaf].getGroup();
619                 sums.resize(groups.size(), 0.0);
620                 
621                 map<string, map<int, double> > tempTotals; //maps node to total Branch Length
622                 map< string, set<int> > tempCounted;
623                 set<int>::iterator it;
624         
625                 //you are a leaf
626                 if(t->tree[index].getBranchLength() != -1){     
627                         for (int k = 0; k < groups.size(); k++) { 
628                                 sums[k] += abs(t->tree[index].getBranchLength());       
629                                 counted[groups[k]].insert(index);
630                         }
631                 }
632                 
633                 for (int k = 0; k < groups.size(); k++) { 
634                         tempTotals[groups[k]][index] = 0.0;     
635                 }
636                 
637                 index = t->tree[index].getParent();     
638                         
639                 //while you aren't at root
640                 while(t->tree[index].getParent() != -1){
641
642                         if (m->control_pressed) {  return sums; }
643                         
644                         int pcountSize = 0;     
645                         for (int k = 0; k < groups.size(); k++) {
646                                 map<string, int>::iterator itGroup = t->tree[index].pcount.find(groups[k]);
647                                 if (itGroup != t->tree[index].pcount.end()) { pcountSize++;  } 
648                         
649                                 //do both your chidren have have descendants from the users groups? 
650                                 int lc = t->tree[index].getLChild();
651                                 int rc = t->tree[index].getRChild();
652                         
653                                 int LpcountSize = 0;
654                                 itGroup = t->tree[lc].pcount.find(groups[k]);
655                                 if (itGroup != t->tree[lc].pcount.end()) { LpcountSize++;  } 
656                                                         
657                                 int RpcountSize = 0;
658                                 itGroup = t->tree[rc].pcount.find(groups[k]);
659                                 if (itGroup != t->tree[rc].pcount.end()) { RpcountSize++;  } 
660                                                                 
661                                 //if yes, add your childrens tempTotals
662                                 if ((LpcountSize != 0) && (RpcountSize != 0)) {
663                                         sums[k] += tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc]; 
664                                         
665                                         for (it = tempCounted[groups[k]].begin(); it != tempCounted[groups[k]].end(); it++) { counted[groups[k]].insert(*it); }
666
667                                         //cout << "added to total " << tempTotals[lc] << '\t' << tempTotals[rc] << endl;
668                                         if (t->tree[index].getBranchLength() != -1) {
669                                                 if (counted[groups[k]].count(index) == 0) {
670                                                         tempTotals[groups[k]][index] = abs(t->tree[index].getBranchLength());
671                                                         tempCounted[groups[k]].insert(index);
672                                                 }else{
673                                                         tempTotals[groups[k]][index] = 0.0;
674                                                 }
675                                         }else {
676                                                 tempTotals[groups[k]][index] = 0.0;
677                                         }
678                                 }else { //if no, your tempTotal is your childrens temp totals + your branch length
679                                         tempTotals[groups[k]][index] = tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc]; 
680                                                                         
681                                         if (counted[groups[k]].count(index) == 0) {
682                                                 tempTotals[groups[k]][index] += abs(t->tree[index].getBranchLength());
683                                                 tempCounted[groups[k]].insert(index);
684                                         }
685
686                                 }
687                                 //cout << "temptotal = "<< tempTotals[i] << endl;
688                         }
689                         
690                         index = t->tree[index].getParent();     
691                 }
692
693                 return sums;
694
695         }
696         catch(exception& e) {
697                 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
698                 exit(1);
699         }
700 }
701 /*****************************************************************/
702 int PhyloDiversityCommand::readNamesFile() {
703         try {
704                 m->names.clear();
705                 numUniquesInName = 0;
706                 
707                 ifstream in;
708                 m->openInputFile(namefile, in);
709                 
710                 string first, second;
711                 map<string, string>::iterator itNames;
712                 
713                 while(!in.eof()) {
714                         in >> first >> second; m->gobble(in);
715                         
716                         numUniquesInName++;
717                         
718                         itNames = m->names.find(first);
719                         if (itNames == m->names.end()) {  
720                                 m->names[first] = second; 
721                                 
722                                 //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
723                                 vector<string> dupNames;
724                                 m->splitAtComma(second, dupNames);
725                                 
726                                 for (int i = 0; i < dupNames.size(); i++) {     
727                                         nameMap[dupNames[i]] = dupNames[i]; 
728                                         if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); } 
729                                 }
730                         }else {  m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }                  
731                 }
732                 in.close();
733                 
734                 return 0;
735         }
736         catch(exception& e) {
737                 m->errorOut(e, "PhyloDiversityCommand", "readNamesFile");
738                 exit(1);
739         }
740 }
741
742 //**********************************************************************************************************************
743
744
745