]> git.donarmstrong.com Git - mothur.git/blob - phylodiversitycommand.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / phylodiversitycommand.cpp
1 /*
2  *  phylodiversitycommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/30/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylodiversitycommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> PhyloDiversityCommand::setParameters(){  
14         try {
15
16                 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree);
17                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pgroup);
18                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
19                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
20                 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
21                 CommandParameter pfreq("freq", "Number", "", "100", "", "", "",false,false); parameters.push_back(pfreq);
22                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
23                 CommandParameter prarefy("rarefy", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prarefy);
24                 CommandParameter psummary("summary", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(psummary);
25                 CommandParameter pcollect("collect", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pcollect);
26                 CommandParameter pscale("scale", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pscale);
27                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
28                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29                 
30                 vector<string> myArray;
31                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
32                 return myArray;
33         }
34         catch(exception& e) {
35                 m->errorOut(e, "PhyloDiversityCommand", "setParameters");
36                 exit(1);
37         }
38 }
39 //**********************************************************************************************************************
40 string PhyloDiversityCommand::getHelpString(){  
41         try {
42                 string helpString = "";
43                 helpString += "The phylo.diversity command parameters are tree, group, name, groups, iters, freq, processors, scale, rarefy, collect and summary.  tree and group are required, unless you have valid current files.\n";
44                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n";
45                 helpString += "The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n";
46                 helpString += "The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n";
47                 helpString += "The scale parameter is used indicate that you want your output scaled to the number of sequences sampled, default = false. \n";
48                 helpString += "The rarefy parameter allows you to create a rarefaction curve. The default is false.\n";
49                 helpString += "The collect parameter allows you to create a collectors curve. The default is false.\n";
50                 helpString += "The summary parameter allows you to create a .summary file. The default is true.\n";
51                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
52                 helpString += "The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n";
53                 helpString += "Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n";
54                 helpString += "The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n";
55                 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
56                 return helpString;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "PhyloDiversityCommand", "getHelpString");
60                 exit(1);
61         }
62 }
63
64
65 //**********************************************************************************************************************
66 PhyloDiversityCommand::PhyloDiversityCommand(){ 
67         try {
68                 abort = true; calledHelp = true; 
69                 setParameters();
70                 vector<string> tempOutNames;
71                 outputTypes["phylodiv"] = tempOutNames;
72                 outputTypes["rarefy"] = tempOutNames;
73                 outputTypes["summary"] = tempOutNames;
74         }
75         catch(exception& e) {
76                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
77                 exit(1);
78         }
79 }
80 //**********************************************************************************************************************
81 PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
82         try {
83                 abort = false; calledHelp = false;   
84                 
85                 //allow user to run help
86                 if(option == "help") { help(); abort = true; calledHelp = true; }
87                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
88                 
89                 else {
90                         vector<string> myArray = setParameters();;
91                         
92                         OptionParser parser(option);
93                         map<string,string> parameters = parser.getParameters();
94                         map<string,string>::iterator it;
95                         
96                         ValidParameters validParameter;
97                 
98                         //check to make sure all parameters are valid for command
99                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
100                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
101                         }
102                         
103                         //initialize outputTypes
104                         vector<string> tempOutNames;
105                         outputTypes["phylodiv"] = tempOutNames;
106                         outputTypes["rarefy"] = tempOutNames;
107                         outputTypes["summary"] = tempOutNames;
108                         
109                         //if the user changes the input directory command factory will send this info to us in the output parameter 
110                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
111                         if (inputDir == "not found"){   inputDir = "";          }
112                         else {
113                                 string path;
114                                 it = parameters.find("tree");
115                                 //user has given a template file
116                                 if(it != parameters.end()){ 
117                                         path = m->hasPath(it->second);
118                                         //if the user has not given a path then, add inputdir. else leave path alone.
119                                         if (path == "") {       parameters["tree"] = inputDir + it->second;             }
120                                 }
121                                 
122                                 it = parameters.find("group");
123                                 //user has given a template file
124                                 if(it != parameters.end()){ 
125                                         path = m->hasPath(it->second);
126                                         //if the user has not given a path then, add inputdir. else leave path alone.
127                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
128                                 }
129                                 
130                                 it = parameters.find("name");
131                                 //user has given a template file
132                                 if(it != parameters.end()){ 
133                                         path = m->hasPath(it->second);
134                                         //if the user has not given a path then, add inputdir. else leave path alone.
135                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
136                                 }
137                         }
138                         
139                         m->runParse = true;
140                         m->clearGroups();
141                         m->clearAllGroups();
142                         m->Treenames.clear();
143                         m->names.clear();
144                         
145                         //check for required parameters
146                         treefile = validParameter.validFile(parameters, "tree", true);
147                         if (treefile == "not open") { treefile = ""; abort = true; }
148                         else if (treefile == "not found") {                             
149                                 //if there is a current design file, use it
150                                 treefile = m->getTreeFile(); 
151                                 if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
152                                 else {  m->mothurOut("You have no current tree file and the tree parameter is required."); m->mothurOutEndLine(); abort = true; }                                                               
153                         }else { m->setTreeFile(treefile); }     
154                         
155                         //check for required parameters
156                         groupfile = validParameter.validFile(parameters, "group", true);
157                         if (groupfile == "not open") { groupfile = ""; abort = true; }
158                         else if (groupfile == "not found") { groupfile = ""; }
159                         else { m->setGroupFile(groupfile); }
160                         
161                         namefile = validParameter.validFile(parameters, "name", true);
162                         if (namefile == "not open") { namefile = ""; abort = true; }
163                         else if (namefile == "not found") { namefile = ""; }
164                         else { m->setNameFile(namefile); }
165                         
166                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(treefile);       }
167                         
168                         string temp;
169                         temp = validParameter.validFile(parameters, "freq", false);                     if (temp == "not found") { temp = "100"; }
170                         m->mothurConvert(temp, freq); 
171                         
172                         temp = validParameter.validFile(parameters, "iters", false);                    if (temp == "not found") { temp = "1000"; }
173                         m->mothurConvert(temp, iters); 
174                         
175                         temp = validParameter.validFile(parameters, "rarefy", false);                   if (temp == "not found") { temp = "F"; }
176                         rarefy = m->isTrue(temp);
177                         if (!rarefy) { iters = 1;  }
178                         
179                         temp = validParameter.validFile(parameters, "summary", false);                  if (temp == "not found") { temp = "T"; }
180                         summary = m->isTrue(temp);
181                         
182                         temp = validParameter.validFile(parameters, "scale", false);                    if (temp == "not found") { temp = "F"; }
183                         scale = m->isTrue(temp);
184                         
185                         temp = validParameter.validFile(parameters, "collect", false);                  if (temp == "not found") { temp = "F"; }
186                         collect = m->isTrue(temp);
187                         
188                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
189                         m->setProcessors(temp);
190                         m->mothurConvert(temp, processors); 
191                         
192                         groups = validParameter.validFile(parameters, "groups", false);                 
193                         if (groups == "not found") { groups = "";  }
194                         else { 
195                                 m->splitAtDash(groups, Groups);
196                                 m->setGroups(Groups);
197                         }
198                         
199                         if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
200                         
201                         if (namefile == "") {
202                                 vector<string> files; files.push_back(treefile);
203                                 parser.getNameFile(files);
204                         }
205                 }
206                 
207         }
208         catch(exception& e) {
209                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
210                 exit(1);
211         }                       
212 }
213 //**********************************************************************************************************************
214
215 int PhyloDiversityCommand::execute(){
216         try {
217                 
218                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
219                 
220                 m->setTreeFile(treefile);
221                 
222                 if (groupfile != "") {
223                         //read in group map info.
224                         tmap = new TreeMap(groupfile);
225                         tmap->readMap();
226                 }else{ //fake out by putting everyone in one group
227                         Tree* tree = new Tree(treefile); delete tree;  //extracts names from tree to make faked out groupmap
228                         tmap = new TreeMap();
229                         
230                         for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
231                 }
232                 
233                 if (namefile != "") { readNamesFile(); }
234                 
235                 read = new ReadNewickTree(treefile);
236                 int readOk = read->read(tmap); 
237                 
238                 if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
239                 
240                 read->AssembleTrees();
241                 vector<Tree*> trees = read->getTrees();
242                 delete read;
243                 
244                 //make sure all files match
245                 //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
246                 int numNamesInTree;
247                 if (namefile != "")  {  
248                         if (numUniquesInName == m->Treenames.size()) {  numNamesInTree = nameMap.size();  }
249                         else {   numNamesInTree = m->Treenames.size();  }
250                 }else {  numNamesInTree = m->Treenames.size();  }
251                 
252                 
253                 //output any names that are in group file but not in tree
254                 if (numNamesInTree < tmap->getNumSeqs()) {
255                         for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
256                                 //is that name in the tree?
257                                 int count = 0;
258                                 for (int j = 0; j < m->Treenames.size(); j++) {
259                                         if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
260                                         count++;
261                                 }
262                                 
263                                 if (m->control_pressed) { 
264                                         delete tmap; for (int i = 0; i < trees.size(); i++) { delete trees[i]; }
265                                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
266                                         m->clearGroups();
267                                         return 0;
268                                 }
269                                 
270                                 //then you did not find it so report it 
271                                 if (count == m->Treenames.size()) { 
272                                         //if it is in your namefile then don't remove
273                                         map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
274                                         
275                                         if (it == nameMap.end()) {
276                                                 m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
277                                                 tmap->removeSeq(tmap->namesOfSeqs[i]);
278                                                 i--; //need this because removeSeq removes name from namesOfSeqs
279                                         }
280                                 }
281                         }
282                 }
283                 
284                 SharedUtil* util = new SharedUtil();
285                 vector<string> mGroups = m->getGroups();
286                 vector<string> tGroups = tmap->getNamesOfGroups();
287                 util->setGroups(mGroups, tGroups, "phylo.diversity");   //sets the groups the user wants to analyze
288                 delete util;
289                 
290                 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
291                 for (int i = 0; i < mGroups.size(); i++) { if (mGroups[i] == "xxx") { mGroups.erase(mGroups.begin()+i);  break; }  }
292                 m->setGroups(mGroups);
293                  
294                 vector<string> outputNames;
295                 
296                 //for each of the users trees
297                 for(int i = 0; i < trees.size(); i++) {
298                 
299                         if (m->control_pressed) { delete tmap; for (int j = 0; j < trees.size(); j++) { delete trees[j]; } for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        } return 0; }
300                         
301                         ofstream outSum, outRare, outCollect;
302                         string outSumFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv.summary";
303                         string outRareFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv.rarefaction";
304                         string outCollectFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv";
305                         
306                         if (summary)    { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile);             outputTypes["summary"].push_back(outSumFile);                   }
307                         if (rarefy)             { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile);  outputTypes["rarefy"].push_back(outRareFile);                   }
308                         if (collect)    { m->openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile);  outputTypes["phylodiv"].push_back(outCollectFile);  }
309                         
310                         int numLeafNodes = trees[i]->getNumLeaves();
311                         
312                         //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
313                         vector<int> randomLeaf;
314                         for (int j = 0; j < numLeafNodes; j++) {  
315                                 if (m->inUsersGroups(trees[i]->tree[j].getGroup(), mGroups) == true) { //is this a node from the group the user selected.
316                                         randomLeaf.push_back(j); 
317                                 }
318                         }
319                         
320                         numLeafNodes = randomLeaf.size();  //reset the number of leaf nodes you are using 
321                         
322                         //each group, each sampling, if no rarefy iters = 1;
323                         map<string, vector<float> > diversity;
324                         
325                         //each group, each sampling, if no rarefy iters = 1;
326                         map<string, vector<float> > sumDiversity;
327                         
328                         //find largest group total 
329                         int largestGroup = 0;
330                         for (int j = 0; j < mGroups.size(); j++) {  
331                                 if (tmap->seqsPerGroup[mGroups[j]] > largestGroup) { largestGroup = tmap->seqsPerGroup[mGroups[j]]; }
332                                 
333                                 //initialize diversity
334                                 diversity[mGroups[j]].resize(tmap->seqsPerGroup[mGroups[j]]+1, 0.0);            //numSampled
335                                                                                                                                                                                                                         //groupA                0.0                     0.0
336                                                                                                                                                                                                                         
337                                 //initialize sumDiversity
338                                 sumDiversity[mGroups[j]].resize(tmap->seqsPerGroup[mGroups[j]]+1, 0.0);
339                         }       
340
341                         //convert freq percentage to number
342                         int increment = 100;
343                         if (freq < 1.0) {  increment = largestGroup * freq;  
344                         }else { increment = freq;  }
345                         
346                         //initialize sampling spots
347                         set<int> numSampledList;
348                         for(int k = 1; k <= largestGroup; k++){  if((k == 1) || (k % increment == 0)){  numSampledList.insert(k); }   }
349                         if(largestGroup % increment != 0){      numSampledList.insert(largestGroup);   }
350                         
351                         //add other groups ending points
352                         for (int j = 0; j < mGroups.size(); j++) {  
353                                 if (numSampledList.count(diversity[mGroups[j]].size()-1) == 0) {  numSampledList.insert(diversity[mGroups[j]].size()-1); }
354                         }
355                         
356                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
357                                 if(processors == 1){
358                                         driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
359                                 }else{
360                                         if (rarefy) {
361                                                 vector<int> procIters;
362                                                 
363                                                 int numItersPerProcessor = iters / processors;
364                                                 
365                                                 //divide iters between processes
366                                                 for (int h = 0; h < processors; h++) {
367                                                         if(h == processors - 1){
368                                                                 numItersPerProcessor = iters - h * numItersPerProcessor;
369                                                         }
370                                                         procIters.push_back(numItersPerProcessor);
371                                                 }
372                                                 
373                                                 createProcesses(procIters, trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum); 
374                                                 
375                                         }else{ //no need to paralellize if you dont want to rarefy
376                                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
377                                         }
378                                 }
379
380                         #else
381                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
382                         #endif
383
384                         if (rarefy) {   printData(numSampledList, sumDiversity, outRare, iters);        }
385                 }
386                 
387         
388                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
389
390                 m->mothurOutEndLine();
391                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
392                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
393                 m->mothurOutEndLine();
394
395                 
396                 return 0;
397         }
398         catch(exception& e) {
399                 m->errorOut(e, "PhyloDiversityCommand", "execute");
400                 exit(1);
401         }
402 }
403 //**********************************************************************************************************************
404 int PhyloDiversityCommand::createProcesses(vector<int>& procIters, Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum){
405         try {
406                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
407                 int process = 1;
408                 
409                 vector<int> processIDS;
410                 map< string, vector<float> >::iterator itSum;
411                 
412                 //loop through and create all the processes you want
413                 while (process != processors) {
414                         int pid = fork();
415                         
416                         if (pid > 0) {
417                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
418                                 process++;
419                         }else if (pid == 0){
420                                 driver(t, div, sumDiv, procIters[process], increment, randomLeaf, numSampledList, outCollect, outSum, false);
421                                 
422                                 string outTemp = outputDir + toString(getpid()) + ".sumDiv.temp";
423                                 ofstream out;
424                                 m->openOutputFile(outTemp, out);
425                                 
426                                 //output the sumDIversity
427                                 for (itSum = sumDiv.begin(); itSum != sumDiv.end(); itSum++) {
428                                         out << itSum->first << '\t' << (itSum->second).size() << '\t';
429                                         for (int k = 0; k < (itSum->second).size(); k++) { 
430                                                 out << (itSum->second)[k] << '\t';
431                                         }
432                                         out << endl;
433                                 }
434                                 
435                                 out.close();
436                                 
437                                 exit(0);
438                         }else { 
439                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
440                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
441                                 exit(0);
442                         }
443                 }
444                 
445                 driver(t, div, sumDiv, procIters[0], increment, randomLeaf, numSampledList, outCollect, outSum, true);
446                 
447                 //force parent to wait until all the processes are done
448                 for (int i=0;i<(processors-1);i++) { 
449                         int temp = processIDS[i];
450                         wait(&temp);
451                 }
452                 
453                 //get data created by processes
454                 for (int i=0;i<(processors-1);i++) { 
455                         
456                         //input the sumDIversity
457                         string inTemp = outputDir + toString(processIDS[i]) + ".sumDiv.temp";
458                         ifstream in;
459                         m->openInputFile(inTemp, in);
460                                 
461                         //output the sumDIversity
462                         for (int j = 0; j < sumDiv.size(); j++) { 
463                                 string group = "";
464                                 int size = 0;
465                                 
466                                 in >> group >> size; m->gobble(in);
467                                 
468                                 for (int k = 0; k < size; k++) { 
469                                         float tempVal;
470                                         in >> tempVal;
471                                         
472                                         sumDiv[group][k] += tempVal;
473                                 }
474                                 m->gobble(in);
475                         }
476                                 
477                         in.close();
478                         m->mothurRemove(inTemp);
479                 }
480                 
481 #endif
482
483         return 0;               
484         
485         }
486         catch(exception& e) {
487                 m->errorOut(e, "PhyloDiversityCommand", "createProcesses");
488                 exit(1);
489         }
490 }
491 //**********************************************************************************************************************
492 int PhyloDiversityCommand::driver(Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum, bool doSumCollect){
493         try {
494                 int numLeafNodes = randomLeaf.size();
495                 vector<string> mGroups = m->getGroups();
496         
497                 for (int l = 0; l < numIters; l++) {
498                                 random_shuffle(randomLeaf.begin(), randomLeaf.end());
499                 
500                                 //initialize counts
501                                 map<string, int> counts;
502                                 map< string, set<int> > countedBranch;  
503                                 for (int j = 0; j < mGroups.size(); j++) {  counts[mGroups[j]] = 0; countedBranch[mGroups[j]].insert(-2);  }  //add dummy index to initialize countedBranch sets
504                                 
505                                 for(int k = 0; k < numLeafNodes; k++){
506                                                 
507                                         if (m->control_pressed) { return 0; }
508                                         
509                                         //calc branch length of randomLeaf k
510                                         vector<float> br = calcBranchLength(t, randomLeaf[k], countedBranch);
511                         
512                                         //for each group in the groups update the total branch length accounting for the names file
513                                         vector<string> groups = t->tree[randomLeaf[k]].getGroup();
514                                         
515                                         for (int j = 0; j < groups.size(); j++) {
516                                                 int numSeqsInGroupJ = 0;
517                                                 map<string, int>::iterator it;
518                                                 it = t->tree[randomLeaf[k]].pcount.find(groups[j]);
519                                                 if (it != t->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
520                                                         numSeqsInGroupJ = it->second;
521                                                 }
522                                                 
523                                                 if (numSeqsInGroupJ != 0) {     div[groups[j]][(counts[groups[j]]+1)] = div[groups[j]][counts[groups[j]]] + br[j];  }
524                                                 
525                                                 for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
526                                                         div[groups[j]][s] = div[groups[j]][s-1];  //update counts, but don't add in redundant branch lengths
527                                                 }
528                                                 counts[groups[j]] += numSeqsInGroupJ;
529                                         }
530                                 }
531                                 
532                                 if (rarefy) {
533                                         //add this diversity to the sum
534                                         for (int j = 0; j < mGroups.size(); j++) {  
535                                                 for (int g = 0; g < div[mGroups[j]].size(); g++) {
536                                                         sumDiv[mGroups[j]][g] += div[mGroups[j]][g];
537                                                 }
538                                         }
539                                 }
540                                 
541                                 if ((collect) && (l == 0) && doSumCollect) {  printData(numSampledList, div, outCollect, 1);  }
542                                 if ((summary) && (l == 0) && doSumCollect) {  printSumData(div, outSum, 1);  }
543                         }
544                         
545                         return 0;
546
547         }
548         catch(exception& e) {
549                 m->errorOut(e, "PhyloDiversityCommand", "driver");
550                 exit(1);
551         }
552 }
553
554 //**********************************************************************************************************************
555
556 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
557         try {
558                 
559                 out << "Groups\tnumSampled\tphyloDiversity" << endl;
560                 
561                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
562                 
563                 vector<string> mGroups = m->getGroups();
564                 for (int j = 0; j < mGroups.size(); j++) {
565                         int numSampled = (div[mGroups[j]].size()-1);
566                         out << mGroups[j] << '\t' << numSampled << '\t';
567                 
568                          
569                         float score;
570                         if (scale)      {  score = (div[mGroups[j]][numSampled] / (float)numIters) / (float)numSampled; }
571                         else            {       score = div[mGroups[j]][numSampled] / (float)numIters;  }
572                                 
573                         out << setprecision(4) << score << endl;
574                 }
575                                         
576                 out.close();
577                 
578         }
579         catch(exception& e) {
580                 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
581                 exit(1);
582         }
583 }
584 //**********************************************************************************************************************
585
586 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
587         try {
588                 
589                 out << "numSampled\t";
590                 vector<string> mGroups = m->getGroups();
591                 for (int i = 0; i < mGroups.size(); i++) { out << mGroups[i] << '\t';  }
592                 out << endl;
593                 
594                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
595                 
596                 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {  
597                         int numSampled = *it;
598                         
599                         out << numSampled << '\t';  
600                 
601                         for (int j = 0; j < mGroups.size(); j++) {
602                                 if (numSampled < div[mGroups[j]].size()) { 
603                                         float score;
604                                         if (scale)      {  score = (div[mGroups[j]][numSampled] / (float)numIters) / (float)numSampled; }
605                                         else            {       score = div[mGroups[j]][numSampled] / (float)numIters;  }
606
607                                         out << setprecision(4) << score << '\t';
608                                 }else { out << "NA" << '\t'; }
609                         }
610                         out << endl;
611                 }
612                 
613                 out.close();
614                 
615         }
616         catch(exception& e) {
617                 m->errorOut(e, "PhyloDiversityCommand", "printData");
618                 exit(1);
619         }
620 }
621 //**********************************************************************************************************************
622 //need a vector of floats one branch length for every group the node represents.
623 vector<float> PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< string, set<int> >& counted){
624         try {
625
626                 //calc the branch length
627                 //while you aren't at root
628                 vector<float> sums; 
629                 int index = leaf;
630                 
631                 vector<string> groups = t->tree[leaf].getGroup();
632                 sums.resize(groups.size(), 0.0);
633                 
634                 map<string, map<int, double> > tempTotals; //maps node to total Branch Length
635                 map< string, set<int> > tempCounted;
636                 set<int>::iterator it;
637         
638                 //you are a leaf
639                 if(t->tree[index].getBranchLength() != -1){     
640                         for (int k = 0; k < groups.size(); k++) { 
641                                 sums[k] += abs(t->tree[index].getBranchLength());       
642                                 counted[groups[k]].insert(index);
643                         }
644                 }
645                 
646                 for (int k = 0; k < groups.size(); k++) { 
647                         tempTotals[groups[k]][index] = 0.0;     
648                 }
649                 
650                 index = t->tree[index].getParent();     
651                         
652                 //while you aren't at root
653                 while(t->tree[index].getParent() != -1){
654
655                         if (m->control_pressed) {  return sums; }
656                         
657                         int pcountSize = 0;     
658                         for (int k = 0; k < groups.size(); k++) {
659                                 map<string, int>::iterator itGroup = t->tree[index].pcount.find(groups[k]);
660                                 if (itGroup != t->tree[index].pcount.end()) { pcountSize++;  } 
661                         
662                                 //do both your chidren have have descendants from the users groups? 
663                                 int lc = t->tree[index].getLChild();
664                                 int rc = t->tree[index].getRChild();
665                         
666                                 int LpcountSize = 0;
667                                 itGroup = t->tree[lc].pcount.find(groups[k]);
668                                 if (itGroup != t->tree[lc].pcount.end()) { LpcountSize++;  } 
669                                                         
670                                 int RpcountSize = 0;
671                                 itGroup = t->tree[rc].pcount.find(groups[k]);
672                                 if (itGroup != t->tree[rc].pcount.end()) { RpcountSize++;  } 
673                                                                 
674                                 //if yes, add your childrens tempTotals
675                                 if ((LpcountSize != 0) && (RpcountSize != 0)) {
676                                         sums[k] += tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc]; 
677                                         
678                                         for (it = tempCounted[groups[k]].begin(); it != tempCounted[groups[k]].end(); it++) { counted[groups[k]].insert(*it); }
679
680                                         //cout << "added to total " << tempTotals[lc] << '\t' << tempTotals[rc] << endl;
681                                         if (t->tree[index].getBranchLength() != -1) {
682                                                 if (counted[groups[k]].count(index) == 0) {
683                                                         tempTotals[groups[k]][index] = abs(t->tree[index].getBranchLength());
684                                                         tempCounted[groups[k]].insert(index);
685                                                 }else{
686                                                         tempTotals[groups[k]][index] = 0.0;
687                                                 }
688                                         }else {
689                                                 tempTotals[groups[k]][index] = 0.0;
690                                         }
691                                 }else { //if no, your tempTotal is your childrens temp totals + your branch length
692                                         tempTotals[groups[k]][index] = tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc]; 
693                                                                         
694                                         if (counted[groups[k]].count(index) == 0) {
695                                                 tempTotals[groups[k]][index] += abs(t->tree[index].getBranchLength());
696                                                 tempCounted[groups[k]].insert(index);
697                                         }
698
699                                 }
700                                 //cout << "temptotal = "<< tempTotals[i] << endl;
701                         }
702                         
703                         index = t->tree[index].getParent();     
704                 }
705
706                 return sums;
707
708         }
709         catch(exception& e) {
710                 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
711                 exit(1);
712         }
713 }
714 /*****************************************************************/
715 int PhyloDiversityCommand::readNamesFile() {
716         try {
717                 m->names.clear();
718                 numUniquesInName = 0;
719                 
720                 ifstream in;
721                 m->openInputFile(namefile, in);
722                 
723                 string first, second;
724                 map<string, string>::iterator itNames;
725                 
726                 while(!in.eof()) {
727                         in >> first >> second; m->gobble(in);
728                         
729                         numUniquesInName++;
730                         
731                         itNames = m->names.find(first);
732                         if (itNames == m->names.end()) {  
733                                 m->names[first] = second; 
734                                 
735                                 //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
736                                 vector<string> dupNames;
737                                 m->splitAtComma(second, dupNames);
738                                 
739                                 for (int i = 0; i < dupNames.size(); i++) {     
740                                         nameMap[dupNames[i]] = dupNames[i]; 
741                                         if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); } 
742                                 }
743                         }else {  m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }                  
744                 }
745                 in.close();
746                 
747                 return 0;
748         }
749         catch(exception& e) {
750                 m->errorOut(e, "PhyloDiversityCommand", "readNamesFile");
751                 exit(1);
752         }
753 }
754
755 //**********************************************************************************************************************
756
757
758