]> git.donarmstrong.com Git - mothur.git/blob - phylodiversitycommand.cpp
rewrote metastats command in c++, added mothurRemove function to handle ~ error....
[mothur.git] / phylodiversitycommand.cpp
1 /*
2  *  phylodiversitycommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/30/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylodiversitycommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> PhyloDiversityCommand::setParameters(){  
14         try {
15
16                 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree);
17                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pgroup);
18                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
19                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
20                 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
21                 CommandParameter pfreq("freq", "Number", "", "100", "", "", "",false,false); parameters.push_back(pfreq);
22                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
23                 CommandParameter prarefy("rarefy", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prarefy);
24                 CommandParameter psummary("summary", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(psummary);
25                 CommandParameter pcollect("collect", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pcollect);
26                 CommandParameter pscale("scale", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pscale);
27                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
28                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29                 
30                 vector<string> myArray;
31                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
32                 return myArray;
33         }
34         catch(exception& e) {
35                 m->errorOut(e, "PhyloDiversityCommand", "setParameters");
36                 exit(1);
37         }
38 }
39 //**********************************************************************************************************************
40 string PhyloDiversityCommand::getHelpString(){  
41         try {
42                 string helpString = "";
43                 helpString += "The phylo.diversity command parameters are tree, group, name, groups, iters, freq, processors, scale, rarefy, collect and summary.  tree and group are required, unless you have valid current files.\n";
44                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n";
45                 helpString += "The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n";
46                 helpString += "The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n";
47                 helpString += "The scale parameter is used indicate that you want your output scaled to the number of sequences sampled, default = false. \n";
48                 helpString += "The rarefy parameter allows you to create a rarefaction curve. The default is false.\n";
49                 helpString += "The collect parameter allows you to create a collectors curve. The default is false.\n";
50                 helpString += "The summary parameter allows you to create a .summary file. The default is true.\n";
51                 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
52                 helpString += "The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n";
53                 helpString += "Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n";
54                 helpString += "The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n";
55                 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
56                 return helpString;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "PhyloDiversityCommand", "getHelpString");
60                 exit(1);
61         }
62 }
63
64
65 //**********************************************************************************************************************
66 PhyloDiversityCommand::PhyloDiversityCommand(){ 
67         try {
68                 abort = true; calledHelp = true; 
69                 setParameters();
70                 vector<string> tempOutNames;
71                 outputTypes["phylodiv"] = tempOutNames;
72                 outputTypes["rarefy"] = tempOutNames;
73                 outputTypes["summary"] = tempOutNames;
74         }
75         catch(exception& e) {
76                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
77                 exit(1);
78         }
79 }
80 //**********************************************************************************************************************
81 PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
82         try {
83                 abort = false; calledHelp = false;   
84                 
85                 //allow user to run help
86                 if(option == "help") { help(); abort = true; calledHelp = true; }
87                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
88                 
89                 else {
90                         vector<string> myArray = setParameters();;
91                         
92                         OptionParser parser(option);
93                         map<string,string> parameters = parser.getParameters();
94                         map<string,string>::iterator it;
95                         
96                         ValidParameters validParameter;
97                 
98                         //check to make sure all parameters are valid for command
99                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
100                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
101                         }
102                         
103                         //initialize outputTypes
104                         vector<string> tempOutNames;
105                         outputTypes["phylodiv"] = tempOutNames;
106                         outputTypes["rarefy"] = tempOutNames;
107                         outputTypes["summary"] = tempOutNames;
108                         
109                         //if the user changes the input directory command factory will send this info to us in the output parameter 
110                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
111                         if (inputDir == "not found"){   inputDir = "";          }
112                         else {
113                                 string path;
114                                 it = parameters.find("tree");
115                                 //user has given a template file
116                                 if(it != parameters.end()){ 
117                                         path = m->hasPath(it->second);
118                                         //if the user has not given a path then, add inputdir. else leave path alone.
119                                         if (path == "") {       parameters["tree"] = inputDir + it->second;             }
120                                 }
121                                 
122                                 it = parameters.find("group");
123                                 //user has given a template file
124                                 if(it != parameters.end()){ 
125                                         path = m->hasPath(it->second);
126                                         //if the user has not given a path then, add inputdir. else leave path alone.
127                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
128                                 }
129                                 
130                                 it = parameters.find("name");
131                                 //user has given a template file
132                                 if(it != parameters.end()){ 
133                                         path = m->hasPath(it->second);
134                                         //if the user has not given a path then, add inputdir. else leave path alone.
135                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
136                                 }
137                         }
138                         
139                         m->runParse = true;
140                         m->Groups.clear();
141                         m->namesOfGroups.clear();
142                         m->Treenames.clear();
143                         m->names.clear();
144                         
145                         //check for required parameters
146                         treefile = validParameter.validFile(parameters, "tree", true);
147                         if (treefile == "not open") { abort = true; }
148                         else if (treefile == "not found") {                             
149                                 //if there is a current design file, use it
150                                 treefile = m->getTreeFile(); 
151                                 if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
152                                 else {  m->mothurOut("You have no current tree file and the tree parameter is required."); m->mothurOutEndLine(); abort = true; }                                                               
153                         }else { m->setTreeFile(treefile); }     
154                         
155                         //check for required parameters
156                         groupfile = validParameter.validFile(parameters, "group", true);
157                         if (groupfile == "not open") { groupfile = ""; abort = true; }
158                         else if (groupfile == "not found") { groupfile = ""; }
159                         else { m->setGroupFile(groupfile); }
160                         
161                         namefile = validParameter.validFile(parameters, "name", true);
162                         if (namefile == "not open") { abort = true; }
163                         else if (namefile == "not found") { namefile = ""; }
164                         else { m->setNameFile(namefile); }
165                         
166                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(treefile);       }
167                         
168                         string temp;
169                         temp = validParameter.validFile(parameters, "freq", false);                     if (temp == "not found") { temp = "100"; }
170                         convert(temp, freq); 
171                         
172                         temp = validParameter.validFile(parameters, "iters", false);                    if (temp == "not found") { temp = "1000"; }
173                         convert(temp, iters); 
174                         
175                         temp = validParameter.validFile(parameters, "rarefy", false);                   if (temp == "not found") { temp = "F"; }
176                         rarefy = m->isTrue(temp);
177                         if (!rarefy) { iters = 1;  }
178                         
179                         temp = validParameter.validFile(parameters, "summary", false);                  if (temp == "not found") { temp = "T"; }
180                         summary = m->isTrue(temp);
181                         
182                         temp = validParameter.validFile(parameters, "scale", false);                    if (temp == "not found") { temp = "F"; }
183                         scale = m->isTrue(temp);
184                         
185                         temp = validParameter.validFile(parameters, "collect", false);                  if (temp == "not found") { temp = "F"; }
186                         collect = m->isTrue(temp);
187                         
188                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
189                         m->setProcessors(temp);
190                         convert(temp, processors); 
191                         
192                         groups = validParameter.validFile(parameters, "groups", false);                 
193                         if (groups == "not found") { groups = "";  }
194                         else { 
195                                 m->splitAtDash(groups, Groups);
196                                 m->Groups = Groups;
197                         }
198                         
199                         if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
200                 }
201                 
202         }
203         catch(exception& e) {
204                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
205                 exit(1);
206         }                       
207 }
208 //**********************************************************************************************************************
209
210 int PhyloDiversityCommand::execute(){
211         try {
212                 
213                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
214                 
215                 m->setTreeFile(treefile);
216                 
217                 if (groupfile != "") {
218                         //read in group map info.
219                         tmap = new TreeMap(groupfile);
220                         tmap->readMap();
221                 }else{ //fake out by putting everyone in one group
222                         Tree* tree = new Tree(treefile); delete tree;  //extracts names from tree to make faked out groupmap
223                         tmap = new TreeMap();
224                         
225                         for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
226                 }
227                 
228                 if (namefile != "") { readNamesFile(); }
229                 
230                 read = new ReadNewickTree(treefile);
231                 int readOk = read->read(tmap); 
232                 
233                 if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
234                 
235                 read->AssembleTrees();
236                 vector<Tree*> trees = read->getTrees();
237                 delete read;
238                 
239                 //make sure all files match
240                 //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
241                 int numNamesInTree;
242                 if (namefile != "")  {  
243                         if (numUniquesInName == m->Treenames.size()) {  numNamesInTree = nameMap.size();  }
244                         else {   numNamesInTree = m->Treenames.size();  }
245                 }else {  numNamesInTree = m->Treenames.size();  }
246                 
247                 
248                 //output any names that are in group file but not in tree
249                 if (numNamesInTree < tmap->getNumSeqs()) {
250                         for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
251                                 //is that name in the tree?
252                                 int count = 0;
253                                 for (int j = 0; j < m->Treenames.size(); j++) {
254                                         if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
255                                         count++;
256                                 }
257                                 
258                                 if (m->control_pressed) { 
259                                         delete tmap; for (int i = 0; i < trees.size(); i++) { delete trees[i]; }
260                                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
261                                         m->Groups.clear();
262                                         return 0;
263                                 }
264                                 
265                                 //then you did not find it so report it 
266                                 if (count == m->Treenames.size()) { 
267                                         //if it is in your namefile then don't remove
268                                         map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
269                                         
270                                         if (it == nameMap.end()) {
271                                                 m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
272                                                 tmap->removeSeq(tmap->namesOfSeqs[i]);
273                                                 i--; //need this because removeSeq removes name from namesOfSeqs
274                                         }
275                                 }
276                         }
277                 }
278                 
279                 SharedUtil* util = new SharedUtil();
280                 util->setGroups(m->Groups, tmap->namesOfGroups, "phylo.diversity");     //sets the groups the user wants to analyze
281                 delete util;
282                 
283                 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
284                 for (int i = 0; i < m->Groups.size(); i++) { if (m->Groups[i] == "xxx") { m->Groups.erase(m->Groups.begin()+i);  break; }  }
285                  
286                 vector<string> outputNames;
287                 
288                 //for each of the users trees
289                 for(int i = 0; i < trees.size(); i++) {
290                 
291                         if (m->control_pressed) { delete tmap; for (int j = 0; j < trees.size(); j++) { delete trees[j]; } for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        } return 0; }
292                         
293                         ofstream outSum, outRare, outCollect;
294                         string outSumFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv.summary";
295                         string outRareFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv.rarefaction";
296                         string outCollectFile = outputDir + m->getRootName(m->getSimpleName(treefile))  + toString(i+1) + ".phylodiv";
297                         
298                         if (summary)    { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile);             outputTypes["summary"].push_back(outSumFile);                   }
299                         if (rarefy)             { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile);  outputTypes["rarefy"].push_back(outRareFile);                   }
300                         if (collect)    { m->openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile);  outputTypes["phylodiv"].push_back(outCollectFile);  }
301                         
302                         int numLeafNodes = trees[i]->getNumLeaves();
303                         
304                         //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
305                         vector<int> randomLeaf;
306                         for (int j = 0; j < numLeafNodes; j++) {  
307                                 if (m->inUsersGroups(trees[i]->tree[j].getGroup(), m->Groups) == true) { //is this a node from the group the user selected.
308                                         randomLeaf.push_back(j); 
309                                 }
310                         }
311                         
312                         numLeafNodes = randomLeaf.size();  //reset the number of leaf nodes you are using 
313                         
314                         //each group, each sampling, if no rarefy iters = 1;
315                         map<string, vector<float> > diversity;
316                         
317                         //each group, each sampling, if no rarefy iters = 1;
318                         map<string, vector<float> > sumDiversity;
319                         
320                         //find largest group total 
321                         int largestGroup = 0;
322                         for (int j = 0; j < m->Groups.size(); j++) {  
323                                 if (tmap->seqsPerGroup[m->Groups[j]] > largestGroup) { largestGroup = tmap->seqsPerGroup[m->Groups[j]]; }
324                                 
325                                 //initialize diversity
326                                 diversity[m->Groups[j]].resize(tmap->seqsPerGroup[m->Groups[j]]+1, 0.0);                //numSampled
327                                                                                                                                                                                                                         //groupA                0.0                     0.0
328                                                                                                                                                                                                                         
329                                 //initialize sumDiversity
330                                 sumDiversity[m->Groups[j]].resize(tmap->seqsPerGroup[m->Groups[j]]+1, 0.0);
331                         }       
332
333                         //convert freq percentage to number
334                         int increment = 100;
335                         if (freq < 1.0) {  increment = largestGroup * freq;  
336                         }else { increment = freq;  }
337                         
338                         //initialize sampling spots
339                         set<int> numSampledList;
340                         for(int k = 1; k <= largestGroup; k++){  if((k == 1) || (k % increment == 0)){  numSampledList.insert(k); }   }
341                         if(largestGroup % increment != 0){      numSampledList.insert(largestGroup);   }
342                         
343                         //add other groups ending points
344                         for (int j = 0; j < m->Groups.size(); j++) {  
345                                 if (numSampledList.count(diversity[m->Groups[j]].size()-1) == 0) {  numSampledList.insert(diversity[m->Groups[j]].size()-1); }
346                         }
347                         
348                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
349                                 if(processors == 1){
350                                         driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
351                                 }else{
352                                         if (rarefy) {
353                                                 vector<int> procIters;
354                                                 
355                                                 int numItersPerProcessor = iters / processors;
356                                                 
357                                                 //divide iters between processes
358                                                 for (int h = 0; h < processors; h++) {
359                                                         if(h == processors - 1){
360                                                                 numItersPerProcessor = iters - h * numItersPerProcessor;
361                                                         }
362                                                         procIters.push_back(numItersPerProcessor);
363                                                 }
364                                                 
365                                                 createProcesses(procIters, trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum); 
366                                                 
367                                         }else{ //no need to paralellize if you dont want to rarefy
368                                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
369                                         }
370                                 }
371
372                         #else
373                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
374                         #endif
375
376                         if (rarefy) {   printData(numSampledList, sumDiversity, outRare, iters);        }
377                 }
378                 
379         
380                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
381
382                 m->mothurOutEndLine();
383                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
384                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
385                 m->mothurOutEndLine();
386
387                 
388                 return 0;
389         }
390         catch(exception& e) {
391                 m->errorOut(e, "PhyloDiversityCommand", "execute");
392                 exit(1);
393         }
394 }
395 //**********************************************************************************************************************
396 int PhyloDiversityCommand::createProcesses(vector<int>& procIters, Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum){
397         try {
398                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
399                 int process = 1;
400                 
401                 vector<int> processIDS;
402                 map< string, vector<float> >::iterator itSum;
403                 
404                 //loop through and create all the processes you want
405                 while (process != processors) {
406                         int pid = fork();
407                         
408                         if (pid > 0) {
409                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
410                                 process++;
411                         }else if (pid == 0){
412                                 driver(t, div, sumDiv, procIters[process], increment, randomLeaf, numSampledList, outCollect, outSum, false);
413                                 
414                                 string outTemp = outputDir + toString(getpid()) + ".sumDiv.temp";
415                                 ofstream out;
416                                 m->openOutputFile(outTemp, out);
417                                 
418                                 //output the sumDIversity
419                                 for (itSum = sumDiv.begin(); itSum != sumDiv.end(); itSum++) {
420                                         out << itSum->first << '\t' << (itSum->second).size() << '\t';
421                                         for (int k = 0; k < (itSum->second).size(); k++) { 
422                                                 out << (itSum->second)[k] << '\t';
423                                         }
424                                         out << endl;
425                                 }
426                                 
427                                 out.close();
428                                 
429                                 exit(0);
430                         }else { 
431                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
432                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
433                                 exit(0);
434                         }
435                 }
436                 
437                 driver(t, div, sumDiv, procIters[0], increment, randomLeaf, numSampledList, outCollect, outSum, true);
438                 
439                 //force parent to wait until all the processes are done
440                 for (int i=0;i<(processors-1);i++) { 
441                         int temp = processIDS[i];
442                         wait(&temp);
443                 }
444                 
445                 //get data created by processes
446                 for (int i=0;i<(processors-1);i++) { 
447                         
448                         //input the sumDIversity
449                         string inTemp = outputDir + toString(processIDS[i]) + ".sumDiv.temp";
450                         ifstream in;
451                         m->openInputFile(inTemp, in);
452                                 
453                         //output the sumDIversity
454                         for (int j = 0; j < sumDiv.size(); j++) { 
455                                 string group = "";
456                                 int size = 0;
457                                 
458                                 in >> group >> size; m->gobble(in);
459                                 
460                                 for (int k = 0; k < size; k++) { 
461                                         float tempVal;
462                                         in >> tempVal;
463                                         
464                                         sumDiv[group][k] += tempVal;
465                                 }
466                                 m->gobble(in);
467                         }
468                                 
469                         in.close();
470                         m->mothurRemove(inTemp);
471                 }
472                 
473 #endif
474
475         return 0;               
476         
477         }
478         catch(exception& e) {
479                 m->errorOut(e, "PhyloDiversityCommand", "createProcesses");
480                 exit(1);
481         }
482 }
483 //**********************************************************************************************************************
484 int PhyloDiversityCommand::driver(Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum, bool doSumCollect){
485         try {
486                 int numLeafNodes = randomLeaf.size();
487         
488                 for (int l = 0; l < numIters; l++) {
489                                 random_shuffle(randomLeaf.begin(), randomLeaf.end());
490                 
491                                 //initialize counts
492                                 map<string, int> counts;
493                                 map< string, set<int> > countedBranch;  
494                                 for (int j = 0; j < m->Groups.size(); j++) {  counts[m->Groups[j]] = 0; countedBranch[m->Groups[j]].insert(-2);  }  //add dummy index to initialize countedBranch sets
495                                 
496                                 for(int k = 0; k < numLeafNodes; k++){
497                                                 
498                                         if (m->control_pressed) { return 0; }
499                                         
500                                         //calc branch length of randomLeaf k
501                                         vector<float> br = calcBranchLength(t, randomLeaf[k], countedBranch);
502                         
503                                         //for each group in the groups update the total branch length accounting for the names file
504                                         vector<string> groups = t->tree[randomLeaf[k]].getGroup();
505                                         
506                                         for (int j = 0; j < groups.size(); j++) {
507                                                 int numSeqsInGroupJ = 0;
508                                                 map<string, int>::iterator it;
509                                                 it = t->tree[randomLeaf[k]].pcount.find(groups[j]);
510                                                 if (it != t->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
511                                                         numSeqsInGroupJ = it->second;
512                                                 }
513                                                 
514                                                 if (numSeqsInGroupJ != 0) {     div[groups[j]][(counts[groups[j]]+1)] = div[groups[j]][counts[groups[j]]] + br[j];  }
515                                                 
516                                                 for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
517                                                         div[groups[j]][s] = div[groups[j]][s-1];  //update counts, but don't add in redundant branch lengths
518                                                 }
519                                                 counts[groups[j]] += numSeqsInGroupJ;
520                                         }
521                                 }
522                                 
523                                 if (rarefy) {
524                                         //add this diversity to the sum
525                                         for (int j = 0; j < m->Groups.size(); j++) {  
526                                                 for (int g = 0; g < div[m->Groups[j]].size(); g++) {
527                                                         sumDiv[m->Groups[j]][g] += div[m->Groups[j]][g];
528                                                 }
529                                         }
530                                 }
531                                 
532                                 if ((collect) && (l == 0) && doSumCollect) {  printData(numSampledList, div, outCollect, 1);  }
533                                 if ((summary) && (l == 0) && doSumCollect) {  printSumData(div, outSum, 1);  }
534                         }
535                         
536                         return 0;
537
538         }
539         catch(exception& e) {
540                 m->errorOut(e, "PhyloDiversityCommand", "driver");
541                 exit(1);
542         }
543 }
544
545 //**********************************************************************************************************************
546
547 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
548         try {
549                 
550                 out << "Groups\tnumSampled\tphyloDiversity" << endl;
551                 
552                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
553                         
554                 for (int j = 0; j < m->Groups.size(); j++) {
555                         int numSampled = (div[m->Groups[j]].size()-1);
556                         out << m->Groups[j] << '\t' << numSampled << '\t';
557                 
558                          
559                         float score;
560                         if (scale)      {  score = (div[m->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;       }
561                         else            {       score = div[m->Groups[j]][numSampled] / (float)numIters;        }
562                                 
563                         out << setprecision(4) << score << endl;
564                 }
565                                         
566                 out.close();
567                 
568         }
569         catch(exception& e) {
570                 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
571                 exit(1);
572         }
573 }
574 //**********************************************************************************************************************
575
576 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
577         try {
578                 
579                 out << "numSampled\t";
580                 for (int i = 0; i < m->Groups.size(); i++) { out << m->Groups[i] << '\t';  }
581                 out << endl;
582                 
583                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
584                 
585                 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {  
586                         int numSampled = *it;
587                         
588                         out << numSampled << '\t';  
589                         
590                         for (int j = 0; j < m->Groups.size(); j++) {
591                                 if (numSampled < div[m->Groups[j]].size()) { 
592                                         float score;
593                                         if (scale)      {  score = (div[m->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;       }
594                                         else            {       score = div[m->Groups[j]][numSampled] / (float)numIters;        }
595
596                                         out << setprecision(4) << score << '\t';
597                                 }else { out << "NA" << '\t'; }
598                         }
599                         out << endl;
600                 }
601                 
602                 out.close();
603                 
604         }
605         catch(exception& e) {
606                 m->errorOut(e, "PhyloDiversityCommand", "printData");
607                 exit(1);
608         }
609 }
610 //**********************************************************************************************************************
611 //need a vector of floats one branch length for every group the node represents.
612 vector<float> PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< string, set<int> >& counted){
613         try {
614
615                 //calc the branch length
616                 //while you aren't at root
617                 vector<float> sums; 
618                 int index = leaf;
619                 
620                 vector<string> groups = t->tree[leaf].getGroup();
621                 sums.resize(groups.size(), 0.0);
622                 
623                 map<string, map<int, double> > tempTotals; //maps node to total Branch Length
624                 map< string, set<int> > tempCounted;
625                 set<int>::iterator it;
626         
627                 //you are a leaf
628                 if(t->tree[index].getBranchLength() != -1){     
629                         for (int k = 0; k < groups.size(); k++) { 
630                                 sums[k] += abs(t->tree[index].getBranchLength());       
631                                 counted[groups[k]].insert(index);
632                         }
633                 }
634                 
635                 for (int k = 0; k < groups.size(); k++) { 
636                         tempTotals[groups[k]][index] = 0.0;     
637                 }
638                 
639                 index = t->tree[index].getParent();     
640                         
641                 //while you aren't at root
642                 while(t->tree[index].getParent() != -1){
643
644                         if (m->control_pressed) {  return sums; }
645                         
646                         int pcountSize = 0;     
647                         for (int k = 0; k < groups.size(); k++) {
648                                 map<string, int>::iterator itGroup = t->tree[index].pcount.find(groups[k]);
649                                 if (itGroup != t->tree[index].pcount.end()) { pcountSize++;  } 
650                         
651                                 //do both your chidren have have descendants from the users groups? 
652                                 int lc = t->tree[index].getLChild();
653                                 int rc = t->tree[index].getRChild();
654                         
655                                 int LpcountSize = 0;
656                                 itGroup = t->tree[lc].pcount.find(groups[k]);
657                                 if (itGroup != t->tree[lc].pcount.end()) { LpcountSize++;  } 
658                                                         
659                                 int RpcountSize = 0;
660                                 itGroup = t->tree[rc].pcount.find(groups[k]);
661                                 if (itGroup != t->tree[rc].pcount.end()) { RpcountSize++;  } 
662                                                                 
663                                 //if yes, add your childrens tempTotals
664                                 if ((LpcountSize != 0) && (RpcountSize != 0)) {
665                                         sums[k] += tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc]; 
666                                         
667                                         for (it = tempCounted[groups[k]].begin(); it != tempCounted[groups[k]].end(); it++) { counted[groups[k]].insert(*it); }
668
669                                         //cout << "added to total " << tempTotals[lc] << '\t' << tempTotals[rc] << endl;
670                                         if (t->tree[index].getBranchLength() != -1) {
671                                                 if (counted[groups[k]].count(index) == 0) {
672                                                         tempTotals[groups[k]][index] = abs(t->tree[index].getBranchLength());
673                                                         tempCounted[groups[k]].insert(index);
674                                                 }else{
675                                                         tempTotals[groups[k]][index] = 0.0;
676                                                 }
677                                         }else {
678                                                 tempTotals[groups[k]][index] = 0.0;
679                                         }
680                                 }else { //if no, your tempTotal is your childrens temp totals + your branch length
681                                         tempTotals[groups[k]][index] = tempTotals[groups[k]][lc] + tempTotals[groups[k]][rc]; 
682                                                                         
683                                         if (counted[groups[k]].count(index) == 0) {
684                                                 tempTotals[groups[k]][index] += abs(t->tree[index].getBranchLength());
685                                                 tempCounted[groups[k]].insert(index);
686                                         }
687
688                                 }
689                                 //cout << "temptotal = "<< tempTotals[i] << endl;
690                         }
691                         
692                         index = t->tree[index].getParent();     
693                 }
694
695                 return sums;
696
697         }
698         catch(exception& e) {
699                 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
700                 exit(1);
701         }
702 }
703 /*****************************************************************/
704 int PhyloDiversityCommand::readNamesFile() {
705         try {
706                 m->names.clear();
707                 numUniquesInName = 0;
708                 
709                 ifstream in;
710                 m->openInputFile(namefile, in);
711                 
712                 string first, second;
713                 map<string, string>::iterator itNames;
714                 
715                 while(!in.eof()) {
716                         in >> first >> second; m->gobble(in);
717                         
718                         numUniquesInName++;
719                         
720                         itNames = m->names.find(first);
721                         if (itNames == m->names.end()) {  
722                                 m->names[first] = second; 
723                                 
724                                 //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
725                                 vector<string> dupNames;
726                                 m->splitAtComma(second, dupNames);
727                                 
728                                 for (int i = 0; i < dupNames.size(); i++) {     
729                                         nameMap[dupNames[i]] = dupNames[i]; 
730                                         if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); } 
731                                 }
732                         }else {  m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }                  
733                 }
734                 in.close();
735                 
736                 return 0;
737         }
738         catch(exception& e) {
739                 m->errorOut(e, "PhyloDiversityCommand", "readNamesFile");
740                 exit(1);
741         }
742 }
743
744 //**********************************************************************************************************************
745
746
747