]> git.donarmstrong.com Git - mothur.git/blob - phylodiversitycommand.cpp
paralellized phylo.diversity
[mothur.git] / phylodiversitycommand.cpp
1 /*
2  *  phylodiversitycommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/30/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylodiversitycommand.h"
11
12 //**********************************************************************************************************************
13 PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
14         try {
15                 globaldata = GlobalData::getInstance();
16                 abort = false;
17                 
18                 //allow user to run help
19                 if(option == "help") { help(); abort = true; }
20                 
21                 else {
22                         //valid paramters for this command
23                         string Array[] =  {"freq","rarefy","iters","groups","processors","summary","collect","scale","outputdir","inputdir"};
24                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25                         
26                         OptionParser parser(option);
27                         map<string,string> parameters = parser.getParameters();
28                         
29                         ValidParameters validParameter;
30                 
31                         //check to make sure all parameters are valid for command
32                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
33                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
34                         }
35                         
36                         //if the user changes the output directory command factory will send this info to us in the output parameter 
37                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(globaldata->getTreeFile());              }
38                         
39                         if (globaldata->gTree.size() == 0) {//no trees were read
40                                 m->mothurOut("You must execute the read.tree command, before you may execute the phylo.diversity command."); m->mothurOutEndLine(); abort = true;  }
41
42                         string temp;
43                         temp = validParameter.validFile(parameters, "freq", false);                     if (temp == "not found") { temp = "100"; }
44                         convert(temp, freq); 
45                         
46                         temp = validParameter.validFile(parameters, "iters", false);                    if (temp == "not found") { temp = "1000"; }
47                         convert(temp, iters); 
48                         
49                         temp = validParameter.validFile(parameters, "rarefy", false);                   if (temp == "not found") { temp = "F"; }
50                         rarefy = m->isTrue(temp);
51                         if (!rarefy) { iters = 1;  }
52                         
53                         temp = validParameter.validFile(parameters, "summary", false);                  if (temp == "not found") { temp = "T"; }
54                         summary = m->isTrue(temp);
55                         
56                         temp = validParameter.validFile(parameters, "scale", false);                    if (temp == "not found") { temp = "F"; }
57                         scale = m->isTrue(temp);
58                         
59                         temp = validParameter.validFile(parameters, "collect", false);                  if (temp == "not found") { temp = "F"; }
60                         collect = m->isTrue(temp);
61                         
62                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
63                         convert(temp, processors); 
64                         
65                         groups = validParameter.validFile(parameters, "groups", false);                 
66                         if (groups == "not found") { groups = ""; Groups = globaldata->gTreemap->namesOfGroups;  globaldata->Groups = Groups;  }
67                         else { 
68                                 m->splitAtDash(groups, Groups);
69                                 globaldata->Groups = Groups;
70                         }
71                         
72                         if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
73                 }
74                 
75         }
76         catch(exception& e) {
77                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
78                 exit(1);
79         }                       
80 }
81 //**********************************************************************************************************************
82
83 void PhyloDiversityCommand::help(){
84         try {
85                 m->mothurOut("The phylo.diversity command can only be executed after a successful read.tree command.\n");
86                 m->mothurOut("The phylo.diversity command parameters are groups, iters, freq, processors, scale, rarefy, collect and summary.  No parameters are required.\n");
87                 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n");
88                 m->mothurOut("The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n");
89                 m->mothurOut("The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n");
90                 m->mothurOut("The scale parameter is used indicate that you want your ouptut scaled to the number of sequences sampled, default = false. \n");
91                 m->mothurOut("The rarefy parameter allows you to create a rarefaction curve. The default is false.\n");
92                 m->mothurOut("The collect parameter allows you to create a collectors curve. The default is false.\n");
93                 m->mothurOut("The summary parameter allows you to create a .summary file. The default is true.\n");
94                 m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
95                 m->mothurOut("The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n");
96                 m->mothurOut("Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n");
97                 m->mothurOut("The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n");
98                 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
99
100         }
101         catch(exception& e) {
102                 m->errorOut(e, "PhyloDiversityCommand", "help");
103                 exit(1);
104         }
105 }
106
107 //**********************************************************************************************************************
108
109 PhyloDiversityCommand::~PhyloDiversityCommand(){}
110
111 //**********************************************************************************************************************
112
113 int PhyloDiversityCommand::execute(){
114         try {
115                 
116                 if (abort == true) { return 0; }
117                 
118                 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
119                 for (int i = 0; i < globaldata->Groups.size(); i++) { if (globaldata->Groups[i] == "xxx") { globaldata->Groups.erase(globaldata->Groups.begin()+i);  break; }  }
120                  
121                 vector<string> outputNames;
122                         
123                 vector<Tree*> trees = globaldata->gTree;
124                 
125                 //for each of the users trees
126                 for(int i = 0; i < trees.size(); i++) {
127                 
128                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
129                         
130                         ofstream outSum, outRare, outCollect;
131                         string outSumFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylodiv.summary";
132                         string outRareFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylodiv.rarefaction";
133                         string outCollectFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylodiv";
134                         
135                         if (summary)    { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile);                             }
136                         if (rarefy)             { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile);                          }
137                         if (collect)    { m->openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile); }
138                         
139                         int numLeafNodes = trees[i]->getNumLeaves();
140                         
141                         //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
142                         vector<int> randomLeaf;
143                         for (int j = 0; j < numLeafNodes; j++) {  
144                                 if (m->inUsersGroups(trees[i]->tree[j].getGroup(), globaldata->Groups) == true) { //is this a node from the group the user selected.
145                                         randomLeaf.push_back(j); 
146                                 }
147                         }
148                         
149                         numLeafNodes = randomLeaf.size();  //reset the number of leaf nodes you are using 
150                         
151                         //each group, each sampling, if no rarefy iters = 1;
152                         map<string, vector<float> > diversity;
153                         
154                         //each group, each sampling, if no rarefy iters = 1;
155                         map<string, vector<float> > sumDiversity;
156                         
157                         //find largest group total 
158                         int largestGroup = 0;
159                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
160                                 if (globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]] > largestGroup) { largestGroup = globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]; }
161                                 
162                                 //initialize diversity
163                                 diversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);              //numSampled
164                                                                                                                                                                                                                         //groupA                0.0                     0.0
165                                                                                                                                                                                                                         
166                                 //initialize sumDiversity
167                                 sumDiversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);
168                         }       
169
170                         //convert freq percentage to number
171                         int increment = 100;
172                         if (freq < 1.0) {  increment = largestGroup * freq;  
173                         }else { increment = freq;  }
174                         
175                         //initialize sampling spots
176                         set<int> numSampledList;
177                         for(int k = 1; k <= largestGroup; k++){  if((k == 1) || (k % increment == 0)){  numSampledList.insert(k); }   }
178                         if(largestGroup % increment != 0){      numSampledList.insert(largestGroup);   }
179                         
180                         //add other groups ending points
181                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
182                                 if (numSampledList.count(diversity[globaldata->Groups[j]].size()-1) == 0) {  numSampledList.insert(diversity[globaldata->Groups[j]].size()-1); }
183                         }
184                         
185                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
186                                 if(processors == 1){
187                                         driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
188                                 }else{
189                                         if (rarefy) {
190                                                 vector<int> procIters;
191                                                 
192                                                 int numItersPerProcessor = iters / processors;
193                                                 
194                                                 //divide iters between processes
195                                                 for (int h = 0; h < processors; h++) {
196                                                         if(h == processors - 1){
197                                                                 numItersPerProcessor = iters - h * numItersPerProcessor;
198                                                         }
199                                                         procIters.push_back(numItersPerProcessor);
200                                                 }
201                                                 
202                                                 createProcesses(procIters, trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum); 
203                                                 
204                                         }else{ //no need to paralellize if you dont want to rarefy
205                                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
206                                         }
207                                 }
208
209                         #else
210                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
211                         #endif
212
213                         if (rarefy) {   printData(numSampledList, sumDiversity, outRare, iters);        }
214                 }
215                 
216         
217                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
218
219                 m->mothurOutEndLine();
220                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
221                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
222                 m->mothurOutEndLine();
223
224                 
225                 return 0;
226         }
227         catch(exception& e) {
228                 m->errorOut(e, "PhyloDiversityCommand", "execute");
229                 exit(1);
230         }
231 }
232 //**********************************************************************************************************************
233 int PhyloDiversityCommand::createProcesses(vector<int>& procIters, Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum){
234         try {
235                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
236                 int process = 1;
237                 int num = 0;
238                 vector<int> processIDS;
239                 map< string, vector<float> >::iterator itSum;
240                 
241                 EstOutput results;
242                 
243                 //loop through and create all the processes you want
244                 while (process != processors) {
245                         int pid = fork();
246                         
247                         if (pid > 0) {
248                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
249                                 process++;
250                         }else if (pid == 0){
251                                 driver(t, div, sumDiv, procIters[process], increment, randomLeaf, numSampledList, outCollect, outSum, false);
252                                 
253                                 string outTemp = outputDir + toString(getpid()) + ".sumDiv.temp";
254                                 ofstream out;
255                                 m->openOutputFile(outTemp, out);
256                                 
257                                 //output the sumDIversity
258                                 for (itSum = sumDiv.begin(); itSum != sumDiv.end(); itSum++) {
259                                         out << itSum->first << '\t' << (itSum->second).size() << '\t';
260                                         for (int k = 0; k < (itSum->second).size(); k++) { 
261                                                 out << (itSum->second)[k] << '\t';
262                                         }
263                                         out << endl;
264                                 }
265                                 
266                                 out.close();
267                                 
268                                 exit(0);
269                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
270                 }
271                 
272                 driver(t, div, sumDiv, procIters[0], increment, randomLeaf, numSampledList, outCollect, outSum, true);
273                 
274                 //force parent to wait until all the processes are done
275                 for (int i=0;i<(processors-1);i++) { 
276                         int temp = processIDS[i];
277                         wait(&temp);
278                 }
279                 
280                 //get data created by processes
281                 for (int i=0;i<(processors-1);i++) { 
282                         
283                         //input the sumDIversity
284                         string inTemp = outputDir + toString(processIDS[i]) + ".sumDiv.temp";
285                         ifstream in;
286                         m->openInputFile(inTemp, in);
287                                 
288                         //output the sumDIversity
289                         for (int j = 0; j < sumDiv.size(); j++) { 
290                                 string group = "";
291                                 int size = 0;
292                                 
293                                 in >> group >> size; m->gobble(in);
294                                 
295                                 for (int k = 0; k < size; k++) { 
296                                         float tempVal;
297                                         in >> tempVal;
298                                         
299                                         sumDiv[group][k] += tempVal;
300                                 }
301                                 m->gobble(in);
302                         }
303                                 
304                         in.close();
305                         remove(inTemp.c_str());
306                 }
307                 
308 #endif
309
310         return 0;               
311         
312         }
313         catch(exception& e) {
314                 m->errorOut(e, "PhyloDiversityCommand", "createProcesses");
315                 exit(1);
316         }
317 }
318 //**********************************************************************************************************************
319 int PhyloDiversityCommand::driver(Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum, bool doSumCollect){
320         try {
321                 int numLeafNodes = randomLeaf.size();
322         
323                 for (int l = 0; l < numIters; l++) {
324                                 random_shuffle(randomLeaf.begin(), randomLeaf.end());
325                 
326                                 //initialize counts
327                                 map<string, int> counts;
328                                 map< string, set<int> > countedBranch;  
329                                 for (int j = 0; j < globaldata->Groups.size(); j++) {  counts[globaldata->Groups[j]] = 0; countedBranch[globaldata->Groups[j]].insert(-2);  }  //add dummy index to initialize countedBranch sets
330                                 
331                                 for(int k = 0; k < numLeafNodes; k++){
332                                                 
333                                         if (m->control_pressed) { return 0; }
334                                         
335                                         //calc branch length of randomLeaf k
336                                         float br = calcBranchLength(t, randomLeaf[k], countedBranch);
337                         
338                                         //for each group in the groups update the total branch length accounting for the names file
339                                         vector<string> groups = t->tree[randomLeaf[k]].getGroup();
340                                         
341                                         for (int j = 0; j < groups.size(); j++) {
342                                                 int numSeqsInGroupJ = 0;
343                                                 map<string, int>::iterator it;
344                                                 it = t->tree[randomLeaf[k]].pcount.find(groups[j]);
345                                                 if (it != t->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
346                                                         numSeqsInGroupJ = it->second;
347                                                 }
348                                                 
349                                                 if (numSeqsInGroupJ != 0) {     div[groups[j]][(counts[groups[j]]+1)] = div[groups[j]][counts[groups[j]]] + br;  }
350                                                 
351                                                 for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
352                                                         div[groups[j]][s] = div[groups[j]][s-1];  //update counts, but don't add in redundant branch lengths
353                                                 }
354                                                 counts[groups[j]] += numSeqsInGroupJ;
355                                         }
356                                 }
357                                 
358                                 if (rarefy) {
359                                         //add this diversity to the sum
360                                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
361                                                 for (int g = 0; g < div[globaldata->Groups[j]].size(); g++) {
362                                                         sumDiv[globaldata->Groups[j]][g] += div[globaldata->Groups[j]][g];
363                                                 }
364                                         }
365                                 }
366                                 
367                                 if ((collect) && (l == 0) && doSumCollect) {  printData(numSampledList, div, outCollect, 1);  }
368                                 if ((summary) && (l == 0) && doSumCollect) {  printSumData(div, outSum, 1);  }
369                         }
370                         
371                         return 0;
372
373         }
374         catch(exception& e) {
375                 m->errorOut(e, "PhyloDiversityCommand", "driver");
376                 exit(1);
377         }
378 }
379
380 //**********************************************************************************************************************
381
382 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
383         try {
384                 
385                 out << "Groups\tnumSampled\tphyloDiversity" << endl;
386                 
387                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
388                         
389                 for (int j = 0; j < globaldata->Groups.size(); j++) {
390                         int numSampled = (div[globaldata->Groups[j]].size()-1);
391                         out << globaldata->Groups[j] << '\t' << numSampled << '\t';
392                 
393                          
394                         float score;
395                         if (scale)      {  score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;      }
396                         else            {       score = div[globaldata->Groups[j]][numSampled] / (float)numIters;       }
397                                 
398                         out << setprecision(4) << score << endl;
399                 }
400                                         
401                 out.close();
402                 
403         }
404         catch(exception& e) {
405                 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
406                 exit(1);
407         }
408 }
409 //**********************************************************************************************************************
410
411 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
412         try {
413                 
414                 out << "numSampled\t";
415                 for (int i = 0; i < globaldata->Groups.size(); i++) { out << globaldata->Groups[i] << '\t';  }
416                 out << endl;
417                 
418                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
419                 
420                 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {  
421                         int numSampled = *it;
422                         
423                         out << numSampled << '\t';  
424                         
425                         for (int j = 0; j < globaldata->Groups.size(); j++) {
426                                 if (numSampled < div[globaldata->Groups[j]].size()) { 
427                                         float score;
428                                         if (scale)      {  score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;      }
429                                         else            {       score = div[globaldata->Groups[j]][numSampled] / (float)numIters;       }
430
431                                         out << setprecision(4) << score << '\t';
432                                 }else { out << "NA" << '\t'; }
433                         }
434                         out << endl;
435                 }
436                 
437                 out.close();
438                 
439         }
440         catch(exception& e) {
441                 m->errorOut(e, "PhyloDiversityCommand", "printData");
442                 exit(1);
443         }
444 }
445 //**********************************************************************************************************************
446 float PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< string, set<int> >& counted){
447         try {
448
449                 //calc the branch length
450                 //while you aren't at root
451                 float sum = 0.0;
452                 int index = leaf;
453                 
454                 vector<string> groups = t->tree[leaf].getGroup();
455                                         
456                 while(t->tree[index].getParent() != -1){
457                         
458                         //if you have a BL
459                         if(t->tree[index].getBranchLength() != -1){
460                                 if (counted[groups[0]].count(index) == 0) { //you have not already counted this branch
461                                         sum += abs(t->tree[index].getBranchLength());
462                                         for (int j = 0; j < groups.size(); j++) {  counted[groups[j]].insert(index);  }
463                                 }
464                         }
465                         index = t->tree[index].getParent();
466                 }
467                         
468                 //get last breanch length added
469                 if(t->tree[index].getBranchLength() != -1){
470                         if (counted[groups[0]].count(index) == 0) { //you have not already counted this branch
471                                 sum += abs(t->tree[index].getBranchLength());
472                                 for (int j = 0; j < groups.size(); j++) {  counted[groups[j]].insert(index);  }
473                         }
474                 }
475                 
476                 return sum;
477         }
478         catch(exception& e) {
479                 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
480                 exit(1);
481         }
482 }
483 //**********************************************************************************************************************