]> git.donarmstrong.com Git - mothur.git/blob - phylodiversitycommand.cpp
added pipeline commands which involved change to command factory and command class...
[mothur.git] / phylodiversitycommand.cpp
1 /*
2  *  phylodiversitycommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 4/30/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "phylodiversitycommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> PhyloDiversityCommand::getValidParameters(){     
14         try {
15                 string Array[] =  {"freq","rarefy","iters","groups","processors","summary","collect","scale","outputdir","inputdir"};
16                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
17                 return myArray;
18         }
19         catch(exception& e) {
20                 m->errorOut(e, "PhyloDiversityCommand", "getValidParameters");
21                 exit(1);
22         }
23 }
24 //**********************************************************************************************************************
25 PhyloDiversityCommand::PhyloDiversityCommand(){ 
26         try {
27                 //initialize outputTypes
28                 vector<string> tempOutNames;
29                 outputTypes["phylodiv"] = tempOutNames;
30                 outputTypes["rarefy"] = tempOutNames;
31                 outputTypes["summary"] = tempOutNames;
32         }
33         catch(exception& e) {
34                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
35                 exit(1);
36         }
37 }
38 //**********************************************************************************************************************
39 vector<string> PhyloDiversityCommand::getRequiredParameters(){  
40         try {
41                 vector<string> myArray;
42                 return myArray;
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "PhyloDiversityCommand", "getRequiredParameters");
46                 exit(1);
47         }
48 }
49 //**********************************************************************************************************************
50 vector<string> PhyloDiversityCommand::getRequiredFiles(){       
51         try {
52                 string Array[] =  {"tree","group"};
53                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
54                 return myArray;
55         }
56         catch(exception& e) {
57                 m->errorOut(e, "PhyloDiversityCommand", "getRequiredFiles");
58                 exit(1);
59         }
60 }
61 //**********************************************************************************************************************
62 PhyloDiversityCommand::PhyloDiversityCommand(string option)  {
63         try {
64                 globaldata = GlobalData::getInstance();
65                 abort = false;
66                 
67                 //allow user to run help
68                 if(option == "help") { help(); abort = true; }
69                 
70                 else {
71                         //valid paramters for this command
72                         string Array[] =  {"freq","rarefy","iters","groups","processors","summary","collect","scale","outputdir","inputdir"};
73                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
74                         
75                         OptionParser parser(option);
76                         map<string,string> parameters = parser.getParameters();
77                         
78                         ValidParameters validParameter;
79                 
80                         //check to make sure all parameters are valid for command
81                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
82                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
83                         }
84                         
85                         //initialize outputTypes
86                         vector<string> tempOutNames;
87                         outputTypes["phylodiv"] = tempOutNames;
88                         outputTypes["rarefy"] = tempOutNames;
89                         outputTypes["summary"] = tempOutNames;
90                         
91                         //if the user changes the output directory command factory will send this info to us in the output parameter 
92                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(globaldata->getTreeFile());              }
93                         
94                         if (globaldata->gTree.size() == 0) {//no trees were read
95                                 m->mothurOut("You must execute the read.tree command, before you may execute the phylo.diversity command."); m->mothurOutEndLine(); abort = true;  }
96
97                         string temp;
98                         temp = validParameter.validFile(parameters, "freq", false);                     if (temp == "not found") { temp = "100"; }
99                         convert(temp, freq); 
100                         
101                         temp = validParameter.validFile(parameters, "iters", false);                    if (temp == "not found") { temp = "1000"; }
102                         convert(temp, iters); 
103                         
104                         temp = validParameter.validFile(parameters, "rarefy", false);                   if (temp == "not found") { temp = "F"; }
105                         rarefy = m->isTrue(temp);
106                         if (!rarefy) { iters = 1;  }
107                         
108                         temp = validParameter.validFile(parameters, "summary", false);                  if (temp == "not found") { temp = "T"; }
109                         summary = m->isTrue(temp);
110                         
111                         temp = validParameter.validFile(parameters, "scale", false);                    if (temp == "not found") { temp = "F"; }
112                         scale = m->isTrue(temp);
113                         
114                         temp = validParameter.validFile(parameters, "collect", false);                  if (temp == "not found") { temp = "F"; }
115                         collect = m->isTrue(temp);
116                         
117                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
118                         convert(temp, processors); 
119                         
120                         groups = validParameter.validFile(parameters, "groups", false);                 
121                         if (groups == "not found") { groups = ""; Groups = globaldata->gTreemap->namesOfGroups;  globaldata->Groups = Groups;  }
122                         else { 
123                                 m->splitAtDash(groups, Groups);
124                                 globaldata->Groups = Groups;
125                         }
126                         
127                         if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; }
128                 }
129                 
130         }
131         catch(exception& e) {
132                 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
133                 exit(1);
134         }                       
135 }
136 //**********************************************************************************************************************
137
138 void PhyloDiversityCommand::help(){
139         try {
140                 m->mothurOut("The phylo.diversity command can only be executed after a successful read.tree command.\n");
141                 m->mothurOut("The phylo.diversity command parameters are groups, iters, freq, processors, scale, rarefy, collect and summary.  No parameters are required.\n");
142                 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n");
143                 m->mothurOut("The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n");
144                 m->mothurOut("The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n");
145                 m->mothurOut("The scale parameter is used indicate that you want your ouptut scaled to the number of sequences sampled, default = false. \n");
146                 m->mothurOut("The rarefy parameter allows you to create a rarefaction curve. The default is false.\n");
147                 m->mothurOut("The collect parameter allows you to create a collectors curve. The default is false.\n");
148                 m->mothurOut("The summary parameter allows you to create a .summary file. The default is true.\n");
149                 m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
150                 m->mothurOut("The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n");
151                 m->mothurOut("Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n");
152                 m->mothurOut("The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n");
153                 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
154
155         }
156         catch(exception& e) {
157                 m->errorOut(e, "PhyloDiversityCommand", "help");
158                 exit(1);
159         }
160 }
161
162 //**********************************************************************************************************************
163
164 PhyloDiversityCommand::~PhyloDiversityCommand(){}
165
166 //**********************************************************************************************************************
167
168 int PhyloDiversityCommand::execute(){
169         try {
170                 
171                 if (abort == true) { return 0; }
172                 
173                 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
174                 for (int i = 0; i < globaldata->Groups.size(); i++) { if (globaldata->Groups[i] == "xxx") { globaldata->Groups.erase(globaldata->Groups.begin()+i);  break; }  }
175                  
176                 vector<string> outputNames;
177                         
178                 vector<Tree*> trees = globaldata->gTree;
179                 
180                 //for each of the users trees
181                 for(int i = 0; i < trees.size(); i++) {
182                 
183                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
184                         
185                         ofstream outSum, outRare, outCollect;
186                         string outSumFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylodiv.summary";
187                         string outRareFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylodiv.rarefaction";
188                         string outCollectFile = outputDir + m->getRootName(m->getSimpleName(globaldata->getTreeFile()))  + toString(i+1) + ".phylodiv";
189                         
190                         if (summary)    { m->openOutputFile(outSumFile, outSum); outputNames.push_back(outSumFile);             outputTypes["summary"].push_back(outSumFile);                   }
191                         if (rarefy)             { m->openOutputFile(outRareFile, outRare); outputNames.push_back(outRareFile);  outputTypes["rarefy"].push_back(outRareFile);                   }
192                         if (collect)    { m->openOutputFile(outCollectFile, outCollect); outputNames.push_back(outCollectFile);  outputTypes["phylodiv"].push_back(outCollectFile);  }
193                         
194                         int numLeafNodes = trees[i]->getNumLeaves();
195                         
196                         //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
197                         vector<int> randomLeaf;
198                         for (int j = 0; j < numLeafNodes; j++) {  
199                                 if (m->inUsersGroups(trees[i]->tree[j].getGroup(), globaldata->Groups) == true) { //is this a node from the group the user selected.
200                                         randomLeaf.push_back(j); 
201                                 }
202                         }
203                         
204                         numLeafNodes = randomLeaf.size();  //reset the number of leaf nodes you are using 
205                         
206                         //each group, each sampling, if no rarefy iters = 1;
207                         map<string, vector<float> > diversity;
208                         
209                         //each group, each sampling, if no rarefy iters = 1;
210                         map<string, vector<float> > sumDiversity;
211                         
212                         //find largest group total 
213                         int largestGroup = 0;
214                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
215                                 if (globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]] > largestGroup) { largestGroup = globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]; }
216                                 
217                                 //initialize diversity
218                                 diversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);              //numSampled
219                                                                                                                                                                                                                         //groupA                0.0                     0.0
220                                                                                                                                                                                                                         
221                                 //initialize sumDiversity
222                                 sumDiversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);
223                         }       
224
225                         //convert freq percentage to number
226                         int increment = 100;
227                         if (freq < 1.0) {  increment = largestGroup * freq;  
228                         }else { increment = freq;  }
229                         
230                         //initialize sampling spots
231                         set<int> numSampledList;
232                         for(int k = 1; k <= largestGroup; k++){  if((k == 1) || (k % increment == 0)){  numSampledList.insert(k); }   }
233                         if(largestGroup % increment != 0){      numSampledList.insert(largestGroup);   }
234                         
235                         //add other groups ending points
236                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
237                                 if (numSampledList.count(diversity[globaldata->Groups[j]].size()-1) == 0) {  numSampledList.insert(diversity[globaldata->Groups[j]].size()-1); }
238                         }
239                         
240                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
241                                 if(processors == 1){
242                                         driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
243                                 }else{
244                                         if (rarefy) {
245                                                 vector<int> procIters;
246                                                 
247                                                 int numItersPerProcessor = iters / processors;
248                                                 
249                                                 //divide iters between processes
250                                                 for (int h = 0; h < processors; h++) {
251                                                         if(h == processors - 1){
252                                                                 numItersPerProcessor = iters - h * numItersPerProcessor;
253                                                         }
254                                                         procIters.push_back(numItersPerProcessor);
255                                                 }
256                                                 
257                                                 createProcesses(procIters, trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum); 
258                                                 
259                                         }else{ //no need to paralellize if you dont want to rarefy
260                                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
261                                         }
262                                 }
263
264                         #else
265                                 driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);      
266                         #endif
267
268                         if (rarefy) {   printData(numSampledList, sumDiversity, outRare, iters);        }
269                 }
270                 
271         
272                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        remove(outputNames[i].c_str());         } return 0; }
273
274                 m->mothurOutEndLine();
275                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
276                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
277                 m->mothurOutEndLine();
278
279                 
280                 return 0;
281         }
282         catch(exception& e) {
283                 m->errorOut(e, "PhyloDiversityCommand", "execute");
284                 exit(1);
285         }
286 }
287 //**********************************************************************************************************************
288 int PhyloDiversityCommand::createProcesses(vector<int>& procIters, Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum){
289         try {
290                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
291                 int process = 1;
292                 int num = 0;
293                 vector<int> processIDS;
294                 map< string, vector<float> >::iterator itSum;
295                 
296                 EstOutput results;
297                 
298                 //loop through and create all the processes you want
299                 while (process != processors) {
300                         int pid = fork();
301                         
302                         if (pid > 0) {
303                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
304                                 process++;
305                         }else if (pid == 0){
306                                 driver(t, div, sumDiv, procIters[process], increment, randomLeaf, numSampledList, outCollect, outSum, false);
307                                 
308                                 string outTemp = outputDir + toString(getpid()) + ".sumDiv.temp";
309                                 ofstream out;
310                                 m->openOutputFile(outTemp, out);
311                                 
312                                 //output the sumDIversity
313                                 for (itSum = sumDiv.begin(); itSum != sumDiv.end(); itSum++) {
314                                         out << itSum->first << '\t' << (itSum->second).size() << '\t';
315                                         for (int k = 0; k < (itSum->second).size(); k++) { 
316                                                 out << (itSum->second)[k] << '\t';
317                                         }
318                                         out << endl;
319                                 }
320                                 
321                                 out.close();
322                                 
323                                 exit(0);
324                         }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
325                 }
326                 
327                 driver(t, div, sumDiv, procIters[0], increment, randomLeaf, numSampledList, outCollect, outSum, true);
328                 
329                 //force parent to wait until all the processes are done
330                 for (int i=0;i<(processors-1);i++) { 
331                         int temp = processIDS[i];
332                         wait(&temp);
333                 }
334                 
335                 //get data created by processes
336                 for (int i=0;i<(processors-1);i++) { 
337                         
338                         //input the sumDIversity
339                         string inTemp = outputDir + toString(processIDS[i]) + ".sumDiv.temp";
340                         ifstream in;
341                         m->openInputFile(inTemp, in);
342                                 
343                         //output the sumDIversity
344                         for (int j = 0; j < sumDiv.size(); j++) { 
345                                 string group = "";
346                                 int size = 0;
347                                 
348                                 in >> group >> size; m->gobble(in);
349                                 
350                                 for (int k = 0; k < size; k++) { 
351                                         float tempVal;
352                                         in >> tempVal;
353                                         
354                                         sumDiv[group][k] += tempVal;
355                                 }
356                                 m->gobble(in);
357                         }
358                                 
359                         in.close();
360                         remove(inTemp.c_str());
361                 }
362                 
363 #endif
364
365         return 0;               
366         
367         }
368         catch(exception& e) {
369                 m->errorOut(e, "PhyloDiversityCommand", "createProcesses");
370                 exit(1);
371         }
372 }
373 //**********************************************************************************************************************
374 int PhyloDiversityCommand::driver(Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum, bool doSumCollect){
375         try {
376                 int numLeafNodes = randomLeaf.size();
377         
378                 for (int l = 0; l < numIters; l++) {
379                                 random_shuffle(randomLeaf.begin(), randomLeaf.end());
380                 
381                                 //initialize counts
382                                 map<string, int> counts;
383                                 map< string, set<int> > countedBranch;  
384                                 for (int j = 0; j < globaldata->Groups.size(); j++) {  counts[globaldata->Groups[j]] = 0; countedBranch[globaldata->Groups[j]].insert(-2);  }  //add dummy index to initialize countedBranch sets
385                                 
386                                 for(int k = 0; k < numLeafNodes; k++){
387                                                 
388                                         if (m->control_pressed) { return 0; }
389                                         
390                                         //calc branch length of randomLeaf k
391                                         float br = calcBranchLength(t, randomLeaf[k], countedBranch);
392                         
393                                         //for each group in the groups update the total branch length accounting for the names file
394                                         vector<string> groups = t->tree[randomLeaf[k]].getGroup();
395                                         
396                                         for (int j = 0; j < groups.size(); j++) {
397                                                 int numSeqsInGroupJ = 0;
398                                                 map<string, int>::iterator it;
399                                                 it = t->tree[randomLeaf[k]].pcount.find(groups[j]);
400                                                 if (it != t->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
401                                                         numSeqsInGroupJ = it->second;
402                                                 }
403                                                 
404                                                 if (numSeqsInGroupJ != 0) {     div[groups[j]][(counts[groups[j]]+1)] = div[groups[j]][counts[groups[j]]] + br;  }
405                                                 
406                                                 for (int s = (counts[groups[j]]+2); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
407                                                         div[groups[j]][s] = div[groups[j]][s-1];  //update counts, but don't add in redundant branch lengths
408                                                 }
409                                                 counts[groups[j]] += numSeqsInGroupJ;
410                                         }
411                                 }
412                                 
413                                 if (rarefy) {
414                                         //add this diversity to the sum
415                                         for (int j = 0; j < globaldata->Groups.size(); j++) {  
416                                                 for (int g = 0; g < div[globaldata->Groups[j]].size(); g++) {
417                                                         sumDiv[globaldata->Groups[j]][g] += div[globaldata->Groups[j]][g];
418                                                 }
419                                         }
420                                 }
421                                 
422                                 if ((collect) && (l == 0) && doSumCollect) {  printData(numSampledList, div, outCollect, 1);  }
423                                 if ((summary) && (l == 0) && doSumCollect) {  printSumData(div, outSum, 1);  }
424                         }
425                         
426                         return 0;
427
428         }
429         catch(exception& e) {
430                 m->errorOut(e, "PhyloDiversityCommand", "driver");
431                 exit(1);
432         }
433 }
434
435 //**********************************************************************************************************************
436
437 void PhyloDiversityCommand::printSumData(map< string, vector<float> >& div, ofstream& out, int numIters){
438         try {
439                 
440                 out << "Groups\tnumSampled\tphyloDiversity" << endl;
441                 
442                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
443                         
444                 for (int j = 0; j < globaldata->Groups.size(); j++) {
445                         int numSampled = (div[globaldata->Groups[j]].size()-1);
446                         out << globaldata->Groups[j] << '\t' << numSampled << '\t';
447                 
448                          
449                         float score;
450                         if (scale)      {  score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;      }
451                         else            {       score = div[globaldata->Groups[j]][numSampled] / (float)numIters;       }
452                                 
453                         out << setprecision(4) << score << endl;
454                 }
455                                         
456                 out.close();
457                 
458         }
459         catch(exception& e) {
460                 m->errorOut(e, "PhyloDiversityCommand", "printSumData");
461                 exit(1);
462         }
463 }
464 //**********************************************************************************************************************
465
466 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, ofstream& out, int numIters){
467         try {
468                 
469                 out << "numSampled\t";
470                 for (int i = 0; i < globaldata->Groups.size(); i++) { out << globaldata->Groups[i] << '\t';  }
471                 out << endl;
472                 
473                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
474                 
475                 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {  
476                         int numSampled = *it;
477                         
478                         out << numSampled << '\t';  
479                         
480                         for (int j = 0; j < globaldata->Groups.size(); j++) {
481                                 if (numSampled < div[globaldata->Groups[j]].size()) { 
482                                         float score;
483                                         if (scale)      {  score = (div[globaldata->Groups[j]][numSampled] / (float)numIters) / (float)numSampled;      }
484                                         else            {       score = div[globaldata->Groups[j]][numSampled] / (float)numIters;       }
485
486                                         out << setprecision(4) << score << '\t';
487                                 }else { out << "NA" << '\t'; }
488                         }
489                         out << endl;
490                 }
491                 
492                 out.close();
493                 
494         }
495         catch(exception& e) {
496                 m->errorOut(e, "PhyloDiversityCommand", "printData");
497                 exit(1);
498         }
499 }
500 //**********************************************************************************************************************
501 float PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf, map< string, set<int> >& counted){
502         try {
503
504                 //calc the branch length
505                 //while you aren't at root
506                 float sum = 0.0;
507                 int index = leaf;
508                 
509                 vector<string> groups = t->tree[leaf].getGroup();
510                                         
511                 while(t->tree[index].getParent() != -1){
512                         
513                         //if you have a BL
514                         if(t->tree[index].getBranchLength() != -1){
515                                 if (counted[groups[0]].count(index) == 0) { //you have not already counted this branch
516                                         sum += abs(t->tree[index].getBranchLength());
517                                         for (int j = 0; j < groups.size(); j++) {  counted[groups[j]].insert(index);  }
518                                 }
519                         }
520                         index = t->tree[index].getParent();
521                 }
522                         
523                 //get last breanch length added
524                 if(t->tree[index].getBranchLength() != -1){
525                         if (counted[groups[0]].count(index) == 0) { //you have not already counted this branch
526                                 sum += abs(t->tree[index].getBranchLength());
527                                 for (int j = 0; j < groups.size(); j++) {  counted[groups[j]].insert(index);  }
528                         }
529                 }
530                 
531                 return sum;
532         }
533         catch(exception& e) {
534                 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
535                 exit(1);
536         }
537 }
538 //**********************************************************************************************************************