2 * phylodiversitycommand.cpp
5 * Created by westcott on 4/30/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "phylodiversitycommand.h"
12 //**********************************************************************************************************************
13 PhyloDiversityCommand::PhyloDiversityCommand(string option) {
15 globaldata = GlobalData::getInstance();
18 //allow user to run help
19 if(option == "help") { help(); abort = true; }
22 //valid paramters for this command
23 string Array[] = {"freq","rarefy","iters","groups","outputdir","inputdir"};
24 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
26 OptionParser parser(option);
27 map<string,string> parameters = parser.getParameters();
29 ValidParameters validParameter;
31 //check to make sure all parameters are valid for command
32 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
33 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
36 //if the user changes the output directory command factory will send this info to us in the output parameter
37 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
39 if (globaldata->gTree.size() == 0) {//no trees were read
40 m->mothurOut("You must execute the read.tree command, before you may execute the phylo.diversity command."); m->mothurOutEndLine(); abort = true; }
43 temp = validParameter.validFile(parameters, "freq", false); if (temp == "not found") { temp = "100"; }
46 temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; }
49 temp = validParameter.validFile(parameters, "rarefy", false); if (temp == "not found") { temp = "F"; }
50 rarefy = isTrue(temp);
51 if (!rarefy) { iters = 1; }
53 groups = validParameter.validFile(parameters, "groups", false);
54 if (groups == "not found") { groups = ""; Groups = globaldata->gTreemap->namesOfGroups; globaldata->Groups = Groups; }
56 splitAtDash(groups, Groups);
57 globaldata->Groups = Groups;
64 m->errorOut(e, "PhyloDiversityCommand", "PhyloDiversityCommand");
68 //**********************************************************************************************************************
70 void PhyloDiversityCommand::help(){
72 m->mothurOut("The phylo.diversity command can only be executed after a successful read.tree command.\n");
73 m->mothurOut("The phylo.diversity command parameters are groups, iters, freq and rarefy. No parameters are required.\n");
74 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. The group names are separated by dashes. By default all groups are used.\n");
75 m->mothurOut("The iters parameter allows you to specify the number of randomizations to preform, by default iters=1000, if you set rarefy to true.\n");
76 m->mothurOut("The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n");
77 m->mothurOut("The rarefy parameter allows you to create a rarefaction curve. The default is false.\n");
78 m->mothurOut("The phylo.diversity command should be in the following format: phylo.diversity(groups=yourGroups, rarefy=yourRarefy, iters=yourIters).\n");
79 m->mothurOut("Example phylo.diversity(groups=A-B-C, rarefy=T, iters=500).\n");
80 m->mothurOut("The phylo.diversity command output two files: .phylo.diversity and if rarefy=T, .rarefaction.\n");
81 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
85 m->errorOut(e, "PhyloDiversityCommand", "help");
90 //**********************************************************************************************************************
92 PhyloDiversityCommand::~PhyloDiversityCommand(){}
94 //**********************************************************************************************************************
96 int PhyloDiversityCommand::execute(){
99 if (abort == true) { return 0; }
101 //incase the user had some mismatches between the tree and group files we don't want group xxx to be analyzed
102 for (int i = 0; i < globaldata->Groups.size(); i++) { if (globaldata->Groups[i] == "xxx") { globaldata->Groups.erase(globaldata->Groups.begin()+i); break; } }
104 vector<string> outputNames;
106 vector<Tree*> trees = globaldata->gTree;
108 //for each of the users trees
109 for(int i = 0; i < trees.size(); i++) {
111 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
113 string outFile = outputDir + getRootName(getSimpleName(globaldata->getTreeFile())) + toString(i+1) + ".phylo.diversity";
114 if (rarefy) { outFile += ".rarefaction"; }
115 outputNames.push_back(outFile);
117 int numLeafNodes = trees[i]->getNumLeaves();
119 //create a vector containing indexes of leaf nodes, randomize it, select nodes to send to calculator
120 vector<int> randomLeaf;
121 for (int j = 0; j < numLeafNodes; j++) {
122 if (inUsersGroups(trees[i]->tree[j].getGroup(), globaldata->Groups) == true) { //is this a node from the group the user selected.
123 randomLeaf.push_back(j);
127 numLeafNodes = randomLeaf.size(); //reset the number of leaf nodes you are using
129 //each group, each sampling, if no rarefy iters = 1;
130 map<string, vector<float> > diversity;
132 //each group, each sampling, if no rarefy iters = 1;
133 map<string, vector<float> > sumDiversity;
135 //find largest group total
136 int largestGroup = 0;
137 for (int j = 0; j < globaldata->Groups.size(); j++) {
138 if (globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]] > largestGroup) { largestGroup = globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]; }
140 //initialize diversity
141 diversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0); //numSampled
144 //initialize sumDiversity
145 sumDiversity[globaldata->Groups[j]].resize(globaldata->gTreemap->seqsPerGroup[globaldata->Groups[j]]+1, 0.0);
148 //convert freq percentage to number
150 if (freq < 1.0) { increment = largestGroup * freq;
151 }else { increment = freq; }
153 //initialize sampling spots
154 set<int> numSampledList;
155 for(int k = 1; k <= largestGroup; k++){ if((k == 1) || (k % increment == 0)){ numSampledList.insert(k); } }
156 if(largestGroup % increment != 0){ numSampledList.insert(largestGroup); }
158 //add other groups ending points
159 for (int j = 0; j < globaldata->Groups.size(); j++) {
160 if (numSampledList.count(diversity[globaldata->Groups[j]].size()-1) == 0) { numSampledList.insert(diversity[globaldata->Groups[j]].size()-1); }
163 for (int l = 0; l < iters; l++) {
164 random_shuffle(randomLeaf.begin(), randomLeaf.end());
167 map<string, int> counts;
168 for (int j = 0; j < globaldata->Groups.size(); j++) { counts[globaldata->Groups[j]] = 0; }
170 for(int k = 0; k < numLeafNodes; k++){
172 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
174 //calc branch length of randomLeaf k
175 float br = calcBranchLength(trees[i], randomLeaf[k]);
177 //for each group in the groups update the total branch length accounting for the names file
178 vector<string> groups = trees[i]->tree[randomLeaf[k]].getGroup();
179 for (int j = 0; j < groups.size(); j++) {
180 int numSeqsInGroupJ = 0;
181 map<string, int>::iterator it;
182 it = trees[i]->tree[randomLeaf[k]].pcount.find(groups[j]);
183 if (it != trees[i]->tree[randomLeaf[k]].pcount.end()) { //this leaf node contains seqs from group j
184 numSeqsInGroupJ = it->second;
187 for (int s = (counts[groups[j]]+1); s <= (counts[groups[j]]+numSeqsInGroupJ); s++) {
188 diversity[groups[j]][s] = diversity[groups[j]][s-1] + (numSeqsInGroupJ * br);
190 counts[groups[j]] += numSeqsInGroupJ;
195 //add this diversity to the sum
196 for (int j = 0; j < globaldata->Groups.size(); j++) {
197 for (int g = 0; g < diversity[globaldata->Groups[j]].size(); g++) {
198 sumDiversity[globaldata->Groups[j]][g] += diversity[globaldata->Groups[j]][g];
205 printData(numSampledList, sumDiversity, outFile);
207 printData(numSampledList, diversity, outFile);
213 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
215 m->mothurOutEndLine();
216 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
217 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
218 m->mothurOutEndLine();
223 catch(exception& e) {
224 m->errorOut(e, "PhyloDiversityCommand", "execute");
228 //**********************************************************************************************************************
230 void PhyloDiversityCommand::printData(set<int>& num, map< string, vector<float> >& div, string file){
233 openOutputFile(file, out);
235 out << "numSampled\t";
236 for (int i = 0; i < globaldata->Groups.size(); i++) { out << globaldata->Groups[i] << '\t'; }
239 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
241 for (set<int>::iterator it = num.begin(); it != num.end(); it++) {
242 int numSampled = *it;
244 out << numSampled << '\t';
246 for (int j = 0; j < globaldata->Groups.size(); j++) {
247 if (numSampled < div[globaldata->Groups[j]].size()) {
248 float score = div[globaldata->Groups[j]][numSampled] / (float)iters;
249 out << setprecision(6) << score << '\t';
250 }else { out << "NA" << '\t'; }
258 catch(exception& e) {
259 m->errorOut(e, "PhyloDiversityCommand", "printData");
264 //**********************************************************************************************************************
265 float PhyloDiversityCommand::calcBranchLength(Tree* t, int leaf){
268 //calc the branch length
269 //while you aren't at root
273 while(t->tree[index].getParent() != -1){
276 if(t->tree[index].getBranchLength() != -1){
277 sum += abs(t->tree[index].getBranchLength());
279 index = t->tree[index].getParent();
282 //get last breanch length added
283 if(t->tree[index].getBranchLength() != -1){
284 sum += abs(t->tree[index].getBranchLength());
289 catch(exception& e) {
290 m->errorOut(e, "PhyloDiversityCommand", "calcBranchLength");
294 //**********************************************************************************************************************