2 * unifracweightedcommand.cpp
5 * Created by Sarah Westcott on 2/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "unifracweightedcommand.h"
12 //**********************************************************************************************************************
13 vector<string> UnifracWeightedCommand::getValidParameters(){
15 string Array[] = {"groups","iters","distance","random","processors","root","outputdir","inputdir"};
16 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
20 m->errorOut(e, "UnifracWeightedCommand", "getValidParameters");
24 //**********************************************************************************************************************
25 UnifracWeightedCommand::UnifracWeightedCommand(){
27 abort = true; calledHelp = true;
28 vector<string> tempOutNames;
29 outputTypes["weighted"] = tempOutNames;
30 outputTypes["wsummary"] = tempOutNames;
31 outputTypes["phylip"] = tempOutNames;
32 outputTypes["column"] = tempOutNames;
35 m->errorOut(e, "UnifracWeightedCommand", "UnifracWeightedCommand");
39 //**********************************************************************************************************************
40 vector<string> UnifracWeightedCommand::getRequiredParameters(){
42 vector<string> myArray;
46 m->errorOut(e, "UnifracWeightedCommand", "getRequiredParameters");
50 //**********************************************************************************************************************
51 vector<string> UnifracWeightedCommand::getRequiredFiles(){
53 string Array[] = {"tree","group"};
54 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
59 m->errorOut(e, "UnifracWeightedCommand", "getRequiredFiles");
63 /***********************************************************/
64 UnifracWeightedCommand::UnifracWeightedCommand(string option) {
66 globaldata = GlobalData::getInstance();
67 abort = false; calledHelp = false;
70 //allow user to run help
71 if(option == "help") { help(); abort = true; calledHelp = true; }
74 //valid paramters for this command
75 string Array[] = {"groups","iters","distance","random","processors","root","outputdir","inputdir"};
76 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
78 OptionParser parser(option);
79 map<string,string> parameters=parser.getParameters();
81 ValidParameters validParameter;
83 //check to make sure all parameters are valid for command
84 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
85 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
88 //initialize outputTypes
89 vector<string> tempOutNames;
90 outputTypes["weighted"] = tempOutNames;
91 outputTypes["wsummary"] = tempOutNames;
92 outputTypes["phylip"] = tempOutNames;
93 outputTypes["column"] = tempOutNames;
95 if (globaldata->gTree.size() == 0) {//no trees were read
96 m->mothurOut("You must execute the read.tree command, before you may execute the unifrac.weighted command."); m->mothurOutEndLine(); abort = true; }
98 //if the user changes the output directory command factory will send this info to us in the output parameter
99 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
101 outputDir += m->hasPath(globaldata->inputFileName); //if user entered a file with a path then preserve it
104 //check for optional parameter and set defaults
105 // ...at some point should added some additional type checking...
106 groups = validParameter.validFile(parameters, "groups", false);
107 if (groups == "not found") { groups = ""; }
109 m->splitAtDash(groups, Groups);
110 globaldata->Groups = Groups;
113 itersString = validParameter.validFile(parameters, "iters", false); if (itersString == "not found") { itersString = "1000"; }
114 convert(itersString, iters);
116 string temp = validParameter.validFile(parameters, "distance", false);
117 if (temp == "not found") { phylip = false; outputForm = ""; }
119 if ((temp == "lt") || (temp == "column") || (temp == "square")) { phylip = true; outputForm = temp; }
120 else { m->mothurOut("Options for distance are: lt, square, or column. Using lt."); m->mothurOutEndLine(); phylip = true; outputForm = "lt"; }
123 temp = validParameter.validFile(parameters, "random", false); if (temp == "not found") { temp = "F"; }
124 random = m->isTrue(temp);
126 temp = validParameter.validFile(parameters, "root", false); if (temp == "not found") { temp = "F"; }
127 includeRoot = m->isTrue(temp);
129 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
130 convert(temp, processors);
132 if (!random) { iters = 0; } //turn off random calcs
135 if (abort == false) {
136 T = globaldata->gTree;
137 tmap = globaldata->gTreemap;
138 sumFile = outputDir + m->getSimpleName(globaldata->getTreeFile()) + ".wsummary";
139 m->openOutputFile(sumFile, outSum);
140 outputNames.push_back(sumFile); outputTypes["wsummary"].push_back(sumFile);
142 util = new SharedUtil();
143 string s; //to make work with setgroups
144 util->setGroups(globaldata->Groups, tmap->namesOfGroups, s, numGroups, "weighted"); //sets the groups the user wants to analyze
145 util->getCombos(groupComb, globaldata->Groups, numComp);
147 weighted = new Weighted(tmap, includeRoot);
154 catch(exception& e) {
155 m->errorOut(e, "UnifracWeightedCommand", "UnifracWeightedCommand");
159 //**********************************************************************************************************************
161 void UnifracWeightedCommand::help(){
163 m->mothurOut("The unifrac.weighted command can only be executed after a successful read.tree command.\n");
164 m->mothurOut("The unifrac.weighted command parameters are groups, iters, distance, processors, root and random. No parameters are required.\n");
165 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n");
166 m->mothurOut("The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n");
167 m->mothurOut("The distance parameter allows you to create a distance file from the results. The default is false.\n");
168 m->mothurOut("The random parameter allows you to shut off the comparison to random trees. The default is false, meaning don't compare your trees with randomly generated trees.\n");
169 m->mothurOut("The root parameter allows you to include the entire root in your calculations. The default is false, meaning stop at the root for this comparision instead of the root of the entire tree.\n");
170 m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
171 m->mothurOut("The unifrac.weighted command should be in the following format: unifrac.weighted(groups=yourGroups, iters=yourIters).\n");
172 m->mothurOut("Example unifrac.weighted(groups=A-B-C, iters=500).\n");
173 m->mothurOut("The default value for groups is all the groups in your groupfile, and iters is 1000.\n");
174 m->mothurOut("The unifrac.weighted command output two files: .weighted and .wsummary their descriptions are in the manual.\n");
175 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
177 catch(exception& e) {
178 m->errorOut(e, "UnifracWeightedCommand", "help");
183 /***********************************************************/
184 int UnifracWeightedCommand::execute() {
187 if (abort == true) { if (calledHelp) { return 0; } return 2; }
189 int start = time(NULL);
191 //get weighted for users tree
192 userData.resize(numComp,0); //data[0] = weightedscore AB, data[1] = weightedscore AC...
193 randomData.resize(numComp,0); //data[0] = weightedscore AB, data[1] = weightedscore AC...
195 if (numComp < processors) { processors = numComp; }
197 //get weighted scores for users trees
198 for (int i = 0; i < T.size(); i++) {
200 if (m->control_pressed) { outSum.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
203 rScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC...
204 uScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC...
207 output = new ColumnFile(outputDir + m->getSimpleName(globaldata->getTreeFile()) + toString(i+1) + ".weighted", itersString);
208 outputNames.push_back(outputDir + m->getSimpleName(globaldata->getTreeFile()) + toString(i+1) + ".weighted");
209 outputTypes["weighted"].push_back(outputDir + m->getSimpleName(globaldata->getTreeFile()) + toString(i+1) + ".weighted");
212 userData = weighted->getValues(T[i], processors, outputDir); //userData[0] = weightedscore
214 if (m->control_pressed) { if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
217 for (int s=0; s<numComp; s++) {
218 //add users score to vector of user scores
219 uScores[s].push_back(userData[s]);
221 //save users tree score for summary file
222 utreeScores.push_back(userData[s]);
227 //calculate number of comparisons i.e. with groups A,B,C = AB, AC, BC = 3;
228 vector< vector<string> > namesOfGroupCombos;
229 for (int a=0; a<numGroups; a++) {
230 for (int l = 0; l < a; l++) {
231 vector<string> groups; groups.push_back(globaldata->Groups[a]); groups.push_back(globaldata->Groups[l]);
232 namesOfGroupCombos.push_back(groups);
238 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
240 int numPairs = namesOfGroupCombos.size();
241 int numPairsPerProcessor = numPairs / processors;
243 for (int i = 0; i < processors; i++) {
244 int startPos = i * numPairsPerProcessor;
245 if(i == processors - 1){
246 numPairsPerProcessor = numPairs - i * numPairsPerProcessor;
248 lines.push_back(linePair(startPos, numPairsPerProcessor));
254 //get scores for random trees
255 for (int j = 0; j < iters; j++) {
257 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
259 driver(T[i], namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores);
261 createProcesses(T[i], namesOfGroupCombos, rScores);
264 driver(T[i], namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores);
267 if (m->control_pressed) { delete output; outSum.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
270 // m->mothurOut("Iter: " + toString(j+1)); m->mothurOutEndLine();
274 //find the signifigance of the score for summary file
275 for (int f = 0; f < numComp; f++) {
277 sort(rScores[f].begin(), rScores[f].end());
279 //the index of the score higher than yours is returned
280 //so if you have 1000 random trees the index returned is 100
281 //then there are 900 trees with a score greater then you.
282 //giving you a signifigance of 0.900
283 int index = findIndex(userData[f], f); if (index == -1) { m->mothurOut("error in UnifracWeightedCommand"); m->mothurOutEndLine(); exit(1); } //error code
285 //the signifigance is the number of trees with the users score or higher
286 WScoreSig.push_back((iters-index)/(float)iters);
289 //out << "Tree# " << i << endl;
290 calculateFreqsCumuls();
304 if (m->control_pressed) { outSum.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
308 if (phylip) { createPhylipFile(); }
310 //clear out users groups
311 globaldata->Groups.clear();
314 if (m->control_pressed) {
315 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
319 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.weighted."); m->mothurOutEndLine();
321 //set phylip file as new current phylipfile
323 itTypes = outputTypes.find("phylip");
324 if (itTypes != outputTypes.end()) {
325 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setPhylipFile(current); }
328 //set column file as new current columnfile
329 itTypes = outputTypes.find("column");
330 if (itTypes != outputTypes.end()) {
331 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setColumnFile(current); }
334 m->mothurOutEndLine();
335 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
336 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
337 m->mothurOutEndLine();
342 catch(exception& e) {
343 m->errorOut(e, "UnifracWeightedCommand", "execute");
347 /**************************************************************************************************/
349 int UnifracWeightedCommand::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, vector< vector<double> >& scores) {
351 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
353 vector<int> processIDS;
357 //loop through and create all the processes you want
358 while (process != processors) {
362 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
365 driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, scores);
367 //pass numSeqs to parent
369 string tempFile = outputDir + toString(getpid()) + ".weightedcommand.results.temp";
370 m->openOutputFile(tempFile, out);
371 for (int i = lines[process].start; i < (lines[process].start + lines[process].num); i++) { out << scores[i][(scores[i].size()-1)] << '\t'; } out << endl;
376 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
377 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
382 driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, scores);
384 //force parent to wait until all the processes are done
385 for (int i=0;i<(processors-1);i++) {
386 int temp = processIDS[i];
390 //get data created by processes
391 for (int i=0;i<(processors-1);i++) {
394 string s = outputDir + toString(processIDS[i]) + ".weightedcommand.results.temp";
395 m->openInputFile(s, in);
398 for (int j = lines[(i+1)].start; j < (lines[(i+1)].start + lines[(i+1)].num); j++) { in >> tempScore; scores[j].push_back(tempScore); }
406 catch(exception& e) {
407 m->errorOut(e, "UnifracWeightedCommand", "createProcesses");
412 /**************************************************************************************************/
413 int UnifracWeightedCommand::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, vector< vector<double> >& scores) {
415 Tree* randT = new Tree();
417 for (int h = start; h < (start+num); h++) {
419 if (m->control_pressed) { return 0; }
421 //initialize weighted score
422 string groupA = namesOfGroupCombos[h][0];
423 string groupB = namesOfGroupCombos[h][1];
428 //create a random tree with same topology as T[i], but different labels
429 randT->assembleRandomUnifracTree(groupA, groupB);
431 if (m->control_pressed) { delete randT; return 0; }
433 //get wscore of random tree
434 EstOutput randomData = weighted->getValues(randT, groupA, groupB);
436 if (m->control_pressed) { delete randT; return 0; }
439 scores[h].push_back(randomData[0]);
447 catch(exception& e) {
448 m->errorOut(e, "UnifracWeightedCommand", "driver");
452 /***********************************************************/
453 void UnifracWeightedCommand::printWeightedFile() {
457 tags.push_back("Score"); tags.push_back("RandFreq"); tags.push_back("RandCumul");
459 for(int a = 0; a < numComp; a++) {
460 output->initFile(groupComb[a], tags);
462 for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
463 data.push_back(it->first); data.push_back(rScoreFreq[a][it->first]); data.push_back(rCumul[a][it->first]);
464 output->output(data);
470 catch(exception& e) {
471 m->errorOut(e, "UnifracWeightedCommand", "printWeightedFile");
477 /***********************************************************/
478 void UnifracWeightedCommand::printWSummaryFile() {
481 outSum << "Tree#" << '\t' << "Groups" << '\t' << "WScore" << '\t' << "WSig" << endl;
482 m->mothurOut("Tree#\tGroups\tWScore\tWSig"); m->mothurOutEndLine();
485 outSum.setf(ios::fixed, ios::floatfield); outSum.setf(ios::showpoint);
489 for (int i = 0; i < T.size(); i++) {
490 for (int j = 0; j < numComp; j++) {
492 if (WScoreSig[count] > (1/(float)iters)) {
493 outSum << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << WScoreSig[count] << endl;
494 cout << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << WScoreSig[count] << endl;
495 m->mothurOutJustToLog(toString(i+1) +"\t" + groupComb[j] +"\t" + toString(utreeScores[count]) +"\t" + toString(WScoreSig[count]) + "\n");
497 outSum << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl;
498 cout << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl;
499 m->mothurOutJustToLog(toString(i+1) +"\t" + groupComb[j] +"\t" + toString(utreeScores[count]) +"\t<" + toString((1/float(iters))) + "\n");
502 outSum << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << "0.00" << endl;
503 cout << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << "0.00" << endl;
504 m->mothurOutJustToLog(toString(i+1) +"\t" + groupComb[j] +"\t" + toString(utreeScores[count]) +"\t0.00\n");
511 catch(exception& e) {
512 m->errorOut(e, "UnifracWeightedCommand", "printWSummaryFile");
516 /***********************************************************/
517 void UnifracWeightedCommand::createPhylipFile() {
521 for (int i = 0; i < T.size(); i++) {
523 string phylipFileName;
524 if ((outputForm == "lt") || (outputForm == "square")) {
525 phylipFileName = outputDir + m->getSimpleName(globaldata->getTreeFile()) + toString(i+1) + ".weighted.phylip.dist";
526 outputNames.push_back(phylipFileName); outputTypes["phylip"].push_back(phylipFileName);
528 phylipFileName = outputDir + m->getSimpleName(globaldata->getTreeFile()) + toString(i+1) + ".weighted.column.dist";
529 outputNames.push_back(phylipFileName); outputTypes["column"].push_back(phylipFileName);
533 m->openOutputFile(phylipFileName, out);
535 if ((outputForm == "lt") || (outputForm == "square")) {
537 out << globaldata->Groups.size() << endl;
540 //make matrix with scores in it
541 vector< vector<float> > dists; dists.resize(globaldata->Groups.size());
542 for (int i = 0; i < globaldata->Groups.size(); i++) {
543 dists[i].resize(globaldata->Groups.size(), 0.0);
546 //flip it so you can print it
547 for (int r=0; r<globaldata->Groups.size(); r++) {
548 for (int l = 0; l < r; l++) {
549 dists[r][l] = utreeScores[count];
550 dists[l][r] = utreeScores[count];
556 for (int r=0; r<globaldata->Groups.size(); r++) {
558 string name = globaldata->Groups[r];
559 if (name.length() < 10) { //pad with spaces to make compatible
560 while (name.length() < 10) { name += " "; }
563 if (outputForm == "lt") {
567 for (int l = 0; l < r; l++) { out << dists[r][l] << '\t'; }
569 }else if (outputForm == "square") {
573 for (int l = 0; l < globaldata->Groups.size(); l++) { out << dists[r][l] << '\t'; }
577 for (int l = 0; l < r; l++) {
578 string otherName = globaldata->Groups[l];
579 if (otherName.length() < 10) { //pad with spaces to make compatible
580 while (otherName.length() < 10) { otherName += " "; }
583 out << name << '\t' << otherName << dists[r][l] << endl;
590 catch(exception& e) {
591 m->errorOut(e, "UnifracWeightedCommand", "createPhylipFile");
595 /***********************************************************/
596 int UnifracWeightedCommand::findIndex(float score, int index) {
598 for (int i = 0; i < rScores[index].size(); i++) {
599 if (rScores[index][i] >= score) { return i; }
601 return rScores[index].size();
603 catch(exception& e) {
604 m->errorOut(e, "UnifracWeightedCommand", "findIndex");
609 /***********************************************************/
611 void UnifracWeightedCommand::calculateFreqsCumuls() {
613 //clear out old tree values
615 rScoreFreq.resize(numComp);
617 rCumul.resize(numComp);
620 //calculate frequency
621 for (int f = 0; f < numComp; f++) {
622 for (int i = 0; i < rScores[f].size(); i++) { //looks like 0,0,1,1,1,2,4,7... you want to make a map that say rScoreFreq[0] = 2, rScoreFreq[1] = 3...
623 validScores[rScores[f][i]] = rScores[f][i];
624 map<float,float>::iterator it = rScoreFreq[f].find(rScores[f][i]);
625 if (it != rScoreFreq[f].end()) {
626 rScoreFreq[f][rScores[f][i]]++;
628 rScoreFreq[f][rScores[f][i]] = 1;
634 for(int a = 0; a < numComp; a++) {
635 float rcumul = 1.0000;
636 //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
637 for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
638 //make rscoreFreq map and rCumul
639 map<float,float>::iterator it2 = rScoreFreq[a].find(it->first);
640 rCumul[a][it->first] = rcumul;
641 //get percentage of random trees with that info
642 if (it2 != rScoreFreq[a].end()) { rScoreFreq[a][it->first] /= iters; rcumul-= it2->second; }
643 else { rScoreFreq[a][it->first] = 0.0000; } //no random trees with that score
648 catch(exception& e) {
649 m->errorOut(e, "UnifracWeightedCommand", "calculateFreqsCums");
654 /***********************************************************/