2 * unifracweightedcommand.cpp
5 * Created by Sarah Westcott on 2/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "unifracweightedcommand.h"
12 //**********************************************************************************************************************
13 vector<string> UnifracWeightedCommand::getValidParameters(){
15 string Array[] = {"groups","iters","distance","random","processors","root","outputdir","inputdir"};
16 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
20 m->errorOut(e, "UnifracWeightedCommand", "getValidParameters");
24 //**********************************************************************************************************************
25 UnifracWeightedCommand::UnifracWeightedCommand(){
27 abort = true; calledHelp = true;
28 vector<string> tempOutNames;
29 outputTypes["weighted"] = tempOutNames;
30 outputTypes["wsummary"] = tempOutNames;
31 outputTypes["phylip"] = tempOutNames;
32 outputTypes["column"] = tempOutNames;
35 m->errorOut(e, "UnifracWeightedCommand", "UnifracWeightedCommand");
39 //**********************************************************************************************************************
40 vector<string> UnifracWeightedCommand::getRequiredParameters(){
42 vector<string> myArray;
46 m->errorOut(e, "UnifracWeightedCommand", "getRequiredParameters");
50 //**********************************************************************************************************************
51 vector<string> UnifracWeightedCommand::getRequiredFiles(){
53 string Array[] = {"tree","group"};
54 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
59 m->errorOut(e, "UnifracWeightedCommand", "getRequiredFiles");
63 /***********************************************************/
64 UnifracWeightedCommand::UnifracWeightedCommand(string option) {
66 globaldata = GlobalData::getInstance();
67 abort = false; calledHelp = false;
70 //allow user to run help
71 if(option == "help") { help(); abort = true; calledHelp = true; }
74 //valid paramters for this command
75 string Array[] = {"groups","iters","distance","random","processors","root","outputdir","inputdir"};
76 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
78 OptionParser parser(option);
79 map<string,string> parameters=parser.getParameters();
81 ValidParameters validParameter;
83 //check to make sure all parameters are valid for command
84 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
85 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
88 //initialize outputTypes
89 vector<string> tempOutNames;
90 outputTypes["weighted"] = tempOutNames;
91 outputTypes["wsummary"] = tempOutNames;
92 outputTypes["phylip"] = tempOutNames;
93 outputTypes["column"] = tempOutNames;
95 if (globaldata->gTree.size() == 0) {//no trees were read
96 m->mothurOut("You must execute the read.tree command, before you may execute the unifrac.weighted command."); m->mothurOutEndLine(); abort = true; }
98 //if the user changes the output directory command factory will send this info to us in the output parameter
99 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
101 outputDir += m->hasPath(globaldata->inputFileName); //if user entered a file with a path then preserve it
104 //check for optional parameter and set defaults
105 // ...at some point should added some additional type checking...
106 groups = validParameter.validFile(parameters, "groups", false);
107 if (groups == "not found") { groups = ""; }
109 m->splitAtDash(groups, Groups);
110 globaldata->Groups = Groups;
113 itersString = validParameter.validFile(parameters, "iters", false); if (itersString == "not found") { itersString = "1000"; }
114 convert(itersString, iters);
116 string temp = validParameter.validFile(parameters, "distance", false);
117 if (temp == "not found") { phylip = false; outputForm = ""; }
119 if ((temp == "lt") || (temp == "column") || (temp == "square")) { phylip = true; outputForm = temp; }
120 else { m->mothurOut("Options for distance are: lt, square, or column. Using lt."); m->mothurOutEndLine(); phylip = true; outputForm = "lt"; }
123 temp = validParameter.validFile(parameters, "random", false); if (temp == "not found") { temp = "F"; }
124 random = m->isTrue(temp);
126 temp = validParameter.validFile(parameters, "root", false); if (temp == "not found") { temp = "F"; }
127 includeRoot = m->isTrue(temp);
129 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
130 convert(temp, processors);
132 if (!random) { iters = 0; } //turn off random calcs
135 if (abort == false) {
136 T = globaldata->gTree;
137 tmap = globaldata->gTreemap;
138 sumFile = outputDir + m->getSimpleName(globaldata->getTreeFile()) + ".wsummary";
139 m->openOutputFile(sumFile, outSum);
140 outputNames.push_back(sumFile); outputTypes["wsummary"].push_back(sumFile);
142 util = new SharedUtil();
143 string s; //to make work with setgroups
144 util->setGroups(globaldata->Groups, tmap->namesOfGroups, s, numGroups, "weighted"); //sets the groups the user wants to analyze
145 util->getCombos(groupComb, globaldata->Groups, numComp);
147 weighted = new Weighted(tmap, includeRoot);
154 catch(exception& e) {
155 m->errorOut(e, "UnifracWeightedCommand", "UnifracWeightedCommand");
159 //**********************************************************************************************************************
161 void UnifracWeightedCommand::help(){
163 m->mothurOut("The unifrac.weighted command can only be executed after a successful read.tree command.\n");
164 m->mothurOut("The unifrac.weighted command parameters are groups, iters, distance, processors, root and random. No parameters are required.\n");
165 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n");
166 m->mothurOut("The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n");
167 m->mothurOut("The distance parameter allows you to create a distance file from the results. The default is false.\n");
168 m->mothurOut("The random parameter allows you to shut off the comparison to random trees. The default is false, meaning don't compare your trees with randomly generated trees.\n");
169 m->mothurOut("The root parameter allows you to include the entire root in your calculations. The default is false, meaning stop at the root for this comparision instead of the root of the entire tree.\n");
170 m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
171 m->mothurOut("The unifrac.weighted command should be in the following format: unifrac.weighted(groups=yourGroups, iters=yourIters).\n");
172 m->mothurOut("Example unifrac.weighted(groups=A-B-C, iters=500).\n");
173 m->mothurOut("The default value for groups is all the groups in your groupfile, and iters is 1000.\n");
174 m->mothurOut("The unifrac.weighted command output two files: .weighted and .wsummary their descriptions are in the manual.\n");
175 m->mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n");
177 catch(exception& e) {
178 m->errorOut(e, "UnifracWeightedCommand", "help");
183 /***********************************************************/
184 int UnifracWeightedCommand::execute() {
187 if (abort == true) { if (calledHelp) { return 0; } return 2; }
189 int start = time(NULL);
191 //get weighted for users tree
192 userData.resize(numComp,0); //data[0] = weightedscore AB, data[1] = weightedscore AC...
193 randomData.resize(numComp,0); //data[0] = weightedscore AB, data[1] = weightedscore AC...
195 if (numComp < processors) { processors = numComp; }
197 //get weighted scores for users trees
198 for (int i = 0; i < T.size(); i++) {
200 if (m->control_pressed) { outSum.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
203 rScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC...
204 uScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC...
207 output = new ColumnFile(outputDir + m->getSimpleName(globaldata->getTreeFile()) + toString(i+1) + ".weighted", itersString);
208 outputNames.push_back(outputDir + m->getSimpleName(globaldata->getTreeFile()) + toString(i+1) + ".weighted");
209 outputTypes["weighted"].push_back(outputDir + m->getSimpleName(globaldata->getTreeFile()) + toString(i+1) + ".weighted");
212 userData = weighted->getValues(T[i], processors, outputDir); //userData[0] = weightedscore
214 if (m->control_pressed) { if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
217 for (int s=0; s<numComp; s++) {
218 //add users score to vector of user scores
219 uScores[s].push_back(userData[s]);
221 //save users tree score for summary file
222 utreeScores.push_back(userData[s]);
227 //calculate number of comparisons i.e. with groups A,B,C = AB, AC, BC = 3;
228 vector< vector<string> > namesOfGroupCombos;
229 for (int a=0; a<numGroups; a++) {
230 for (int l = 0; l < a; l++) {
231 vector<string> groups; groups.push_back(globaldata->Groups[a]); groups.push_back(globaldata->Groups[l]);
232 namesOfGroupCombos.push_back(groups);
238 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
240 int numPairs = namesOfGroupCombos.size();
241 int numPairsPerProcessor = numPairs / processors;
243 for (int i = 0; i < processors; i++) {
244 int startPos = i * numPairsPerProcessor;
245 if(i == processors - 1){
246 numPairsPerProcessor = numPairs - i * numPairsPerProcessor;
248 lines.push_back(linePair(startPos, numPairsPerProcessor));
254 //get scores for random trees
255 for (int j = 0; j < iters; j++) {
257 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
259 driver(T[i], namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores);
261 createProcesses(T[i], namesOfGroupCombos, rScores);
264 driver(T[i], namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores);
267 if (m->control_pressed) { delete output; outSum.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
270 // m->mothurOut("Iter: " + toString(j+1)); m->mothurOutEndLine();
274 //find the signifigance of the score for summary file
275 for (int f = 0; f < numComp; f++) {
277 sort(rScores[f].begin(), rScores[f].end());
279 //the index of the score higher than yours is returned
280 //so if you have 1000 random trees the index returned is 100
281 //then there are 900 trees with a score greater then you.
282 //giving you a signifigance of 0.900
283 int index = findIndex(userData[f], f); if (index == -1) { m->mothurOut("error in UnifracWeightedCommand"); m->mothurOutEndLine(); exit(1); } //error code
285 //the signifigance is the number of trees with the users score or higher
286 WScoreSig.push_back((iters-index)/(float)iters);
289 //out << "Tree# " << i << endl;
290 calculateFreqsCumuls();
304 if (m->control_pressed) { outSum.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
308 if (phylip) { createPhylipFile(); }
310 //clear out users groups
311 globaldata->Groups.clear();
314 if (m->control_pressed) {
315 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
319 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.weighted."); m->mothurOutEndLine();
321 m->mothurOutEndLine();
322 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
323 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
324 m->mothurOutEndLine();
329 catch(exception& e) {
330 m->errorOut(e, "UnifracWeightedCommand", "execute");
334 /**************************************************************************************************/
336 int UnifracWeightedCommand::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, vector< vector<double> >& scores) {
338 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
340 vector<int> processIDS;
344 //loop through and create all the processes you want
345 while (process != processors) {
349 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
352 driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, scores);
354 //pass numSeqs to parent
356 string tempFile = outputDir + toString(getpid()) + ".weightedcommand.results.temp";
357 m->openOutputFile(tempFile, out);
358 for (int i = lines[process].start; i < (lines[process].start + lines[process].num); i++) { out << scores[i][(scores[i].size()-1)] << '\t'; } out << endl;
363 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
364 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
369 driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, scores);
371 //force parent to wait until all the processes are done
372 for (int i=0;i<(processors-1);i++) {
373 int temp = processIDS[i];
377 //get data created by processes
378 for (int i=0;i<(processors-1);i++) {
381 string s = outputDir + toString(processIDS[i]) + ".weightedcommand.results.temp";
382 m->openInputFile(s, in);
385 for (int j = lines[(i+1)].start; j < (lines[(i+1)].start + lines[(i+1)].num); j++) { in >> tempScore; scores[j].push_back(tempScore); }
393 catch(exception& e) {
394 m->errorOut(e, "UnifracWeightedCommand", "createProcesses");
399 /**************************************************************************************************/
400 int UnifracWeightedCommand::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, vector< vector<double> >& scores) {
402 Tree* randT = new Tree();
404 for (int h = start; h < (start+num); h++) {
406 if (m->control_pressed) { return 0; }
408 //initialize weighted score
409 string groupA = namesOfGroupCombos[h][0];
410 string groupB = namesOfGroupCombos[h][1];
415 //create a random tree with same topology as T[i], but different labels
416 randT->assembleRandomUnifracTree(groupA, groupB);
418 if (m->control_pressed) { delete randT; return 0; }
420 //get wscore of random tree
421 EstOutput randomData = weighted->getValues(randT, groupA, groupB);
423 if (m->control_pressed) { delete randT; return 0; }
426 scores[h].push_back(randomData[0]);
434 catch(exception& e) {
435 m->errorOut(e, "UnifracWeightedCommand", "driver");
439 /***********************************************************/
440 void UnifracWeightedCommand::printWeightedFile() {
444 tags.push_back("Score"); tags.push_back("RandFreq"); tags.push_back("RandCumul");
446 for(int a = 0; a < numComp; a++) {
447 output->initFile(groupComb[a], tags);
449 for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
450 data.push_back(it->first); data.push_back(rScoreFreq[a][it->first]); data.push_back(rCumul[a][it->first]);
451 output->output(data);
457 catch(exception& e) {
458 m->errorOut(e, "UnifracWeightedCommand", "printWeightedFile");
464 /***********************************************************/
465 void UnifracWeightedCommand::printWSummaryFile() {
468 outSum << "Tree#" << '\t' << "Groups" << '\t' << "WScore" << '\t' << "WSig" << endl;
469 m->mothurOut("Tree#\tGroups\tWScore\tWSig"); m->mothurOutEndLine();
472 outSum.setf(ios::fixed, ios::floatfield); outSum.setf(ios::showpoint);
476 for (int i = 0; i < T.size(); i++) {
477 for (int j = 0; j < numComp; j++) {
479 if (WScoreSig[count] > (1/(float)iters)) {
480 outSum << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << WScoreSig[count] << endl;
481 cout << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << WScoreSig[count] << endl;
482 m->mothurOutJustToLog(toString(i+1) +"\t" + groupComb[j] +"\t" + toString(utreeScores[count]) +"\t" + toString(WScoreSig[count]) + "\n");
484 outSum << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl;
485 cout << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl;
486 m->mothurOutJustToLog(toString(i+1) +"\t" + groupComb[j] +"\t" + toString(utreeScores[count]) +"\t<" + toString((1/float(iters))) + "\n");
489 outSum << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << "0.00" << endl;
490 cout << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << "0.00" << endl;
491 m->mothurOutJustToLog(toString(i+1) +"\t" + groupComb[j] +"\t" + toString(utreeScores[count]) +"\t0.00\n");
498 catch(exception& e) {
499 m->errorOut(e, "UnifracWeightedCommand", "printWSummaryFile");
503 /***********************************************************/
504 void UnifracWeightedCommand::createPhylipFile() {
508 for (int i = 0; i < T.size(); i++) {
510 string phylipFileName;
511 if ((outputForm == "lt") || (outputForm == "square")) {
512 phylipFileName = outputDir + m->getSimpleName(globaldata->getTreeFile()) + toString(i+1) + ".weighted.phylip.dist";
513 outputNames.push_back(phylipFileName); outputTypes["phylip"].push_back(phylipFileName);
515 phylipFileName = outputDir + m->getSimpleName(globaldata->getTreeFile()) + toString(i+1) + ".weighted.column.dist";
516 outputNames.push_back(phylipFileName); outputTypes["column"].push_back(phylipFileName);
520 m->openOutputFile(phylipFileName, out);
522 if ((outputForm == "lt") || (outputForm == "square")) {
524 out << globaldata->Groups.size() << endl;
527 //make matrix with scores in it
528 vector< vector<float> > dists; dists.resize(globaldata->Groups.size());
529 for (int i = 0; i < globaldata->Groups.size(); i++) {
530 dists[i].resize(globaldata->Groups.size(), 0.0);
533 //flip it so you can print it
534 for (int r=0; r<globaldata->Groups.size(); r++) {
535 for (int l = 0; l < r; l++) {
536 dists[r][l] = utreeScores[count];
537 dists[l][r] = utreeScores[count];
543 for (int r=0; r<globaldata->Groups.size(); r++) {
545 string name = globaldata->Groups[r];
546 if (name.length() < 10) { //pad with spaces to make compatible
547 while (name.length() < 10) { name += " "; }
550 if (outputForm == "lt") {
554 for (int l = 0; l < r; l++) { out << dists[r][l] << '\t'; }
556 }else if (outputForm == "square") {
560 for (int l = 0; l < globaldata->Groups.size(); l++) { out << dists[r][l] << '\t'; }
564 for (int l = 0; l < r; l++) {
565 string otherName = globaldata->Groups[l];
566 if (otherName.length() < 10) { //pad with spaces to make compatible
567 while (otherName.length() < 10) { otherName += " "; }
570 out << name << '\t' << otherName << dists[r][l] << endl;
577 catch(exception& e) {
578 m->errorOut(e, "UnifracWeightedCommand", "createPhylipFile");
582 /***********************************************************/
583 int UnifracWeightedCommand::findIndex(float score, int index) {
585 for (int i = 0; i < rScores[index].size(); i++) {
586 if (rScores[index][i] >= score) { return i; }
588 return rScores[index].size();
590 catch(exception& e) {
591 m->errorOut(e, "UnifracWeightedCommand", "findIndex");
596 /***********************************************************/
598 void UnifracWeightedCommand::calculateFreqsCumuls() {
600 //clear out old tree values
602 rScoreFreq.resize(numComp);
604 rCumul.resize(numComp);
607 //calculate frequency
608 for (int f = 0; f < numComp; f++) {
609 for (int i = 0; i < rScores[f].size(); i++) { //looks like 0,0,1,1,1,2,4,7... you want to make a map that say rScoreFreq[0] = 2, rScoreFreq[1] = 3...
610 validScores[rScores[f][i]] = rScores[f][i];
611 map<float,float>::iterator it = rScoreFreq[f].find(rScores[f][i]);
612 if (it != rScoreFreq[f].end()) {
613 rScoreFreq[f][rScores[f][i]]++;
615 rScoreFreq[f][rScores[f][i]] = 1;
621 for(int a = 0; a < numComp; a++) {
622 float rcumul = 1.0000;
623 //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
624 for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
625 //make rscoreFreq map and rCumul
626 map<float,float>::iterator it2 = rScoreFreq[a].find(it->first);
627 rCumul[a][it->first] = rcumul;
628 //get percentage of random trees with that info
629 if (it2 != rScoreFreq[a].end()) { rScoreFreq[a][it->first] /= iters; rcumul-= it2->second; }
630 else { rScoreFreq[a][it->first] = 0.0000; } //no random trees with that score
635 catch(exception& e) {
636 m->errorOut(e, "UnifracWeightedCommand", "calculateFreqsCums");
641 /***********************************************************/