2 * unifracunweightedcommand.cpp
5 * Created by Sarah Westcott on 2/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "unifracunweightedcommand.h"
12 //**********************************************************************************************************************
13 vector<string> UnifracUnweightedCommand::setParameters(){
15 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree);
16 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
17 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
18 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
19 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
20 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
21 CommandParameter prandom("random", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prandom);
22 CommandParameter pdistance("distance", "Multiple", "column-lt-square", "column", "", "", "",false,false); parameters.push_back(pdistance);
23 CommandParameter proot("root", "Boolean", "F", "", "", "", "",false,false); parameters.push_back(proot);
24 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
25 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27 vector<string> myArray;
28 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
32 m->errorOut(e, "UnifracUnweightedCommand", "setParameters");
36 //**********************************************************************************************************************
37 string UnifracUnweightedCommand::getHelpString(){
39 string helpString = "";
40 helpString += "The unifrac.unweighted command parameters are tree, group, name, groups, iters, distance, processors, root and random. tree parameter is required unless you have valid current tree file.\n";
41 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 1 valid group.\n";
42 helpString += "The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n";
43 helpString += "The distance parameter allows you to create a distance file from the results. The default is false. You may set distance to lt, square or column.\n";
44 helpString += "The random parameter allows you to shut off the comparison to random trees. The default is false, meaning compare don't your trees with randomly generated trees.\n";
45 helpString += "The root parameter allows you to include the entire root in your calculations. The default is false, meaning stop at the root for this comparision instead of the root of the entire tree.\n";
46 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
47 helpString += "The unifrac.unweighted command should be in the following format: unifrac.unweighted(groups=yourGroups, iters=yourIters).\n";
48 helpString += "Example unifrac.unweighted(groups=A-B-C, iters=500).\n";
49 helpString += "The default value for groups is all the groups in your groupfile, and iters is 1000.\n";
50 helpString += "The unifrac.unweighted command output two files: .unweighted and .uwsummary their descriptions are in the manual.\n";
51 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
55 m->errorOut(e, "UnifracUnweightedCommand", "getHelpString");
59 //**********************************************************************************************************************
60 UnifracUnweightedCommand::UnifracUnweightedCommand(){
62 abort = true; calledHelp = true;
64 vector<string> tempOutNames;
65 outputTypes["unweighted"] = tempOutNames;
66 outputTypes["uwsummary"] = tempOutNames;
67 outputTypes["phylip"] = tempOutNames;
68 outputTypes["column"] = tempOutNames;
71 m->errorOut(e, "UnifracUnweightedCommand", "UnifracUnweightedCommand");
75 /***********************************************************/
76 UnifracUnweightedCommand::UnifracUnweightedCommand(string option) {
78 abort = false; calledHelp = false;
81 //allow user to run help
82 if(option == "help") { help(); abort = true; calledHelp = true; }
83 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
86 vector<string> myArray = setParameters();
88 OptionParser parser(option);
89 map<string,string> parameters = parser.getParameters();
90 map<string,string>::iterator it;
92 ValidParameters validParameter;
94 //check to make sure all parameters are valid for command
95 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
96 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
99 //initialize outputTypes
100 vector<string> tempOutNames;
101 outputTypes["unweighted"] = tempOutNames;
102 outputTypes["uwsummary"] = tempOutNames;
103 outputTypes["phylip"] = tempOutNames;
104 outputTypes["column"] = tempOutNames;
106 //if the user changes the input directory command factory will send this info to us in the output parameter
107 string inputDir = validParameter.validFile(parameters, "inputdir", false);
108 if (inputDir == "not found"){ inputDir = ""; }
111 it = parameters.find("tree");
112 //user has given a template file
113 if(it != parameters.end()){
114 path = m->hasPath(it->second);
115 //if the user has not given a path then, add inputdir. else leave path alone.
116 if (path == "") { parameters["tree"] = inputDir + it->second; }
119 it = parameters.find("group");
120 //user has given a template file
121 if(it != parameters.end()){
122 path = m->hasPath(it->second);
123 //if the user has not given a path then, add inputdir. else leave path alone.
124 if (path == "") { parameters["group"] = inputDir + it->second; }
127 it = parameters.find("name");
128 //user has given a template file
129 if(it != parameters.end()){
130 path = m->hasPath(it->second);
131 //if the user has not given a path then, add inputdir. else leave path alone.
132 if (path == "") { parameters["name"] = inputDir + it->second; }
139 m->Treenames.clear();
142 //check for required parameters
143 treefile = validParameter.validFile(parameters, "tree", true);
144 if (treefile == "not open") { abort = true; }
145 else if (treefile == "not found") { //if there is a current design file, use it
146 treefile = m->getTreeFile();
147 if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
148 else { m->mothurOut("You have no current tree file and the tree parameter is required."); m->mothurOutEndLine(); abort = true; }
149 }else { m->setTreeFile(treefile); }
151 //check for required parameters
152 groupfile = validParameter.validFile(parameters, "group", true);
153 if (groupfile == "not open") { abort = true; }
154 else if (groupfile == "not found") { groupfile = ""; }
155 else { m->setGroupFile(groupfile); }
157 namefile = validParameter.validFile(parameters, "name", true);
158 if (namefile == "not open") { abort = true; }
159 else if (namefile == "not found") { namefile = ""; }
160 else { m->setNameFile(namefile); }
162 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
164 //check for optional parameter and set defaults
165 // ...at some point should added some additional type checking...
166 groups = validParameter.validFile(parameters, "groups", false);
167 if (groups == "not found") { groups = ""; }
169 m->splitAtDash(groups, Groups);
170 m->setGroups(Groups);
173 itersString = validParameter.validFile(parameters, "iters", false); if (itersString == "not found") { itersString = "1000"; }
174 m->mothurConvert(itersString, iters);
176 string temp = validParameter.validFile(parameters, "distance", false);
177 if (temp == "not found") { phylip = false; outputForm = ""; }
179 if ((temp == "lt") || (temp == "column") || (temp == "square")) { phylip = true; outputForm = temp; }
180 else { m->mothurOut("Options for distance are: lt, square, or column. Using lt."); m->mothurOutEndLine(); phylip = true; outputForm = "lt"; }
183 temp = validParameter.validFile(parameters, "random", false); if (temp == "not found") { temp = "f"; }
184 random = m->isTrue(temp);
186 temp = validParameter.validFile(parameters, "root", false); if (temp == "not found") { temp = "F"; }
187 includeRoot = m->isTrue(temp);
189 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
190 m->setProcessors(temp);
191 m->mothurConvert(temp, processors);
193 if (!random) { iters = 0; } //turn off random calcs
195 //if user selects distance = true and no groups it won't calc the pairwise
196 if ((phylip) && (Groups.size() == 0)) {
198 m->splitAtDash(groups, Groups);
199 m->setGroups(Groups);
204 catch(exception& e) {
205 m->errorOut(e, "UnifracUnweightedCommand", "UnifracUnweightedCommand");
210 /***********************************************************/
211 int UnifracUnweightedCommand::execute() {
214 if (abort == true) { if (calledHelp) { return 0; } return 2; }
216 m->setTreeFile(treefile);
218 if (groupfile != "") {
219 //read in group map info.
220 tmap = new TreeMap(groupfile);
222 }else{ //fake out by putting everyone in one group
223 Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap
224 tmap = new TreeMap();
226 for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
229 if (namefile != "") { readNamesFile(); }
231 read = new ReadNewickTree(treefile);
232 int readOk = read->read(tmap);
234 if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
236 read->AssembleTrees();
237 T = read->getTrees();
240 //make sure all files match
241 //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
243 if (namefile != "") {
244 if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); }
245 else { numNamesInTree = m->Treenames.size(); }
246 }else { numNamesInTree = m->Treenames.size(); }
249 //output any names that are in group file but not in tree
250 if (numNamesInTree < tmap->getNumSeqs()) {
251 for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
252 //is that name in the tree?
254 for (int j = 0; j < m->Treenames.size(); j++) {
255 if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
259 if (m->control_pressed) {
260 delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
261 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
266 //then you did not find it so report it
267 if (count == m->Treenames.size()) {
268 //if it is in your namefile then don't remove
269 map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
271 if (it == nameMap.end()) {
272 m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
273 tmap->removeSeq(tmap->namesOfSeqs[i]);
274 i--; //need this because removeSeq removes name from namesOfSeqs
280 sumFile = outputDir + m->getSimpleName(treefile) + ".uwsummary";
281 outputNames.push_back(sumFile); outputTypes["uwsummary"].push_back(sumFile);
282 m->openOutputFile(sumFile, outSum);
284 util = new SharedUtil();
285 Groups = m->getGroups();
286 vector<string> namesGroups = tmap->getNamesOfGroups();
287 util->setGroups(Groups, namesGroups, allGroups, numGroups, "unweighted"); //sets the groups the user wants to analyze
288 util->getCombos(groupComb, Groups, numComp);
289 m->setGroups(Groups);
292 if (numGroups == 1) { numComp++; groupComb.push_back(allGroups); }
294 unweighted = new Unweighted(tmap, includeRoot);
296 int start = time(NULL);
298 userData.resize(numComp,0); //data[0] = unweightedscore
299 randomData.resize(numComp,0); //data[0] = unweightedscore
300 //create new tree with same num nodes and leaves as users
302 if (numComp < processors) { processors = numComp; }
304 outSum << "Tree#" << '\t' << "Groups" << '\t' << "UWScore" <<'\t';
305 m->mothurOut("Tree#\tGroups\tUWScore\t");
306 if (random) { outSum << "UWSig"; m->mothurOut("UWSig"); }
307 outSum << endl; m->mothurOutEndLine();
309 //get pscores for users trees
310 for (int i = 0; i < T.size(); i++) {
311 if (m->control_pressed) {
312 delete tmap; delete unweighted;
313 for (int i = 0; i < T.size(); i++) { delete T[i]; }
315 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }
322 output = new ColumnFile(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted", itersString);
323 outputNames.push_back(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted");
324 outputTypes["unweighted"].push_back(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted");
328 //get unweighted for users tree
329 rscoreFreq.resize(numComp);
330 rCumul.resize(numComp);
331 utreeScores.resize(numComp);
332 UWScoreSig.resize(numComp);
334 userData = unweighted->getValues(T[i], processors, outputDir); //userData[0] = unweightedscore
336 if (m->control_pressed) { delete tmap; delete unweighted;
337 for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }return 0; }
339 //output scores for each combination
340 for(int k = 0; k < numComp; k++) {
342 utreeScores[k].push_back(userData[k]);
344 //add users score to validscores
345 validScores[userData[k]] = userData[k];
348 //get unweighted scores for random trees - if random is false iters = 0
349 for (int j = 0; j < iters; j++) {
351 //we need a different getValues because when we swap the labels we only want to swap those in each pairwise comparison
352 randomData = unweighted->getValues(T[i], "", "", processors, outputDir);
354 if (m->control_pressed) { delete tmap; delete unweighted;
355 for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
357 for(int k = 0; k < numComp; k++) {
358 //add trees unweighted score to map of scores
359 map<float,float>::iterator it = rscoreFreq[k].find(randomData[k]);
360 if (it != rscoreFreq[k].end()) {//already have that score
361 rscoreFreq[k][randomData[k]]++;
362 }else{//first time we have seen this score
363 rscoreFreq[k][randomData[k]] = 1;
366 //add randoms score to validscores
367 validScores[randomData[k]] = randomData[k];
371 // m->mothurOut("Iter: " + toString(j+1)); m->mothurOutEndLine();
374 for(int a = 0; a < numComp; a++) {
375 float rcumul = 1.0000;
378 //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
379 for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
380 //make rscoreFreq map and rCumul
381 map<float,float>::iterator it2 = rscoreFreq[a].find(it->first);
382 rCumul[a][it->first] = rcumul;
383 //get percentage of random trees with that info
384 if (it2 != rscoreFreq[a].end()) { rscoreFreq[a][it->first] /= iters; rcumul-= it2->second; }
385 else { rscoreFreq[a][it->first] = 0.0000; } //no random trees with that score
387 UWScoreSig[a].push_back(rCumul[a][userData[a]]);
388 }else { UWScoreSig[a].push_back(0.0); }
392 if (m->control_pressed) { delete tmap; delete unweighted;
393 for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
396 printUWSummaryFile(i);
397 if (random) { printUnweightedFile(); delete output; }
398 if (phylip) { createPhylipFile(i); }
410 delete tmap; delete unweighted;
411 for (int i = 0; i < T.size(); i++) { delete T[i]; }
413 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
415 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.unweighted."); m->mothurOutEndLine();
417 //set phylip file as new current phylipfile
419 itTypes = outputTypes.find("phylip");
420 if (itTypes != outputTypes.end()) {
421 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setPhylipFile(current); }
424 //set column file as new current columnfile
425 itTypes = outputTypes.find("column");
426 if (itTypes != outputTypes.end()) {
427 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setColumnFile(current); }
430 m->mothurOutEndLine();
431 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
432 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
433 m->mothurOutEndLine();
438 catch(exception& e) {
439 m->errorOut(e, "UnifracUnweightedCommand", "execute");
443 /***********************************************************/
444 void UnifracUnweightedCommand::printUnweightedFile() {
449 tags.push_back("Score");
450 tags.push_back("RandFreq"); tags.push_back("RandCumul");
452 for(int a = 0; a < numComp; a++) {
453 output->initFile(groupComb[a], tags);
455 for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
456 data.push_back(it->first); data.push_back(rscoreFreq[a][it->first]); data.push_back(rCumul[a][it->first]);
457 output->output(data);
463 catch(exception& e) {
464 m->errorOut(e, "UnifracUnweightedCommand", "printUnweightedFile");
469 /***********************************************************/
470 void UnifracUnweightedCommand::printUWSummaryFile(int i) {
474 outSum.setf(ios::fixed, ios::floatfield); outSum.setf(ios::showpoint);
478 for(int a = 0; a < numComp; a++) {
479 outSum << i+1 << '\t';
480 m->mothurOut(toString(i+1) + "\t");
483 if (UWScoreSig[a][0] > (1/(float)iters)) {
484 outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << UWScoreSig[a][0] << endl;
485 cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << UWScoreSig[a][0] << endl;
486 m->mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\t" + toString(UWScoreSig[a][0])+ "\n");
488 outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl;
489 cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl;
490 m->mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\t<" + toString((1/float(iters))) + "\n");
493 outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << endl;
494 cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << endl;
495 m->mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\n");
500 catch(exception& e) {
501 m->errorOut(e, "UnifracUnweightedCommand", "printUWSummaryFile");
505 /***********************************************************/
506 void UnifracUnweightedCommand::createPhylipFile(int i) {
508 string phylipFileName;
509 if ((outputForm == "lt") || (outputForm == "square")) {
510 phylipFileName = outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted.phylip.dist";
511 outputNames.push_back(phylipFileName); outputTypes["phylip"].push_back(phylipFileName);
513 phylipFileName = outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted.column.dist";
514 outputNames.push_back(phylipFileName); outputTypes["column"].push_back(phylipFileName);
518 m->openOutputFile(phylipFileName, out);
520 if ((outputForm == "lt") || (outputForm == "square")) {
522 out << m->getNumGroups() << endl;
525 //make matrix with scores in it
526 vector< vector<float> > dists; dists.resize(m->getNumGroups());
527 for (int i = 0; i < m->getNumGroups(); i++) {
528 dists[i].resize(m->getNumGroups(), 0.0);
531 //flip it so you can print it
533 for (int r=0; r<m->getNumGroups(); r++) {
534 for (int l = 0; l < r; l++) {
535 dists[r][l] = utreeScores[count][0];
536 dists[l][r] = utreeScores[count][0];
542 for (int r=0; r<m->getNumGroups(); r++) {
544 string name = (m->getGroups())[r];
545 if (name.length() < 10) { //pad with spaces to make compatible
546 while (name.length() < 10) { name += " "; }
549 if (outputForm == "lt") {
553 for (int l = 0; l < r; l++) { out << dists[r][l] << '\t'; }
555 }else if (outputForm == "square") {
559 for (int l = 0; l < m->getNumGroups(); l++) { out << dists[r][l] << '\t'; }
563 for (int l = 0; l < r; l++) {
564 string otherName = (m->getGroups())[l];
565 if (otherName.length() < 10) { //pad with spaces to make compatible
566 while (otherName.length() < 10) { otherName += " "; }
569 out << name << '\t' << otherName << dists[r][l] << endl;
575 catch(exception& e) {
576 m->errorOut(e, "UnifracUnweightedCommand", "createPhylipFile");
579 }/*****************************************************************/
580 int UnifracUnweightedCommand::readNamesFile() {
583 numUniquesInName = 0;
586 m->openInputFile(namefile, in);
588 string first, second;
589 map<string, string>::iterator itNames;
592 in >> first >> second; m->gobble(in);
596 itNames = m->names.find(first);
597 if (itNames == m->names.end()) {
598 m->names[first] = second;
600 //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
601 vector<string> dupNames;
602 m->splitAtComma(second, dupNames);
604 for (int i = 0; i < dupNames.size(); i++) {
605 nameMap[dupNames[i]] = dupNames[i];
606 if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); }
608 }else { m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }
614 catch(exception& e) {
615 m->errorOut(e, "UnifracUnweightedCommand", "readNamesFile");
619 /***********************************************************/