2 * unifracunweightedcommand.cpp
5 * Created by Sarah Westcott on 2/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "unifracunweightedcommand.h"
12 //**********************************************************************************************************************
13 vector<string> UnifracUnweightedCommand::setParameters(){
15 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree);
16 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
17 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
18 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
19 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
20 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
21 CommandParameter prandom("random", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prandom);
22 CommandParameter pdistance("distance", "Multiple", "column-lt-square", "column", "", "", "",false,false); parameters.push_back(pdistance);
23 CommandParameter proot("root", "Boolean", "F", "", "", "", "",false,false); parameters.push_back(proot);
24 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
25 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27 vector<string> myArray;
28 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
32 m->errorOut(e, "UnifracUnweightedCommand", "setParameters");
36 //**********************************************************************************************************************
37 string UnifracUnweightedCommand::getHelpString(){
39 string helpString = "";
40 helpString += "The unifrac.unweighted command parameters are tree, group, name, groups, iters, distance, processors, root and random. tree parameter is required unless you have valid current tree file.\n";
41 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 1 valid group.\n";
42 helpString += "The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n";
43 helpString += "The distance parameter allows you to create a distance file from the results. The default is false. You may set distance to lt, square or column.\n";
44 helpString += "The random parameter allows you to shut off the comparison to random trees. The default is false, meaning compare don't your trees with randomly generated trees.\n";
45 helpString += "The root parameter allows you to include the entire root in your calculations. The default is false, meaning stop at the root for this comparision instead of the root of the entire tree.\n";
46 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
47 helpString += "The unifrac.unweighted command should be in the following format: unifrac.unweighted(groups=yourGroups, iters=yourIters).\n";
48 helpString += "Example unifrac.unweighted(groups=A-B-C, iters=500).\n";
49 helpString += "The default value for groups is all the groups in your groupfile, and iters is 1000.\n";
50 helpString += "The unifrac.unweighted command output two files: .unweighted and .uwsummary their descriptions are in the manual.\n";
51 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
55 m->errorOut(e, "UnifracUnweightedCommand", "getHelpString");
59 //**********************************************************************************************************************
60 UnifracUnweightedCommand::UnifracUnweightedCommand(){
62 abort = true; calledHelp = true;
64 vector<string> tempOutNames;
65 outputTypes["unweighted"] = tempOutNames;
66 outputTypes["uwsummary"] = tempOutNames;
67 outputTypes["phylip"] = tempOutNames;
68 outputTypes["column"] = tempOutNames;
71 m->errorOut(e, "UnifracUnweightedCommand", "UnifracUnweightedCommand");
75 /***********************************************************/
76 UnifracUnweightedCommand::UnifracUnweightedCommand(string option) {
78 abort = false; calledHelp = false;
81 //allow user to run help
82 if(option == "help") { help(); abort = true; calledHelp = true; }
83 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
86 vector<string> myArray = setParameters();
88 OptionParser parser(option);
89 map<string,string> parameters = parser.getParameters();
90 map<string,string>::iterator it;
92 ValidParameters validParameter;
94 //check to make sure all parameters are valid for command
95 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
96 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
99 //initialize outputTypes
100 vector<string> tempOutNames;
101 outputTypes["unweighted"] = tempOutNames;
102 outputTypes["uwsummary"] = tempOutNames;
103 outputTypes["phylip"] = tempOutNames;
104 outputTypes["column"] = tempOutNames;
106 //if the user changes the input directory command factory will send this info to us in the output parameter
107 string inputDir = validParameter.validFile(parameters, "inputdir", false);
108 if (inputDir == "not found"){ inputDir = ""; }
111 it = parameters.find("tree");
112 //user has given a template file
113 if(it != parameters.end()){
114 path = m->hasPath(it->second);
115 //if the user has not given a path then, add inputdir. else leave path alone.
116 if (path == "") { parameters["tree"] = inputDir + it->second; }
119 it = parameters.find("group");
120 //user has given a template file
121 if(it != parameters.end()){
122 path = m->hasPath(it->second);
123 //if the user has not given a path then, add inputdir. else leave path alone.
124 if (path == "") { parameters["group"] = inputDir + it->second; }
127 it = parameters.find("name");
128 //user has given a template file
129 if(it != parameters.end()){
130 path = m->hasPath(it->second);
131 //if the user has not given a path then, add inputdir. else leave path alone.
132 if (path == "") { parameters["name"] = inputDir + it->second; }
139 m->Treenames.clear();
142 //check for required parameters
143 treefile = validParameter.validFile(parameters, "tree", true);
144 if (treefile == "not open") { abort = true; }
145 else if (treefile == "not found") { //if there is a current design file, use it
146 treefile = m->getTreeFile();
147 if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
148 else { m->mothurOut("You have no current tree file and the tree parameter is required."); m->mothurOutEndLine(); abort = true; }
149 }else { m->setTreeFile(treefile); }
151 //check for required parameters
152 groupfile = validParameter.validFile(parameters, "group", true);
153 if (groupfile == "not open") { abort = true; }
154 else if (groupfile == "not found") { groupfile = ""; }
155 else { m->setGroupFile(groupfile); }
157 namefile = validParameter.validFile(parameters, "name", true);
158 if (namefile == "not open") { namefile = ""; abort = true; }
159 else if (namefile == "not found") { namefile = ""; }
160 else { m->setNameFile(namefile); }
162 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
164 //check for optional parameter and set defaults
165 // ...at some point should added some additional type checking...
166 groups = validParameter.validFile(parameters, "groups", false);
167 if (groups == "not found") { groups = ""; }
169 m->splitAtDash(groups, Groups);
170 m->setGroups(Groups);
173 itersString = validParameter.validFile(parameters, "iters", false); if (itersString == "not found") { itersString = "1000"; }
174 m->mothurConvert(itersString, iters);
176 string temp = validParameter.validFile(parameters, "distance", false);
177 if (temp == "not found") { phylip = false; outputForm = ""; }
179 if ((temp == "lt") || (temp == "column") || (temp == "square")) { phylip = true; outputForm = temp; }
180 else { m->mothurOut("Options for distance are: lt, square, or column. Using lt."); m->mothurOutEndLine(); phylip = true; outputForm = "lt"; }
183 temp = validParameter.validFile(parameters, "random", false); if (temp == "not found") { temp = "f"; }
184 random = m->isTrue(temp);
186 temp = validParameter.validFile(parameters, "root", false); if (temp == "not found") { temp = "F"; }
187 includeRoot = m->isTrue(temp);
189 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
190 m->setProcessors(temp);
191 m->mothurConvert(temp, processors);
193 if (!random) { iters = 0; } //turn off random calcs
195 //if user selects distance = true and no groups it won't calc the pairwise
196 if ((phylip) && (Groups.size() == 0)) {
198 m->splitAtDash(groups, Groups);
199 m->setGroups(Groups);
202 if (namefile == "") {
203 vector<string> files; files.push_back(treefile);
204 parser.getNameFile(files);
209 catch(exception& e) {
210 m->errorOut(e, "UnifracUnweightedCommand", "UnifracUnweightedCommand");
215 /***********************************************************/
216 int UnifracUnweightedCommand::execute() {
219 if (abort == true) { if (calledHelp) { return 0; } return 2; }
221 m->setTreeFile(treefile);
223 if (groupfile != "") {
224 //read in group map info.
225 tmap = new TreeMap(groupfile);
227 }else{ //fake out by putting everyone in one group
228 Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap
229 tmap = new TreeMap();
231 for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
234 if (namefile != "") { readNamesFile(); }
236 read = new ReadNewickTree(treefile);
237 int readOk = read->read(tmap);
239 if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
241 read->AssembleTrees();
242 T = read->getTrees();
245 //make sure all files match
246 //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
248 if (namefile != "") {
249 if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); }
250 else { numNamesInTree = m->Treenames.size(); }
251 }else { numNamesInTree = m->Treenames.size(); }
254 //output any names that are in group file but not in tree
255 if (numNamesInTree < tmap->getNumSeqs()) {
256 for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
257 //is that name in the tree?
259 for (int j = 0; j < m->Treenames.size(); j++) {
260 if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
264 if (m->control_pressed) {
265 delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
266 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
271 //then you did not find it so report it
272 if (count == m->Treenames.size()) {
273 //if it is in your namefile then don't remove
274 map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
276 if (it == nameMap.end()) {
277 m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
278 tmap->removeSeq(tmap->namesOfSeqs[i]);
279 i--; //need this because removeSeq removes name from namesOfSeqs
285 sumFile = outputDir + m->getSimpleName(treefile) + ".uwsummary";
286 outputNames.push_back(sumFile); outputTypes["uwsummary"].push_back(sumFile);
287 m->openOutputFile(sumFile, outSum);
289 util = new SharedUtil();
290 Groups = m->getGroups();
291 vector<string> namesGroups = tmap->getNamesOfGroups();
292 util->setGroups(Groups, namesGroups, allGroups, numGroups, "unweighted"); //sets the groups the user wants to analyze
293 util->getCombos(groupComb, Groups, numComp);
294 m->setGroups(Groups);
297 if (numGroups == 1) { numComp++; groupComb.push_back(allGroups); }
299 unweighted = new Unweighted(tmap, includeRoot);
301 int start = time(NULL);
303 userData.resize(numComp,0); //data[0] = unweightedscore
304 randomData.resize(numComp,0); //data[0] = unweightedscore
305 //create new tree with same num nodes and leaves as users
307 if (numComp < processors) { processors = numComp; }
309 outSum << "Tree#" << '\t' << "Groups" << '\t' << "UWScore" <<'\t';
310 m->mothurOut("Tree#\tGroups\tUWScore\t");
311 if (random) { outSum << "UWSig"; m->mothurOut("UWSig"); }
312 outSum << endl; m->mothurOutEndLine();
314 //get pscores for users trees
315 for (int i = 0; i < T.size(); i++) {
316 if (m->control_pressed) {
317 delete tmap; delete unweighted;
318 for (int i = 0; i < T.size(); i++) { delete T[i]; }
320 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }
327 output = new ColumnFile(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted", itersString);
328 outputNames.push_back(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted");
329 outputTypes["unweighted"].push_back(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted");
333 //get unweighted for users tree
334 rscoreFreq.resize(numComp);
335 rCumul.resize(numComp);
336 utreeScores.resize(numComp);
337 UWScoreSig.resize(numComp);
339 userData = unweighted->getValues(T[i], processors, outputDir); //userData[0] = unweightedscore
341 if (m->control_pressed) { delete tmap; delete unweighted;
342 for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); }return 0; }
344 //output scores for each combination
345 for(int k = 0; k < numComp; k++) {
347 utreeScores[k].push_back(userData[k]);
349 //add users score to validscores
350 validScores[userData[k]] = userData[k];
353 //get unweighted scores for random trees - if random is false iters = 0
354 for (int j = 0; j < iters; j++) {
356 //we need a different getValues because when we swap the labels we only want to swap those in each pairwise comparison
357 randomData = unweighted->getValues(T[i], "", "", processors, outputDir);
359 if (m->control_pressed) { delete tmap; delete unweighted;
360 for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
362 for(int k = 0; k < numComp; k++) {
363 //add trees unweighted score to map of scores
364 map<float,float>::iterator it = rscoreFreq[k].find(randomData[k]);
365 if (it != rscoreFreq[k].end()) {//already have that score
366 rscoreFreq[k][randomData[k]]++;
367 }else{//first time we have seen this score
368 rscoreFreq[k][randomData[k]] = 1;
371 //add randoms score to validscores
372 validScores[randomData[k]] = randomData[k];
376 // m->mothurOut("Iter: " + toString(j+1)); m->mothurOutEndLine();
379 for(int a = 0; a < numComp; a++) {
380 float rcumul = 1.0000;
383 //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
384 for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
385 //make rscoreFreq map and rCumul
386 map<float,float>::iterator it2 = rscoreFreq[a].find(it->first);
387 rCumul[a][it->first] = rcumul;
388 //get percentage of random trees with that info
389 if (it2 != rscoreFreq[a].end()) { rscoreFreq[a][it->first] /= iters; rcumul-= it2->second; }
390 else { rscoreFreq[a][it->first] = 0.0000; } //no random trees with that score
392 UWScoreSig[a].push_back(rCumul[a][userData[a]]);
393 }else { UWScoreSig[a].push_back(0.0); }
397 if (m->control_pressed) { delete tmap; delete unweighted;
398 for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
401 printUWSummaryFile(i);
402 if (random) { printUnweightedFile(); delete output; }
403 if (phylip) { createPhylipFile(i); }
415 delete tmap; delete unweighted;
416 for (int i = 0; i < T.size(); i++) { delete T[i]; }
418 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
420 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.unweighted."); m->mothurOutEndLine();
422 //set phylip file as new current phylipfile
424 itTypes = outputTypes.find("phylip");
425 if (itTypes != outputTypes.end()) {
426 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setPhylipFile(current); }
429 //set column file as new current columnfile
430 itTypes = outputTypes.find("column");
431 if (itTypes != outputTypes.end()) {
432 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setColumnFile(current); }
435 m->mothurOutEndLine();
436 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
437 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
438 m->mothurOutEndLine();
443 catch(exception& e) {
444 m->errorOut(e, "UnifracUnweightedCommand", "execute");
448 /***********************************************************/
449 void UnifracUnweightedCommand::printUnweightedFile() {
454 tags.push_back("Score");
455 tags.push_back("RandFreq"); tags.push_back("RandCumul");
457 for(int a = 0; a < numComp; a++) {
458 output->initFile(groupComb[a], tags);
460 for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
461 data.push_back(it->first); data.push_back(rscoreFreq[a][it->first]); data.push_back(rCumul[a][it->first]);
462 output->output(data);
468 catch(exception& e) {
469 m->errorOut(e, "UnifracUnweightedCommand", "printUnweightedFile");
474 /***********************************************************/
475 void UnifracUnweightedCommand::printUWSummaryFile(int i) {
479 outSum.setf(ios::fixed, ios::floatfield); outSum.setf(ios::showpoint);
483 for(int a = 0; a < numComp; a++) {
484 outSum << i+1 << '\t';
485 m->mothurOut(toString(i+1) + "\t");
488 if (UWScoreSig[a][0] > (1/(float)iters)) {
489 outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << UWScoreSig[a][0] << endl;
490 cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << UWScoreSig[a][0] << endl;
491 m->mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\t" + toString(UWScoreSig[a][0])+ "\n");
493 outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl;
494 cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl;
495 m->mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\t<" + toString((1/float(iters))) + "\n");
498 outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << endl;
499 cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << endl;
500 m->mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\n");
505 catch(exception& e) {
506 m->errorOut(e, "UnifracUnweightedCommand", "printUWSummaryFile");
510 /***********************************************************/
511 void UnifracUnweightedCommand::createPhylipFile(int i) {
513 string phylipFileName;
514 if ((outputForm == "lt") || (outputForm == "square")) {
515 phylipFileName = outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted.phylip.dist";
516 outputNames.push_back(phylipFileName); outputTypes["phylip"].push_back(phylipFileName);
518 phylipFileName = outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted.column.dist";
519 outputNames.push_back(phylipFileName); outputTypes["column"].push_back(phylipFileName);
523 m->openOutputFile(phylipFileName, out);
525 if ((outputForm == "lt") || (outputForm == "square")) {
527 out << m->getNumGroups() << endl;
530 //make matrix with scores in it
531 vector< vector<float> > dists; dists.resize(m->getNumGroups());
532 for (int i = 0; i < m->getNumGroups(); i++) {
533 dists[i].resize(m->getNumGroups(), 0.0);
536 //flip it so you can print it
538 for (int r=0; r<m->getNumGroups(); r++) {
539 for (int l = 0; l < r; l++) {
540 dists[r][l] = utreeScores[count][0];
541 dists[l][r] = utreeScores[count][0];
547 for (int r=0; r<m->getNumGroups(); r++) {
549 string name = (m->getGroups())[r];
550 if (name.length() < 10) { //pad with spaces to make compatible
551 while (name.length() < 10) { name += " "; }
554 if (outputForm == "lt") {
558 for (int l = 0; l < r; l++) { out << dists[r][l] << '\t'; }
560 }else if (outputForm == "square") {
564 for (int l = 0; l < m->getNumGroups(); l++) { out << dists[r][l] << '\t'; }
568 for (int l = 0; l < r; l++) {
569 string otherName = (m->getGroups())[l];
570 if (otherName.length() < 10) { //pad with spaces to make compatible
571 while (otherName.length() < 10) { otherName += " "; }
574 out << name << '\t' << otherName << dists[r][l] << endl;
580 catch(exception& e) {
581 m->errorOut(e, "UnifracUnweightedCommand", "createPhylipFile");
584 }/*****************************************************************/
585 int UnifracUnweightedCommand::readNamesFile() {
588 numUniquesInName = 0;
591 m->openInputFile(namefile, in);
593 string first, second;
594 map<string, string>::iterator itNames;
597 in >> first >> second; m->gobble(in);
601 itNames = m->names.find(first);
602 if (itNames == m->names.end()) {
603 m->names[first] = second;
605 //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
606 vector<string> dupNames;
607 m->splitAtComma(second, dupNames);
609 for (int i = 0; i < dupNames.size(); i++) {
610 nameMap[dupNames[i]] = dupNames[i];
611 if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); }
613 }else { m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }
619 catch(exception& e) {
620 m->errorOut(e, "UnifracUnweightedCommand", "readNamesFile");
624 /***********************************************************/