2 * unifracunweightedcommand.cpp
5 * Created by Sarah Westcott on 2/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "unifracunweightedcommand.h"
12 //**********************************************************************************************************************
13 vector<string> UnifracUnweightedCommand::setParameters(){
15 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree);
16 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
17 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
18 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
19 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
20 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
21 CommandParameter prandom("random", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prandom);
22 CommandParameter pdistance("distance", "Multiple", "column-lt-square", "column", "", "", "",false,false); parameters.push_back(pdistance);
23 CommandParameter proot("root", "Boolean", "F", "", "", "", "",false,false); parameters.push_back(proot);
24 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
25 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27 vector<string> myArray;
28 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
32 m->errorOut(e, "UnifracUnweightedCommand", "setParameters");
36 //**********************************************************************************************************************
37 string UnifracUnweightedCommand::getHelpString(){
39 string helpString = "";
40 helpString += "The unifrac.unweighted command parameters are tree, group, name, groups, iters, distance, processors, root and random. tree parameter is required unless you have valid current tree file.\n";
41 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 1 valid group.\n";
42 helpString += "The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n";
43 helpString += "The distance parameter allows you to create a distance file from the results. The default is false. You may set distance to lt, square or column.\n";
44 helpString += "The random parameter allows you to shut off the comparison to random trees. The default is false, meaning compare don't your trees with randomly generated trees.\n";
45 helpString += "The root parameter allows you to include the entire root in your calculations. The default is false, meaning stop at the root for this comparision instead of the root of the entire tree.\n";
46 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
47 helpString += "The unifrac.unweighted command should be in the following format: unifrac.unweighted(groups=yourGroups, iters=yourIters).\n";
48 helpString += "Example unifrac.unweighted(groups=A-B-C, iters=500).\n";
49 helpString += "The default value for groups is all the groups in your groupfile, and iters is 1000.\n";
50 helpString += "The unifrac.unweighted command output two files: .unweighted and .uwsummary their descriptions are in the manual.\n";
51 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
55 m->errorOut(e, "UnifracUnweightedCommand", "getHelpString");
59 //**********************************************************************************************************************
60 UnifracUnweightedCommand::UnifracUnweightedCommand(){
62 abort = true; calledHelp = true;
64 vector<string> tempOutNames;
65 outputTypes["unweighted"] = tempOutNames;
66 outputTypes["uwsummary"] = tempOutNames;
67 outputTypes["phylip"] = tempOutNames;
68 outputTypes["column"] = tempOutNames;
71 m->errorOut(e, "UnifracUnweightedCommand", "UnifracUnweightedCommand");
75 /***********************************************************/
76 UnifracUnweightedCommand::UnifracUnweightedCommand(string option) {
78 abort = false; calledHelp = false;
81 //allow user to run help
82 if(option == "help") { help(); abort = true; calledHelp = true; }
83 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
86 vector<string> myArray = setParameters();
88 OptionParser parser(option);
89 map<string,string> parameters = parser.getParameters();
90 map<string,string>::iterator it;
92 ValidParameters validParameter;
94 //check to make sure all parameters are valid for command
95 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
96 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
99 //initialize outputTypes
100 vector<string> tempOutNames;
101 outputTypes["unweighted"] = tempOutNames;
102 outputTypes["uwsummary"] = tempOutNames;
103 outputTypes["phylip"] = tempOutNames;
104 outputTypes["column"] = tempOutNames;
106 //if the user changes the input directory command factory will send this info to us in the output parameter
107 string inputDir = validParameter.validFile(parameters, "inputdir", false);
108 if (inputDir == "not found"){ inputDir = ""; }
111 it = parameters.find("tree");
112 //user has given a template file
113 if(it != parameters.end()){
114 path = m->hasPath(it->second);
115 //if the user has not given a path then, add inputdir. else leave path alone.
116 if (path == "") { parameters["tree"] = inputDir + it->second; }
119 it = parameters.find("group");
120 //user has given a template file
121 if(it != parameters.end()){
122 path = m->hasPath(it->second);
123 //if the user has not given a path then, add inputdir. else leave path alone.
124 if (path == "") { parameters["group"] = inputDir + it->second; }
127 it = parameters.find("name");
128 //user has given a template file
129 if(it != parameters.end()){
130 path = m->hasPath(it->second);
131 //if the user has not given a path then, add inputdir. else leave path alone.
132 if (path == "") { parameters["name"] = inputDir + it->second; }
138 m->namesOfGroups.clear();
139 m->Treenames.clear();
142 //check for required parameters
143 treefile = validParameter.validFile(parameters, "tree", true);
144 if (treefile == "not open") { abort = true; }
145 else if (treefile == "not found") { //if there is a current design file, use it
146 treefile = m->getTreeFile();
147 if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
148 else { m->mothurOut("You have no current tree file and the tree parameter is required."); m->mothurOutEndLine(); abort = true; }
151 //check for required parameters
152 groupfile = validParameter.validFile(parameters, "group", true);
153 if (groupfile == "not open") { abort = true; }
154 else if (groupfile == "not found") { groupfile = ""; }
156 namefile = validParameter.validFile(parameters, "name", true);
157 if (namefile == "not open") { abort = true; }
158 else if (namefile == "not found") { namefile = ""; }
160 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
162 //check for optional parameter and set defaults
163 // ...at some point should added some additional type checking...
164 groups = validParameter.validFile(parameters, "groups", false);
165 if (groups == "not found") { groups = ""; }
167 m->splitAtDash(groups, Groups);
171 itersString = validParameter.validFile(parameters, "iters", false); if (itersString == "not found") { itersString = "1000"; }
172 convert(itersString, iters);
174 string temp = validParameter.validFile(parameters, "distance", false);
175 if (temp == "not found") { phylip = false; outputForm = ""; }
177 if ((temp == "lt") || (temp == "column") || (temp == "square")) { phylip = true; outputForm = temp; }
178 else { m->mothurOut("Options for distance are: lt, square, or column. Using lt."); m->mothurOutEndLine(); phylip = true; outputForm = "lt"; }
181 temp = validParameter.validFile(parameters, "random", false); if (temp == "not found") { temp = "f"; }
182 random = m->isTrue(temp);
184 temp = validParameter.validFile(parameters, "root", false); if (temp == "not found") { temp = "F"; }
185 includeRoot = m->isTrue(temp);
187 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
188 m->setProcessors(temp);
189 convert(temp, processors);
191 if (!random) { iters = 0; } //turn off random calcs
193 //if user selects distance = true and no groups it won't calc the pairwise
194 if ((phylip) && (Groups.size() == 0)) {
196 m->splitAtDash(groups, Groups);
202 catch(exception& e) {
203 m->errorOut(e, "UnifracUnweightedCommand", "UnifracUnweightedCommand");
208 /***********************************************************/
209 int UnifracUnweightedCommand::execute() {
212 if (abort == true) { if (calledHelp) { return 0; } return 2; }
214 m->setTreeFile(treefile);
216 if (groupfile != "") {
217 //read in group map info.
218 tmap = new TreeMap(groupfile);
220 }else{ //fake out by putting everyone in one group
221 Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap
222 tmap = new TreeMap();
224 for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
227 if (namefile != "") { readNamesFile(); }
229 read = new ReadNewickTree(treefile);
230 int readOk = read->read(tmap);
232 if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
234 read->AssembleTrees();
235 T = read->getTrees();
238 //make sure all files match
239 //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
241 if (namefile != "") {
242 if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); }
243 else { numNamesInTree = m->Treenames.size(); }
244 }else { numNamesInTree = m->Treenames.size(); }
247 //output any names that are in group file but not in tree
248 if (numNamesInTree < tmap->getNumSeqs()) {
249 for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
250 //is that name in the tree?
252 for (int j = 0; j < m->Treenames.size(); j++) {
253 if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
257 if (m->control_pressed) {
258 delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
259 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
264 //then you did not find it so report it
265 if (count == m->Treenames.size()) {
266 //if it is in your namefile then don't remove
267 map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
269 if (it == nameMap.end()) {
270 m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
271 tmap->removeSeq(tmap->namesOfSeqs[i]);
272 i--; //need this because removeSeq removes name from namesOfSeqs
278 sumFile = outputDir + m->getSimpleName(treefile) + ".uwsummary";
279 outputNames.push_back(sumFile); outputTypes["uwsummary"].push_back(sumFile);
280 m->openOutputFile(sumFile, outSum);
282 util = new SharedUtil();
283 util->setGroups(m->Groups, tmap->namesOfGroups, allGroups, numGroups, "unweighted"); //sets the groups the user wants to analyze
284 util->getCombos(groupComb, m->Groups, numComp);
287 if (numGroups == 1) { numComp++; groupComb.push_back(allGroups); }
289 unweighted = new Unweighted(tmap, includeRoot);
291 int start = time(NULL);
293 userData.resize(numComp,0); //data[0] = unweightedscore
294 randomData.resize(numComp,0); //data[0] = unweightedscore
295 //create new tree with same num nodes and leaves as users
297 if (numComp < processors) { processors = numComp; }
299 outSum << "Tree#" << '\t' << "Groups" << '\t' << "UWScore" <<'\t' << "UWSig" << endl;
300 m->mothurOut("Tree#\tGroups\tUWScore\tUWSig"); m->mothurOutEndLine();
302 //get pscores for users trees
303 for (int i = 0; i < T.size(); i++) {
304 if (m->control_pressed) {
305 delete tmap; delete unweighted;
306 for (int i = 0; i < T.size(); i++) { delete T[i]; }
308 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
315 output = new ColumnFile(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted", itersString);
316 outputNames.push_back(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted");
317 outputTypes["unweighted"].push_back(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted");
321 //get unweighted for users tree
322 rscoreFreq.resize(numComp);
323 rCumul.resize(numComp);
324 utreeScores.resize(numComp);
325 UWScoreSig.resize(numComp);
327 userData = unweighted->getValues(T[i], processors, outputDir); //userData[0] = unweightedscore
329 if (m->control_pressed) { delete tmap; delete unweighted;
330 for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }return 0; }
332 //output scores for each combination
333 for(int k = 0; k < numComp; k++) {
335 utreeScores[k].push_back(userData[k]);
337 //add users score to validscores
338 validScores[userData[k]] = userData[k];
341 //get unweighted scores for random trees - if random is false iters = 0
342 for (int j = 0; j < iters; j++) {
344 //we need a different getValues because when we swap the labels we only want to swap those in each pairwise comparison
345 randomData = unweighted->getValues(T[i], "", "", processors, outputDir);
347 if (m->control_pressed) { delete tmap; delete unweighted;
348 for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
350 for(int k = 0; k < numComp; k++) {
351 //add trees unweighted score to map of scores
352 map<float,float>::iterator it = rscoreFreq[k].find(randomData[k]);
353 if (it != rscoreFreq[k].end()) {//already have that score
354 rscoreFreq[k][randomData[k]]++;
355 }else{//first time we have seen this score
356 rscoreFreq[k][randomData[k]] = 1;
359 //add randoms score to validscores
360 validScores[randomData[k]] = randomData[k];
364 // m->mothurOut("Iter: " + toString(j+1)); m->mothurOutEndLine();
367 for(int a = 0; a < numComp; a++) {
368 float rcumul = 1.0000;
371 //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
372 for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
373 //make rscoreFreq map and rCumul
374 map<float,float>::iterator it2 = rscoreFreq[a].find(it->first);
375 rCumul[a][it->first] = rcumul;
376 //get percentage of random trees with that info
377 if (it2 != rscoreFreq[a].end()) { rscoreFreq[a][it->first] /= iters; rcumul-= it2->second; }
378 else { rscoreFreq[a][it->first] = 0.0000; } //no random trees with that score
380 UWScoreSig[a].push_back(rCumul[a][userData[a]]);
381 }else { UWScoreSig[a].push_back(0.0); }
385 if (m->control_pressed) { delete tmap; delete unweighted;
386 for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
389 printUWSummaryFile(i);
390 if (random) { printUnweightedFile(); delete output; }
391 if (phylip) { createPhylipFile(i); }
403 delete tmap; delete unweighted;
404 for (int i = 0; i < T.size(); i++) { delete T[i]; }
406 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
408 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.unweighted."); m->mothurOutEndLine();
410 //set phylip file as new current phylipfile
412 itTypes = outputTypes.find("phylip");
413 if (itTypes != outputTypes.end()) {
414 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setPhylipFile(current); }
417 //set column file as new current columnfile
418 itTypes = outputTypes.find("column");
419 if (itTypes != outputTypes.end()) {
420 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setColumnFile(current); }
423 m->mothurOutEndLine();
424 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
425 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
426 m->mothurOutEndLine();
431 catch(exception& e) {
432 m->errorOut(e, "UnifracUnweightedCommand", "execute");
436 /***********************************************************/
437 void UnifracUnweightedCommand::printUnweightedFile() {
442 tags.push_back("Score");
443 tags.push_back("RandFreq"); tags.push_back("RandCumul");
445 for(int a = 0; a < numComp; a++) {
446 output->initFile(groupComb[a], tags);
448 for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
449 data.push_back(it->first); data.push_back(rscoreFreq[a][it->first]); data.push_back(rCumul[a][it->first]);
450 output->output(data);
456 catch(exception& e) {
457 m->errorOut(e, "UnifracUnweightedCommand", "printUnweightedFile");
462 /***********************************************************/
463 void UnifracUnweightedCommand::printUWSummaryFile(int i) {
467 outSum.setf(ios::fixed, ios::floatfield); outSum.setf(ios::showpoint);
471 for(int a = 0; a < numComp; a++) {
472 outSum << i+1 << '\t';
473 m->mothurOut(toString(i+1) + "\t");
476 if (UWScoreSig[a][0] > (1/(float)iters)) {
477 outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << UWScoreSig[a][0] << endl;
478 cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << UWScoreSig[a][0] << endl;
479 m->mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\t" + toString(UWScoreSig[a][0])+ "\n");
481 outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl;
482 cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl;
483 m->mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\t<" + toString((1/float(iters))) + "\n");
486 outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << "0.00" << endl;
487 cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << "0.00" << endl;
488 m->mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\t0.00\n");
493 catch(exception& e) {
494 m->errorOut(e, "UnifracUnweightedCommand", "printUWSummaryFile");
498 /***********************************************************/
499 void UnifracUnweightedCommand::createPhylipFile(int i) {
501 string phylipFileName;
502 if ((outputForm == "lt") || (outputForm == "square")) {
503 phylipFileName = outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted.phylip.dist";
504 outputNames.push_back(phylipFileName); outputTypes["phylip"].push_back(phylipFileName);
506 phylipFileName = outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted.column.dist";
507 outputNames.push_back(phylipFileName); outputTypes["column"].push_back(phylipFileName);
511 m->openOutputFile(phylipFileName, out);
513 if ((outputForm == "lt") || (outputForm == "square")) {
515 out << m->Groups.size() << endl;
518 //make matrix with scores in it
519 vector< vector<float> > dists; dists.resize(m->Groups.size());
520 for (int i = 0; i < m->Groups.size(); i++) {
521 dists[i].resize(m->Groups.size(), 0.0);
524 //flip it so you can print it
526 for (int r=0; r<m->Groups.size(); r++) {
527 for (int l = 0; l < r; l++) {
528 dists[r][l] = utreeScores[count][0];
529 dists[l][r] = utreeScores[count][0];
535 for (int r=0; r<m->Groups.size(); r++) {
537 string name = m->Groups[r];
538 if (name.length() < 10) { //pad with spaces to make compatible
539 while (name.length() < 10) { name += " "; }
542 if (outputForm == "lt") {
546 for (int l = 0; l < r; l++) { out << dists[r][l] << '\t'; }
548 }else if (outputForm == "square") {
552 for (int l = 0; l < m->Groups.size(); l++) { out << dists[r][l] << '\t'; }
556 for (int l = 0; l < r; l++) {
557 string otherName = m->Groups[l];
558 if (otherName.length() < 10) { //pad with spaces to make compatible
559 while (otherName.length() < 10) { otherName += " "; }
562 out << name << '\t' << otherName << dists[r][l] << endl;
568 catch(exception& e) {
569 m->errorOut(e, "UnifracUnweightedCommand", "createPhylipFile");
572 }/*****************************************************************/
573 int UnifracUnweightedCommand::readNamesFile() {
576 numUniquesInName = 0;
579 m->openInputFile(namefile, in);
581 string first, second;
582 map<string, string>::iterator itNames;
585 in >> first >> second; m->gobble(in);
589 itNames = m->names.find(first);
590 if (itNames == m->names.end()) {
591 m->names[first] = second;
593 //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
594 vector<string> dupNames;
595 m->splitAtComma(second, dupNames);
597 for (int i = 0; i < dupNames.size(); i++) {
598 nameMap[dupNames[i]] = dupNames[i];
599 if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); }
601 }else { m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }
607 catch(exception& e) {
608 m->errorOut(e, "UnifracUnweightedCommand", "readNamesFile");
612 /***********************************************************/