2 * unifracunweightedcommand.cpp
5 * Created by Sarah Westcott on 2/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "unifracunweightedcommand.h"
12 //**********************************************************************************************************************
13 vector<string> UnifracUnweightedCommand::setParameters(){
15 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree);
16 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
17 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
18 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
19 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
20 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
21 CommandParameter prandom("random", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prandom);
22 CommandParameter pdistance("distance", "Multiple", "column-lt-square", "column", "", "", "",false,false); parameters.push_back(pdistance);
23 CommandParameter proot("root", "Boolean", "F", "", "", "", "",false,false); parameters.push_back(proot);
24 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
25 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27 vector<string> myArray;
28 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
32 m->errorOut(e, "UnifracUnweightedCommand", "setParameters");
36 //**********************************************************************************************************************
37 string UnifracUnweightedCommand::getHelpString(){
39 string helpString = "";
40 helpString += "The unifrac.unweighted command parameters are tree, group, name, groups, iters, distance, processors, root and random. tree parameter is required unless you have valid current tree file.\n";
41 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 1 valid group.\n";
42 helpString += "The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n";
43 helpString += "The distance parameter allows you to create a distance file from the results. The default is false. You may set distance to lt, square or column.\n";
44 helpString += "The random parameter allows you to shut off the comparison to random trees. The default is false, meaning compare don't your trees with randomly generated trees.\n";
45 helpString += "The root parameter allows you to include the entire root in your calculations. The default is false, meaning stop at the root for this comparision instead of the root of the entire tree.\n";
46 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
47 helpString += "The unifrac.unweighted command should be in the following format: unifrac.unweighted(groups=yourGroups, iters=yourIters).\n";
48 helpString += "Example unifrac.unweighted(groups=A-B-C, iters=500).\n";
49 helpString += "The default value for groups is all the groups in your groupfile, and iters is 1000.\n";
50 helpString += "The unifrac.unweighted command output two files: .unweighted and .uwsummary their descriptions are in the manual.\n";
51 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
55 m->errorOut(e, "UnifracUnweightedCommand", "getHelpString");
59 //**********************************************************************************************************************
60 UnifracUnweightedCommand::UnifracUnweightedCommand(){
62 abort = true; calledHelp = true;
64 vector<string> tempOutNames;
65 outputTypes["unweighted"] = tempOutNames;
66 outputTypes["uwsummary"] = tempOutNames;
67 outputTypes["phylip"] = tempOutNames;
68 outputTypes["column"] = tempOutNames;
71 m->errorOut(e, "UnifracUnweightedCommand", "UnifracUnweightedCommand");
75 /***********************************************************/
76 UnifracUnweightedCommand::UnifracUnweightedCommand(string option) {
78 abort = false; calledHelp = false;
81 //allow user to run help
82 if(option == "help") { help(); abort = true; calledHelp = true; }
85 vector<string> myArray = setParameters();
87 OptionParser parser(option);
88 map<string,string> parameters = parser.getParameters();
89 map<string,string>::iterator it;
91 ValidParameters validParameter;
93 //check to make sure all parameters are valid for command
94 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
95 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
98 //initialize outputTypes
99 vector<string> tempOutNames;
100 outputTypes["unweighted"] = tempOutNames;
101 outputTypes["uwsummary"] = tempOutNames;
102 outputTypes["phylip"] = tempOutNames;
103 outputTypes["column"] = tempOutNames;
105 //if the user changes the input directory command factory will send this info to us in the output parameter
106 string inputDir = validParameter.validFile(parameters, "inputdir", false);
107 if (inputDir == "not found"){ inputDir = ""; }
110 it = parameters.find("tree");
111 //user has given a template file
112 if(it != parameters.end()){
113 path = m->hasPath(it->second);
114 //if the user has not given a path then, add inputdir. else leave path alone.
115 if (path == "") { parameters["tree"] = inputDir + it->second; }
118 it = parameters.find("group");
119 //user has given a template file
120 if(it != parameters.end()){
121 path = m->hasPath(it->second);
122 //if the user has not given a path then, add inputdir. else leave path alone.
123 if (path == "") { parameters["group"] = inputDir + it->second; }
126 it = parameters.find("name");
127 //user has given a template file
128 if(it != parameters.end()){
129 path = m->hasPath(it->second);
130 //if the user has not given a path then, add inputdir. else leave path alone.
131 if (path == "") { parameters["name"] = inputDir + it->second; }
137 m->namesOfGroups.clear();
138 m->Treenames.clear();
141 //check for required parameters
142 treefile = validParameter.validFile(parameters, "tree", true);
143 if (treefile == "not open") { abort = true; }
144 else if (treefile == "not found") { //if there is a current design file, use it
145 treefile = m->getTreeFile();
146 if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
147 else { m->mothurOut("You have no current tree file and the tree parameter is required."); m->mothurOutEndLine(); abort = true; }
150 //check for required parameters
151 groupfile = validParameter.validFile(parameters, "group", true);
152 if (groupfile == "not open") { abort = true; }
153 else if (groupfile == "not found") { groupfile = ""; }
155 namefile = validParameter.validFile(parameters, "name", true);
156 if (namefile == "not open") { abort = true; }
157 else if (namefile == "not found") { namefile = ""; }
159 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
161 //check for optional parameter and set defaults
162 // ...at some point should added some additional type checking...
163 groups = validParameter.validFile(parameters, "groups", false);
164 if (groups == "not found") { groups = ""; }
166 m->splitAtDash(groups, Groups);
170 itersString = validParameter.validFile(parameters, "iters", false); if (itersString == "not found") { itersString = "1000"; }
171 convert(itersString, iters);
173 string temp = validParameter.validFile(parameters, "distance", false);
174 if (temp == "not found") { phylip = false; outputForm = ""; }
176 if ((temp == "lt") || (temp == "column") || (temp == "square")) { phylip = true; outputForm = temp; }
177 else { m->mothurOut("Options for distance are: lt, square, or column. Using lt."); m->mothurOutEndLine(); phylip = true; outputForm = "lt"; }
180 temp = validParameter.validFile(parameters, "random", false); if (temp == "not found") { temp = "f"; }
181 random = m->isTrue(temp);
183 temp = validParameter.validFile(parameters, "root", false); if (temp == "not found") { temp = "F"; }
184 includeRoot = m->isTrue(temp);
186 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
187 m->setProcessors(temp);
188 convert(temp, processors);
190 if (!random) { iters = 0; } //turn off random calcs
192 //if user selects distance = true and no groups it won't calc the pairwise
193 if ((phylip) && (Groups.size() == 0)) {
195 m->splitAtDash(groups, Groups);
201 catch(exception& e) {
202 m->errorOut(e, "UnifracUnweightedCommand", "UnifracUnweightedCommand");
207 /***********************************************************/
208 int UnifracUnweightedCommand::execute() {
211 if (abort == true) { if (calledHelp) { return 0; } return 2; }
213 m->setTreeFile(treefile);
215 if (groupfile != "") {
216 //read in group map info.
217 tmap = new TreeMap(groupfile);
219 }else{ //fake out by putting everyone in one group
220 Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap
221 tmap = new TreeMap();
223 for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
226 if (namefile != "") { readNamesFile(); }
228 read = new ReadNewickTree(treefile);
229 int readOk = read->read(tmap);
231 if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
233 read->AssembleTrees();
234 T = read->getTrees();
237 //make sure all files match
238 //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
240 if (namefile != "") {
241 if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); }
242 else { numNamesInTree = m->Treenames.size(); }
243 }else { numNamesInTree = m->Treenames.size(); }
246 //output any names that are in group file but not in tree
247 if (numNamesInTree < tmap->getNumSeqs()) {
248 for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
249 //is that name in the tree?
251 for (int j = 0; j < m->Treenames.size(); j++) {
252 if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
256 if (m->control_pressed) {
257 delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
258 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } outputTypes.clear();
263 //then you did not find it so report it
264 if (count == m->Treenames.size()) {
265 //if it is in your namefile then don't remove
266 map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
268 if (it == nameMap.end()) {
269 m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
270 tmap->removeSeq(tmap->namesOfSeqs[i]);
271 i--; //need this because removeSeq removes name from namesOfSeqs
277 sumFile = outputDir + m->getSimpleName(treefile) + ".uwsummary";
278 outputNames.push_back(sumFile); outputTypes["uwsummary"].push_back(sumFile);
279 m->openOutputFile(sumFile, outSum);
281 util = new SharedUtil();
282 util->setGroups(m->Groups, tmap->namesOfGroups, allGroups, numGroups, "unweighted"); //sets the groups the user wants to analyze
283 util->getCombos(groupComb, m->Groups, numComp);
286 if (numGroups == 1) { numComp++; groupComb.push_back(allGroups); }
288 unweighted = new Unweighted(tmap, includeRoot);
290 int start = time(NULL);
292 userData.resize(numComp,0); //data[0] = unweightedscore
293 randomData.resize(numComp,0); //data[0] = unweightedscore
294 //create new tree with same num nodes and leaves as users
296 if (numComp < processors) { processors = numComp; }
298 outSum << "Tree#" << '\t' << "Groups" << '\t' << "UWScore" <<'\t' << "UWSig" << endl;
299 m->mothurOut("Tree#\tGroups\tUWScore\tUWSig"); m->mothurOutEndLine();
301 //get pscores for users trees
302 for (int i = 0; i < T.size(); i++) {
303 if (m->control_pressed) {
304 delete tmap; delete unweighted;
305 for (int i = 0; i < T.size(); i++) { delete T[i]; }
307 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
314 output = new ColumnFile(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted", itersString);
315 outputNames.push_back(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted");
316 outputTypes["unweighted"].push_back(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted");
320 //get unweighted for users tree
321 rscoreFreq.resize(numComp);
322 rCumul.resize(numComp);
323 utreeScores.resize(numComp);
324 UWScoreSig.resize(numComp);
326 userData = unweighted->getValues(T[i], processors, outputDir); //userData[0] = unweightedscore
328 if (m->control_pressed) { delete tmap; delete unweighted;
329 for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }return 0; }
331 //output scores for each combination
332 for(int k = 0; k < numComp; k++) {
334 utreeScores[k].push_back(userData[k]);
336 //add users score to validscores
337 validScores[userData[k]] = userData[k];
340 //get unweighted scores for random trees - if random is false iters = 0
341 for (int j = 0; j < iters; j++) {
343 //we need a different getValues because when we swap the labels we only want to swap those in each pairwise comparison
344 randomData = unweighted->getValues(T[i], "", "", processors, outputDir);
346 if (m->control_pressed) { delete tmap; delete unweighted;
347 for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
349 for(int k = 0; k < numComp; k++) {
350 //add trees unweighted score to map of scores
351 map<float,float>::iterator it = rscoreFreq[k].find(randomData[k]);
352 if (it != rscoreFreq[k].end()) {//already have that score
353 rscoreFreq[k][randomData[k]]++;
354 }else{//first time we have seen this score
355 rscoreFreq[k][randomData[k]] = 1;
358 //add randoms score to validscores
359 validScores[randomData[k]] = randomData[k];
363 // m->mothurOut("Iter: " + toString(j+1)); m->mothurOutEndLine();
366 for(int a = 0; a < numComp; a++) {
367 float rcumul = 1.0000;
370 //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
371 for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
372 //make rscoreFreq map and rCumul
373 map<float,float>::iterator it2 = rscoreFreq[a].find(it->first);
374 rCumul[a][it->first] = rcumul;
375 //get percentage of random trees with that info
376 if (it2 != rscoreFreq[a].end()) { rscoreFreq[a][it->first] /= iters; rcumul-= it2->second; }
377 else { rscoreFreq[a][it->first] = 0.0000; } //no random trees with that score
379 UWScoreSig[a].push_back(rCumul[a][userData[a]]);
380 }else { UWScoreSig[a].push_back(0.0); }
384 if (m->control_pressed) { delete tmap; delete unweighted;
385 for (int i = 0; i < T.size(); i++) { delete T[i]; }if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
388 printUWSummaryFile(i);
389 if (random) { printUnweightedFile(); delete output; }
390 if (phylip) { createPhylipFile(i); }
402 delete tmap; delete unweighted;
403 for (int i = 0; i < T.size(); i++) { delete T[i]; }
405 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
407 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.unweighted."); m->mothurOutEndLine();
409 //set phylip file as new current phylipfile
411 itTypes = outputTypes.find("phylip");
412 if (itTypes != outputTypes.end()) {
413 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setPhylipFile(current); }
416 //set column file as new current columnfile
417 itTypes = outputTypes.find("column");
418 if (itTypes != outputTypes.end()) {
419 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setColumnFile(current); }
422 m->mothurOutEndLine();
423 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
424 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
425 m->mothurOutEndLine();
430 catch(exception& e) {
431 m->errorOut(e, "UnifracUnweightedCommand", "execute");
435 /***********************************************************/
436 void UnifracUnweightedCommand::printUnweightedFile() {
441 tags.push_back("Score");
442 tags.push_back("RandFreq"); tags.push_back("RandCumul");
444 for(int a = 0; a < numComp; a++) {
445 output->initFile(groupComb[a], tags);
447 for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
448 data.push_back(it->first); data.push_back(rscoreFreq[a][it->first]); data.push_back(rCumul[a][it->first]);
449 output->output(data);
455 catch(exception& e) {
456 m->errorOut(e, "UnifracUnweightedCommand", "printUnweightedFile");
461 /***********************************************************/
462 void UnifracUnweightedCommand::printUWSummaryFile(int i) {
466 outSum.setf(ios::fixed, ios::floatfield); outSum.setf(ios::showpoint);
470 for(int a = 0; a < numComp; a++) {
471 outSum << i+1 << '\t';
472 m->mothurOut(toString(i+1) + "\t");
475 if (UWScoreSig[a][0] > (1/(float)iters)) {
476 outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << UWScoreSig[a][0] << endl;
477 cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << UWScoreSig[a][0] << endl;
478 m->mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\t" + toString(UWScoreSig[a][0])+ "\n");
480 outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl;
481 cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl;
482 m->mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\t<" + toString((1/float(iters))) + "\n");
485 outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << "0.00" << endl;
486 cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << "0.00" << endl;
487 m->mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\t0.00\n");
492 catch(exception& e) {
493 m->errorOut(e, "UnifracUnweightedCommand", "printUWSummaryFile");
497 /***********************************************************/
498 void UnifracUnweightedCommand::createPhylipFile(int i) {
500 string phylipFileName;
501 if ((outputForm == "lt") || (outputForm == "square")) {
502 phylipFileName = outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted.phylip.dist";
503 outputNames.push_back(phylipFileName); outputTypes["phylip"].push_back(phylipFileName);
505 phylipFileName = outputDir + m->getSimpleName(treefile) + toString(i+1) + ".unweighted.column.dist";
506 outputNames.push_back(phylipFileName); outputTypes["column"].push_back(phylipFileName);
510 m->openOutputFile(phylipFileName, out);
512 if ((outputForm == "lt") || (outputForm == "square")) {
514 out << m->Groups.size() << endl;
517 //make matrix with scores in it
518 vector< vector<float> > dists; dists.resize(m->Groups.size());
519 for (int i = 0; i < m->Groups.size(); i++) {
520 dists[i].resize(m->Groups.size(), 0.0);
523 //flip it so you can print it
525 for (int r=0; r<m->Groups.size(); r++) {
526 for (int l = 0; l < r; l++) {
527 dists[r][l] = utreeScores[count][0];
528 dists[l][r] = utreeScores[count][0];
534 for (int r=0; r<m->Groups.size(); r++) {
536 string name = m->Groups[r];
537 if (name.length() < 10) { //pad with spaces to make compatible
538 while (name.length() < 10) { name += " "; }
541 if (outputForm == "lt") {
545 for (int l = 0; l < r; l++) { out << dists[r][l] << '\t'; }
547 }else if (outputForm == "square") {
551 for (int l = 0; l < m->Groups.size(); l++) { out << dists[r][l] << '\t'; }
555 for (int l = 0; l < r; l++) {
556 string otherName = m->Groups[l];
557 if (otherName.length() < 10) { //pad with spaces to make compatible
558 while (otherName.length() < 10) { otherName += " "; }
561 out << name << '\t' << otherName << dists[r][l] << endl;
567 catch(exception& e) {
568 m->errorOut(e, "UnifracUnweightedCommand", "createPhylipFile");
571 }/*****************************************************************/
572 int UnifracUnweightedCommand::readNamesFile() {
575 numUniquesInName = 0;
578 m->openInputFile(namefile, in);
580 string first, second;
581 map<string, string>::iterator itNames;
584 in >> first >> second; m->gobble(in);
588 itNames = m->names.find(first);
589 if (itNames == m->names.end()) {
590 m->names[first] = second;
592 //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
593 vector<string> dupNames;
594 m->splitAtComma(second, dupNames);
596 for (int i = 0; i < dupNames.size(); i++) {
597 nameMap[dupNames[i]] = dupNames[i];
598 if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); }
600 }else { m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }
606 catch(exception& e) {
607 m->errorOut(e, "UnifracUnweightedCommand", "readNamesFile");
611 /***********************************************************/