2 * unifracweightedcommand.cpp
5 * Created by Sarah Westcott on 2/9/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "unifracweightedcommand.h"
11 #include "consensus.h"
12 #include "subsample.h"
14 //**********************************************************************************************************************
15 vector<string> UnifracWeightedCommand::setParameters(){
17 CommandParameter ptree("tree", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptree);
18 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
19 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
20 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
21 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
22 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
23 CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample);
24 CommandParameter pconsensus("consensus", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pconsensus);
25 CommandParameter prandom("random", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prandom);
26 CommandParameter pdistance("distance", "Multiple", "column-lt-square", "column", "", "", "",false,false); parameters.push_back(pdistance);
27 CommandParameter proot("root", "Boolean", "F", "", "", "", "",false,false); parameters.push_back(proot);
28 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
29 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
31 vector<string> myArray;
32 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
36 m->errorOut(e, "UnifracWeightedCommand", "setParameters");
40 //**********************************************************************************************************************
41 string UnifracWeightedCommand::getHelpString(){
43 string helpString = "";
44 helpString += "The unifrac.weighted command parameters are tree, group, name, groups, iters, distance, processors, root, subsample, consensus and random. tree parameter is required unless you have valid current tree file.\n";
45 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n";
46 helpString += "The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n";
47 helpString += "The distance parameter allows you to create a distance file from the results. The default is false.\n";
48 helpString += "The random parameter allows you to shut off the comparison to random trees. The default is false, meaning don't compare your trees with randomly generated trees.\n";
49 helpString += "The root parameter allows you to include the entire root in your calculations. The default is false, meaning stop at the root for this comparision instead of the root of the entire tree.\n";
50 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
51 helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group. The subsample parameter may only be used with a group file.\n";
52 helpString += "The consensus parameter allows you to indicate you would like trees built from distance matrices created with the results, as well as a consensus tree built from these trees. Default=F.\n";
53 helpString += "The unifrac.weighted command should be in the following format: unifrac.weighted(groups=yourGroups, iters=yourIters).\n";
54 helpString += "Example unifrac.weighted(groups=A-B-C, iters=500).\n";
55 helpString += "The default value for groups is all the groups in your groupfile, and iters is 1000.\n";
56 helpString += "The unifrac.weighted command output two files: .weighted and .wsummary their descriptions are in the manual.\n";
57 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
61 m->errorOut(e, "UnifracWeightedCommand", "getHelpString");
65 //**********************************************************************************************************************
66 UnifracWeightedCommand::UnifracWeightedCommand(){
68 abort = true; calledHelp = true;
70 vector<string> tempOutNames;
71 outputTypes["weighted"] = tempOutNames;
72 outputTypes["wsummary"] = tempOutNames;
73 outputTypes["phylip"] = tempOutNames;
74 outputTypes["column"] = tempOutNames;
75 outputTypes["tree"] = tempOutNames;
78 m->errorOut(e, "UnifracWeightedCommand", "UnifracWeightedCommand");
83 /***********************************************************/
84 UnifracWeightedCommand::UnifracWeightedCommand(string option) {
86 abort = false; calledHelp = false;
88 //allow user to run help
89 if(option == "help") { help(); abort = true; calledHelp = true; }
90 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
93 vector<string> myArray = setParameters();
95 OptionParser parser(option);
96 map<string,string> parameters=parser.getParameters();
97 map<string,string>::iterator it;
99 ValidParameters validParameter;
101 //check to make sure all parameters are valid for command
102 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
103 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
106 //initialize outputTypes
107 vector<string> tempOutNames;
108 outputTypes["weighted"] = tempOutNames;
109 outputTypes["wsummary"] = tempOutNames;
110 outputTypes["phylip"] = tempOutNames;
111 outputTypes["column"] = tempOutNames;
112 outputTypes["tree"] = tempOutNames;
114 //if the user changes the input directory command factory will send this info to us in the output parameter
115 string inputDir = validParameter.validFile(parameters, "inputdir", false);
116 if (inputDir == "not found"){ inputDir = ""; }
119 it = parameters.find("tree");
120 //user has given a template file
121 if(it != parameters.end()){
122 path = m->hasPath(it->second);
123 //if the user has not given a path then, add inputdir. else leave path alone.
124 if (path == "") { parameters["tree"] = inputDir + it->second; }
127 it = parameters.find("group");
128 //user has given a template file
129 if(it != parameters.end()){
130 path = m->hasPath(it->second);
131 //if the user has not given a path then, add inputdir. else leave path alone.
132 if (path == "") { parameters["group"] = inputDir + it->second; }
135 it = parameters.find("name");
136 //user has given a template file
137 if(it != parameters.end()){
138 path = m->hasPath(it->second);
139 //if the user has not given a path then, add inputdir. else leave path alone.
140 if (path == "") { parameters["name"] = inputDir + it->second; }
147 m->Treenames.clear();
150 //check for required parameters
151 treefile = validParameter.validFile(parameters, "tree", true);
152 if (treefile == "not open") { treefile = ""; abort = true; }
153 else if (treefile == "not found") { //if there is a current design file, use it
154 treefile = m->getTreeFile();
155 if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
156 else { m->mothurOut("You have no current tree file and the tree parameter is required."); m->mothurOutEndLine(); abort = true; }
157 }else { m->setTreeFile(treefile); }
159 //check for required parameters
160 groupfile = validParameter.validFile(parameters, "group", true);
161 if (groupfile == "not open") { abort = true; }
162 else if (groupfile == "not found") { groupfile = ""; }
163 else { m->setGroupFile(groupfile); }
165 namefile = validParameter.validFile(parameters, "name", true);
166 if (namefile == "not open") { namefile = ""; abort = true; }
167 else if (namefile == "not found") { namefile = ""; }
168 else { m->setNameFile(namefile); }
170 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(treefile); }
173 //check for optional parameter and set defaults
174 // ...at some point should added some additional type checking...
175 groups = validParameter.validFile(parameters, "groups", false);
176 if (groups == "not found") { groups = ""; }
178 m->splitAtDash(groups, Groups);
179 m->setGroups(Groups);
182 itersString = validParameter.validFile(parameters, "iters", false); if (itersString == "not found") { itersString = "1000"; }
183 m->mothurConvert(itersString, iters);
185 string temp = validParameter.validFile(parameters, "distance", false);
186 if (temp == "not found") { phylip = false; outputForm = ""; }
188 if ((temp == "lt") || (temp == "column") || (temp == "square")) { phylip = true; outputForm = temp; }
189 else { m->mothurOut("Options for distance are: lt, square, or column. Using lt."); m->mothurOutEndLine(); phylip = true; outputForm = "lt"; }
192 temp = validParameter.validFile(parameters, "random", false); if (temp == "not found") { temp = "F"; }
193 random = m->isTrue(temp);
195 temp = validParameter.validFile(parameters, "root", false); if (temp == "not found") { temp = "F"; }
196 includeRoot = m->isTrue(temp);
198 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
199 m->setProcessors(temp);
200 m->mothurConvert(temp, processors);
202 temp = validParameter.validFile(parameters, "subsample", false); if (temp == "not found") { temp = "F"; }
203 if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; }
205 if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; } //we will set it to smallest group later
206 else { subsample = false; }
209 if (!subsample) { subsampleIters = 0; }
210 else { subsampleIters = iters; }
212 temp = validParameter.validFile(parameters, "consensus", false); if (temp == "not found") { temp = "F"; }
213 consensus = m->isTrue(temp);
215 if (subsample && random) { m->mothurOut("[ERROR]: random must be false, if subsample=t.\n"); abort=true; }
216 if (subsample && (groupfile == "")) { m->mothurOut("[ERROR]: if subsample=t, a group file must be provided.\n"); abort=true; }
217 if (subsample && (!phylip)) { phylip=true; outputForm = "lt"; }
218 if (consensus && (!subsample)) { m->mothurOut("[ERROR]: you cannot use consensus without subsample.\n"); abort=true; }
220 if (namefile == "") {
221 vector<string> files; files.push_back(treefile);
222 parser.getNameFile(files);
228 catch(exception& e) {
229 m->errorOut(e, "UnifracWeightedCommand", "UnifracWeightedCommand");
233 /***********************************************************/
234 int UnifracWeightedCommand::execute() {
237 if (abort == true) { if (calledHelp) { return 0; } return 2; }
239 m->setTreeFile(treefile);
241 readTrees(); if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
243 sumFile = outputDir + m->getSimpleName(treefile) + ".wsummary";
244 m->openOutputFile(sumFile, outSum);
245 outputNames.push_back(sumFile); outputTypes["wsummary"].push_back(sumFile);
248 string s; //to make work with setgroups
249 Groups = m->getGroups();
250 vector<string> nameGroups = tmap->getNamesOfGroups();
251 util.setGroups(Groups, nameGroups, s, numGroups, "weighted"); //sets the groups the user wants to analyze
252 m->setGroups(Groups);
254 if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } return 0; }
256 Weighted weighted(tmap, includeRoot);
258 int start = time(NULL);
262 //user has not set size, set size = smallest samples size
263 if (subsampleSize == -1) {
264 vector<string> temp; temp.push_back(Groups[0]);
265 subsampleSize = (tmap->getNamesSeqs(temp)).size(); //num in first group
266 for (int i = 1; i < Groups.size(); i++) {
267 temp.clear(); temp.push_back(Groups[i]);
268 int thisSize = (tmap->getNamesSeqs(temp)).size();
269 if (thisSize < subsampleSize) { subsampleSize = thisSize; }
271 m->mothurOut("\nSetting subsample size to " + toString(subsampleSize) + ".\n\n");
272 }else { //eliminate any too small groups
273 vector<string> newGroups = Groups;
275 for (int i = 0; i < newGroups.size(); i++) {
276 vector<string> thisGroup; thisGroup.push_back(newGroups[i]);
277 vector<string> thisGroupsSeqs = tmap->getNamesSeqs(thisGroup);
278 int thisSize = thisGroupsSeqs.size();
280 if (thisSize >= subsampleSize) { Groups.push_back(newGroups[i]); }
281 else { m->mothurOut("You have selected a size that is larger than "+newGroups[i]+" number of sequences, removing "+newGroups[i]+".\n"); }
283 m->setGroups(Groups);
287 //here in case some groups are removed by subsample
288 util.getCombos(groupComb, Groups, numComp);
290 if (numComp < processors) { processors = numComp; }
292 if (consensus && (numComp < 2)) { m->mothurOut("consensus can only be used with numComparisions greater than 1, setting consensus=f.\n"); consensus=false; }
294 //get weighted scores for users trees
295 for (int i = 0; i < T.size(); i++) {
297 if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
300 rScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC...
301 uScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC...
303 vector<double> userData; userData.resize(numComp,0); //weighted score info for user tree. data[0] = weightedscore AB, data[1] = weightedscore AC...
304 vector<double> randomData; randomData.resize(numComp,0); //weighted score info for random trees. data[0] = weightedscore AB, data[1] = weightedscore AC...
307 output = new ColumnFile(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".weighted", itersString);
308 outputNames.push_back(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".weighted");
309 outputTypes["weighted"].push_back(outputDir + m->getSimpleName(treefile) + toString(i+1) + ".weighted");
312 userData = weighted.getValues(T[i], processors, outputDir); //userData[0] = weightedscore
313 if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
316 for (int s=0; s<numComp; s++) {
317 //add users score to vector of user scores
318 uScores[s].push_back(userData[s]);
319 //save users tree score for summary file
320 utreeScores.push_back(userData[s]);
323 if (random) { runRandomCalcs(T[i], userData); }
331 vector< vector<double> > calcDistsTotals; //each iter, each groupCombos dists. this will be used to make .dist files
332 for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //subsampleIters=0, if subsample=f.
334 if (m->control_pressed) { break; }
336 //copy to preserve old one - would do this in subsample but tree needs it and memory cleanup becomes messy.
337 TreeMap* newTmap = new TreeMap();
338 newTmap->getCopy(tmap);
341 Tree* subSampleTree = sample.getSample(T[i], newTmap, nameMap, subsampleSize);
343 //call new weighted function
344 vector<double> iterData; iterData.resize(numComp,0);
345 Weighted thisWeighted(newTmap, includeRoot);
346 iterData = thisWeighted.getValues(subSampleTree, processors, outputDir); //userData[0] = weightedscore
348 //save data to make ave dist, std dist
349 calcDistsTotals.push_back(iterData);
352 delete subSampleTree;
355 if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } if (random) { delete output; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
357 if (subsample) { getAverageSTDMatrices(calcDistsTotals, i); }
358 if (consensus) { getConsensusTrees(calcDistsTotals, i); }
362 if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
364 if (phylip) { createPhylipFile(); }
368 //clear out users groups
371 for (int i = 0; i < T.size(); i++) { delete T[i]; }
373 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
375 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.weighted."); m->mothurOutEndLine();
377 //set phylip file as new current phylipfile
379 itTypes = outputTypes.find("phylip");
380 if (itTypes != outputTypes.end()) {
381 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setPhylipFile(current); }
384 //set column file as new current columnfile
385 itTypes = outputTypes.find("column");
386 if (itTypes != outputTypes.end()) {
387 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setColumnFile(current); }
390 m->mothurOutEndLine();
391 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
392 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
393 m->mothurOutEndLine();
398 catch(exception& e) {
399 m->errorOut(e, "UnifracWeightedCommand", "execute");
403 /**************************************************************************************************/
404 int UnifracWeightedCommand::getAverageSTDMatrices(vector< vector<double> >& dists, int treeNum) {
406 //we need to find the average distance and standard deviation for each groups distance
409 vector<double> averages; averages.resize(numComp, 0);
410 for (int thisIter = 0; thisIter < subsampleIters; thisIter++) {
411 for (int i = 0; i < dists[thisIter].size(); i++) {
412 averages[i] += dists[thisIter][i];
417 for (int i = 0; i < averages.size(); i++) { averages[i] /= (float) subsampleIters; }
419 //find standard deviation
420 vector<double> stdDev; stdDev.resize(numComp, 0);
422 for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
423 for (int j = 0; j < dists[thisIter].size(); j++) {
424 stdDev[j] += ((dists[thisIter][j] - averages[j]) * (dists[thisIter][j] - averages[j]));
427 for (int i = 0; i < stdDev.size(); i++) {
428 stdDev[i] /= (float) subsampleIters;
429 stdDev[i] = sqrt(stdDev[i]);
432 //make matrix with scores in it
433 vector< vector<double> > avedists; avedists.resize(m->getNumGroups());
434 for (int i = 0; i < m->getNumGroups(); i++) {
435 avedists[i].resize(m->getNumGroups(), 0.0);
438 //make matrix with scores in it
439 vector< vector<double> > stddists; stddists.resize(m->getNumGroups());
440 for (int i = 0; i < m->getNumGroups(); i++) {
441 stddists[i].resize(m->getNumGroups(), 0.0);
444 //flip it so you can print it
446 for (int r=0; r<m->getNumGroups(); r++) {
447 for (int l = 0; l < r; l++) {
448 avedists[r][l] = averages[count];
449 avedists[l][r] = averages[count];
450 stddists[r][l] = stdDev[count];
451 stddists[l][r] = stdDev[count];
456 string aveFileName = outputDir + m->getSimpleName(treefile) + toString(treeNum+1) + ".weighted.ave.dist";
457 outputNames.push_back(aveFileName); outputTypes["phylip"].push_back(aveFileName);
460 m->openOutputFile(aveFileName, out);
462 string stdFileName = outputDir + m->getSimpleName(treefile) + toString(treeNum+1) + ".weighted.std.dist";
463 outputNames.push_back(stdFileName); outputTypes["phylip"].push_back(stdFileName);
466 m->openOutputFile(stdFileName, outStd);
468 if ((outputForm == "lt") || (outputForm == "square")) {
470 out << m->getNumGroups() << endl;
471 outStd << m->getNumGroups() << endl;
475 for (int r=0; r<m->getNumGroups(); r++) {
477 string name = (m->getGroups())[r];
478 if (name.length() < 10) { //pad with spaces to make compatible
479 while (name.length() < 10) { name += " "; }
482 if (outputForm == "lt") {
484 outStd << name << '\t';
487 for (int l = 0; l < r; l++) { out << avedists[r][l] << '\t'; outStd << stddists[r][l] << '\t';}
488 out << endl; outStd << endl;
489 }else if (outputForm == "square") {
491 outStd << name << '\t';
494 for (int l = 0; l < m->getNumGroups(); l++) { out << avedists[r][l] << '\t'; outStd << stddists[r][l] << '\t'; }
495 out << endl; outStd << endl;
498 for (int l = 0; l < r; l++) {
499 string otherName = (m->getGroups())[l];
500 if (otherName.length() < 10) { //pad with spaces to make compatible
501 while (otherName.length() < 10) { otherName += " "; }
504 out << name << '\t' << otherName << avedists[r][l] << endl;
505 outStd << name << '\t' << otherName << stddists[r][l] << endl;
514 catch(exception& e) {
515 m->errorOut(e, "UnifracWeightedCommand", "getAverageSTDMatrices");
520 /**************************************************************************************************/
521 int UnifracWeightedCommand::getConsensusTrees(vector< vector<double> >& dists, int treeNum) {
524 //used in tree constructor
527 //create treemap class from groupmap for tree class to use
528 TreeMap* newTmap = new TreeMap();
529 newTmap->makeSim(m->getGroups());
531 //clear old tree names if any
532 m->Treenames.clear();
534 //fills globaldatas tree names
535 m->Treenames = m->getGroups();
537 vector<Tree*> newTrees = buildTrees(dists, treeNum, newTmap); //also creates .all.tre file containing the trees created
539 if (m->control_pressed) { delete newTmap; return 0; }
542 Tree* conTree = con.getTree(newTrees, newTmap);
544 //create a new filename
545 string conFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(treeNum+1) + ".weighted.cons.tre";
546 outputNames.push_back(conFile); outputTypes["tree"].push_back(conFile);
548 m->openOutputFile(conFile, outTree);
550 if (conTree != NULL) { conTree->print(outTree, "boot"); delete conTree; }
557 catch(exception& e) {
558 m->errorOut(e, "UnifracWeightedCommand", "getConsensusTrees");
562 /**************************************************************************************************/
564 vector<Tree*> UnifracWeightedCommand::buildTrees(vector< vector<double> >& dists, int treeNum, TreeMap* mytmap) {
569 //create a new filename
570 string outputFile = outputDir + m->getRootName(m->getSimpleName(treefile)) + toString(treeNum+1) + ".weighted.all.tre";
571 outputNames.push_back(outputFile); outputTypes["tree"].push_back(outputFile);
574 m->openOutputFile(outputFile, outAll);
577 for (int i = 0; i < dists.size(); i++) { //dists[0] are the dists for the first subsampled tree.
579 if (m->control_pressed) { break; }
581 //make matrix with scores in it
582 vector< vector<double> > sims; sims.resize(m->getNumGroups());
583 for (int j = 0; j < m->getNumGroups(); j++) {
584 sims[j].resize(m->getNumGroups(), 0.0);
588 for (int r=0; r<m->getNumGroups(); r++) {
589 for (int l = 0; l < r; l++) {
590 double sim = -(dists[i][count]-1.0);
598 Tree* tempTree = new Tree(mytmap, sims);
599 tempTree->assembleTree();
601 trees.push_back(tempTree);
604 tempTree->print(outAll);
609 if (m->control_pressed) { for (int i = 0; i < trees.size(); i++) { delete trees[i]; trees[i] = NULL; } m->mothurRemove(outputFile); }
613 catch(exception& e) {
614 m->errorOut(e, "UnifracWeightedCommand", "buildTrees");
618 /**************************************************************************************************/
620 int UnifracWeightedCommand::readTrees() {
623 if (groupfile != "") {
624 //read in group map info.
625 tmap = new TreeMap(groupfile);
627 }else{ //fake out by putting everyone in one group
628 Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap
629 tmap = new TreeMap();
631 for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
634 if (namefile != "") { readNamesFile(); }
636 read = new ReadNewickTree(treefile);
637 int readOk = read->read(tmap);
639 if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
641 read->AssembleTrees();
642 T = read->getTrees();
645 //make sure all files match
646 //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
648 if (namefile != "") {
649 if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); }
650 else { numNamesInTree = m->Treenames.size(); }
651 }else { numNamesInTree = m->Treenames.size(); }
654 //output any names that are in group file but not in tree
655 if (numNamesInTree < tmap->getNumSeqs()) {
656 for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
657 //is that name in the tree?
659 for (int j = 0; j < m->Treenames.size(); j++) {
660 if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
664 if (m->control_pressed) {
665 delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
666 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
671 //then you did not find it so report it
672 if (count == m->Treenames.size()) {
673 //if it is in your namefile then don't remove
674 map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
676 if (it == nameMap.end()) {
677 m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
678 tmap->removeSeq(tmap->namesOfSeqs[i]);
679 i--; //need this because removeSeq removes name from namesOfSeqs
687 catch(exception& e) {
688 m->errorOut(e, "UnifracWeightedCommand", "readTrees");
692 /**************************************************************************************************/
694 int UnifracWeightedCommand::runRandomCalcs(Tree* thisTree, vector<double> usersScores) {
697 //calculate number of comparisons i.e. with groups A,B,C = AB, AC, BC = 3;
698 vector< vector<string> > namesOfGroupCombos;
699 for (int a=0; a<numGroups; a++) {
700 for (int l = 0; l < a; l++) {
701 vector<string> groups; groups.push_back((m->getGroups())[a]); groups.push_back((m->getGroups())[l]);
702 namesOfGroupCombos.push_back(groups);
708 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
710 int numPairs = namesOfGroupCombos.size();
711 int numPairsPerProcessor = numPairs / processors;
713 for (int i = 0; i < processors; i++) {
714 int startPos = i * numPairsPerProcessor;
715 if(i == processors - 1){
716 numPairsPerProcessor = numPairs - i * numPairsPerProcessor;
718 lines.push_back(linePair(startPos, numPairsPerProcessor));
724 //get scores for random trees
725 for (int j = 0; j < iters; j++) {
727 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
729 driver(thisTree, namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores);
731 createProcesses(thisTree, namesOfGroupCombos, rScores);
734 driver(T[i], namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores);
737 if (m->control_pressed) { delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } delete output; outSum.close(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
740 // m->mothurOut("Iter: " + toString(j+1)); m->mothurOutEndLine();
744 //find the signifigance of the score for summary file
745 for (int f = 0; f < numComp; f++) {
747 sort(rScores[f].begin(), rScores[f].end());
749 //the index of the score higher than yours is returned
750 //so if you have 1000 random trees the index returned is 100
751 //then there are 900 trees with a score greater then you.
752 //giving you a signifigance of 0.900
753 int index = findIndex(usersScores[f], f); if (index == -1) { m->mothurOut("error in UnifracWeightedCommand"); m->mothurOutEndLine(); exit(1); } //error code
755 //the signifigance is the number of trees with the users score or higher
756 WScoreSig.push_back((iters-index)/(float)iters);
759 //out << "Tree# " << i << endl;
760 calculateFreqsCumuls();
767 catch(exception& e) {
768 m->errorOut(e, "UnifracWeightedCommand", "runRandomCalcs");
772 /**************************************************************************************************/
774 int UnifracWeightedCommand::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, vector< vector<double> >& scores) {
776 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
778 vector<int> processIDS;
782 //loop through and create all the processes you want
783 while (process != processors) {
787 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
790 driver(t, namesOfGroupCombos, lines[process].start, lines[process].num, scores);
792 //pass numSeqs to parent
794 string tempFile = outputDir + toString(getpid()) + ".weightedcommand.results.temp";
795 m->openOutputFile(tempFile, out);
796 for (int i = lines[process].start; i < (lines[process].start + lines[process].num); i++) { out << scores[i][(scores[i].size()-1)] << '\t'; } out << endl;
801 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
802 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
807 driver(t, namesOfGroupCombos, lines[0].start, lines[0].num, scores);
809 //force parent to wait until all the processes are done
810 for (int i=0;i<(processors-1);i++) {
811 int temp = processIDS[i];
815 //get data created by processes
816 for (int i=0;i<(processors-1);i++) {
819 string s = outputDir + toString(processIDS[i]) + ".weightedcommand.results.temp";
820 m->openInputFile(s, in);
823 for (int j = lines[(i+1)].start; j < (lines[(i+1)].start + lines[(i+1)].num); j++) { in >> tempScore; scores[j].push_back(tempScore); }
831 catch(exception& e) {
832 m->errorOut(e, "UnifracWeightedCommand", "createProcesses");
837 /**************************************************************************************************/
838 int UnifracWeightedCommand::driver(Tree* t, vector< vector<string> > namesOfGroupCombos, int start, int num, vector< vector<double> >& scores) {
840 Tree* randT = new Tree(tmap);
842 Weighted weighted(tmap, includeRoot);
844 for (int h = start; h < (start+num); h++) {
846 if (m->control_pressed) { return 0; }
848 //initialize weighted score
849 string groupA = namesOfGroupCombos[h][0];
850 string groupB = namesOfGroupCombos[h][1];
855 //create a random tree with same topology as T[i], but different labels
856 randT->assembleRandomUnifracTree(groupA, groupB);
858 if (m->control_pressed) { delete randT; return 0; }
860 //get wscore of random tree
861 EstOutput randomData = weighted.getValues(randT, groupA, groupB);
863 if (m->control_pressed) { delete randT; return 0; }
866 scores[h].push_back(randomData[0]);
874 catch(exception& e) {
875 m->errorOut(e, "UnifracWeightedCommand", "driver");
879 /***********************************************************/
880 void UnifracWeightedCommand::printWeightedFile() {
884 tags.push_back("Score"); tags.push_back("RandFreq"); tags.push_back("RandCumul");
886 for(int a = 0; a < numComp; a++) {
887 output->initFile(groupComb[a], tags);
889 for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
890 data.push_back(it->first); data.push_back(rScoreFreq[a][it->first]); data.push_back(rCumul[a][it->first]);
891 output->output(data);
897 catch(exception& e) {
898 m->errorOut(e, "UnifracWeightedCommand", "printWeightedFile");
904 /***********************************************************/
905 void UnifracWeightedCommand::printWSummaryFile() {
908 outSum << "Tree#" << '\t' << "Groups" << '\t' << "WScore" << '\t';
909 m->mothurOut("Tree#\tGroups\tWScore\t");
910 if (random) { outSum << "WSig"; m->mothurOut("WSig"); }
911 outSum << endl; m->mothurOutEndLine();
914 outSum.setf(ios::fixed, ios::floatfield); outSum.setf(ios::showpoint);
918 for (int i = 0; i < T.size(); i++) {
919 for (int j = 0; j < numComp; j++) {
921 if (WScoreSig[count] > (1/(float)iters)) {
922 outSum << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << WScoreSig[count] << endl;
923 cout << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << WScoreSig[count] << endl;
924 m->mothurOutJustToLog(toString(i+1) +"\t" + groupComb[j] +"\t" + toString(utreeScores[count]) +"\t" + toString(WScoreSig[count]) + "\n");
926 outSum << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl;
927 cout << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl;
928 m->mothurOutJustToLog(toString(i+1) +"\t" + groupComb[j] +"\t" + toString(utreeScores[count]) +"\t<" + toString((1/float(iters))) + "\n");
931 outSum << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << endl;
932 cout << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << endl;
933 m->mothurOutJustToLog(toString(i+1) +"\t" + groupComb[j] +"\t" + toString(utreeScores[count]) +"\n");
940 catch(exception& e) {
941 m->errorOut(e, "UnifracWeightedCommand", "printWSummaryFile");
945 /***********************************************************/
946 void UnifracWeightedCommand::createPhylipFile() {
950 for (int i = 0; i < T.size(); i++) {
952 string phylipFileName;
953 if ((outputForm == "lt") || (outputForm == "square")) {
954 phylipFileName = outputDir + m->getSimpleName(treefile) + toString(i+1) + ".weighted.phylip.dist";
955 outputNames.push_back(phylipFileName); outputTypes["phylip"].push_back(phylipFileName);
957 phylipFileName = outputDir + m->getSimpleName(treefile) + toString(i+1) + ".weighted.column.dist";
958 outputNames.push_back(phylipFileName); outputTypes["column"].push_back(phylipFileName);
962 m->openOutputFile(phylipFileName, out);
964 if ((outputForm == "lt") || (outputForm == "square")) {
966 out << m->getNumGroups() << endl;
969 //make matrix with scores in it
970 vector< vector<float> > dists; dists.resize(m->getNumGroups());
971 for (int i = 0; i < m->getNumGroups(); i++) {
972 dists[i].resize(m->getNumGroups(), 0.0);
975 //flip it so you can print it
976 for (int r=0; r<m->getNumGroups(); r++) {
977 for (int l = 0; l < r; l++) {
978 dists[r][l] = utreeScores[count];
979 dists[l][r] = utreeScores[count];
985 for (int r=0; r<m->getNumGroups(); r++) {
987 string name = (m->getGroups())[r];
988 if (name.length() < 10) { //pad with spaces to make compatible
989 while (name.length() < 10) { name += " "; }
992 if (outputForm == "lt") {
996 for (int l = 0; l < r; l++) { out << dists[r][l] << '\t'; }
998 }else if (outputForm == "square") {
1002 for (int l = 0; l < m->getNumGroups(); l++) { out << dists[r][l] << '\t'; }
1006 for (int l = 0; l < r; l++) {
1007 string otherName = (m->getGroups())[l];
1008 if (otherName.length() < 10) { //pad with spaces to make compatible
1009 while (otherName.length() < 10) { otherName += " "; }
1012 out << name << '\t' << otherName << dists[r][l] << endl;
1019 catch(exception& e) {
1020 m->errorOut(e, "UnifracWeightedCommand", "createPhylipFile");
1024 /***********************************************************/
1025 int UnifracWeightedCommand::findIndex(float score, int index) {
1027 for (int i = 0; i < rScores[index].size(); i++) {
1028 if (rScores[index][i] >= score) { return i; }
1030 return rScores[index].size();
1032 catch(exception& e) {
1033 m->errorOut(e, "UnifracWeightedCommand", "findIndex");
1038 /***********************************************************/
1040 void UnifracWeightedCommand::calculateFreqsCumuls() {
1042 //clear out old tree values
1044 rScoreFreq.resize(numComp);
1046 rCumul.resize(numComp);
1047 validScores.clear();
1049 //calculate frequency
1050 for (int f = 0; f < numComp; f++) {
1051 for (int i = 0; i < rScores[f].size(); i++) { //looks like 0,0,1,1,1,2,4,7... you want to make a map that say rScoreFreq[0] = 2, rScoreFreq[1] = 3...
1052 validScores[rScores[f][i]] = rScores[f][i];
1053 map<float,float>::iterator it = rScoreFreq[f].find(rScores[f][i]);
1054 if (it != rScoreFreq[f].end()) {
1055 rScoreFreq[f][rScores[f][i]]++;
1057 rScoreFreq[f][rScores[f][i]] = 1;
1063 for(int a = 0; a < numComp; a++) {
1064 float rcumul = 1.0000;
1065 //this loop fills the cumulative maps and put 0.0000 in the score freq map to make it easier to print.
1066 for (map<float,float>::iterator it = validScores.begin(); it != validScores.end(); it++) {
1067 //make rscoreFreq map and rCumul
1068 map<float,float>::iterator it2 = rScoreFreq[a].find(it->first);
1069 rCumul[a][it->first] = rcumul;
1070 //get percentage of random trees with that info
1071 if (it2 != rScoreFreq[a].end()) { rScoreFreq[a][it->first] /= iters; rcumul-= it2->second; }
1072 else { rScoreFreq[a][it->first] = 0.0000; } //no random trees with that score
1077 catch(exception& e) {
1078 m->errorOut(e, "UnifracWeightedCommand", "calculateFreqsCums");
1082 /*****************************************************************/
1083 int UnifracWeightedCommand::readNamesFile() {
1086 numUniquesInName = 0;
1089 m->openInputFile(namefile, in);
1091 string first, second;
1092 map<string, string>::iterator itNames;
1095 in >> first >> second; m->gobble(in);
1099 itNames = m->names.find(first);
1100 if (itNames == m->names.end()) {
1101 m->names[first] = second;
1103 //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them
1104 vector<string> dupNames;
1105 m->splitAtComma(second, dupNames);
1107 for (int i = 0; i < dupNames.size(); i++) {
1108 nameMap[dupNames[i]] = first;
1109 if ((groupfile == "") && (i != 0)) { tmap->addSeq(dupNames[i], "Group1"); }
1111 }else { m->mothurOut(first + " has already been seen in namefile, disregarding names file."); m->mothurOutEndLine(); in.close(); m->names.clear(); namefile = ""; return 1; }
1117 catch(exception& e) {
1118 m->errorOut(e, "UnifracWeightedCommand", "readNamesFile");
1122 /***********************************************************/