2 * treegroupscommand.cpp
5 * Created by Sarah Westcott on 4/8/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "treegroupscommand.h"
11 #include "sharedjabund.h"
12 #include "sharedsorabund.h"
13 #include "sharedjclass.h"
14 #include "sharedsorclass.h"
15 #include "sharedjest.h"
16 #include "sharedsorest.h"
17 #include "sharedthetayc.h"
18 #include "sharedthetan.h"
19 #include "sharedmorisitahorn.h"
20 #include "sharedbraycurtis.h"
23 //**********************************************************************************************************************
25 TreeGroupCommand::TreeGroupCommand(string option){
27 globaldata = GlobalData::getInstance();
35 //allow user to run help
36 if(option == "help") { validCalculator = new ValidCalculators(); help(); abort = true; }
39 //valid paramters for this command
40 string Array[] = {"line","label","calc","groups", "phylip", "column", "name", "precision","cutoff"};
41 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
43 OptionParser parser(option);
44 map<string, string> parameters = parser. getParameters();
46 ValidParameters validParameter;
48 //check to make sure all parameters are valid for command
49 for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
50 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
54 phylipfile = validParameter.validFile(parameters, "phylip", true);
55 if (phylipfile == "not open") { abort = true; }
56 else if (phylipfile == "not found") { phylipfile = ""; }
57 else { format = "phylip"; }
59 columnfile = validParameter.validFile(parameters, "column", true);
60 if (columnfile == "not open") { abort = true; }
61 else if (columnfile == "not found") { columnfile = ""; }
62 else { format = "column"; }
64 namefile = validParameter.validFile(parameters, "name", true);
65 if (namefile == "not open") { abort = true; }
66 else if (namefile == "not found") { namefile = ""; }
67 else { globaldata->setNameFile(namefile); }
69 format = globaldata->getFormat();
71 //error checking on files
72 if ((globaldata->getSharedFile() == "") && ((phylipfile == "") && (columnfile == ""))) { cout << "You must run the read.otu command or provide a distance file before running the tree.shared command." << endl; abort = true; }
73 else if ((phylipfile != "") && (columnfile != "")) { cout << "When running the tree.shared command with a distance file you may not use both the column and the phylip parameters." << endl; abort = true; }
75 if (columnfile != "") {
76 if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
79 //check for optional parameter and set defaults
80 // ...at some point should added some additional type checking...
81 line = validParameter.validFile(parameters, "line", false);
82 if (line == "not found") { line = ""; }
84 if(line != "all") { splitAtDash(line, lines); allLines = 0; }
85 else { allLines = 1; }
88 label = validParameter.validFile(parameters, "label", false);
89 if (label == "not found") { label = ""; }
91 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
92 else { allLines = 1; }
95 //make sure user did not use both the line and label parameters
96 if ((line != "") && (label != "")) { cout << "You cannot use both the line and label parameters at the same time. " << endl; abort = true; }
97 //if the user has not specified any line or labels use the ones from read.otu
98 else if((line == "") && (label == "")) {
99 allLines = globaldata->allLines;
100 labels = globaldata->labels;
101 lines = globaldata->lines;
104 groups = validParameter.validFile(parameters, "groups", false);
105 if (groups == "not found") { groups = ""; }
107 splitAtDash(groups, Groups);
108 globaldata->Groups = Groups;
111 calc = validParameter.validFile(parameters, "calc", false);
112 if (calc == "not found") { calc = "jclass-thetayc"; }
114 if (calc == "default") { calc = "jclass-thetayc"; }
116 splitAtDash(calc, Estimators);
119 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
120 convert(temp, precision);
122 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
123 convert(temp, cutoff);
124 cutoff += (5 / (precision * 10.0));
127 if (abort == false) {
129 validCalculator = new ValidCalculators();
131 if (format == "sharedfile") {
133 for (i=0; i<Estimators.size(); i++) {
134 if (validCalculator->isValidCalculator("treegroup", Estimators[i]) == true) {
135 if (Estimators[i] == "jabund") {
136 treeCalculators.push_back(new JAbund());
137 }else if (Estimators[i] == "sorabund") {
138 treeCalculators.push_back(new SorAbund());
139 }else if (Estimators[i] == "jclass") {
140 treeCalculators.push_back(new Jclass());
141 }else if (Estimators[i] == "sorclass") {
142 treeCalculators.push_back(new SorClass());
143 }else if (Estimators[i] == "jest") {
144 treeCalculators.push_back(new Jest());
145 }else if (Estimators[i] == "sorest") {
146 treeCalculators.push_back(new SorEst());
147 }else if (Estimators[i] == "thetayc") {
148 treeCalculators.push_back(new ThetaYC());
149 }else if (Estimators[i] == "thetan") {
150 treeCalculators.push_back(new ThetaN());
151 }else if (Estimators[i] == "morisitahorn") {
152 treeCalculators.push_back(new MorHorn());
153 }else if (Estimators[i] == "braycurtis") {
154 treeCalculators.push_back(new BrayCurtis());
163 catch(exception& e) {
164 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function TreeGroupCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
168 cout << "An unknown error has occurred in the TreeGroupCommand class function TreeGroupCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
173 //**********************************************************************************************************************
175 void TreeGroupCommand::help(){
177 cout << "The tree.shared command creates a .tre to represent the similiarity between groups or sequences." << "\n";
178 cout << "The tree.shared command can only be executed after a successful read.otu command or by providing a distance file." << "\n";
179 cout << "The tree.shared command parameters are groups, calc, phylip, column, name, cutoff, precision, line and label. You may not use line and label at the same time." << "\n";
180 cout << "The groups parameter allows you to specify which of the groups in your groupfile you would like included used." << "\n";
181 cout << "The group names are separated by dashes. The line and label allow you to select what distance levels you would like trees created for, and are also separated by dashes." << "\n";
182 cout << "The phylip or column parameter are required if you do not run the read.otu command first, and only one may be used. If you use a column file the name filename is required. " << "\n";
183 cout << "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed." << "\n";
184 cout << "The tree.shared command should be in the following format: tree.shared(groups=yourGroups, calc=yourCalcs, line=yourLines, label=yourLabels)." << "\n";
185 cout << "Example tree.shared(groups=A-B-C, line=1-3-5, calc=jabund-sorabund)." << "\n";
186 cout << "The default value for groups is all the groups in your groupfile." << "\n";
187 cout << "The default value for calc is jclass-thetayc." << "\n";
188 cout << "The tree.shared command outputs a .tre file for each calculator you specify at each distance you choose." << "\n";
189 validCalculator->printCalc("treegroup", cout);
190 cout << "Or the tree.shared command can be in the following format: tree.shared(phylip=yourPhylipFile)." << "\n";
191 cout << "Example tree.shared(phylip=abrecovery.dist)." << "\n";
192 cout << "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups)." << "\n" << "\n";
194 catch(exception& e) {
195 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
199 cout << "An unknown error has occurred in the TreeGroupCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
205 //**********************************************************************************************************************
207 TreeGroupCommand::~TreeGroupCommand(){
209 if (format == "sharedfile") {delete read;}
210 else { delete readMatrix; delete matrix; delete list; }
212 delete validCalculator;
216 //**********************************************************************************************************************
218 int TreeGroupCommand::execute(){
221 if (abort == true) { return 0; }
223 if (format == "sharedfile") {
224 //if the users entered no valid calculators don't execute command
225 if (treeCalculators.size() == 0) { cout << "You have given no valid calculators." << endl; return 0; }
228 read = new ReadOTUFile(globaldata->inputFileName);
229 read->read(&*globaldata);
231 input = globaldata->ginput;
232 lookup = input->getSharedRAbundVectors();
235 if (lookup.size() < 2) { cout << "You have not provided enough valid groups. I cannot run the command." << endl; return 0; }
237 globaldata->runParse = false;
243 filename = globaldata->inputFileName;
245 if (format == "column") { readMatrix = new ReadColumnMatrix(filename); }
246 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(filename); }
248 readMatrix->setCutoff(cutoff);
251 nameMap = new NameAssignment(namefile);
252 nameMap->readMap(1,2);
258 readMatrix->read(nameMap);
259 list = readMatrix->getListVector();
260 matrix = readMatrix->getMatrix();
263 tmap = new TreeMap();
265 globaldata->gTreemap = tmap;
267 globaldata->Groups = tmap->namesOfGroups;
269 //clear globaldatas old tree names if any
270 globaldata->Treenames.clear();
272 //fills globaldatas tree names
273 globaldata->Treenames = globaldata->Groups;
275 globaldata->runParse = false;
279 //create a new filename
280 outputFile = getRootName(globaldata->inputFileName) + "tre";
283 cout << "Tree complete. " << endl;
286 //reset groups parameter
287 globaldata->Groups.clear();
291 catch(exception& e) {
292 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
296 cout << "An unknown error has occurred in the TreeGroupCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
300 //**********************************************************************************************************************
302 void TreeGroupCommand::createTree(){
307 //do merges and create tree structure by setting parents and children
308 //there are numGroups - 1 merges to do
309 for (int i = 0; i < (numGroups - 1); i++) {
310 float largest = -1000.0;
313 //find largest value in sims matrix by searching lower triangle
314 for (int j = 1; j < simMatrix.size(); j++) {
315 for (int k = 0; k < j; k++) {
316 if (simMatrix[j][k] > largest) { largest = simMatrix[j][k]; row = j; column = k; }
320 //set non-leaf node info and update leaves to know their parents
322 t->tree[numGroups + i].setChildren(index[row], index[column]);
325 t->tree[index[row]].setParent(numGroups + i);
326 t->tree[index[column]].setParent(numGroups + i);
328 //blength = distance / 2;
329 float blength = ((1.0 - largest) / 2);
332 t->tree[index[row]].setBranchLength(blength - t->tree[index[row]].getLengthToLeaves());
333 t->tree[index[column]].setBranchLength(blength - t->tree[index[column]].getLengthToLeaves());
335 //set your length to leaves to your childs length plus branchlength
336 t->tree[numGroups + i].setLengthToLeaves(t->tree[index[row]].getLengthToLeaves() + t->tree[index[row]].getBranchLength());
340 index[row] = numGroups+i;
341 index[column] = numGroups+i;
343 //remove highest value that caused the merge.
344 simMatrix[row][column] = -1000.0;
345 simMatrix[column][row] = -1000.0;
347 //merge values in simsMatrix
348 for (int n = 0; n < simMatrix.size(); n++) {
349 //row becomes merge of 2 groups
350 simMatrix[row][n] = (simMatrix[row][n] + simMatrix[column][n]) / 2;
351 simMatrix[n][row] = simMatrix[row][n];
353 simMatrix[column][n] = -1000.0;
354 simMatrix[n][column] = -1000.0;
358 //adjust tree to make sure root to tip length is .5
359 int root = t->findRoot();
360 t->tree[root].setBranchLength((0.5 - t->tree[root].getLengthToLeaves()));
366 t->createNewickFile(outputFile);
372 catch(exception& e) {
373 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function createTree. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
377 cout << "An unknown error has occurred in the TreeGroupCommand class function createTree. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
381 /***********************************************************/
382 void TreeGroupCommand::printSims(ostream& out) {
385 //output column headers
387 //for (int i = 0; i < lookup.size(); i++) { out << lookup[i]->getGroup() << '\t'; }
391 for (int m = 0; m < simMatrix.size(); m++) {
392 //out << lookup[m]->getGroup() << '\t';
393 for (int n = 0; n < simMatrix.size(); n++) {
394 out << simMatrix[m][n] << '\t';
400 catch(exception& e) {
401 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function printSims. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
405 cout << "An unknown error has occurred in the TreeGroupCommand class function printSims. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
409 /***********************************************************/
410 void TreeGroupCommand::makeSimsDist() {
412 numGroups = list->size();
416 for (int g = 0; g < numGroups; g++) { index[g] = g; }
418 //initialize simMatrix
420 simMatrix.resize(numGroups);
421 for (int m = 0; m < simMatrix.size(); m++) {
422 for (int j = 0; j < simMatrix.size(); j++) {
423 simMatrix[m].push_back(0.0);
427 //go through sparse matrix and fill sims
428 //go through each cell in the sparsematrix
429 for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
430 //similairity = -(distance-1)
431 simMatrix[currentCell->row][currentCell->column] = -(currentCell->dist -1.0);
432 simMatrix[currentCell->column][currentCell->row] = -(currentCell->dist -1.0);
437 catch(exception& e) {
438 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function makeSimsDist. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
442 cout << "An unknown error has occurred in the TreeGroupCommand class function makeSimsDist. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
447 /***********************************************************/
448 void TreeGroupCommand::makeSimsShared() {
452 //clear globaldatas old tree names if any
453 globaldata->Treenames.clear();
455 //fills globaldatas tree names
456 globaldata->Treenames = globaldata->Groups;
458 //create treemap class from groupmap for tree class to use
459 tmap = new TreeMap();
460 tmap->makeSim(globaldata->gGroupmap);
461 globaldata->gTreemap = tmap;
463 set<string> processedLabels;
464 set<string> userLabels = labels;
465 set<int> userLines = lines;
467 //as long as you are not at the end of the file or done wih the lines you want
468 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0) || (userLines.size() != 0))) {
470 if(allLines == 1 || lines.count(count) == 1 || labels.count(lookup[0]->getLabel()) == 1){
471 cout << lookup[0]->getLabel() << '\t' << count << endl;
474 processedLabels.insert(lookup[0]->getLabel());
475 userLabels.erase(lookup[0]->getLabel());
476 userLines.erase(count);
479 if ((anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLookup[0]->getLabel()) != 1)) {
480 cout << lastLookup[0]->getLabel() << '\t' << count << endl;
483 processedLabels.insert(lastLookup[0]->getLabel());
484 userLabels.erase(lastLookup[0]->getLabel());
487 //prevent memory leak
488 if (count != 1) { for (int i = 0; i < lastLookup.size(); i++) { delete lastLookup[i]; } }
491 //get next line to process
492 lookup = input->getSharedRAbundVectors();
496 //output error messages about any remaining user labels
497 set<string>::iterator it;
498 bool needToRun = false;
499 for (it = userLabels.begin(); it != userLabels.end(); it++) {
500 cout << "Your file does not include the label "<< *it;
501 if (processedLabels.count(lastLookup[0]->getLabel()) != 1) {
502 cout << ". I will use " << lastLookup[0]->getLabel() << "." << endl;
505 cout << ". Please refer to " << lastLookup[0]->getLabel() << "." << endl;
509 //run last line if you need to
510 if (needToRun == true) {
511 cout << lastLookup[0]->getLabel() << '\t' << count << endl;
515 for (int i = 0; i < lastLookup.size(); i++) { delete lastLookup[i]; }
516 for(int i = 0 ; i < treeCalculators.size(); i++) { delete treeCalculators[i]; }
518 catch(exception& e) {
519 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function makeSimsShared. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
523 cout << "An unknown error has occurred in the TreeGroupCommand class function makeSimsShared. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
528 /***********************************************************/
529 void TreeGroupCommand::process(vector<SharedRAbundVector*> thisLookup) {
532 vector<SharedRAbundVector*> subset;
533 numGroups = thisLookup.size();
535 //for each calculator
536 for(int i = 0 ; i < treeCalculators.size(); i++) {
537 //initialize simMatrix
539 simMatrix.resize(numGroups);
540 for (int m = 0; m < simMatrix.size(); m++) {
541 for (int j = 0; j < simMatrix.size(); j++) {
542 simMatrix[m].push_back(0.0);
548 for (int g = 0; g < numGroups; g++) { index[g] = g; }
550 //create a new filename
551 outputFile = getRootName(globaldata->inputFileName) + treeCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + ".tre";
553 for (int k = 0; k < thisLookup.size(); k++) {
554 for (int l = k; l < thisLookup.size(); l++) {
555 if (k != l) { //we dont need to similiarity of a groups to itself
556 //get estimated similarity between 2 groups
558 subset.clear(); //clear out old pair of sharedrabunds
559 //add new pair of sharedrabunds
560 subset.push_back(thisLookup[k]); subset.push_back(thisLookup[l]);
562 data = treeCalculators[i]->getValues(subset); //saves the calculator outputs
563 //save values in similarity matrix
564 simMatrix[k][l] = data[0];
565 simMatrix[l][k] = data[0];
570 //creates tree from similarity matrix and write out file
575 catch(exception& e) {
576 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
580 cout << "An unknown error has occurred in the TreeGroupCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
584 /***********************************************************/