2 * treegroupscommand.cpp
5 * Created by Sarah Westcott on 4/8/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "treegroupscommand.h"
11 #include "sharedjabund.h"
12 #include "sharedsorabund.h"
13 #include "sharedjclass.h"
14 #include "sharedsorclass.h"
15 #include "sharedjest.h"
16 #include "sharedsorest.h"
17 #include "sharedthetayc.h"
18 #include "sharedthetan.h"
19 #include "sharedmorisitahorn.h"
20 #include "sharedbraycurtis.h"
23 //**********************************************************************************************************************
25 TreeGroupCommand::TreeGroupCommand(string option){
27 globaldata = GlobalData::getInstance();
35 //allow user to run help
36 if(option == "help") { validCalculator = new ValidCalculators(); help(); abort = true; }
39 //valid paramters for this command
40 string Array[] = {"line","label","calc","groups", "phylip", "column", "name", "precision","cutoff"};
41 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
43 parser = new OptionParser();
44 parser->parse(option, parameters); delete parser;
46 ValidParameters* validParameter = new ValidParameters();
48 //check to make sure all parameters are valid for command
49 for (it = parameters.begin(); it != parameters.end(); it++) {
50 if (validParameter->isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
54 phylipfile = validParameter->validFile(parameters, "phylip", true);
55 if (phylipfile == "not open") { abort = true; }
56 else if (phylipfile == "not found") { phylipfile = ""; }
57 else { globaldata->setPhylipFile(phylipfile); globaldata->setFormat("phylip"); }
59 columnfile = validParameter->validFile(parameters, "column", true);
60 if (columnfile == "not open") { abort = true; }
61 else if (columnfile == "not found") { columnfile = ""; }
62 else { globaldata->setColumnFile(columnfile); globaldata->setFormat("column"); }
64 namefile = validParameter->validFile(parameters, "name", true);
65 if (namefile == "not open") { abort = true; }
66 else if (namefile == "not found") { namefile = ""; }
67 else { globaldata->setNameFile(namefile); }
69 format = globaldata->getFormat();
71 //error checking on files
72 if ((globaldata->getSharedFile() == "") && ((phylipfile == "") && (columnfile == ""))) { cout << "You must run the read.otu command or provide a distance file before running the tree.shared command." << endl; abort = true; }
73 else if ((phylipfile != "") && (columnfile != "")) { cout << "When running the tree.shared command with a distance file you may not use both the column and the phylip parameters." << endl; abort = true; }
75 if (columnfile != "") {
76 if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
79 //check for optional parameter and set defaults
80 // ...at some point should added some additional type checking...
81 line = validParameter->validFile(parameters, "line", false);
82 if (line == "not found") { line = ""; }
84 if(line != "all") { splitAtDash(line, lines); allLines = 0; }
85 else { allLines = 1; }
88 label = validParameter->validFile(parameters, "label", false);
89 if (label == "not found") { label = ""; }
91 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
92 else { allLines = 1; }
95 //make sure user did not use both the line and label parameters
96 if ((line != "") && (label != "")) { cout << "You cannot use both the line and label parameters at the same time. " << endl; abort = true; }
97 //if the user has not specified any line or labels use the ones from read.otu
98 else if((line == "") && (label == "")) {
99 allLines = globaldata->allLines;
100 labels = globaldata->labels;
101 lines = globaldata->lines;
104 groups = validParameter->validFile(parameters, "groups", false);
105 if (groups == "not found") { groups = ""; }
107 splitAtDash(groups, Groups);
108 globaldata->Groups = Groups;
111 calc = validParameter->validFile(parameters, "calc", false);
112 if (calc == "not found") { calc = "jclass-thetayc"; }
114 if (calc == "default") { calc = "jclass-thetayc"; }
116 splitAtDash(calc, Estimators);
119 temp = validParameter->validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
120 convert(temp, precision);
122 temp = validParameter->validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
123 convert(temp, cutoff);
124 cutoff += (5 / (precision * 10.0));
127 delete validParameter;
129 if (abort == false) {
131 validCalculator = new ValidCalculators();
133 if (format == "sharedfile") {
135 for (i=0; i<Estimators.size(); i++) {
136 if (validCalculator->isValidCalculator("treegroup", Estimators[i]) == true) {
137 if (Estimators[i] == "jabund") {
138 treeCalculators.push_back(new JAbund());
139 }else if (Estimators[i] == "sorabund") {
140 treeCalculators.push_back(new SorAbund());
141 }else if (Estimators[i] == "jclass") {
142 treeCalculators.push_back(new Jclass());
143 }else if (Estimators[i] == "sorclass") {
144 treeCalculators.push_back(new SorClass());
145 }else if (Estimators[i] == "jest") {
146 treeCalculators.push_back(new Jest());
147 }else if (Estimators[i] == "sorest") {
148 treeCalculators.push_back(new SorEst());
149 }else if (Estimators[i] == "thetayc") {
150 treeCalculators.push_back(new ThetaYC());
151 }else if (Estimators[i] == "thetan") {
152 treeCalculators.push_back(new ThetaN());
153 }else if (Estimators[i] == "morisitahorn") {
154 treeCalculators.push_back(new MorHorn());
155 }else if (Estimators[i] == "braycurtis") {
156 treeCalculators.push_back(new BrayCurtis());
165 catch(exception& e) {
166 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function TreeGroupCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
170 cout << "An unknown error has occurred in the TreeGroupCommand class function TreeGroupCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
175 //**********************************************************************************************************************
177 void TreeGroupCommand::help(){
179 cout << "The tree.shared command creates a .tre to represent the similiarity between groups or sequences." << "\n";
180 cout << "The tree.shared command can only be executed after a successful read.otu command or by providing a distance file." << "\n";
181 cout << "The tree.shared command parameters are groups, calc, phylip, column, name, cutoff, precision, line and label. You may not use line and label at the same time." << "\n";
182 cout << "The groups parameter allows you to specify which of the groups in your groupfile you would like included used." << "\n";
183 cout << "The group names are separated by dashes. The line and label allow you to select what distance levels you would like trees created for, and are also separated by dashes." << "\n";
184 cout << "The phylip or column parameter are required if you do not run the read.otu command first, and only one may be used. If you use a column file the name filename is required. " << "\n";
185 cout << "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed." << "\n";
186 cout << "The tree.shared command should be in the following format: tree.shared(groups=yourGroups, calc=yourCalcs, line=yourLines, label=yourLabels)." << "\n";
187 cout << "Example tree.shared(groups=A-B-C, line=1-3-5, calc=jabund-sorabund)." << "\n";
188 cout << "The default value for groups is all the groups in your groupfile." << "\n";
189 cout << "The default value for calc is jclass-thetayc." << "\n";
190 cout << "The tree.shared command outputs a .tre file for each calculator you specify at each distance you choose." << "\n";
191 validCalculator->printCalc("treegroup", cout);
192 cout << "Or the tree.shared command can be in the following format: tree.shared(phylip=yourPhylipFile)." << "\n";
193 cout << "Example tree.shared(phylip=abrecovery.dist)." << "\n";
194 cout << "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups)." << "\n" << "\n";
196 catch(exception& e) {
197 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
201 cout << "An unknown error has occurred in the TreeGroupCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
207 //**********************************************************************************************************************
209 TreeGroupCommand::~TreeGroupCommand(){
211 if (format == "sharedfile") {delete read;}
212 else { delete readMatrix; delete matrix; delete list; }
214 delete validCalculator;
218 //**********************************************************************************************************************
220 int TreeGroupCommand::execute(){
223 if (abort == true) { return 0; }
225 if (format == "sharedfile") {
226 //if the users entered no valid calculators don't execute command
227 if (treeCalculators.size() == 0) { cout << "You have given no valid calculators." << endl; return 0; }
230 read = new ReadOTUFile(globaldata->inputFileName);
231 read->read(&*globaldata);
233 input = globaldata->ginput;
234 lookup = input->getSharedRAbundVectors();
237 if (lookup.size() < 2) { cout << "You have not provided enough valid groups. I cannot run the command." << endl; return 0; }
243 filename = globaldata->inputFileName;
245 if (format == "column") { readMatrix = new ReadColumnMatrix(filename); }
246 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(filename); }
248 readMatrix->setCutoff(cutoff);
251 nameMap = new NameAssignment(namefile);
252 nameMap->readMap(1,2);
258 readMatrix->read(nameMap);
259 list = readMatrix->getListVector();
260 matrix = readMatrix->getMatrix();
263 tmap = new TreeMap();
265 globaldata->gTreemap = tmap;
267 globaldata->Groups = tmap->namesOfGroups;
269 //clear globaldatas old tree names if any
270 globaldata->Treenames.clear();
272 //fills globaldatas tree names
273 globaldata->Treenames = globaldata->Groups;
277 //create a new filename
278 outputFile = getRootName(globaldata->inputFileName) + "tre";
281 cout << "Tree complete. " << endl;
284 //reset groups parameter
285 globaldata->Groups.clear();
289 catch(exception& e) {
290 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
294 cout << "An unknown error has occurred in the TreeGroupCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
298 //**********************************************************************************************************************
300 void TreeGroupCommand::createTree(){
305 //do merges and create tree structure by setting parents and children
306 //there are numGroups - 1 merges to do
307 for (int i = 0; i < (numGroups - 1); i++) {
308 float largest = -1000.0;
311 //find largest value in sims matrix by searching lower triangle
312 for (int j = 1; j < simMatrix.size(); j++) {
313 for (int k = 0; k < j; k++) {
314 if (simMatrix[j][k] > largest) { largest = simMatrix[j][k]; row = j; column = k; }
318 //set non-leaf node info and update leaves to know their parents
320 t->tree[numGroups + i].setChildren(index[row], index[column]);
323 t->tree[index[row]].setParent(numGroups + i);
324 t->tree[index[column]].setParent(numGroups + i);
326 //blength = distance / 2;
327 float blength = ((1.0 - largest) / 2);
330 t->tree[index[row]].setBranchLength(blength - t->tree[index[row]].getLengthToLeaves());
331 t->tree[index[column]].setBranchLength(blength - t->tree[index[column]].getLengthToLeaves());
333 //set your length to leaves to your childs length plus branchlength
334 t->tree[numGroups + i].setLengthToLeaves(t->tree[index[row]].getLengthToLeaves() + t->tree[index[row]].getBranchLength());
338 index[row] = numGroups+i;
339 index[column] = numGroups+i;
341 //remove highest value that caused the merge.
342 simMatrix[row][column] = -1000.0;
343 simMatrix[column][row] = -1000.0;
345 //merge values in simsMatrix
346 for (int n = 0; n < simMatrix.size(); n++) {
347 //row becomes merge of 2 groups
348 simMatrix[row][n] = (simMatrix[row][n] + simMatrix[column][n]) / 2;
349 simMatrix[n][row] = simMatrix[row][n];
351 simMatrix[column][n] = -1000.0;
352 simMatrix[n][column] = -1000.0;
356 //adjust tree to make sure root to tip length is .5
357 int root = t->findRoot();
358 t->tree[root].setBranchLength((0.5 - t->tree[root].getLengthToLeaves()));
364 t->createNewickFile(outputFile);
370 catch(exception& e) {
371 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function createTree. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
375 cout << "An unknown error has occurred in the TreeGroupCommand class function createTree. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
379 /***********************************************************/
380 void TreeGroupCommand::printSims(ostream& out) {
383 //output column headers
385 //for (int i = 0; i < lookup.size(); i++) { out << lookup[i]->getGroup() << '\t'; }
389 for (int m = 0; m < simMatrix.size(); m++) {
390 //out << lookup[m]->getGroup() << '\t';
391 for (int n = 0; n < simMatrix.size(); n++) {
392 out << simMatrix[m][n] << '\t';
398 catch(exception& e) {
399 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function printSims. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
403 cout << "An unknown error has occurred in the TreeGroupCommand class function printSims. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
407 /***********************************************************/
408 void TreeGroupCommand::makeSimsDist() {
410 numGroups = list->size();
414 for (int g = 0; g < numGroups; g++) { index[g] = g; }
416 //initialize simMatrix
418 simMatrix.resize(numGroups);
419 for (int m = 0; m < simMatrix.size(); m++) {
420 for (int j = 0; j < simMatrix.size(); j++) {
421 simMatrix[m].push_back(0.0);
425 //go through sparse matrix and fill sims
426 //go through each cell in the sparsematrix
427 for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
428 //similairity = -(distance-1)
429 simMatrix[currentCell->row][currentCell->column] = -(currentCell->dist -1.0);
430 simMatrix[currentCell->column][currentCell->row] = -(currentCell->dist -1.0);
435 catch(exception& e) {
436 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function makeSimsDist. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
440 cout << "An unknown error has occurred in the TreeGroupCommand class function makeSimsDist. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
445 /***********************************************************/
446 void TreeGroupCommand::makeSimsShared() {
450 //clear globaldatas old tree names if any
451 globaldata->Treenames.clear();
453 //fills globaldatas tree names
454 globaldata->Treenames = globaldata->Groups;
456 //create treemap class from groupmap for tree class to use
457 tmap = new TreeMap();
458 tmap->makeSim(globaldata->gGroupmap);
459 globaldata->gTreemap = tmap;
461 set<string> processedLabels;
462 set<string> userLabels = labels;
463 set<int> userLines = lines;
465 //as long as you are not at the end of the file or done wih the lines you want
466 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0) || (userLines.size() != 0))) {
468 if(allLines == 1 || lines.count(count) == 1 || labels.count(lookup[0]->getLabel()) == 1){
469 cout << lookup[0]->getLabel() << '\t' << count << endl;
472 processedLabels.insert(lookup[0]->getLabel());
473 userLabels.erase(lookup[0]->getLabel());
474 userLines.erase(count);
477 if ((anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLookup[0]->getLabel()) != 1)) {
478 cout << lastLookup[0]->getLabel() << '\t' << count << endl;
481 processedLabels.insert(lastLookup[0]->getLabel());
482 userLabels.erase(lastLookup[0]->getLabel());
485 //prevent memory leak
486 if (count != 1) { for (int i = 0; i < lastLookup.size(); i++) { delete lastLookup[i]; } }
489 //get next line to process
490 lookup = input->getSharedRAbundVectors();
494 //output error messages about any remaining user labels
495 set<string>::iterator it;
496 bool needToRun = false;
497 for (it = userLabels.begin(); it != userLabels.end(); it++) {
498 cout << "Your file does not include the label "<< *it;
499 if (processedLabels.count(lastLookup[0]->getLabel()) != 1) {
500 cout << ". I will use " << lastLookup[0]->getLabel() << "." << endl;
503 cout << ". Please refer to " << lastLookup[0]->getLabel() << "." << endl;
507 //run last line if you need to
508 if (needToRun == true) {
509 cout << lastLookup[0]->getLabel() << '\t' << count << endl;
513 for (int i = 0; i < lastLookup.size(); i++) { delete lastLookup[i]; }
514 for(int i = 0 ; i < treeCalculators.size(); i++) { delete treeCalculators[i]; }
516 catch(exception& e) {
517 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function makeSimsShared. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
521 cout << "An unknown error has occurred in the TreeGroupCommand class function makeSimsShared. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
526 /***********************************************************/
527 void TreeGroupCommand::process(vector<SharedRAbundVector*> thisLookup) {
530 vector<SharedRAbundVector*> subset;
531 numGroups = thisLookup.size();
533 //for each calculator
534 for(int i = 0 ; i < treeCalculators.size(); i++) {
535 //initialize simMatrix
537 simMatrix.resize(numGroups);
538 for (int m = 0; m < simMatrix.size(); m++) {
539 for (int j = 0; j < simMatrix.size(); j++) {
540 simMatrix[m].push_back(0.0);
546 for (int g = 0; g < numGroups; g++) { index[g] = g; }
548 //create a new filename
549 outputFile = getRootName(globaldata->inputFileName) + treeCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + ".tre";
551 for (int k = 0; k < thisLookup.size(); k++) {
552 for (int l = k; l < thisLookup.size(); l++) {
553 if (k != l) { //we dont need to similiarity of a groups to itself
554 //get estimated similarity between 2 groups
556 subset.clear(); //clear out old pair of sharedrabunds
557 //add new pair of sharedrabunds
558 subset.push_back(thisLookup[k]); subset.push_back(thisLookup[l]);
560 data = treeCalculators[i]->getValues(subset); //saves the calculator outputs
561 //save values in similarity matrix
562 simMatrix[k][l] = data[0];
563 simMatrix[l][k] = data[0];
568 //creates tree from similarity matrix and write out file
573 catch(exception& e) {
574 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
578 cout << "An unknown error has occurred in the TreeGroupCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
582 /***********************************************************/