]> git.donarmstrong.com Git - mothur.git/blob - treegroupscommand.cpp
merged pat's trim seqs edits with sarah's major overhaul of global data; also added...
[mothur.git] / treegroupscommand.cpp
1 /*
2  *  treegroupscommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/8/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "treegroupscommand.h"
11 #include "sharedjabund.h"
12 #include "sharedsorabund.h"
13 #include "sharedjclass.h"
14 #include "sharedsorclass.h"
15 #include "sharedjest.h"
16 #include "sharedsorest.h"
17 #include "sharedthetayc.h"
18 #include "sharedthetan.h"
19 #include "sharedmorisitahorn.h"
20 #include "sharedbraycurtis.h"
21
22
23 //**********************************************************************************************************************
24
25 TreeGroupCommand::TreeGroupCommand(string option){
26         try {
27                 globaldata = GlobalData::getInstance();
28                 abort = false;
29                 allLines = 1;
30                 lines.clear();
31                 labels.clear();
32                 Groups.clear();
33                 Estimators.clear();
34                 
35                 //allow user to run help
36                 if(option == "help") { validCalculator = new ValidCalculators(); help(); abort = true; }
37                 
38                 else {
39                         //valid paramters for this command
40                         string Array[] =  {"line","label","calc","groups", "phylip", "column", "name", "precision","cutoff"};
41                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
42                         
43                         OptionParser parser(option);
44                         map<string, string> parameters = parser. getParameters();
45                         
46                         ValidParameters validParameter;
47                 
48                         //check to make sure all parameters are valid for command
49                         for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
50                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
51                         }
52                         
53                         //required parameters
54                         phylipfile = validParameter.validFile(parameters, "phylip", true);
55                         if (phylipfile == "not open") { abort = true; }
56                         else if (phylipfile == "not found") { phylipfile = ""; }        
57                         else {  format = "phylip";      }
58                         
59                         columnfile = validParameter.validFile(parameters, "column", true);
60                         if (columnfile == "not open") { abort = true; } 
61                         else if (columnfile == "not found") { columnfile = ""; }
62                         else {  format = "column";      }
63                         
64                         namefile = validParameter.validFile(parameters, "name", true);
65                         if (namefile == "not open") { abort = true; }   
66                         else if (namefile == "not found") { namefile = ""; }
67                         else {  globaldata->setNameFile(namefile);      }
68                         
69 //                      format = globaldata->getFormat();
70                         
71                         //error checking on files                       
72                         if ((globaldata->getSharedFile() == "") && ((phylipfile == "") && (columnfile == "")))  { cout << "You must run the read.otu command or provide a distance file before running the tree.shared command." << endl; abort = true; }
73                         else if ((phylipfile != "") && (columnfile != "")) { cout << "When running the tree.shared command with a distance file you may not use both the column and the phylip parameters." << endl; abort = true; }
74                         
75                         if (columnfile != "") {
76                                 if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
77                         }
78
79                         //check for optional parameter and set defaults
80                         // ...at some point should added some additional type checking...
81                         line = validParameter.validFile(parameters, "line", false);                             
82                         if (line == "not found") { line = "";  }
83                         else { 
84                                 if(line != "all") {  splitAtDash(line, lines);  allLines = 0;  }
85                                 else { allLines = 1;  }
86                         }
87                         
88                         label = validParameter.validFile(parameters, "label", false);                   
89                         if (label == "not found") { label = ""; }
90                         else { 
91                                 if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
92                                 else { allLines = 1;  }
93                         }
94                         
95                         //make sure user did not use both the line and label parameters
96                         if ((line != "") && (label != "")) { cout << "You cannot use both the line and label parameters at the same time. " << endl; abort = true; }
97                         //if the user has not specified any line or labels use the ones from read.otu
98                         else if((line == "") && (label == "")) {  
99                                 allLines = globaldata->allLines; 
100                                 labels = globaldata->labels; 
101                                 lines = globaldata->lines;
102                         }
103                                 
104                         groups = validParameter.validFile(parameters, "groups", false);                 
105                         if (groups == "not found") { groups = ""; }
106                         else { 
107                                 splitAtDash(groups, Groups);
108                                 globaldata->Groups = Groups;
109                         }
110                                 
111                         calc = validParameter.validFile(parameters, "calc", false);                     
112                         if (calc == "not found") { calc = "jclass-thetayc";  }
113                         else { 
114                                  if (calc == "default")  {  calc = "jclass-thetayc";  }
115                         }
116                         splitAtDash(calc, Estimators);
117
118                         string temp;
119                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
120                         convert(temp, precision); 
121                         
122                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10"; }
123                         convert(temp, cutoff); 
124                         cutoff += (5 / (precision * 10.0));
125
126                                 
127                         if (abort == false) {
128                         
129                                 validCalculator = new ValidCalculators();
130                                 
131                                 if (format == "sharedfile") {
132                                         int i;
133                                         for (i=0; i<Estimators.size(); i++) {
134                                                 if (validCalculator->isValidCalculator("treegroup", Estimators[i]) == true) { 
135                                                         if (Estimators[i] == "jabund") {        
136                                                                 treeCalculators.push_back(new JAbund());
137                                                         }else if (Estimators[i] == "sorabund") { 
138                                                                 treeCalculators.push_back(new SorAbund());
139                                                         }else if (Estimators[i] == "jclass") { 
140                                                                 treeCalculators.push_back(new Jclass());
141                                                         }else if (Estimators[i] == "sorclass") { 
142                                                                 treeCalculators.push_back(new SorClass());
143                                                         }else if (Estimators[i] == "jest") { 
144                                                                 treeCalculators.push_back(new Jest());
145                                                         }else if (Estimators[i] == "sorest") { 
146                                                                 treeCalculators.push_back(new SorEst());
147                                                         }else if (Estimators[i] == "thetayc") { 
148                                                                 treeCalculators.push_back(new ThetaYC());
149                                                         }else if (Estimators[i] == "thetan") { 
150                                                                 treeCalculators.push_back(new ThetaN());
151                                                         }else if (Estimators[i] == "morisitahorn") { 
152                                                                 treeCalculators.push_back(new MorHorn());
153                                                         }else if (Estimators[i] == "braycurtis") { 
154                                                                 treeCalculators.push_back(new BrayCurtis());
155                                                         }
156                                                 }
157                                         }
158                                 }
159                         }       
160                 }
161
162         }
163         catch(exception& e) {
164                 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function TreeGroupCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
165                 exit(1);
166         }
167         catch(...) {
168                 cout << "An unknown error has occurred in the TreeGroupCommand class function TreeGroupCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
169                 exit(1);
170         }       
171 }
172
173 //**********************************************************************************************************************
174
175 void TreeGroupCommand::help(){
176         try {
177                 cout << "The tree.shared command creates a .tre to represent the similiarity between groups or sequences." << "\n";
178                 cout << "The tree.shared command can only be executed after a successful read.otu command or by providing a distance file." << "\n";
179                 cout << "The tree.shared command parameters are groups, calc, phylip, column, name, cutoff, precision, line and label.  You may not use line and label at the same time." << "\n";
180                 cout << "The groups parameter allows you to specify which of the groups in your groupfile you would like included used." << "\n";
181                 cout << "The group names are separated by dashes. The line and label allow you to select what distance levels you would like trees created for, and are also separated by dashes." << "\n";
182                 cout << "The phylip or column parameter are required if you do not run the read.otu command first, and only one may be used.  If you use a column file the name filename is required. " << "\n";
183                 cout << "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed." << "\n";
184                 cout << "The tree.shared command should be in the following format: tree.shared(groups=yourGroups, calc=yourCalcs, line=yourLines, label=yourLabels)." << "\n";
185                 cout << "Example tree.shared(groups=A-B-C, line=1-3-5, calc=jabund-sorabund)." << "\n";
186                 cout << "The default value for groups is all the groups in your groupfile." << "\n";
187                 cout << "The default value for calc is jclass-thetayc." << "\n";
188                 cout << "The tree.shared command outputs a .tre file for each calculator you specify at each distance you choose." << "\n";
189                 validCalculator->printCalc("treegroup", cout);
190                 cout << "Or the tree.shared command can be in the following format: tree.shared(phylip=yourPhylipFile)." << "\n";
191                 cout << "Example tree.shared(phylip=abrecovery.dist)." << "\n";
192                 cout << "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups)." << "\n" << "\n";
193         }
194         catch(exception& e) {
195                 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
196                 exit(1);
197         }
198         catch(...) {
199                 cout << "An unknown error has occurred in the TreeGroupCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
200                 exit(1);
201         }       
202 }
203
204
205 //**********************************************************************************************************************
206
207 TreeGroupCommand::~TreeGroupCommand(){
208         delete input;
209         if (format == "sharedfile") {delete read;}
210         else { delete readMatrix;  delete matrix; delete list; }
211         delete tmap;
212         delete validCalculator;
213         
214 }
215
216 //**********************************************************************************************************************
217
218 int TreeGroupCommand::execute(){
219         try {
220         
221                 if (abort == true) { return 0; }
222                 
223                 if (format == "sharedfile") {
224                         //if the users entered no valid calculators don't execute command
225                         if (treeCalculators.size() == 0) { cout << "You have given no valid calculators." << endl; return 0; }
226
227                         //you have groups
228                         read = new ReadOTUFile(globaldata->inputFileName);      
229                         read->read(&*globaldata); 
230                         
231                         input = globaldata->ginput;
232                         lookup = input->getSharedRAbundVectors();
233                         lastLookup = lookup;
234                         
235                         if (lookup.size() < 2) { cout << "You have not provided enough valid groups.  I cannot run the command." << endl; return 0; }
236                 
237                         //create tree file
238                         makeSimsShared();
239                 }else{
240                         //read in dist file
241                         filename = globaldata->inputFileName;
242                 
243                         if (format == "column") { readMatrix = new ReadColumnMatrix(filename); }        
244                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(filename); }
245                                 
246                         readMatrix->setCutoff(cutoff);
247         
248                         if(namefile != ""){     
249                                 nameMap = new NameAssignment(namefile);
250                                 nameMap->readMap(1,2);
251                         }
252                         else{
253                                 nameMap = NULL;
254                         }
255         
256                         readMatrix->read(nameMap);
257                         list = readMatrix->getListVector();
258                         matrix = readMatrix->getMatrix();
259
260                         //make treemap
261                         tmap = new TreeMap();
262                         tmap->makeSim(list);
263                         globaldata->gTreemap = tmap;
264                         
265                         globaldata->Groups = tmap->namesOfGroups;
266                 
267                         //clear globaldatas old tree names if any
268                         globaldata->Treenames.clear();
269                 
270                         //fills globaldatas tree names
271                         globaldata->Treenames = globaldata->Groups;
272
273                         makeSimsDist();
274
275                         //create a new filename
276                         outputFile = getRootName(globaldata->inputFileName) + "tre";    
277                                 
278                         createTree();
279                         cout << "Tree complete. " << endl;
280                 }
281                                 
282                 //reset groups parameter
283                 globaldata->Groups.clear();  
284
285                 return 0;
286         }
287         catch(exception& e) {
288                 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
289                 exit(1);
290         }
291         catch(...) {
292                 cout << "An unknown error has occurred in the TreeGroupCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
293                 exit(1);
294         }               
295 }
296 //**********************************************************************************************************************
297
298 void TreeGroupCommand::createTree(){
299         try {
300                 //create tree
301                 t = new Tree();
302                 
303                 //do merges and create tree structure by setting parents and children
304                 //there are numGroups - 1 merges to do
305                 for (int i = 0; i < (numGroups - 1); i++) {
306                         float largest = -1000.0;
307
308                         int row, column;
309                         //find largest value in sims matrix by searching lower triangle
310                         for (int j = 1; j < simMatrix.size(); j++) {
311                                 for (int k = 0; k < j; k++) {
312                                         if (simMatrix[j][k] > largest) {  largest = simMatrix[j][k]; row = j; column = k;  }
313                                 }
314                         }
315
316                         //set non-leaf node info and update leaves to know their parents
317                         //non-leaf
318                         t->tree[numGroups + i].setChildren(index[row], index[column]);
319                         
320                         //parents
321                         t->tree[index[row]].setParent(numGroups + i);
322                         t->tree[index[column]].setParent(numGroups + i);
323                         
324                         //blength = distance / 2;
325                         float blength = ((1.0 - largest) / 2);
326                         
327                         //branchlengths
328                         t->tree[index[row]].setBranchLength(blength - t->tree[index[row]].getLengthToLeaves());
329                         t->tree[index[column]].setBranchLength(blength - t->tree[index[column]].getLengthToLeaves());
330                         
331                         //set your length to leaves to your childs length plus branchlength
332                         t->tree[numGroups + i].setLengthToLeaves(t->tree[index[row]].getLengthToLeaves() + t->tree[index[row]].getBranchLength());
333                         
334                         
335                         //update index 
336                         index[row] = numGroups+i;
337                         index[column] = numGroups+i;
338                         
339                         //remove highest value that caused the merge.
340                         simMatrix[row][column] = -1000.0;
341                         simMatrix[column][row] = -1000.0;
342                         
343                         //merge values in simsMatrix
344                         for (int n = 0; n < simMatrix.size(); n++)      {
345                                 //row becomes merge of 2 groups
346                                 simMatrix[row][n] = (simMatrix[row][n] + simMatrix[column][n]) / 2;
347                                 simMatrix[n][row] = simMatrix[row][n];
348                                 //delete column
349                                 simMatrix[column][n] = -1000.0;
350                                 simMatrix[n][column] = -1000.0;
351                         }
352                 }
353                 
354                 //adjust tree to make sure root to tip length is .5
355                 int root = t->findRoot();
356                 t->tree[root].setBranchLength((0.5 - t->tree[root].getLengthToLeaves()));
357                 
358                 //assemble tree
359                 t->assembleTree();
360                 
361                 //print newick file
362                 t->createNewickFile(outputFile);
363                 
364                 //delete tree
365                 delete t;
366         
367         }
368         catch(exception& e) {
369                 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function createTree. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
370                 exit(1);
371         }
372         catch(...) {
373                 cout << "An unknown error has occurred in the TreeGroupCommand class function createTree. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
374                 exit(1);
375         }
376 }
377 /***********************************************************/
378 void TreeGroupCommand::printSims(ostream& out) {
379         try {
380                 
381                 //output column headers
382                 //out << '\t';
383                 //for (int i = 0; i < lookup.size(); i++) {     out << lookup[i]->getGroup() << '\t';           }
384                 //out << endl;
385                 
386                 
387                 for (int m = 0; m < simMatrix.size(); m++)      {
388                         //out << lookup[m]->getGroup() << '\t';
389                         for (int n = 0; n < simMatrix.size(); n++)      {
390                                 out << simMatrix[m][n] << '\t'; 
391                         }
392                         out << endl;
393                 }
394
395         }
396         catch(exception& e) {
397                 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function printSims. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
398                 exit(1);
399         }
400         catch(...) {
401                 cout << "An unknown error has occurred in the TreeGroupCommand class function printSims. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
402                 exit(1);
403         }               
404 }
405 /***********************************************************/
406 void TreeGroupCommand::makeSimsDist() {
407         try {
408                 numGroups = list->size();
409                 
410                 //initialize index
411                 index.clear();
412                 for (int g = 0; g < numGroups; g++) {   index[g] = g;   }
413                 
414                 //initialize simMatrix
415                 simMatrix.clear();
416                 simMatrix.resize(numGroups);
417                 for (int m = 0; m < simMatrix.size(); m++)      {
418                         for (int j = 0; j < simMatrix.size(); j++)      {
419                                 simMatrix[m].push_back(0.0);
420                         }
421                 }
422                 
423                 //go through sparse matrix and fill sims
424                 //go through each cell in the sparsematrix
425                 for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
426                         //similairity = -(distance-1)
427                         simMatrix[currentCell->row][currentCell->column] = -(currentCell->dist -1.0);   
428                         simMatrix[currentCell->column][currentCell->row] = -(currentCell->dist -1.0);                           
429                 }
430
431
432         }
433         catch(exception& e) {
434                 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function makeSimsDist. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
435                 exit(1);
436         }
437         catch(...) {
438                 cout << "An unknown error has occurred in the TreeGroupCommand class function makeSimsDist. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
439                 exit(1);
440         }               
441 }
442
443 /***********************************************************/
444 void TreeGroupCommand::makeSimsShared() {
445         try {
446                 int count = 1;  
447         
448                 //clear globaldatas old tree names if any
449                 globaldata->Treenames.clear();
450                 
451                 //fills globaldatas tree names
452                 globaldata->Treenames = globaldata->Groups;
453                 
454                 //create treemap class from groupmap for tree class to use
455                 tmap = new TreeMap();
456                 tmap->makeSim(globaldata->gGroupmap);
457                 globaldata->gTreemap = tmap;
458                 
459                 set<string> processedLabels;
460                 set<string> userLabels = labels;
461                 set<int> userLines = lines;
462
463                 //as long as you are not at the end of the file or done wih the lines you want
464                 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0) || (userLines.size() != 0))) {
465                 
466                         if(allLines == 1 || lines.count(count) == 1 || labels.count(lookup[0]->getLabel()) == 1){                       
467                                 cout << lookup[0]->getLabel() << '\t' << count << endl;
468                                 process(lookup);
469                                 
470                                 processedLabels.insert(lookup[0]->getLabel());
471                                 userLabels.erase(lookup[0]->getLabel());
472                                 userLines.erase(count);
473                         }
474                         
475                         if ((anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLookup[0]->getLabel()) != 1)) {
476                                 cout << lastLookup[0]->getLabel() << '\t' << count << endl;
477                                 process(lastLookup);
478                                         
479                                 processedLabels.insert(lastLookup[0]->getLabel());
480                                 userLabels.erase(lastLookup[0]->getLabel());
481                         }
482
483                         //prevent memory leak
484                         if (count != 1) { for (int i = 0; i < lastLookup.size(); i++) {  delete lastLookup[i];  } }
485                         lastLookup = lookup;                    
486                         
487                         //get next line to process
488                         lookup = input->getSharedRAbundVectors();
489                         count++;
490                 }
491                 
492                 //output error messages about any remaining user labels
493                 set<string>::iterator it;
494                 bool needToRun = false;
495                 for (it = userLabels.begin(); it != userLabels.end(); it++) {  
496                         cout << "Your file does not include the label "<< *it; 
497                         if (processedLabels.count(lastLookup[0]->getLabel()) != 1) {
498                                 cout << ". I will use " << lastLookup[0]->getLabel() << "." << endl;
499                                 needToRun = true;
500                         }else {
501                                 cout << ". Please refer to " << lastLookup[0]->getLabel() << "." << endl;
502                         }
503                 }
504                 
505                 //run last line if you need to
506                 if (needToRun == true)  {
507                         cout << lastLookup[0]->getLabel() << '\t' << count << endl;
508                         process(lastLookup);
509                 }
510                 
511                 for (int i = 0; i < lastLookup.size(); i++) {  delete lastLookup[i];  }
512                 for(int i = 0 ; i < treeCalculators.size(); i++) {  delete treeCalculators[i]; }
513         }
514         catch(exception& e) {
515                 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function makeSimsShared. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
516                 exit(1);
517         }
518         catch(...) {
519                 cout << "An unknown error has occurred in the TreeGroupCommand class function makeSimsShared. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
520                 exit(1);
521         }               
522 }
523
524 /***********************************************************/
525 void TreeGroupCommand::process(vector<SharedRAbundVector*> thisLookup) {
526         try{
527                                 EstOutput data;
528                                 vector<SharedRAbundVector*> subset;
529                                 numGroups = thisLookup.size();
530                                 
531                                 //for each calculator                                                                                           
532                                 for(int i = 0 ; i < treeCalculators.size(); i++) {
533                                         //initialize simMatrix
534                                         simMatrix.clear();
535                                         simMatrix.resize(numGroups);
536                                         for (int m = 0; m < simMatrix.size(); m++)      {
537                                                 for (int j = 0; j < simMatrix.size(); j++)      {
538                                                         simMatrix[m].push_back(0.0);
539                                                 }
540                                         }
541                 
542                                         //initialize index
543                                         index.clear();
544                                         for (int g = 0; g < numGroups; g++) {   index[g] = g;   }
545                 
546                                         //create a new filename
547                                         outputFile = getRootName(globaldata->inputFileName) + treeCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + ".tre";                         
548                                                                                                 
549                                         for (int k = 0; k < thisLookup.size(); k++) { 
550                                                 for (int l = k; l < thisLookup.size(); l++) {
551                                                         if (k != l) { //we dont need to similiarity of a groups to itself
552                                                                 //get estimated similarity between 2 groups
553                                                                 
554                                                                 subset.clear(); //clear out old pair of sharedrabunds
555                                                                 //add new pair of sharedrabunds
556                                                                 subset.push_back(thisLookup[k]); subset.push_back(thisLookup[l]); 
557                                                                 
558                                                                 data = treeCalculators[i]->getValues(subset); //saves the calculator outputs
559                                                                 //save values in similarity matrix
560                                                                 simMatrix[k][l] = data[0];
561                                                                 simMatrix[l][k] = data[0];
562                                                         }
563                                                 }
564                                         }
565                                         
566                                         //creates tree from similarity matrix and write out file
567                                         createTree();
568                                 }
569
570         }
571         catch(exception& e) {
572                 cout << "Standard Error: " << e.what() << " has occurred in the TreeGroupCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
573                 exit(1);
574         }
575         catch(...) {
576                 cout << "An unknown error has occurred in the TreeGroupCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
577                 exit(1);
578         }               
579 }
580 /***********************************************************/
581
582         
583