]> git.donarmstrong.com Git - mothur.git/blob - classifytreecommand.cpp
added tree reader class to handle reading trees. Reworked the tree map to tree class...
[mothur.git] / classifytreecommand.cpp
1 //
2 //  classifytreecommand.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 2/20/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "classifytreecommand.h"
10 #include "phylotree.h"
11 #include "treereader.h"
12
13 //**********************************************************************************************************************
14 vector<string> ClassifyTreeCommand::setParameters(){    
15         try {
16                 CommandParameter ptree("tree", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptree);
17         CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptaxonomy);
18         CommandParameter pname("name", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pname);
19         CommandParameter pgroup("group", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pgroup);
20         CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff);
21                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
22                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
23                 
24                 vector<string> myArray;
25                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
26                 return myArray;
27         }
28         catch(exception& e) {
29                 m->errorOut(e, "ClassifyTreeCommand", "setParameters");
30                 exit(1);
31         }
32 }
33 //**********************************************************************************************************************
34 string ClassifyTreeCommand::getHelpString(){    
35         try {
36                 string helpString = "";
37                 helpString += "The classify.tree command reads a tree and taxonomy file and output the consensus taxonomy for each node on the tree. \n";
38                 helpString += "If you provide a group file, the concensus for each group will also be provided. \n";
39                 helpString += "The new tree contains labels at each internal node.  The label is the node number so you can relate the tree to the summary file.\n";
40                 helpString += "The summary file lists the concensus taxonomy for the descendants of each node.\n";
41                 helpString += "The classify.tree command parameters are tree, group, name and taxonomy. The tree and taxonomy files are required.\n";
42         helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy.  The default is 51, meaning 51%. Cutoff cannot be below 51.\n";
43         helpString += "The classify.tree command should be used in the following format: classify.tree(tree=test.tre, group=test.group, taxonomy=test.taxonomy)\n";
44                 helpString += "Note: No spaces between parameter labels (i.e. tree), '=' and parameters (i.e.yourTreefile).\n"; 
45                 return helpString;
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "ClassifyTreeCommand", "getHelpString");
49                 exit(1);
50         }
51 }
52
53 //**********************************************************************************************************************
54 ClassifyTreeCommand::ClassifyTreeCommand(){     
55         try {
56                 abort = true; calledHelp = true; 
57                 setParameters();
58                 vector<string> tempOutNames;
59                 outputTypes["tree"] = tempOutNames;
60                 outputTypes["summary"] = tempOutNames;
61         }
62         catch(exception& e) {
63                 m->errorOut(e, "ClassifyTreeCommand", "ClassifyTreeCommand");
64                 exit(1);
65         }
66 }
67 //**********************************************************************************************************************
68 ClassifyTreeCommand::ClassifyTreeCommand(string option)  {
69         try {
70                 abort = false; calledHelp = false;   
71                 
72                 //allow user to run help
73                 if(option == "help") { help(); abort = true; calledHelp = true; }
74                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
75                 
76                 else {
77                         vector<string> myArray = setParameters();
78                         
79                         OptionParser parser(option);
80                         map<string, string> parameters = parser.getParameters();
81                         
82                         ValidParameters validParameter;
83                         map<string, string>::iterator it;
84                         
85                         //check to make sure all parameters are valid for command
86                         for (it = parameters.begin(); it != parameters.end(); it++) { 
87                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
88                         }
89                         
90                         vector<string> tempOutNames;
91                         outputTypes["tree"] = tempOutNames;
92                         outputTypes["summary"] = tempOutNames;
93                         
94                         //if the user changes the input directory command factory will send this info to us in the output parameter 
95                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
96                         if (inputDir == "not found"){   inputDir = "";          }
97                         else {
98                                 string path;
99                                 it = parameters.find("tree");
100                                 //user has given a template file
101                                 if(it != parameters.end()){ 
102                                         path = m->hasPath(it->second);
103                                         //if the user has not given a path then, add inputdir. else leave path alone.
104                                         if (path == "") {       parameters["tree"] = inputDir + it->second;             }
105                                 }
106                                 
107                                 it = parameters.find("name");
108                                 //user has given a template file
109                                 if(it != parameters.end()){ 
110                                         path = m->hasPath(it->second);
111                                         //if the user has not given a path then, add inputdir. else leave path alone.
112                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
113                                 }
114                                 
115                                 it = parameters.find("group");
116                                 //user has given a template file
117                                 if(it != parameters.end()){ 
118                                         path = m->hasPath(it->second);
119                                         //if the user has not given a path then, add inputdir. else leave path alone.
120                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
121                                 }
122                                 
123                                 it = parameters.find("taxonomy");
124                                 //user has given a template file
125                                 if(it != parameters.end()){ 
126                                         path = m->hasPath(it->second);
127                                         //if the user has not given a path then, add inputdir. else leave path alone.
128                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
129                                 }
130                         }
131                         
132                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
133             
134                         //check for required parameters
135                         treefile = validParameter.validFile(parameters, "tree", true);
136                         if (treefile == "not open") { treefile = ""; abort = true; }
137                         else if (treefile == "not found") { treefile = ""; 
138                 treefile = m->getTreeFile(); 
139                 if (treefile != "") {  m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
140                 else { m->mothurOut("No valid current files. You must provide a tree file."); m->mothurOutEndLine(); abort = true; }
141             }else { m->setTreeFile(treefile); } 
142             
143             taxonomyfile = validParameter.validFile(parameters, "taxonomy", true);
144                         if (taxonomyfile == "not open") { taxonomyfile = ""; abort = true; }
145                         else if (taxonomyfile == "not found") { taxonomyfile = ""; 
146                 taxonomyfile = m->getTaxonomyFile(); 
147                 if (taxonomyfile != "") {  m->mothurOut("Using " + taxonomyfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
148                 else { m->mothurOut("No valid current files. You must provide a taxonomy file."); m->mothurOutEndLine(); abort = true; }
149             }else { m->setTaxonomyFile(taxonomyfile); } 
150                         
151                         namefile = validParameter.validFile(parameters, "name", true);
152                         if (namefile == "not open") { namefile = ""; abort = true; }
153                         else if (namefile == "not found") { namefile = ""; }
154                         else { m->setNameFile(namefile); }
155                         
156                         groupfile = validParameter.validFile(parameters, "group", true);
157                         if (groupfile == "not open") { groupfile = ""; abort = true; }
158                         else if (groupfile == "not found") { groupfile = ""; }
159                         else { m->setGroupFile(groupfile); }
160             
161             string temp = validParameter.validFile(parameters, "cutoff", false);                        if (temp == "not found") { temp = "51"; }
162                         m->mothurConvert(temp, cutoff); 
163                         
164                         if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true;  }
165             
166             if (namefile == "") {
167                                 vector<string> files; files.push_back(treefile);
168                                 parser.getNameFile(files);
169                         }
170                         
171                 }
172         }
173         catch(exception& e) {
174                 m->errorOut(e, "ClassifyTreeCommand", "ClassifyTreeCommand");           
175                 exit(1);
176         }
177 }
178 //**********************************************************************************************************************
179
180 int ClassifyTreeCommand::execute(){
181         try {
182                 
183                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
184                 
185                 cout.setf(ios::fixed, ios::floatfield); cout.setf(ios::showpoint);
186                 
187                 int start = time(NULL);
188         
189                 /***************************************************/
190                 //    reading tree info                                                    //
191                 /***************************************************/
192         m->setTreeFile(treefile);
193         
194         TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
195         vector<Tree*> T = reader->getTrees();
196         TreeMap* tmap = T[0]->getTreeMap();
197         Tree* outputTree = T[0];
198         delete reader;
199
200         if (namefile != "") { readNamesFile(); }
201                         
202         if (m->control_pressed) { delete tmap;  delete outputTree;  return 0; }
203                 
204         readTaxonomyFile();
205         
206         /***************************************************/
207         //              get concensus taxonomies                    //
208         /***************************************************/
209         getClassifications(outputTree);
210         delete outputTree; delete tmap;
211                         
212                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
213                 
214                 //set tree file as new current treefile
215                 if (treefile != "") {
216                         string current = "";
217                         itTypes = outputTypes.find("tree");
218                         if (itTypes != outputTypes.end()) {
219                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTreeFile(current); }
220                         }
221                 }
222                 
223                 m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to find the concensus taxonomies."); m->mothurOutEndLine();
224                 m->mothurOutEndLine();
225                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
226                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
227                 m->mothurOutEndLine();
228         
229                 return 0;
230         }
231         catch(exception& e) {
232                 m->errorOut(e, "ClassifyTreeCommand", "execute");       
233                 exit(1);
234         }
235 }
236 //**********************************************************************************************************************
237 //traverse tree finding concensus taxonomy at each node
238 //label node with a number to relate to output summary file
239 //report all concensus taxonomies to file 
240 int ClassifyTreeCommand::getClassifications(Tree*& T){
241         try {
242                 
243                 string thisOutputDir = outputDir;
244                 if (outputDir == "") {  thisOutputDir += m->hasPath(treefile);  }
245                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(treefile)) + "taxonomy.summary";
246                 outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName);
247                 
248                 ofstream out;
249                 m->openOutputFile(outputFileName, out);
250                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
251                 
252                 //print headings
253                 out << "TreeNode\t";
254                 if (groupfile != "") { out << "Group\t"; } 
255         out << "NumRep\tTaxonomy" << endl; 
256                 
257                 string treeOutputDir = outputDir;
258                 if (outputDir == "") {  treeOutputDir += m->hasPath(treefile);  }
259                 string outputTreeFileName = treeOutputDir + m->getRootName(m->getSimpleName(treefile)) + "taxonomy.tre";
260                 
261                 //create a map from tree node index to names of descendants, save time later
262                 map<int, map<string, set<string> > > nodeToDescendants; //node# -> (groupName -> groupMembers)
263                 for (int i = 0; i < T->getNumNodes(); i++) {
264                         if (m->control_pressed) { return 0; }
265                         
266                         nodeToDescendants[i] = getDescendantList(T, i, nodeToDescendants);
267                 }
268                 
269                 //for each node
270                 for (int i = T->getNumLeaves(); i < T->getNumNodes(); i++) {
271                         
272                         if (m->control_pressed) { out.close(); return 0; }
273             
274                         string tax = "not classifed";
275             int size;
276             if (groupfile != "") {
277                 for (map<string, set<string> >::iterator itGroups = nodeToDescendants[i].begin(); itGroups != nodeToDescendants[i].end(); itGroups++) {
278                     if (itGroups->first != "AllGroups") {
279                         tax = getTaxonomy(itGroups->second, size);
280                         out << (i+1) << '\t' << itGroups->first << '\t' << size << '\t' << tax << endl;
281                     }
282                 }
283             }else {
284                 string group = "AllGroups";
285                 tax = getTaxonomy(nodeToDescendants[i][group], size);
286                 out << (i+1) << '\t' << size << '\t' << tax << endl;
287             }
288                                 
289                         T->tree[i].setLabel((i+1));
290                 }
291                 out.close();
292         
293                 ofstream outTree;
294                 m->openOutputFile(outputTreeFileName, outTree);
295                 outputNames.push_back(outputTreeFileName); outputTypes["tree"].push_back(outputTreeFileName);
296                 T->print(outTree, "both");
297                 outTree.close();
298         
299                 return 0;
300         }
301         catch(exception& e) {
302                 m->errorOut(e, "ClassifyTreeCommand", "GetConcensusTaxonomies");        
303                 exit(1);
304         }
305 }
306 //**********************************************************************************************************************
307 string ClassifyTreeCommand::getTaxonomy(set<string> names, int& size) {
308         try{
309                 string conTax = "";
310         size = 0;
311                         
312                 //create a tree containing sequences from this bin
313                 PhyloTree* phylo = new PhyloTree();
314                 
315                 for (set<string>::iterator it = names.begin(); it != names.end(); it++) {
316             
317             
318                         //if namesfile include the names
319                         if (namefile != "") {
320                 
321                                 //is this sequence in the name file - namemap maps seqName -> repSeqName
322                                 map<string, string>::iterator it2 = nameMap.find(*it);
323                                 
324                                 if (it2 == nameMap.end()) { //this name is not in name file, skip it
325                                         m->mothurOut((*it) + " is not in your name file.  I will not include it in the consensus."); m->mothurOutEndLine();
326                                 }else{
327                                         
328                                         //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
329                                         map<string, string>::iterator itTax = taxMap.find((it2->second));
330                     
331                                         if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it
332                         
333                                                 if ((*it) != (it2->second)) { m->mothurOut((*it) + " is represented by " +  it2->second + " and is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
334                                                 else {  m->mothurOut((*it) + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
335                                         }else{
336                                                 //add seq to tree
337                         int num = nameCount[(*it)]; // we know its there since we found it in nameMap
338                                                 for (int i = 0; i < num; i++) {  phylo->addSeqToTree((*it)+toString(i), it2->second);  }
339                         size += num;
340                                         }
341                                 }
342                                 
343                         }else{
344                                 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
345                                 map<string, string>::iterator itTax = taxMap.find((*it));
346                 
347                                 if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it
348                                         m->mothurOut((*it) + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine();
349                                 }else{
350                                         //add seq to tree
351                                         phylo->addSeqToTree((*it), itTax->second);
352                     size++;
353                                 }
354                         }
355             
356                         if (m->control_pressed) { delete phylo; return conTax; }
357                         
358                 }
359                 
360                 //build tree
361                 phylo->assignHeirarchyIDs(0);
362                 
363                 TaxNode currentNode = phylo->get(0);
364                 int myLevel = 0;        
365                 //at each level
366                 while (currentNode.children.size() != 0) { //you still have more to explore
367             
368                         TaxNode bestChild;
369                         int bestChildSize = 0;
370                         
371                         //go through children
372                         for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
373                                 
374                                 TaxNode temp = phylo->get(itChild->second);
375                                 
376                                 //select child with largest accesions - most seqs assigned to it
377                                 if (temp.accessions.size() > bestChildSize) {
378                                         bestChild = phylo->get(itChild->second);
379                                         bestChildSize = temp.accessions.size();
380                                 }
381                                 
382                         }
383             
384                         //is this taxonomy above cutoff
385                         int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
386                         
387                         if (consensusConfidence >= cutoff) { //if yes, add it
388                 conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
389                                 myLevel++;
390                         }else{ //if no, quit
391                                 break;
392                         }
393                         
394                         //move down a level
395                         currentNode = bestChild;
396                 }
397                 
398                 if (myLevel != phylo->getMaxLevel()) {
399                         while (myLevel != phylo->getMaxLevel()) {
400                                 conTax += "unclassified;";
401                                 myLevel++;
402                         }
403                 }               
404                 if (conTax == "") {  conTax = "no_consensus;";  }
405                 
406                 delete phylo;   
407         
408         return conTax;
409         
410         }
411         catch(exception& e) {
412                 m->errorOut(e, "ClassifyTreeCommand", "getTaxonomy");
413                 exit(1);
414         }
415 }
416
417 //**********************************************************************************************************************
418 map<string, set<string> > ClassifyTreeCommand::getDescendantList(Tree*& T, int i, map<int, map<string, set<string> > > descendants){
419         try {
420                 map<string ,set<string> > names;
421                 
422                 map<string ,set<string> >::iterator it;
423         map<string ,set<string> >::iterator it2;
424                 
425                 int lc = T->tree[i].getLChild();
426                 int rc = T->tree[i].getRChild();
427         TreeMap* tmap = T->getTreeMap();
428                 
429                 if (lc == -1) { //you are a leaf your only descendant is yourself
430             string group = tmap->getGroup(T->tree[i].getName());
431             set<string> mynames; mynames.insert(T->tree[i].getName());
432             names[group] = mynames; //mygroup -> me
433             names["AllGroups"] = mynames;
434                 }else{ //your descedants are the combination of your childrens descendants
435                         names = descendants[lc];
436                         for (it = descendants[rc].begin(); it != descendants[rc].end(); it++) {
437                 it2 = names.find(it->first); //do we already have this group
438                 if (it2 == names.end()) { //nope, so add it
439                     names[it->first] = it->second;
440                 }else {
441                     for (set<string>::iterator it3 = (it->second).begin(); it3 != (it->second).end(); it3++) {
442                         names[it->first].insert(*it3);
443                     }
444                 }
445                                 
446                         }
447                 }
448                 
449                 return names;
450         }
451         catch(exception& e) {
452                 m->errorOut(e, "ClassifyTreeCommand", "getDescendantList");     
453                 exit(1);
454         }
455 }
456 //**********************************************************************************************************************
457 int ClassifyTreeCommand::readTaxonomyFile() {
458         try {
459                 
460                 ifstream in;
461                 m->openInputFile(taxonomyfile, in);
462                 
463                 string name, tax;
464         
465                 while(!in.eof()){
466                         in >> name >> tax;              
467                         m->gobble(in);
468                         
469                         //are there confidence scores, if so remove them
470                         if (tax.find_first_of('(') != -1) {  m->removeConfidences(tax); }
471                         
472                         taxMap[name] = tax;
473                         
474                         if (m->control_pressed) { in.close(); taxMap.clear(); return 0; }
475                 }
476                 in.close();
477                 
478                 return 0;
479         }
480         catch(exception& e) {
481                 m->errorOut(e, "ClassifyTreeCommand", "readTaxonomyFile");
482                 exit(1);
483         }
484 }
485
486 /*****************************************************************/
487 int ClassifyTreeCommand::readNamesFile() {
488         try {
489                 ifstream inNames;
490                 m->openInputFile(namefile, inNames);
491                 
492                 string name, names;
493         
494                 while(!inNames.eof()){
495                         inNames >> name;                        //read from first column  A
496                         inNames >> names;               //read from second column  A,B,C,D
497                         m->gobble(inNames);
498                         
499                         //parse names into vector
500                         vector<string> theseNames;
501                         m->splitAtComma(names, theseNames);
502             
503                         for (int i = 0; i < theseNames.size(); i++) {  nameMap[theseNames[i]] = name;  }
504             nameCount[name] = theseNames.size();
505                         
506                         if (m->control_pressed) { inNames.close(); nameMap.clear(); return 0; }
507                 }
508                 inNames.close();
509                 
510                 return 0;
511         }
512         catch(exception& e) {
513                 m->errorOut(e, "ClassifyTreeCommand", "readNamesFile");
514                 exit(1);
515         }
516 }
517
518 /*****************************************************************/
519
520