2 // classifytreecommand.cpp
5 // Created by Sarah Westcott on 2/20/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "classifytreecommand.h"
10 #include "phylotree.h"
11 #include "treereader.h"
13 //**********************************************************************************************************************
14 vector<string> ClassifyTreeCommand::setParameters(){
16 CommandParameter ptree("tree", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptree);
17 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptaxonomy);
18 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
19 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
20 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
21 CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff);
22 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
23 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
25 vector<string> myArray;
26 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
30 m->errorOut(e, "ClassifyTreeCommand", "setParameters");
34 //**********************************************************************************************************************
35 string ClassifyTreeCommand::getHelpString(){
37 string helpString = "";
38 helpString += "The classify.tree command reads a tree and taxonomy file and output the consensus taxonomy for each node on the tree. \n";
39 helpString += "If you provide a group file, the concensus for each group will also be provided. \n";
40 helpString += "The new tree contains labels at each internal node. The label is the node number so you can relate the tree to the summary file.\n";
41 helpString += "The count parameter allows you add a count file so you can have the summary totals broken up by group.\n";
42 helpString += "The summary file lists the concensus taxonomy for the descendants of each node.\n";
43 helpString += "The classify.tree command parameters are tree, group, name, count and taxonomy. The tree and taxonomy files are required.\n";
44 helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy. The default is 51, meaning 51%. Cutoff cannot be below 51.\n";
45 helpString += "The classify.tree command should be used in the following format: classify.tree(tree=test.tre, group=test.group, taxonomy=test.taxonomy)\n";
46 helpString += "Note: No spaces between parameter labels (i.e. tree), '=' and parameters (i.e.yourTreefile).\n";
50 m->errorOut(e, "ClassifyTreeCommand", "getHelpString");
54 //**********************************************************************************************************************
55 string ClassifyTreeCommand::getOutputFileNameTag(string type, string inputName=""){
57 string outputFileName = "";
58 map<string, vector<string> >::iterator it;
60 //is this a type this command creates
61 it = outputTypes.find(type);
62 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
64 if (type == "tree") { outputFileName = "taxonomy.tre"; }
65 else if (type == "summary") { outputFileName = "taxonomy.summary"; }
66 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
68 return outputFileName;
71 m->errorOut(e, "ClassifyTreeCommand", "getOutputFileNameTag");
75 //**********************************************************************************************************************
76 ClassifyTreeCommand::ClassifyTreeCommand(){
78 abort = true; calledHelp = true;
80 vector<string> tempOutNames;
81 outputTypes["tree"] = tempOutNames;
82 outputTypes["summary"] = tempOutNames;
85 m->errorOut(e, "ClassifyTreeCommand", "ClassifyTreeCommand");
89 //**********************************************************************************************************************
90 ClassifyTreeCommand::ClassifyTreeCommand(string option) {
92 abort = false; calledHelp = false;
94 //allow user to run help
95 if(option == "help") { help(); abort = true; calledHelp = true; }
96 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
99 vector<string> myArray = setParameters();
101 OptionParser parser(option);
102 map<string, string> parameters = parser.getParameters();
104 ValidParameters validParameter;
105 map<string, string>::iterator it;
107 //check to make sure all parameters are valid for command
108 for (it = parameters.begin(); it != parameters.end(); it++) {
109 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
112 vector<string> tempOutNames;
113 outputTypes["tree"] = tempOutNames;
114 outputTypes["summary"] = tempOutNames;
116 //if the user changes the input directory command factory will send this info to us in the output parameter
117 string inputDir = validParameter.validFile(parameters, "inputdir", false);
118 if (inputDir == "not found"){ inputDir = ""; }
121 it = parameters.find("tree");
122 //user has given a template file
123 if(it != parameters.end()){
124 path = m->hasPath(it->second);
125 //if the user has not given a path then, add inputdir. else leave path alone.
126 if (path == "") { parameters["tree"] = inputDir + it->second; }
129 it = parameters.find("name");
130 //user has given a template file
131 if(it != parameters.end()){
132 path = m->hasPath(it->second);
133 //if the user has not given a path then, add inputdir. else leave path alone.
134 if (path == "") { parameters["name"] = inputDir + it->second; }
137 it = parameters.find("group");
138 //user has given a template file
139 if(it != parameters.end()){
140 path = m->hasPath(it->second);
141 //if the user has not given a path then, add inputdir. else leave path alone.
142 if (path == "") { parameters["group"] = inputDir + it->second; }
145 it = parameters.find("taxonomy");
146 //user has given a template file
147 if(it != parameters.end()){
148 path = m->hasPath(it->second);
149 //if the user has not given a path then, add inputdir. else leave path alone.
150 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
153 it = parameters.find("count");
154 //user has given a template file
155 if(it != parameters.end()){
156 path = m->hasPath(it->second);
157 //if the user has not given a path then, add inputdir. else leave path alone.
158 if (path == "") { parameters["count"] = inputDir + it->second; }
162 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
164 //check for required parameters
165 treefile = validParameter.validFile(parameters, "tree", true);
166 if (treefile == "not open") { treefile = ""; abort = true; }
167 else if (treefile == "not found") { treefile = "";
168 treefile = m->getTreeFile();
169 if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
170 else { m->mothurOut("No valid current files. You must provide a tree file."); m->mothurOutEndLine(); abort = true; }
171 }else { m->setTreeFile(treefile); }
173 taxonomyfile = validParameter.validFile(parameters, "taxonomy", true);
174 if (taxonomyfile == "not open") { taxonomyfile = ""; abort = true; }
175 else if (taxonomyfile == "not found") { taxonomyfile = "";
176 taxonomyfile = m->getTaxonomyFile();
177 if (taxonomyfile != "") { m->mothurOut("Using " + taxonomyfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
178 else { m->mothurOut("No valid current files. You must provide a taxonomy file."); m->mothurOutEndLine(); abort = true; }
179 }else { m->setTaxonomyFile(taxonomyfile); }
181 namefile = validParameter.validFile(parameters, "name", true);
182 if (namefile == "not open") { namefile = ""; abort = true; }
183 else if (namefile == "not found") { namefile = ""; }
184 else { m->setNameFile(namefile); }
186 groupfile = validParameter.validFile(parameters, "group", true);
187 if (groupfile == "not open") { groupfile = ""; abort = true; }
188 else if (groupfile == "not found") { groupfile = ""; }
189 else { m->setGroupFile(groupfile); }
191 countfile = validParameter.validFile(parameters, "count", true);
192 if (countfile == "not open") { countfile = ""; abort = true; }
193 else if (countfile == "not found") { countfile = ""; }
194 else { m->setCountTableFile(countfile); }
196 if ((namefile != "") && (countfile != "")) {
197 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
200 if ((groupfile != "") && (countfile != "")) {
201 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
204 string temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "51"; }
205 m->mothurConvert(temp, cutoff);
207 if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; }
209 if (countfile == "") {
210 if (namefile == "") {
211 vector<string> files; files.push_back(treefile);
212 parser.getNameFile(files);
217 catch(exception& e) {
218 m->errorOut(e, "ClassifyTreeCommand", "ClassifyTreeCommand");
222 //**********************************************************************************************************************
224 int ClassifyTreeCommand::execute(){
227 if (abort == true) { if (calledHelp) { return 0; } return 2; }
229 cout.setf(ios::fixed, ios::floatfield); cout.setf(ios::showpoint);
231 int start = time(NULL);
233 /***************************************************/
234 // reading tree info //
235 /***************************************************/
236 m->setTreeFile(treefile);
238 TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
239 vector<Tree*> T = reader->getTrees();
240 CountTable* tmap = T[0]->getCountTable();
241 Tree* outputTree = T[0];
244 if (namefile != "") { m->readNames(namefile, nameMap, nameCount); }
246 if (m->control_pressed) { delete tmap; delete outputTree; return 0; }
248 m->readTax(taxonomyfile, taxMap);
250 /***************************************************/
251 // get concensus taxonomies //
252 /***************************************************/
253 getClassifications(outputTree);
254 delete outputTree; delete tmap;
256 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
258 //set tree file as new current treefile
259 if (treefile != "") {
261 itTypes = outputTypes.find("tree");
262 if (itTypes != outputTypes.end()) {
263 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTreeFile(current); }
267 m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to find the concensus taxonomies."); m->mothurOutEndLine();
268 m->mothurOutEndLine();
269 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
270 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
271 m->mothurOutEndLine();
275 catch(exception& e) {
276 m->errorOut(e, "ClassifyTreeCommand", "execute");
280 //**********************************************************************************************************************
281 //traverse tree finding concensus taxonomy at each node
282 //label node with a number to relate to output summary file
283 //report all concensus taxonomies to file
284 int ClassifyTreeCommand::getClassifications(Tree*& T){
287 string thisOutputDir = outputDir;
288 if (outputDir == "") { thisOutputDir += m->hasPath(treefile); }
289 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(treefile)) + getOutputFileNameTag("summary");
290 outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName);
293 m->openOutputFile(outputFileName, out);
294 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
298 if (groupfile != "") { out << "Group\t"; }
299 out << "NumRep\tTaxonomy" << endl;
301 string treeOutputDir = outputDir;
302 if (outputDir == "") { treeOutputDir += m->hasPath(treefile); }
303 string outputTreeFileName = treeOutputDir + m->getRootName(m->getSimpleName(treefile)) + getOutputFileNameTag("tree");
305 //create a map from tree node index to names of descendants, save time later
306 map<int, map<string, set<string> > > nodeToDescendants; //node# -> (groupName -> groupMembers)
307 for (int i = 0; i < T->getNumNodes(); i++) {
308 if (m->control_pressed) { return 0; }
310 nodeToDescendants[i] = getDescendantList(T, i, nodeToDescendants);
314 for (int i = T->getNumLeaves(); i < T->getNumNodes(); i++) {
316 if (m->control_pressed) { out.close(); return 0; }
318 string tax = "not classifed";
320 if (groupfile != "") {
321 for (map<string, set<string> >::iterator itGroups = nodeToDescendants[i].begin(); itGroups != nodeToDescendants[i].end(); itGroups++) {
322 if (itGroups->first != "AllGroups") {
323 tax = getTaxonomy(itGroups->second, size);
324 out << (i+1) << '\t' << itGroups->first << '\t' << size << '\t' << tax << endl;
328 string group = "AllGroups";
329 tax = getTaxonomy(nodeToDescendants[i][group], size);
330 out << (i+1) << '\t' << size << '\t' << tax << endl;
333 T->tree[i].setLabel((i+1));
338 m->openOutputFile(outputTreeFileName, outTree);
339 outputNames.push_back(outputTreeFileName); outputTypes["tree"].push_back(outputTreeFileName);
340 T->print(outTree, "both");
345 catch(exception& e) {
346 m->errorOut(e, "ClassifyTreeCommand", "GetConcensusTaxonomies");
350 //**********************************************************************************************************************
351 string ClassifyTreeCommand::getTaxonomy(set<string> names, int& size) {
356 //create a tree containing sequences from this bin
357 PhyloTree* phylo = new PhyloTree();
359 for (set<string>::iterator it = names.begin(); it != names.end(); it++) {
362 //if namesfile include the names
363 if (namefile != "") {
365 //is this sequence in the name file - namemap maps seqName -> repSeqName
366 map<string, string>::iterator it2 = nameMap.find(*it);
368 if (it2 == nameMap.end()) { //this name is not in name file, skip it
369 m->mothurOut((*it) + " is not in your name file. I will not include it in the consensus."); m->mothurOutEndLine();
372 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
373 map<string, string>::iterator itTax = taxMap.find((it2->second));
375 if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it
377 if ((*it) != (it2->second)) { m->mothurOut((*it) + " is represented by " + it2->second + " and is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
378 else { m->mothurOut((*it) + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
381 int num = nameCount[(*it)]; // we know its there since we found it in nameMap
382 for (int i = 0; i < num; i++) { phylo->addSeqToTree((*it)+toString(i), it2->second); }
388 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
389 map<string, string>::iterator itTax = taxMap.find((*it));
391 if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it
392 m->mothurOut((*it) + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine();
394 if (countfile != "") {
395 int numDups = ct->getNumSeqs((*it));
396 for (int j = 0; j < numDups; j++) { phylo->addSeqToTree((*it), itTax->second); }
400 phylo->addSeqToTree((*it), itTax->second);
405 if (m->control_pressed) { delete phylo; return conTax; }
410 phylo->assignHeirarchyIDs(0);
412 TaxNode currentNode = phylo->get(0);
415 while (currentNode.children.size() != 0) { //you still have more to explore
418 int bestChildSize = 0;
420 //go through children
421 for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
423 TaxNode temp = phylo->get(itChild->second);
425 //select child with largest accesions - most seqs assigned to it
426 if (temp.accessions.size() > bestChildSize) {
427 bestChild = phylo->get(itChild->second);
428 bestChildSize = temp.accessions.size();
433 //is this taxonomy above cutoff
434 int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
436 if (consensusConfidence >= cutoff) { //if yes, add it
437 conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
444 currentNode = bestChild;
447 if (myLevel != phylo->getMaxLevel()) {
448 while (myLevel != phylo->getMaxLevel()) {
449 conTax += "unclassified;";
453 if (conTax == "") { conTax = "no_consensus;"; }
460 catch(exception& e) {
461 m->errorOut(e, "ClassifyTreeCommand", "getTaxonomy");
466 //**********************************************************************************************************************
467 map<string, set<string> > ClassifyTreeCommand::getDescendantList(Tree*& T, int i, map<int, map<string, set<string> > > descendants){
469 map<string ,set<string> > names;
471 map<string ,set<string> >::iterator it;
472 map<string ,set<string> >::iterator it2;
474 int lc = T->tree[i].getLChild();
475 int rc = T->tree[i].getRChild();
476 // TreeMap* tmap = T->getTreeMap();
478 if (lc == -1) { //you are a leaf your only descendant is yourself
479 vector<string> groups = T->tree[i].getGroup();
480 set<string> mynames; mynames.insert(T->tree[i].getName());
481 for (int j = 0; j < groups.size(); j++) { names[groups[j]] = mynames; } //mygroup -> me
482 names["AllGroups"] = mynames;
483 }else{ //your descedants are the combination of your childrens descendants
484 names = descendants[lc];
485 for (it = descendants[rc].begin(); it != descendants[rc].end(); it++) {
486 it2 = names.find(it->first); //do we already have this group
487 if (it2 == names.end()) { //nope, so add it
488 names[it->first] = it->second;
490 for (set<string>::iterator it3 = (it->second).begin(); it3 != (it->second).end(); it3++) {
491 names[it->first].insert(*it3);
500 catch(exception& e) {
501 m->errorOut(e, "ClassifyTreeCommand", "getDescendantList");
505 /*****************************************************************/