2 // classifytreecommand.cpp
5 // Created by Sarah Westcott on 2/20/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "classifytreecommand.h"
10 #include "phylotree.h"
12 //**********************************************************************************************************************
13 vector<string> ClassifyTreeCommand::setParameters(){
15 CommandParameter ptree("tree", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptree);
16 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptaxonomy);
17 CommandParameter pname("name", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pname);
18 CommandParameter pgroup("group", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pgroup);
19 CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff);
20 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
21 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
23 vector<string> myArray;
24 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
28 m->errorOut(e, "ClassifyTreeCommand", "setParameters");
32 //**********************************************************************************************************************
33 string ClassifyTreeCommand::getHelpString(){
35 string helpString = "";
36 helpString += "The classify.tree command reads a tree and taxonomy file and output the consensus taxonomy for each node on the tree. \n";
37 helpString += "If you provide a group file, the concensus for each group will also be provided. \n";
38 helpString += "The new tree contains labels at each internal node. The label is the node number so you can relate the tree to the summary file.\n";
39 helpString += "The summary file lists the concensus taxonomy for the descendants of each node.\n";
40 helpString += "The classify.tree command parameters are tree, group, name and taxonomy. The tree and taxonomy files are required.\n";
41 helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy. The default is 51, meaning 51%. Cutoff cannot be below 51.\n";
42 helpString += "The classify.tree command should be used in the following format: classify.tree(tree=test.tre, group=test.group, taxonomy=test.taxonomy)\n";
43 helpString += "Note: No spaces between parameter labels (i.e. tree), '=' and parameters (i.e.yourTreefile).\n";
47 m->errorOut(e, "ClassifyTreeCommand", "getHelpString");
52 //**********************************************************************************************************************
53 ClassifyTreeCommand::ClassifyTreeCommand(){
55 abort = true; calledHelp = true;
57 vector<string> tempOutNames;
58 outputTypes["tree"] = tempOutNames;
59 outputTypes["summary"] = tempOutNames;
62 m->errorOut(e, "ClassifyTreeCommand", "ClassifyTreeCommand");
66 //**********************************************************************************************************************
67 ClassifyTreeCommand::ClassifyTreeCommand(string option) {
69 abort = false; calledHelp = false;
71 //allow user to run help
72 if(option == "help") { help(); abort = true; calledHelp = true; }
73 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
76 vector<string> myArray = setParameters();
78 OptionParser parser(option);
79 map<string, string> parameters = parser.getParameters();
81 ValidParameters validParameter;
82 map<string, string>::iterator it;
84 //check to make sure all parameters are valid for command
85 for (it = parameters.begin(); it != parameters.end(); it++) {
86 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
95 vector<string> tempOutNames;
96 outputTypes["tree"] = tempOutNames;
97 outputTypes["summary"] = tempOutNames;
99 //if the user changes the input directory command factory will send this info to us in the output parameter
100 string inputDir = validParameter.validFile(parameters, "inputdir", false);
101 if (inputDir == "not found"){ inputDir = ""; }
104 it = parameters.find("tree");
105 //user has given a template file
106 if(it != parameters.end()){
107 path = m->hasPath(it->second);
108 //if the user has not given a path then, add inputdir. else leave path alone.
109 if (path == "") { parameters["tree"] = inputDir + it->second; }
112 it = parameters.find("name");
113 //user has given a template file
114 if(it != parameters.end()){
115 path = m->hasPath(it->second);
116 //if the user has not given a path then, add inputdir. else leave path alone.
117 if (path == "") { parameters["name"] = inputDir + it->second; }
120 it = parameters.find("group");
121 //user has given a template file
122 if(it != parameters.end()){
123 path = m->hasPath(it->second);
124 //if the user has not given a path then, add inputdir. else leave path alone.
125 if (path == "") { parameters["group"] = inputDir + it->second; }
128 it = parameters.find("taxonomy");
129 //user has given a template file
130 if(it != parameters.end()){
131 path = m->hasPath(it->second);
132 //if the user has not given a path then, add inputdir. else leave path alone.
133 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
137 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
139 //check for required parameters
140 treefile = validParameter.validFile(parameters, "tree", true);
141 if (treefile == "not open") { treefile = ""; abort = true; }
142 else if (treefile == "not found") { treefile = "";
143 treefile = m->getTreeFile();
144 if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
145 else { m->mothurOut("No valid current files. You must provide a tree file."); m->mothurOutEndLine(); abort = true; }
146 }else { m->setTreeFile(treefile); }
148 taxonomyfile = validParameter.validFile(parameters, "taxonomy", true);
149 if (taxonomyfile == "not open") { taxonomyfile = ""; abort = true; }
150 else if (taxonomyfile == "not found") { taxonomyfile = "";
151 taxonomyfile = m->getTaxonomyFile();
152 if (taxonomyfile != "") { m->mothurOut("Using " + taxonomyfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
153 else { m->mothurOut("No valid current files. You must provide a taxonomy file."); m->mothurOutEndLine(); abort = true; }
154 }else { m->setTaxonomyFile(taxonomyfile); }
156 namefile = validParameter.validFile(parameters, "name", true);
157 if (namefile == "not open") { namefile = ""; abort = true; }
158 else if (namefile == "not found") { namefile = ""; }
159 else { m->setNameFile(namefile); }
161 groupfile = validParameter.validFile(parameters, "group", true);
162 if (groupfile == "not open") { groupfile = ""; abort = true; }
163 else if (groupfile == "not found") { groupfile = ""; }
164 else { m->setGroupFile(groupfile); }
166 string temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "51"; }
167 m->mothurConvert(temp, cutoff);
169 if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; }
171 if (namefile == "") {
172 vector<string> files; files.push_back(treefile);
173 parser.getNameFile(files);
178 catch(exception& e) {
179 m->errorOut(e, "ClassifyTreeCommand", "ClassifyTreeCommand");
183 //**********************************************************************************************************************
185 int ClassifyTreeCommand::execute(){
188 if (abort == true) { if (calledHelp) { return 0; } return 2; }
190 cout.setf(ios::fixed, ios::floatfield); cout.setf(ios::showpoint);
192 int start = time(NULL);
194 /***************************************************/
195 // reading tree info //
196 /***************************************************/
197 m->setTreeFile(treefile);
198 if (groupfile != "") {
199 //read in group map info.
200 tmap = new TreeMap(groupfile);
202 }else{ //fake out by putting everyone in one group
203 Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap
204 tmap = new TreeMap();
206 for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
209 if (namefile != "") { readNamesFile(); }
211 read = new ReadNewickTree(treefile);
212 int readOk = read->read(tmap);
214 if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
216 read->AssembleTrees();
217 vector<Tree*> T = read->getTrees();
218 Tree* outputTree = T[0];
221 //make sure all files match
222 //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
224 if (namefile != "") {
225 if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); }
226 else { numNamesInTree = m->Treenames.size(); }
227 }else { numNamesInTree = m->Treenames.size(); }
230 //output any names that are in group file but not in tree
231 if (numNamesInTree < tmap->getNumSeqs()) {
232 for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
233 //is that name in the tree?
235 for (int j = 0; j < m->Treenames.size(); j++) {
236 if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
240 if (m->control_pressed) {
241 delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
242 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
247 //then you did not find it so report it
248 if (count == m->Treenames.size()) {
249 //if it is in your namefile then don't remove
250 map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
252 if (it == nameMap.end()) {
253 m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
254 tmap->removeSeq(tmap->namesOfSeqs[i]);
255 i--; //need this because removeSeq removes name from namesOfSeqs
261 if (m->control_pressed) { delete outputTree; delete tmap; return 0; }
266 /***************************************************/
267 // get concensus taxonomies //
268 /***************************************************/
269 getClassifications(outputTree);
270 delete outputTree; delete tmap;
272 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
274 //set tree file as new current treefile
275 if (treefile != "") {
277 itTypes = outputTypes.find("tree");
278 if (itTypes != outputTypes.end()) {
279 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTreeFile(current); }
283 m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to find the concensus taxonomies."); m->mothurOutEndLine();
284 m->mothurOutEndLine();
285 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
286 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
287 m->mothurOutEndLine();
291 catch(exception& e) {
292 m->errorOut(e, "ClassifyTreeCommand", "execute");
296 //**********************************************************************************************************************
297 //traverse tree finding concensus taxonomy at each node
298 //label node with a number to relate to output summary file
299 //report all concensus taxonomies to file
300 int ClassifyTreeCommand::getClassifications(Tree*& T){
303 string thisOutputDir = outputDir;
304 if (outputDir == "") { thisOutputDir += m->hasPath(treefile); }
305 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(treefile)) + "taxonomy.summary";
306 outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName);
309 m->openOutputFile(outputFileName, out);
310 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
314 if (groupfile != "") { out << "Group\t"; }
315 out << "NumRep\tTaxonomy" << endl;
317 string treeOutputDir = outputDir;
318 if (outputDir == "") { treeOutputDir += m->hasPath(treefile); }
319 string outputTreeFileName = treeOutputDir + m->getRootName(m->getSimpleName(treefile)) + "taxonomy.tre";
321 //create a map from tree node index to names of descendants, save time later
322 map<int, map<string, set<string> > > nodeToDescendants; //node# -> (groupName -> groupMembers)
323 for (int i = 0; i < T->getNumNodes(); i++) {
324 if (m->control_pressed) { return 0; }
326 nodeToDescendants[i] = getDescendantList(T, i, nodeToDescendants);
330 for (int i = T->getNumLeaves(); i < T->getNumNodes(); i++) {
332 if (m->control_pressed) { out.close(); return 0; }
334 string tax = "not classifed";
336 if (groupfile != "") {
337 for (map<string, set<string> >::iterator itGroups = nodeToDescendants[i].begin(); itGroups != nodeToDescendants[i].end(); itGroups++) {
338 if (itGroups->first != "AllGroups") {
339 tax = getTaxonomy(itGroups->second, size);
340 out << (i+1) << '\t' << itGroups->first << '\t' << size << '\t' << tax << endl;
344 string group = "AllGroups";
345 tax = getTaxonomy(nodeToDescendants[i][group], size);
346 out << (i+1) << '\t' << size << '\t' << tax << endl;
349 T->tree[i].setLabel((i+1));
354 m->openOutputFile(outputTreeFileName, outTree);
355 outputNames.push_back(outputTreeFileName); outputTypes["tree"].push_back(outputTreeFileName);
356 T->print(outTree, "both");
361 catch(exception& e) {
362 m->errorOut(e, "ClassifyTreeCommand", "GetConcensusTaxonomies");
366 //**********************************************************************************************************************
367 string ClassifyTreeCommand::getTaxonomy(set<string> names, int& size) {
372 //create a tree containing sequences from this bin
373 PhyloTree* phylo = new PhyloTree();
375 for (set<string>::iterator it = names.begin(); it != names.end(); it++) {
378 //if namesfile include the names
379 if (namefile != "") {
381 //is this sequence in the name file - namemap maps seqName -> repSeqName
382 map<string, string>::iterator it2 = nameMap.find(*it);
384 if (it2 == nameMap.end()) { //this name is not in name file, skip it
385 m->mothurOut((*it) + " is not in your name file. I will not include it in the consensus."); m->mothurOutEndLine();
388 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
389 map<string, string>::iterator itTax = taxMap.find((it2->second));
391 if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it
393 if ((*it) != (it2->second)) { m->mothurOut((*it) + " is represented by " + it2->second + " and is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
394 else { m->mothurOut((*it) + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
397 int num = nameCount[(*it)]; // we know its there since we found it in nameMap
398 for (int i = 0; i < num; i++) { phylo->addSeqToTree((*it)+toString(i), it2->second); }
404 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
405 map<string, string>::iterator itTax = taxMap.find((*it));
407 if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it
408 m->mothurOut((*it) + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine();
411 phylo->addSeqToTree((*it), itTax->second);
416 if (m->control_pressed) { delete phylo; return conTax; }
421 phylo->assignHeirarchyIDs(0);
423 TaxNode currentNode = phylo->get(0);
426 while (currentNode.children.size() != 0) { //you still have more to explore
429 int bestChildSize = 0;
431 //go through children
432 for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
434 TaxNode temp = phylo->get(itChild->second);
436 //select child with largest accesions - most seqs assigned to it
437 if (temp.accessions.size() > bestChildSize) {
438 bestChild = phylo->get(itChild->second);
439 bestChildSize = temp.accessions.size();
444 //is this taxonomy above cutoff
445 int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
447 if (consensusConfidence >= cutoff) { //if yes, add it
448 conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
455 currentNode = bestChild;
458 if (myLevel != phylo->getMaxLevel()) {
459 while (myLevel != phylo->getMaxLevel()) {
460 conTax += "unclassified;";
464 if (conTax == "") { conTax = "no_consensus;"; }
471 catch(exception& e) {
472 m->errorOut(e, "ClassifyTreeCommand", "getTaxonomy");
477 //**********************************************************************************************************************
478 map<string, set<string> > ClassifyTreeCommand::getDescendantList(Tree*& T, int i, map<int, map<string, set<string> > > descendants){
480 map<string ,set<string> > names;
482 map<string ,set<string> >::iterator it;
483 map<string ,set<string> >::iterator it2;
485 int lc = T->tree[i].getLChild();
486 int rc = T->tree[i].getRChild();
488 if (lc == -1) { //you are a leaf your only descendant is yourself
489 string group = tmap->getGroup(T->tree[i].getName());
490 set<string> mynames; mynames.insert(T->tree[i].getName());
491 names[group] = mynames; //mygroup -> me
492 names["AllGroups"] = mynames;
493 }else{ //your descedants are the combination of your childrens descendants
494 names = descendants[lc];
495 for (it = descendants[rc].begin(); it != descendants[rc].end(); it++) {
496 it2 = names.find(it->first); //do we already have this group
497 if (it2 == names.end()) { //nope, so add it
498 names[it->first] = it->second;
500 for (set<string>::iterator it3 = (it->second).begin(); it3 != (it->second).end(); it3++) {
501 names[it->first].insert(*it3);
510 catch(exception& e) {
511 m->errorOut(e, "ClassifyTreeCommand", "getDescendantList");
515 //**********************************************************************************************************************
516 int ClassifyTreeCommand::readTaxonomyFile() {
520 m->openInputFile(taxonomyfile, in);
528 //are there confidence scores, if so remove them
529 if (tax.find_first_of('(') != -1) { m->removeConfidences(tax); }
533 if (m->control_pressed) { in.close(); taxMap.clear(); return 0; }
539 catch(exception& e) {
540 m->errorOut(e, "ClassifyTreeCommand", "readTaxonomyFile");
545 /*****************************************************************/
546 int ClassifyTreeCommand::readNamesFile() {
549 m->openInputFile(namefile, inNames);
553 while(!inNames.eof()){
554 inNames >> name; //read from first column A
555 inNames >> names; //read from second column A,B,C,D
558 //parse names into vector
559 vector<string> theseNames;
560 m->splitAtComma(names, theseNames);
562 for (int i = 0; i < theseNames.size(); i++) { nameMap[theseNames[i]] = name; }
563 nameCount[name] = theseNames.size();
565 if (m->control_pressed) { inNames.close(); nameMap.clear(); return 0; }
571 catch(exception& e) {
572 m->errorOut(e, "ClassifyTreeCommand", "readNamesFile");
577 /*****************************************************************/