]> git.donarmstrong.com Git - mothur.git/blob - classifytreecommand.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / classifytreecommand.cpp
1 //
2 //  classifytreecommand.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 2/20/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "classifytreecommand.h"
10 #include "phylotree.h"
11 #include "treereader.h"
12
13 //**********************************************************************************************************************
14 vector<string> ClassifyTreeCommand::setParameters(){    
15         try {
16                 CommandParameter ptree("tree", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptree);
17         CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptaxonomy);
18         CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
19         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
20                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
21         CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff);
22                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
23                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
24                 
25                 vector<string> myArray;
26                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
27                 return myArray;
28         }
29         catch(exception& e) {
30                 m->errorOut(e, "ClassifyTreeCommand", "setParameters");
31                 exit(1);
32         }
33 }
34 //**********************************************************************************************************************
35 string ClassifyTreeCommand::getHelpString(){    
36         try {
37                 string helpString = "";
38                 helpString += "The classify.tree command reads a tree and taxonomy file and output the consensus taxonomy for each node on the tree. \n";
39                 helpString += "If you provide a group file, the concensus for each group will also be provided. \n";
40                 helpString += "The new tree contains labels at each internal node.  The label is the node number so you can relate the tree to the summary file.\n";
41         helpString += "The count parameter allows you add a count file so you can have the summary totals broken up by group.\n";
42                 helpString += "The summary file lists the concensus taxonomy for the descendants of each node.\n";
43                 helpString += "The classify.tree command parameters are tree, group, name, count and taxonomy. The tree and taxonomy files are required.\n";
44         helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy.  The default is 51, meaning 51%. Cutoff cannot be below 51.\n";
45         helpString += "The classify.tree command should be used in the following format: classify.tree(tree=test.tre, group=test.group, taxonomy=test.taxonomy)\n";
46                 helpString += "Note: No spaces between parameter labels (i.e. tree), '=' and parameters (i.e.yourTreefile).\n"; 
47                 return helpString;
48         }
49         catch(exception& e) {
50                 m->errorOut(e, "ClassifyTreeCommand", "getHelpString");
51                 exit(1);
52         }
53 }
54 //**********************************************************************************************************************
55 string ClassifyTreeCommand::getOutputFileNameTag(string type, string inputName=""){     
56         try {
57         string outputFileName = "";
58                 map<string, vector<string> >::iterator it;
59         
60         //is this a type this command creates
61         it = outputTypes.find(type);
62         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
63         else {
64             if (type == "tree") {  outputFileName =  "taxonomy.tre"; }
65             else if (type == "summary") {  outputFileName =  "taxonomy.summary"; }
66             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
67         }
68         return outputFileName;
69         }
70         catch(exception& e) {
71                 m->errorOut(e, "ClassifyTreeCommand", "getOutputFileNameTag");
72                 exit(1);
73         }
74 }
75 //**********************************************************************************************************************
76 ClassifyTreeCommand::ClassifyTreeCommand(){     
77         try {
78                 abort = true; calledHelp = true; 
79                 setParameters();
80                 vector<string> tempOutNames;
81                 outputTypes["tree"] = tempOutNames;
82                 outputTypes["summary"] = tempOutNames;
83         }
84         catch(exception& e) {
85                 m->errorOut(e, "ClassifyTreeCommand", "ClassifyTreeCommand");
86                 exit(1);
87         }
88 }
89 //**********************************************************************************************************************
90 ClassifyTreeCommand::ClassifyTreeCommand(string option)  {
91         try {
92                 abort = false; calledHelp = false;   
93                 
94                 //allow user to run help
95                 if(option == "help") { help(); abort = true; calledHelp = true; }
96                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
97                 
98                 else {
99                         vector<string> myArray = setParameters();
100                         
101                         OptionParser parser(option);
102                         map<string, string> parameters = parser.getParameters();
103                         
104                         ValidParameters validParameter;
105                         map<string, string>::iterator it;
106                         
107                         //check to make sure all parameters are valid for command
108                         for (it = parameters.begin(); it != parameters.end(); it++) { 
109                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
110                         }
111                         
112                         vector<string> tempOutNames;
113                         outputTypes["tree"] = tempOutNames;
114                         outputTypes["summary"] = tempOutNames;
115                         
116                         //if the user changes the input directory command factory will send this info to us in the output parameter 
117                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
118                         if (inputDir == "not found"){   inputDir = "";          }
119                         else {
120                                 string path;
121                                 it = parameters.find("tree");
122                                 //user has given a template file
123                                 if(it != parameters.end()){ 
124                                         path = m->hasPath(it->second);
125                                         //if the user has not given a path then, add inputdir. else leave path alone.
126                                         if (path == "") {       parameters["tree"] = inputDir + it->second;             }
127                                 }
128                                 
129                                 it = parameters.find("name");
130                                 //user has given a template file
131                                 if(it != parameters.end()){ 
132                                         path = m->hasPath(it->second);
133                                         //if the user has not given a path then, add inputdir. else leave path alone.
134                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
135                                 }
136                                 
137                                 it = parameters.find("group");
138                                 //user has given a template file
139                                 if(it != parameters.end()){ 
140                                         path = m->hasPath(it->second);
141                                         //if the user has not given a path then, add inputdir. else leave path alone.
142                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
143                                 }
144                                 
145                                 it = parameters.find("taxonomy");
146                                 //user has given a template file
147                                 if(it != parameters.end()){ 
148                                         path = m->hasPath(it->second);
149                                         //if the user has not given a path then, add inputdir. else leave path alone.
150                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
151                                 }
152                 
153                 it = parameters.find("count");
154                                 //user has given a template file
155                                 if(it != parameters.end()){ 
156                                         path = m->hasPath(it->second);
157                                         //if the user has not given a path then, add inputdir. else leave path alone.
158                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
159                                 }
160                         }
161                         
162                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
163             
164                         //check for required parameters
165                         treefile = validParameter.validFile(parameters, "tree", true);
166                         if (treefile == "not open") { treefile = ""; abort = true; }
167                         else if (treefile == "not found") { treefile = ""; 
168                 treefile = m->getTreeFile(); 
169                 if (treefile != "") {  m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
170                 else { m->mothurOut("No valid current files. You must provide a tree file."); m->mothurOutEndLine(); abort = true; }
171             }else { m->setTreeFile(treefile); } 
172             
173             taxonomyfile = validParameter.validFile(parameters, "taxonomy", true);
174                         if (taxonomyfile == "not open") { taxonomyfile = ""; abort = true; }
175                         else if (taxonomyfile == "not found") { taxonomyfile = ""; 
176                 taxonomyfile = m->getTaxonomyFile(); 
177                 if (taxonomyfile != "") {  m->mothurOut("Using " + taxonomyfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
178                 else { m->mothurOut("No valid current files. You must provide a taxonomy file."); m->mothurOutEndLine(); abort = true; }
179             }else { m->setTaxonomyFile(taxonomyfile); } 
180                         
181                         namefile = validParameter.validFile(parameters, "name", true);
182                         if (namefile == "not open") { namefile = ""; abort = true; }
183                         else if (namefile == "not found") { namefile = ""; }
184                         else { m->setNameFile(namefile); }
185                         
186                         groupfile = validParameter.validFile(parameters, "group", true);
187                         if (groupfile == "not open") { groupfile = ""; abort = true; }
188                         else if (groupfile == "not found") { groupfile = ""; }
189                         else { m->setGroupFile(groupfile); }
190             
191             countfile = validParameter.validFile(parameters, "count", true);
192                         if (countfile == "not open") { countfile = ""; abort = true; }
193                         else if (countfile == "not found") { countfile = "";  } 
194                         else { m->setCountTableFile(countfile); }
195             
196             if ((namefile != "") && (countfile != "")) {
197                 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
198             }
199                         
200             if ((groupfile != "") && (countfile != "")) {
201                 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
202             }
203             
204             string temp = validParameter.validFile(parameters, "cutoff", false);                        if (temp == "not found") { temp = "51"; }
205                         m->mothurConvert(temp, cutoff); 
206                         
207                         if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true;  }
208             
209             if (countfile == "") {
210                 if (namefile == "") {
211                     vector<string> files; files.push_back(treefile);
212                     parser.getNameFile(files);
213                 }
214                         }
215                 }
216         }
217         catch(exception& e) {
218                 m->errorOut(e, "ClassifyTreeCommand", "ClassifyTreeCommand");           
219                 exit(1);
220         }
221 }
222 //**********************************************************************************************************************
223
224 int ClassifyTreeCommand::execute(){
225         try {
226                 
227                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
228                 
229                 cout.setf(ios::fixed, ios::floatfield); cout.setf(ios::showpoint);
230                 
231                 int start = time(NULL);
232         
233                 /***************************************************/
234                 //    reading tree info                                                    //
235                 /***************************************************/
236         m->setTreeFile(treefile);
237         
238         TreeReader* reader = new TreeReader(treefile, groupfile, namefile);
239         vector<Tree*> T = reader->getTrees();
240         CountTable* tmap = T[0]->getCountTable();
241         Tree* outputTree = T[0];
242         delete reader;
243
244         if (namefile != "") { m->readNames(namefile, nameMap, nameCount); }
245                         
246         if (m->control_pressed) { delete tmap;  delete outputTree;  return 0; }
247                 
248         m->readTax(taxonomyfile, taxMap);
249         
250         /***************************************************/
251         //              get concensus taxonomies                    //
252         /***************************************************/
253         getClassifications(outputTree);
254         delete outputTree; delete tmap;
255                         
256                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
257                 
258                 //set tree file as new current treefile
259                 if (treefile != "") {
260                         string current = "";
261                         itTypes = outputTypes.find("tree");
262                         if (itTypes != outputTypes.end()) {
263                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTreeFile(current); }
264                         }
265                 }
266                 
267                 m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to find the concensus taxonomies."); m->mothurOutEndLine();
268                 m->mothurOutEndLine();
269                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
270                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
271                 m->mothurOutEndLine();
272         
273                 return 0;
274         }
275         catch(exception& e) {
276                 m->errorOut(e, "ClassifyTreeCommand", "execute");       
277                 exit(1);
278         }
279 }
280 //**********************************************************************************************************************
281 //traverse tree finding concensus taxonomy at each node
282 //label node with a number to relate to output summary file
283 //report all concensus taxonomies to file 
284 int ClassifyTreeCommand::getClassifications(Tree*& T){
285         try {
286                 
287                 string thisOutputDir = outputDir;
288                 if (outputDir == "") {  thisOutputDir += m->hasPath(treefile);  }
289                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(treefile)) + getOutputFileNameTag("summary");
290                 outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName);
291                 
292                 ofstream out;
293                 m->openOutputFile(outputFileName, out);
294                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
295                 
296                 //print headings
297                 out << "TreeNode\t";
298                 if (groupfile != "") { out << "Group\t"; } 
299         out << "NumRep\tTaxonomy" << endl; 
300                 
301                 string treeOutputDir = outputDir;
302                 if (outputDir == "") {  treeOutputDir += m->hasPath(treefile);  }
303                 string outputTreeFileName = treeOutputDir + m->getRootName(m->getSimpleName(treefile)) + getOutputFileNameTag("tree");
304                 
305                 //create a map from tree node index to names of descendants, save time later
306                 map<int, map<string, set<string> > > nodeToDescendants; //node# -> (groupName -> groupMembers)
307                 for (int i = 0; i < T->getNumNodes(); i++) {
308                         if (m->control_pressed) { return 0; }
309                         
310                         nodeToDescendants[i] = getDescendantList(T, i, nodeToDescendants);
311                 }
312                 
313                 //for each node
314                 for (int i = T->getNumLeaves(); i < T->getNumNodes(); i++) {
315                         
316                         if (m->control_pressed) { out.close(); return 0; }
317             
318                         string tax = "not classifed";
319             int size;
320             if (groupfile != "") {
321                 for (map<string, set<string> >::iterator itGroups = nodeToDescendants[i].begin(); itGroups != nodeToDescendants[i].end(); itGroups++) {
322                     if (itGroups->first != "AllGroups") {
323                         tax = getTaxonomy(itGroups->second, size);
324                         out << (i+1) << '\t' << itGroups->first << '\t' << size << '\t' << tax << endl;
325                     }
326                 }
327             }else {
328                 string group = "AllGroups";
329                 tax = getTaxonomy(nodeToDescendants[i][group], size);
330                 out << (i+1) << '\t' << size << '\t' << tax << endl;
331             }
332                                 
333                         T->tree[i].setLabel((i+1));
334                 }
335                 out.close();
336         
337                 ofstream outTree;
338                 m->openOutputFile(outputTreeFileName, outTree);
339                 outputNames.push_back(outputTreeFileName); outputTypes["tree"].push_back(outputTreeFileName);
340                 T->print(outTree, "both");
341                 outTree.close();
342         
343                 return 0;
344         }
345         catch(exception& e) {
346                 m->errorOut(e, "ClassifyTreeCommand", "GetConcensusTaxonomies");        
347                 exit(1);
348         }
349 }
350 //**********************************************************************************************************************
351 string ClassifyTreeCommand::getTaxonomy(set<string> names, int& size) {
352         try{
353                 string conTax = "";
354         size = 0;
355                         
356                 //create a tree containing sequences from this bin
357                 PhyloTree* phylo = new PhyloTree();
358                 
359                 for (set<string>::iterator it = names.begin(); it != names.end(); it++) {
360             
361             
362                         //if namesfile include the names
363                         if (namefile != "") {
364                 
365                                 //is this sequence in the name file - namemap maps seqName -> repSeqName
366                                 map<string, string>::iterator it2 = nameMap.find(*it);
367                                 
368                                 if (it2 == nameMap.end()) { //this name is not in name file, skip it
369                                         m->mothurOut((*it) + " is not in your name file.  I will not include it in the consensus."); m->mothurOutEndLine();
370                                 }else{
371                                         
372                                         //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
373                                         map<string, string>::iterator itTax = taxMap.find((it2->second));
374                     
375                                         if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it
376                         
377                                                 if ((*it) != (it2->second)) { m->mothurOut((*it) + " is represented by " +  it2->second + " and is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
378                                                 else {  m->mothurOut((*it) + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
379                                         }else{
380                                                 //add seq to tree
381                         int num = nameCount[(*it)]; // we know its there since we found it in nameMap
382                                                 for (int i = 0; i < num; i++) {  phylo->addSeqToTree((*it)+toString(i), it2->second);  }
383                         size += num;
384                                         }
385                                 }
386                                 
387                         }else{
388                                 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
389                                 map<string, string>::iterator itTax = taxMap.find((*it));
390                 
391                                 if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it
392                                         m->mothurOut((*it) + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine();
393                                 }else{
394                                         if (countfile != "") {
395                         int numDups = ct->getNumSeqs((*it)); 
396                         for (int j = 0; j < numDups; j++) {  phylo->addSeqToTree((*it), itTax->second);  }
397                         size += numDups;
398                     }else{
399                         //add seq to tree
400                         phylo->addSeqToTree((*it), itTax->second);
401                         size++;  
402                     }                           }
403                         }
404             
405                         if (m->control_pressed) { delete phylo; return conTax; }
406                         
407                 }
408                 
409                 //build tree
410                 phylo->assignHeirarchyIDs(0);
411                 
412                 TaxNode currentNode = phylo->get(0);
413                 int myLevel = 0;        
414                 //at each level
415                 while (currentNode.children.size() != 0) { //you still have more to explore
416             
417                         TaxNode bestChild;
418                         int bestChildSize = 0;
419                         
420                         //go through children
421                         for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
422                                 
423                                 TaxNode temp = phylo->get(itChild->second);
424                                 
425                                 //select child with largest accesions - most seqs assigned to it
426                                 if (temp.accessions.size() > bestChildSize) {
427                                         bestChild = phylo->get(itChild->second);
428                                         bestChildSize = temp.accessions.size();
429                                 }
430                                 
431                         }
432             
433                         //is this taxonomy above cutoff
434                         int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
435                         
436                         if (consensusConfidence >= cutoff) { //if yes, add it
437                 conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
438                                 myLevel++;
439                         }else{ //if no, quit
440                                 break;
441                         }
442                         
443                         //move down a level
444                         currentNode = bestChild;
445                 }
446                 
447                 if (myLevel != phylo->getMaxLevel()) {
448                         while (myLevel != phylo->getMaxLevel()) {
449                                 conTax += "unclassified;";
450                                 myLevel++;
451                         }
452                 }               
453                 if (conTax == "") {  conTax = "no_consensus;";  }
454                 
455                 delete phylo;   
456         
457         return conTax;
458         
459         }
460         catch(exception& e) {
461                 m->errorOut(e, "ClassifyTreeCommand", "getTaxonomy");
462                 exit(1);
463         }
464 }
465
466 //**********************************************************************************************************************
467 map<string, set<string> > ClassifyTreeCommand::getDescendantList(Tree*& T, int i, map<int, map<string, set<string> > > descendants){
468         try {
469                 map<string ,set<string> > names;
470                 
471                 map<string ,set<string> >::iterator it;
472         map<string ,set<string> >::iterator it2;
473                 
474                 int lc = T->tree[i].getLChild();
475                 int rc = T->tree[i].getRChild();
476        // TreeMap* tmap = T->getTreeMap();
477                 
478                 if (lc == -1) { //you are a leaf your only descendant is yourself
479             vector<string> groups = T->tree[i].getGroup();
480             set<string> mynames; mynames.insert(T->tree[i].getName());
481             for (int j = 0; j < groups.size(); j++) { names[groups[j]] = mynames;   } //mygroup -> me
482             names["AllGroups"] = mynames;
483                 }else{ //your descedants are the combination of your childrens descendants
484                         names = descendants[lc];
485                         for (it = descendants[rc].begin(); it != descendants[rc].end(); it++) {
486                 it2 = names.find(it->first); //do we already have this group
487                 if (it2 == names.end()) { //nope, so add it
488                     names[it->first] = it->second;
489                 }else {
490                     for (set<string>::iterator it3 = (it->second).begin(); it3 != (it->second).end(); it3++) {
491                         names[it->first].insert(*it3);
492                     }
493                 }
494                                 
495                         }
496                 }
497                 
498                 return names;
499         }
500         catch(exception& e) {
501                 m->errorOut(e, "ClassifyTreeCommand", "getDescendantList");     
502                 exit(1);
503         }
504 }
505 /*****************************************************************/
506
507