]> git.donarmstrong.com Git - mothur.git/blob - classifytreecommand.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / classifytreecommand.cpp
1 //
2 //  classifytreecommand.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 2/20/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "classifytreecommand.h"
10 #include "phylotree.h"
11
12 //**********************************************************************************************************************
13 vector<string> ClassifyTreeCommand::setParameters(){    
14         try {
15                 CommandParameter ptree("tree", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptree);
16         CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptaxonomy);
17         CommandParameter pname("name", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pname);
18         CommandParameter pgroup("group", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pgroup);
19         CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff);
20                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
21                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
22                 
23                 vector<string> myArray;
24                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
25                 return myArray;
26         }
27         catch(exception& e) {
28                 m->errorOut(e, "ClassifyTreeCommand", "setParameters");
29                 exit(1);
30         }
31 }
32 //**********************************************************************************************************************
33 string ClassifyTreeCommand::getHelpString(){    
34         try {
35                 string helpString = "";
36                 helpString += "The classify.tree command reads a tree and taxonomy file and output the consensus taxonomy for each node on the tree. \n";
37                 helpString += "If you provide a group file, the concensus for each group will also be provided. \n";
38                 helpString += "The new tree contains labels at each internal node.  The label is the node number so you can relate the tree to the summary file.\n";
39                 helpString += "The summary file lists the concensus taxonomy for the descendants of each node.\n";
40                 helpString += "The classify.tree command parameters are tree, group, name and taxonomy. The tree and taxonomy files are required.\n";
41         helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy.  The default is 51, meaning 51%. Cutoff cannot be below 51.\n";
42         helpString += "The classify.tree command should be used in the following format: classify.tree(tree=test.tre, group=test.group, taxonomy=test.taxonomy)\n";
43                 helpString += "Note: No spaces between parameter labels (i.e. tree), '=' and parameters (i.e.yourTreefile).\n"; 
44                 return helpString;
45         }
46         catch(exception& e) {
47                 m->errorOut(e, "ClassifyTreeCommand", "getHelpString");
48                 exit(1);
49         }
50 }
51
52 //**********************************************************************************************************************
53 ClassifyTreeCommand::ClassifyTreeCommand(){     
54         try {
55                 abort = true; calledHelp = true; 
56                 setParameters();
57                 vector<string> tempOutNames;
58                 outputTypes["tree"] = tempOutNames;
59                 outputTypes["summary"] = tempOutNames;
60         }
61         catch(exception& e) {
62                 m->errorOut(e, "ClassifyTreeCommand", "ClassifyTreeCommand");
63                 exit(1);
64         }
65 }
66 //**********************************************************************************************************************
67 ClassifyTreeCommand::ClassifyTreeCommand(string option)  {
68         try {
69                 abort = false; calledHelp = false;   
70                 
71                 //allow user to run help
72                 if(option == "help") { help(); abort = true; calledHelp = true; }
73                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
74                 
75                 else {
76                         vector<string> myArray = setParameters();
77                         
78                         OptionParser parser(option);
79                         map<string, string> parameters = parser.getParameters();
80                         
81                         ValidParameters validParameter;
82                         map<string, string>::iterator it;
83                         
84                         //check to make sure all parameters are valid for command
85                         for (it = parameters.begin(); it != parameters.end(); it++) { 
86                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
87                         }
88                         
89                         m->runParse = true;
90                         m->clearGroups();
91                         m->clearAllGroups();
92                         m->Treenames.clear();
93                         m->names.clear();
94                         
95                         vector<string> tempOutNames;
96                         outputTypes["tree"] = tempOutNames;
97                         outputTypes["summary"] = tempOutNames;
98                         
99                         //if the user changes the input directory command factory will send this info to us in the output parameter 
100                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
101                         if (inputDir == "not found"){   inputDir = "";          }
102                         else {
103                                 string path;
104                                 it = parameters.find("tree");
105                                 //user has given a template file
106                                 if(it != parameters.end()){ 
107                                         path = m->hasPath(it->second);
108                                         //if the user has not given a path then, add inputdir. else leave path alone.
109                                         if (path == "") {       parameters["tree"] = inputDir + it->second;             }
110                                 }
111                                 
112                                 it = parameters.find("name");
113                                 //user has given a template file
114                                 if(it != parameters.end()){ 
115                                         path = m->hasPath(it->second);
116                                         //if the user has not given a path then, add inputdir. else leave path alone.
117                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
118                                 }
119                                 
120                                 it = parameters.find("group");
121                                 //user has given a template file
122                                 if(it != parameters.end()){ 
123                                         path = m->hasPath(it->second);
124                                         //if the user has not given a path then, add inputdir. else leave path alone.
125                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
126                                 }
127                                 
128                                 it = parameters.find("taxonomy");
129                                 //user has given a template file
130                                 if(it != parameters.end()){ 
131                                         path = m->hasPath(it->second);
132                                         //if the user has not given a path then, add inputdir. else leave path alone.
133                                         if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
134                                 }
135                         }
136                         
137                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
138             
139                         //check for required parameters
140                         treefile = validParameter.validFile(parameters, "tree", true);
141                         if (treefile == "not open") { treefile = ""; abort = true; }
142                         else if (treefile == "not found") { treefile = ""; 
143                 treefile = m->getTreeFile(); 
144                 if (treefile != "") {  m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
145                 else { m->mothurOut("No valid current files. You must provide a tree file."); m->mothurOutEndLine(); abort = true; }
146             }else { m->setTreeFile(treefile); } 
147             
148             taxonomyfile = validParameter.validFile(parameters, "taxonomy", true);
149                         if (taxonomyfile == "not open") { taxonomyfile = ""; abort = true; }
150                         else if (taxonomyfile == "not found") { taxonomyfile = ""; 
151                 taxonomyfile = m->getTaxonomyFile(); 
152                 if (taxonomyfile != "") {  m->mothurOut("Using " + taxonomyfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
153                 else { m->mothurOut("No valid current files. You must provide a taxonomy file."); m->mothurOutEndLine(); abort = true; }
154             }else { m->setTaxonomyFile(taxonomyfile); } 
155                         
156                         namefile = validParameter.validFile(parameters, "name", true);
157                         if (namefile == "not open") { namefile = ""; abort = true; }
158                         else if (namefile == "not found") { namefile = ""; }
159                         else { m->setNameFile(namefile); }
160                         
161                         groupfile = validParameter.validFile(parameters, "group", true);
162                         if (groupfile == "not open") { groupfile = ""; abort = true; }
163                         else if (groupfile == "not found") { groupfile = ""; }
164                         else { m->setGroupFile(groupfile); }
165             
166             string temp = validParameter.validFile(parameters, "cutoff", false);                        if (temp == "not found") { temp = "51"; }
167                         m->mothurConvert(temp, cutoff); 
168                         
169                         if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true;  }
170             
171             if (namefile == "") {
172                                 vector<string> files; files.push_back(treefile);
173                                 parser.getNameFile(files);
174                         }
175                         
176                 }
177         }
178         catch(exception& e) {
179                 m->errorOut(e, "ClassifyTreeCommand", "ClassifyTreeCommand");           
180                 exit(1);
181         }
182 }
183 //**********************************************************************************************************************
184
185 int ClassifyTreeCommand::execute(){
186         try {
187                 
188                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
189                 
190                 cout.setf(ios::fixed, ios::floatfield); cout.setf(ios::showpoint);
191                 
192                 int start = time(NULL);
193         
194                 /***************************************************/
195                 //    reading tree info                                                    //
196                 /***************************************************/
197         m->setTreeFile(treefile);
198         if (groupfile != "") {
199                         //read in group map info.
200                         tmap = new TreeMap(groupfile);
201                         tmap->readMap();
202                 }else{ //fake out by putting everyone in one group
203                         Tree* tree = new Tree(treefile); delete tree;  //extracts names from tree to make faked out groupmap
204                         tmap = new TreeMap();
205                         
206                         for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
207                 }
208                 
209                 if (namefile != "") { readNamesFile(); }
210                 
211                 read = new ReadNewickTree(treefile);
212                 int readOk = read->read(tmap); 
213                 
214                 if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
215                 
216                 read->AssembleTrees();
217                 vector<Tree*> T = read->getTrees();
218         Tree* outputTree = T[0]; 
219                 delete read;
220                 
221                 //make sure all files match
222                 //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
223                 int numNamesInTree;
224                 if (namefile != "")  {  
225                         if (numUniquesInName == m->Treenames.size()) {  numNamesInTree = nameMap.size();  }
226                         else {   numNamesInTree = m->Treenames.size();  }
227                 }else {  numNamesInTree = m->Treenames.size();  }
228                 
229                 
230                 //output any names that are in group file but not in tree
231                 if (numNamesInTree < tmap->getNumSeqs()) {
232                         for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
233                                 //is that name in the tree?
234                                 int count = 0;
235                                 for (int j = 0; j < m->Treenames.size(); j++) {
236                                         if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
237                                         count++;
238                                 }
239                                 
240                                 if (m->control_pressed) { 
241                                         delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
242                                         for (int i = 0; i < outputNames.size(); i++) {  m->mothurRemove(outputNames[i]); } outputTypes.clear();
243                                         m->clearGroups();
244                                         return 0;
245                                 }
246                                 
247                                 //then you did not find it so report it 
248                                 if (count == m->Treenames.size()) { 
249                                         //if it is in your namefile then don't remove
250                                         map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
251                                         
252                                         if (it == nameMap.end()) {
253                                                 m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
254                                                 tmap->removeSeq(tmap->namesOfSeqs[i]);
255                                                 i--; //need this because removeSeq removes name from namesOfSeqs
256                                         }
257                                 }
258                         }
259                 }
260                         
261         if (m->control_pressed) { delete outputTree; delete tmap;  return 0; }
262                 
263         readTaxonomyFile();
264         
265         
266         /***************************************************/
267         //              get concensus taxonomies                    //
268         /***************************************************/
269         getClassifications(outputTree);
270         delete outputTree; delete tmap;
271                         
272                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
273                 
274                 //set tree file as new current treefile
275                 if (treefile != "") {
276                         string current = "";
277                         itTypes = outputTypes.find("tree");
278                         if (itTypes != outputTypes.end()) {
279                                 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTreeFile(current); }
280                         }
281                 }
282                 
283                 m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to find the concensus taxonomies."); m->mothurOutEndLine();
284                 m->mothurOutEndLine();
285                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
286                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
287                 m->mothurOutEndLine();
288         
289                 return 0;
290         }
291         catch(exception& e) {
292                 m->errorOut(e, "ClassifyTreeCommand", "execute");       
293                 exit(1);
294         }
295 }
296 //**********************************************************************************************************************
297 //traverse tree finding concensus taxonomy at each node
298 //label node with a number to relate to output summary file
299 //report all concensus taxonomies to file 
300 int ClassifyTreeCommand::getClassifications(Tree*& T){
301         try {
302                 
303                 string thisOutputDir = outputDir;
304                 if (outputDir == "") {  thisOutputDir += m->hasPath(treefile);  }
305                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(treefile)) + "taxonomy.summary";
306                 outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName);
307                 
308                 ofstream out;
309                 m->openOutputFile(outputFileName, out);
310                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
311                 
312                 //print headings
313                 out << "TreeNode\t";
314                 if (groupfile != "") { out << "Group\t"; } 
315         out << "NumRep\tTaxonomy" << endl; 
316                 
317                 string treeOutputDir = outputDir;
318                 if (outputDir == "") {  treeOutputDir += m->hasPath(treefile);  }
319                 string outputTreeFileName = treeOutputDir + m->getRootName(m->getSimpleName(treefile)) + "taxonomy.tre";
320                 
321                 //create a map from tree node index to names of descendants, save time later
322                 map<int, map<string, set<string> > > nodeToDescendants; //node# -> (groupName -> groupMembers)
323                 for (int i = 0; i < T->getNumNodes(); i++) {
324                         if (m->control_pressed) { return 0; }
325                         
326                         nodeToDescendants[i] = getDescendantList(T, i, nodeToDescendants);
327                 }
328                 
329                 //for each node
330                 for (int i = T->getNumLeaves(); i < T->getNumNodes(); i++) {
331                         
332                         if (m->control_pressed) { out.close(); return 0; }
333             
334                         string tax = "not classifed";
335             int size;
336             if (groupfile != "") {
337                 for (map<string, set<string> >::iterator itGroups = nodeToDescendants[i].begin(); itGroups != nodeToDescendants[i].end(); itGroups++) {
338                     if (itGroups->first != "AllGroups") {
339                         tax = getTaxonomy(itGroups->second, size);
340                         out << (i+1) << '\t' << itGroups->first << '\t' << size << '\t' << tax << endl;
341                     }
342                 }
343             }else {
344                 string group = "AllGroups";
345                 tax = getTaxonomy(nodeToDescendants[i][group], size);
346                 out << (i+1) << '\t' << size << '\t' << tax << endl;
347             }
348                                 
349                         T->tree[i].setLabel((i+1));
350                 }
351                 out.close();
352         
353                 ofstream outTree;
354                 m->openOutputFile(outputTreeFileName, outTree);
355                 outputNames.push_back(outputTreeFileName); outputTypes["tree"].push_back(outputTreeFileName);
356                 T->print(outTree, "both");
357                 outTree.close();
358         
359                 return 0;
360         }
361         catch(exception& e) {
362                 m->errorOut(e, "ClassifyTreeCommand", "GetConcensusTaxonomies");        
363                 exit(1);
364         }
365 }
366 //**********************************************************************************************************************
367 string ClassifyTreeCommand::getTaxonomy(set<string> names, int& size) {
368         try{
369                 string conTax = "";
370         size = 0;
371                         
372                 //create a tree containing sequences from this bin
373                 PhyloTree* phylo = new PhyloTree();
374                 
375                 for (set<string>::iterator it = names.begin(); it != names.end(); it++) {
376             
377             
378                         //if namesfile include the names
379                         if (namefile != "") {
380                 
381                                 //is this sequence in the name file - namemap maps seqName -> repSeqName
382                                 map<string, string>::iterator it2 = nameMap.find(*it);
383                                 
384                                 if (it2 == nameMap.end()) { //this name is not in name file, skip it
385                                         m->mothurOut((*it) + " is not in your name file.  I will not include it in the consensus."); m->mothurOutEndLine();
386                                 }else{
387                                         
388                                         //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
389                                         map<string, string>::iterator itTax = taxMap.find((it2->second));
390                     
391                                         if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it
392                         
393                                                 if ((*it) != (it2->second)) { m->mothurOut((*it) + " is represented by " +  it2->second + " and is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
394                                                 else {  m->mothurOut((*it) + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine(); }
395                                         }else{
396                                                 //add seq to tree
397                         int num = nameCount[(*it)]; // we know its there since we found it in nameMap
398                                                 for (int i = 0; i < num; i++) {  phylo->addSeqToTree((*it)+toString(i), it2->second);  }
399                         size += num;
400                                         }
401                                 }
402                                 
403                         }else{
404                                 //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
405                                 map<string, string>::iterator itTax = taxMap.find((*it));
406                 
407                                 if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it
408                                         m->mothurOut((*it) + " is not in your taxonomy file.  I will not include it in the consensus."); m->mothurOutEndLine();
409                                 }else{
410                                         //add seq to tree
411                                         phylo->addSeqToTree((*it), itTax->second);
412                     size++;
413                                 }
414                         }
415             
416                         if (m->control_pressed) { delete phylo; return conTax; }
417                         
418                 }
419                 
420                 //build tree
421                 phylo->assignHeirarchyIDs(0);
422                 
423                 TaxNode currentNode = phylo->get(0);
424                 int myLevel = 0;        
425                 //at each level
426                 while (currentNode.children.size() != 0) { //you still have more to explore
427             
428                         TaxNode bestChild;
429                         int bestChildSize = 0;
430                         
431                         //go through children
432                         for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
433                                 
434                                 TaxNode temp = phylo->get(itChild->second);
435                                 
436                                 //select child with largest accesions - most seqs assigned to it
437                                 if (temp.accessions.size() > bestChildSize) {
438                                         bestChild = phylo->get(itChild->second);
439                                         bestChildSize = temp.accessions.size();
440                                 }
441                                 
442                         }
443             
444                         //is this taxonomy above cutoff
445                         int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
446                         
447                         if (consensusConfidence >= cutoff) { //if yes, add it
448                 conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
449                                 myLevel++;
450                         }else{ //if no, quit
451                                 break;
452                         }
453                         
454                         //move down a level
455                         currentNode = bestChild;
456                 }
457                 
458                 if (myLevel != phylo->getMaxLevel()) {
459                         while (myLevel != phylo->getMaxLevel()) {
460                                 conTax += "unclassified;";
461                                 myLevel++;
462                         }
463                 }               
464                 if (conTax == "") {  conTax = "no_consensus;";  }
465                 
466                 delete phylo;   
467         
468         return conTax;
469         
470         }
471         catch(exception& e) {
472                 m->errorOut(e, "ClassifyTreeCommand", "getTaxonomy");
473                 exit(1);
474         }
475 }
476
477 //**********************************************************************************************************************
478 map<string, set<string> > ClassifyTreeCommand::getDescendantList(Tree*& T, int i, map<int, map<string, set<string> > > descendants){
479         try {
480                 map<string ,set<string> > names;
481                 
482                 map<string ,set<string> >::iterator it;
483         map<string ,set<string> >::iterator it2;
484                 
485                 int lc = T->tree[i].getLChild();
486                 int rc = T->tree[i].getRChild();
487                 
488                 if (lc == -1) { //you are a leaf your only descendant is yourself
489             string group = tmap->getGroup(T->tree[i].getName());
490             set<string> mynames; mynames.insert(T->tree[i].getName());
491             names[group] = mynames; //mygroup -> me
492             names["AllGroups"] = mynames;
493                 }else{ //your descedants are the combination of your childrens descendants
494                         names = descendants[lc];
495                         for (it = descendants[rc].begin(); it != descendants[rc].end(); it++) {
496                 it2 = names.find(it->first); //do we already have this group
497                 if (it2 == names.end()) { //nope, so add it
498                     names[it->first] = it->second;
499                 }else {
500                     for (set<string>::iterator it3 = (it->second).begin(); it3 != (it->second).end(); it3++) {
501                         names[it->first].insert(*it3);
502                     }
503                 }
504                                 
505                         }
506                 }
507                 
508                 return names;
509         }
510         catch(exception& e) {
511                 m->errorOut(e, "ClassifyTreeCommand", "getDescendantList");     
512                 exit(1);
513         }
514 }
515 //**********************************************************************************************************************
516 int ClassifyTreeCommand::readTaxonomyFile() {
517         try {
518                 
519                 ifstream in;
520                 m->openInputFile(taxonomyfile, in);
521                 
522                 string name, tax;
523         
524                 while(!in.eof()){
525                         in >> name >> tax;              
526                         m->gobble(in);
527                         
528                         //are there confidence scores, if so remove them
529                         if (tax.find_first_of('(') != -1) {  m->removeConfidences(tax); }
530                         
531                         taxMap[name] = tax;
532                         
533                         if (m->control_pressed) { in.close(); taxMap.clear(); return 0; }
534                 }
535                 in.close();
536                 
537                 return 0;
538         }
539         catch(exception& e) {
540                 m->errorOut(e, "ClassifyTreeCommand", "readTaxonomyFile");
541                 exit(1);
542         }
543 }
544
545 /*****************************************************************/
546 int ClassifyTreeCommand::readNamesFile() {
547         try {
548                 ifstream inNames;
549                 m->openInputFile(namefile, inNames);
550                 
551                 string name, names;
552         
553                 while(!inNames.eof()){
554                         inNames >> name;                        //read from first column  A
555                         inNames >> names;               //read from second column  A,B,C,D
556                         m->gobble(inNames);
557                         
558                         //parse names into vector
559                         vector<string> theseNames;
560                         m->splitAtComma(names, theseNames);
561             
562                         for (int i = 0; i < theseNames.size(); i++) {  nameMap[theseNames[i]] = name;  }
563             nameCount[name] = theseNames.size();
564                         
565                         if (m->control_pressed) { inNames.close(); nameMap.clear(); return 0; }
566                 }
567                 inNames.close();
568                 
569                 return 0;
570         }
571         catch(exception& e) {
572                 m->errorOut(e, "ClassifyTreeCommand", "readNamesFile");
573                 exit(1);
574         }
575 }
576
577 /*****************************************************************/
578
579