]> git.donarmstrong.com Git - mothur.git/commitdiff
Merge remote-tracking branch 'origin'
authorSarah Westcott <mothur.westcott@gmail.com>
Wed, 18 Sep 2013 16:54:25 +0000 (12:54 -0400)
committerSarah Westcott <mothur.westcott@gmail.com>
Wed, 18 Sep 2013 16:54:25 +0000 (12:54 -0400)
17 files changed:
Mothur.xcodeproj/project.pbxproj
ccode.h
chimeraccodecommand.cpp
chimeracheckcommand.cpp
chimeracheckrdp.h
chimerapintailcommand.cpp
classifyrfsharedcommand.cpp [new file with mode: 0755]
classifyrfsharedcommand.h [new file with mode: 0755]
classifysharedcommand.cpp
clustercommand.h
clustersplitcommand.cpp
commandfactory.cpp
decalc.h
hclustercommand.h
pintail.h
randomforest.cpp
randomforest.hpp

index 27607ff59b39c81ca60a3c5b7e081f6de81c73f7..8304f072147391d834a1a82f002ee60a697c04fe 100644 (file)
                A7E9B67212D37EC400DA6239 /* catchallcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = catchallcommand.cpp; sourceTree = "<group>"; };
                A7E9B67312D37EC400DA6239 /* catchallcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = catchallcommand.h; sourceTree = "<group>"; };
                A7E9B67412D37EC400DA6239 /* ccode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ccode.cpp; sourceTree = "<group>"; };
-               A7E9B67512D37EC400DA6239 /* ccode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ccode.h; sourceTree = "<group>"; };
+               A7E9B67512D37EC400DA6239 /* ccode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; lineEnding = 0; path = ccode.h; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.objcpp; };
                A7E9B67612D37EC400DA6239 /* chao1.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chao1.cpp; sourceTree = "<group>"; };
                A7E9B67712D37EC400DA6239 /* chao1.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chao1.h; sourceTree = "<group>"; };
                A7E9B67812D37EC400DA6239 /* chimera.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimera.cpp; sourceTree = "<group>"; };
                A7E9B67912D37EC400DA6239 /* chimera.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimera.h; sourceTree = "<group>"; };
                A7E9B67A12D37EC400DA6239 /* chimerabellerophoncommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerabellerophoncommand.cpp; sourceTree = "<group>"; };
                A7E9B67B12D37EC400DA6239 /* chimerabellerophoncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerabellerophoncommand.h; sourceTree = "<group>"; };
-               A7E9B67C12D37EC400DA6239 /* chimeraccodecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraccodecommand.cpp; sourceTree = "<group>"; };
+               A7E9B67C12D37EC400DA6239 /* chimeraccodecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; lineEnding = 0; path = chimeraccodecommand.cpp; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.cpp; };
                A7E9B67D12D37EC400DA6239 /* chimeraccodecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraccodecommand.h; sourceTree = "<group>"; };
-               A7E9B67E12D37EC400DA6239 /* chimeracheckcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeracheckcommand.cpp; sourceTree = "<group>"; };
+               A7E9B67E12D37EC400DA6239 /* chimeracheckcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; lineEnding = 0; path = chimeracheckcommand.cpp; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.cpp; };
                A7E9B67F12D37EC400DA6239 /* chimeracheckcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeracheckcommand.h; sourceTree = "<group>"; };
                A7E9B68012D37EC400DA6239 /* chimeracheckrdp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeracheckrdp.cpp; sourceTree = "<group>"; };
-               A7E9B68112D37EC400DA6239 /* chimeracheckrdp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeracheckrdp.h; sourceTree = "<group>"; };
-               A7E9B68212D37EC400DA6239 /* chimerapintailcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerapintailcommand.cpp; sourceTree = "<group>"; };
+               A7E9B68112D37EC400DA6239 /* chimeracheckrdp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; lineEnding = 0; path = chimeracheckrdp.h; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.objcpp; };
+               A7E9B68212D37EC400DA6239 /* chimerapintailcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; lineEnding = 0; path = chimerapintailcommand.cpp; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.cpp; };
                A7E9B68312D37EC400DA6239 /* chimerapintailcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerapintailcommand.h; sourceTree = "<group>"; };
                A7E9B68412D37EC400DA6239 /* chimerarealigner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerarealigner.cpp; sourceTree = "<group>"; };
                A7E9B68512D37EC400DA6239 /* chimerarealigner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerarealigner.h; sourceTree = "<group>"; };
                A7E9B69A12D37EC400DA6239 /* clusterclassic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clusterclassic.cpp; sourceTree = "<group>"; };
                A7E9B69B12D37EC400DA6239 /* clusterclassic.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clusterclassic.h; sourceTree = "<group>"; };
                A7E9B69C12D37EC400DA6239 /* clustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clustercommand.cpp; sourceTree = "<group>"; };
-               A7E9B69D12D37EC400DA6239 /* clustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clustercommand.h; sourceTree = "<group>"; };
+               A7E9B69D12D37EC400DA6239 /* clustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; lineEnding = 0; path = clustercommand.h; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.objcpp; };
                A7E9B69E12D37EC400DA6239 /* clusterdoturcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clusterdoturcommand.cpp; sourceTree = "<group>"; };
                A7E9B69F12D37EC400DA6239 /* clusterdoturcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clusterdoturcommand.h; sourceTree = "<group>"; };
                A7E9B6A012D37EC400DA6239 /* clusterfragmentscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clusterfragmentscommand.cpp; sourceTree = "<group>"; };
                A7E9B6A112D37EC400DA6239 /* clusterfragmentscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clusterfragmentscommand.h; sourceTree = "<group>"; };
-               A7E9B6A212D37EC400DA6239 /* clustersplitcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clustersplitcommand.cpp; sourceTree = "<group>"; };
+               A7E9B6A212D37EC400DA6239 /* clustersplitcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; lineEnding = 0; path = clustersplitcommand.cpp; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.cpp; };
                A7E9B6A312D37EC400DA6239 /* clustersplitcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clustersplitcommand.h; sourceTree = "<group>"; };
                A7E9B6A412D37EC400DA6239 /* cmdargs.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cmdargs.cpp; sourceTree = "<group>"; };
                A7E9B6A512D37EC400DA6239 /* cmdargs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cmdargs.h; sourceTree = "<group>"; };
                A7E9B6BF12D37EC400DA6239 /* datavector.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = datavector.hpp; sourceTree = "<group>"; };
                A7E9B6C012D37EC400DA6239 /* dayhoff.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dayhoff.h; sourceTree = "<group>"; };
                A7E9B6C112D37EC400DA6239 /* decalc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decalc.cpp; sourceTree = "<group>"; };
-               A7E9B6C212D37EC400DA6239 /* decalc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decalc.h; sourceTree = "<group>"; };
+               A7E9B6C212D37EC400DA6239 /* decalc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; lineEnding = 0; path = decalc.h; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.objcpp; };
                A7E9B6C312D37EC400DA6239 /* deconvolutecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deconvolutecommand.cpp; sourceTree = "<group>"; };
                A7E9B6C412D37EC400DA6239 /* deconvolutecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = deconvolutecommand.h; sourceTree = "<group>"; };
                A7E9B6C512D37EC400DA6239 /* degapseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; lineEnding = 0; path = degapseqscommand.cpp; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.cpp; };
                A7E9B71812D37EC400DA6239 /* hcluster.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hcluster.cpp; sourceTree = SOURCE_ROOT; };
                A7E9B71912D37EC400DA6239 /* hcluster.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hcluster.h; sourceTree = SOURCE_ROOT; };
                A7E9B71A12D37EC400DA6239 /* hclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hclustercommand.cpp; sourceTree = "<group>"; };
-               A7E9B71B12D37EC400DA6239 /* hclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hclustercommand.h; sourceTree = "<group>"; };
+               A7E9B71B12D37EC400DA6239 /* hclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; lineEnding = 0; path = hclustercommand.h; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.objcpp; };
                A7E9B71C12D37EC400DA6239 /* heatmap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = heatmap.cpp; sourceTree = SOURCE_ROOT; };
                A7E9B71D12D37EC400DA6239 /* heatmap.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = heatmap.h; sourceTree = SOURCE_ROOT; };
                A7E9B71E12D37EC400DA6239 /* heatmapcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = heatmapcommand.cpp; sourceTree = "<group>"; };
diff --git a/ccode.h b/ccode.h
index 456b735df524ff3eed9c2688be95f8537c0d0c86..dcd48797da324699218114a5dfd1b64be4ac8dd3 100644 (file)
--- a/ccode.h
+++ b/ccode.h
@@ -15,7 +15,7 @@
 #include "decalc.h"
 
 /***********************************************************/
-//This class was created using the algorythms described in the 
+//This class was created using the algorithms described in the 
 // "Evaluating putative chimeric sequences from PCR-amplified products" paper 
 //by Juan M. Gonzalez, Johannes Zimmerman and Cesareo Saiz-Jimenez.
 
index 627f3a950f69bec32971b65b96d7eba89942558a..d890db4087813cc3c4c4e453b55013fa555e7689 100644 (file)
@@ -38,7 +38,7 @@ string ChimeraCcodeCommand::getHelpString(){
        try {
                string helpString = "";
                helpString += "The chimera.ccode command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n";
-               helpString += "This command was created using the algorythms described in the 'Evaluating putative chimeric sequences from PCR-amplified products' paper by Juan M. Gonzalez, Johannes Zimmerman and Cesareo Saiz-Jimenez.\n";
+               helpString += "This command was created using the algorithms described in the 'Evaluating putative chimeric sequences from PCR-amplified products' paper by Juan M. Gonzalez, Johannes Zimmerman and Cesareo Saiz-Jimenez.\n";
                helpString += "The chimera.ccode command parameters are fasta, reference, filter, mask, processors, window and numwanted.\n";
                helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required unless you have a valid current fasta file. \n";
                helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n";
index 31a36f73c6d0339d775dd6a1a4efa8e523c0f14f..64ed9faa73e7cb72e5e307cc10e86f1a9bcfc195 100644 (file)
@@ -38,7 +38,7 @@ string ChimeraCheckCommand::getHelpString(){
        try {
                string helpString = "";
                helpString += "The chimera.check command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n";
-               helpString += "This command was created using the algorythms described in CHIMERA_CHECK version 2.7 written by Niels Larsen. \n";
+               helpString += "This command was created using the algorithms described in CHIMERA_CHECK version 2.7 written by Niels Larsen. \n";
                helpString += "The chimera.check command parameters are fasta, reference, processors, ksize, increment, svg and name.\n";
                helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required unless you have a valid current fasta file. \n";
                helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n";
index cc23c2e9e8b0b794339a0dfa8fa91bae50f28c00..25db15b6738f4909a3a02e939d9a114b9d63fa29 100644 (file)
@@ -17,7 +17,7 @@
 #include "alignmentdb.h"
 
 /***********************************************************/
-//This class was created using the algorythms described in 
+//This class was created using the algorithms described in 
 //CHIMERA_CHECK version 2.7 written by Niels Larsen. 
 
 /***********************************************************/
index 9d492afe950713cefd4d1fad7fcd72a6da74640c..fca9f176d9e66c8452a8da5c451eba71ca76d6eb 100644 (file)
@@ -41,7 +41,7 @@ string ChimeraPintailCommand::getHelpString(){
        try {
                string helpString = "";
                helpString += "The chimera.pintail command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n";
-               helpString += "This command was created using the algorythms described in the 'At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies' paper by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1.\n";
+               helpString += "This command was created using the algorithms described in the 'At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies' paper by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1.\n";
                helpString += "The chimera.pintail command parameters are fasta, reference, filter, mask, processors, window, increment, conservation and quantile.\n";
                helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required unless you have a valid current fasta file. \n";
                helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n";
diff --git a/classifyrfsharedcommand.cpp b/classifyrfsharedcommand.cpp
new file mode 100755 (executable)
index 0000000..d2cd9f9
--- /dev/null
@@ -0,0 +1,407 @@
+//
+//  classifysharedcommand.cpp
+//  Mothur
+//
+//  Created by Abu Zaher Md. Faridee on 8/13/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "classifyrfsharedcommand.h"
+#include "randomforest.hpp"
+#include "decisiontree.hpp"
+#include "rftreenode.hpp"
+
+//**********************************************************************************************************************
+vector<string> ClassifyRFSharedCommand::setParameters(){       
+       try {
+               //CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);        
+        CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none","summary",false,true,true); parameters.push_back(pshared);             
+        CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pdesign);    
+        CommandParameter potupersplit("otupersplit", "Multiple", "log2-squareroot", "log2", "", "", "","",false,false); parameters.push_back(potupersplit);
+        CommandParameter psplitcriteria("splitcriteria", "Multiple", "gainratio-infogain", "gainratio", "", "", "","",false,false); parameters.push_back(psplitcriteria);
+               CommandParameter pnumtrees("numtrees", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pnumtrees);
+        
+            // parameters related to pruning
+        CommandParameter pdopruning("prune", "Boolean", "", "T", "", "", "", "", false, false); parameters.push_back(pdopruning);
+        CommandParameter ppruneaggrns("pruneaggressiveness", "Number", "", "0.9", "", "", "", "", false, false); parameters.push_back(ppruneaggrns);
+        CommandParameter pdiscardhetrees("discarderrortrees", "Boolean", "", "T", "", "", "", "", false, false); parameters.push_back(pdiscardhetrees);
+        CommandParameter phetdiscardthreshold("errorthreshold", "Number", "", "0.4", "", "", "", "", false, false); parameters.push_back(phetdiscardthreshold);
+        CommandParameter psdthreshold("stdthreshold", "Number", "", "0.0", "", "", "", "", false, false); parameters.push_back(psdthreshold);
+            // pruning params end
+
+        CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
+               CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
+               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
+               CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
+               
+               vector<string> myArray;
+               for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
+               return myArray;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClassifySharedCommand", "setParameters");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+string ClassifyRFSharedCommand::getHelpString(){       
+       try {
+               string helpString = "";
+               helpString += "The classify.shared command allows you to ....\n";
+               helpString += "The classify.shared command parameters are: shared, design, label, groups, otupersplit.\n";
+        helpString += "The label parameter is used to analyze specific labels in your input.\n";
+               helpString += "The groups parameter allows you to specify which of the groups in your designfile you would like analyzed.\n";
+               helpString += "The classify.shared should be in the following format: \n";
+               helpString += "classify.shared(shared=yourSharedFile, design=yourDesignFile)\n";
+               return helpString;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClassifySharedCommand", "getHelpString");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+string ClassifyRFSharedCommand::getOutputPattern(string type) {
+    try {
+        string pattern = "";
+        
+        if (type == "summary") {  pattern = "[filename],[distance],summary"; } //makes file like: amazon.0.03.fasta
+        else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
+        
+        return pattern;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "ClassifySharedCommand", "getOutputPattern");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
+
+ClassifyRFSharedCommand::ClassifyRFSharedCommand() {
+  try {
+    abort = true; calledHelp = true;
+    setParameters();
+    vector<string> tempOutNames;
+    outputTypes["summary"] = tempOutNames; 
+  }
+  catch(exception& e) {
+    m->errorOut(e, "ClassifySharedCommand", "ClassifySharedCommand");
+    exit(1);
+  }
+}
+
+//**********************************************************************************************************************
+ClassifyRFSharedCommand::ClassifyRFSharedCommand(string option) {
+  try {
+    abort = false; calledHelp = false;
+    allLines = 1;
+      
+      //allow user to run help
+    if(option == "help") { help(); abort = true; calledHelp = true; }
+    else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+    
+    else {
+        //valid paramters for this command
+      vector<string> myArray = setParameters();
+      
+      OptionParser parser(option);
+      map<string,string> parameters = parser.getParameters();
+      
+      ValidParameters validParameter;
+      map<string,string>::iterator it;
+        //check to make sure all parameters are valid for command
+      for (it = parameters.begin(); it != parameters.end(); it++) {
+        if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+      }
+        vector<string> tempOutNames;
+        outputTypes["summary"] = tempOutNames;
+      
+        //if the user changes the input directory command factory will send this info to us in the output parameter
+      string inputDir = validParameter.validFile(parameters, "inputdir", false);
+      if (inputDir == "not found"){    inputDir = "";          }
+      else {
+        string path;
+        it = parameters.find("shared");
+          //user has given a shared file
+        if(it != parameters.end()){
+          path = m->hasPath(it->second);
+            //if the user has not given a path then, add inputdir. else leave path alone.
+          if (path == "") {    parameters["shared"] = inputDir + it->second;           }
+        }
+        
+        it = parameters.find("design");
+          //user has given a design file
+        if(it != parameters.end()){
+          path = m->hasPath(it->second);
+            //if the user has not given a path then, add inputdir. else leave path alone.
+          if (path == "") {    parameters["design"] = inputDir + it->second;           }
+        }
+        
+      }
+        //check for parameters
+        //get shared file, it is required
+      sharedfile = validParameter.validFile(parameters, "shared", true);
+      if (sharedfile == "not open") { sharedfile = ""; abort = true; }
+      else if (sharedfile == "not found") {
+          //if there is a current shared file, use it
+        sharedfile = m->getSharedFile();
+        if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
+        else {         m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
+      }else { m->setSharedFile(sharedfile); }
+      
+        //get design file, it is required
+      designfile = validParameter.validFile(parameters, "design", true);
+      if (designfile == "not open") { sharedfile = ""; abort = true; }
+      else if (designfile == "not found") {
+          //if there is a current shared file, use it
+        designfile = m->getDesignFile();
+        if (designfile != "") { m->mothurOut("Using " + designfile + " as input file for the design parameter."); m->mothurOutEndLine(); }
+        else {         m->mothurOut("You have no current designfile and the design parameter is required."); m->mothurOutEndLine(); abort = true; }
+      }else { m->setDesignFile(designfile); }
+
+      
+        //if the user changes the output directory command factory will send this info to us in the output parameter
+      outputDir = validParameter.validFile(parameters, "outputdir", false);            if (outputDir == "not found"){
+        outputDir = m->hasPath(sharedfile); //if user entered a file with a path then preserve it
+      }
+      
+        // NEW CODE for OTU per split selection criteria
+        string temp = validParameter.validFile(parameters, "splitcriteria", false);
+        if (temp == "not found") { temp = "gainratio"; }
+        if ((temp == "gainratio") || (temp == "infogain")) {
+            treeSplitCriterion = temp;
+        } else { m->mothurOut("Not a valid tree splitting criterio. Valid tree splitting criteria are 'gainratio' and 'infogain'.");
+            m->mothurOutEndLine();
+            abort = true;
+        }
+        
+        temp = validParameter.validFile(parameters, "numtrees", false); if (temp == "not found"){      temp = "100";   }
+        m->mothurConvert(temp, numDecisionTrees);
+        
+            // parameters for pruning
+        temp = validParameter.validFile(parameters, "prune", false);
+        if (temp == "not found") { temp = "f"; }
+        doPruning = m->isTrue(temp);
+        
+        temp = validParameter.validFile(parameters, "pruneaggressiveness", false);
+        if (temp == "not found") { temp = "0.9"; }
+        m->mothurConvert(temp, pruneAggressiveness);
+        
+        temp = validParameter.validFile(parameters, "discarderrortrees", false);
+        if (temp == "not found") { temp = "f"; }
+        discardHighErrorTrees = m->isTrue(temp);
+        
+        temp = validParameter.validFile(parameters, "errorthreshold", false);
+        if (temp == "not found") { temp = "0.4"; }
+        m->mothurConvert(temp, highErrorTreeDiscardThreshold);
+        
+        temp = validParameter.validFile(parameters, "otupersplit", false);
+        if (temp == "not found") { temp = "log2"; }
+        if ((temp == "squareroot") || (temp == "log2")) {
+            optimumFeatureSubsetSelectionCriteria = temp;
+        } else { m->mothurOut("Not a valid OTU per split selection method. Valid OTU per split selection methods are 'log2' and 'squareroot'.");
+            m->mothurOutEndLine();
+            abort = true;
+        }
+        
+        temp = validParameter.validFile(parameters, "stdthreshold", false);
+        if (temp == "not found") { temp = "0.0"; }
+        m->mothurConvert(temp, featureStandardDeviationThreshold);
+                        
+            // end of pruning params
+        
+        //Groups must be checked later to make sure they are valid. SharedUtilities has functions of check the validity, just make to so m->setGroups() after the checks.  If you are using these with a shared file no need to check the SharedRAbundVector class will call SharedUtilites for you, kinda nice, huh?
+      string groups = validParameter.validFile(parameters, "groups", false);
+      if (groups == "not found") { groups = ""; }
+      else { m->splitAtDash(groups, Groups); }
+      m->setGroups(Groups);
+      
+        //Commonly used to process list, rabund, sabund, shared and relabund files.  Look at "smart distancing" examples below in the execute function.
+      string label = validParameter.validFile(parameters, "label", false);
+      if (label == "not found") { label = ""; }
+      else {
+        if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
+        else { allLines = 1;  }
+      }
+    }
+    
+  }
+  catch(exception& e) {
+    m->errorOut(e, "ClassifySharedCommand", "ClassifySharedCommand");
+    exit(1);
+  }
+}
+//**********************************************************************************************************************
+int ClassifyRFSharedCommand::execute() {
+  try {
+    
+    if (abort == true) { if (calledHelp) { return 0; }  return 2;      }
+    
+    InputData input(sharedfile, "sharedfile");
+    vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
+        
+    //read design file
+    designMap.readDesignMap(designfile);
+    
+    string lastLabel = lookup[0]->getLabel();
+    set<string> processedLabels;
+    set<string> userLabels = labels;
+    
+      //as long as you are not at the end of the file or done wih the lines you want
+    while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+      
+      if (m->control_pressed) { for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }  return 0; }
+      
+      if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
+        
+        m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+        
+        processSharedAndDesignData(lookup);  
+          
+        processedLabels.insert(lookup[0]->getLabel());
+        userLabels.erase(lookup[0]->getLabel());
+      }
+      
+      if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+        string saveLabel = lookup[0]->getLabel();
+        
+        for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
+        lookup = input.getSharedRAbundVectors(lastLabel);
+        m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+        processSharedAndDesignData(lookup);        
+        
+        processedLabels.insert(lookup[0]->getLabel());
+        userLabels.erase(lookup[0]->getLabel());
+        
+          //restore real lastlabel to save below
+        lookup[0]->setLabel(saveLabel);
+      }
+      
+      lastLabel = lookup[0]->getLabel();
+        //prevent memory leak
+      for (int i = 0; i < lookup.size(); i++) {  delete lookup[i]; lookup[i] = NULL; }
+      
+      if (m->control_pressed) { return 0; }
+      
+        //get next line to process
+      lookup = input.getSharedRAbundVectors();
+    }
+    
+    if (m->control_pressed) {  return 0; }
+    
+      //output error messages about any remaining user labels
+    set<string>::iterator it;
+    bool needToRun = false;
+    for (it = userLabels.begin(); it != userLabels.end(); it++) {
+      m->mothurOut("Your file does not include the label " + *it);
+      if (processedLabels.count(lastLabel) != 1) {
+        m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+        needToRun = true;
+      }else {
+        m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+      }
+    }
+    
+      //run last label if you need to
+    if (needToRun == true)  {
+      for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
+      lookup = input.getSharedRAbundVectors(lastLabel);
+      
+      m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+      
+      processSharedAndDesignData(lookup);  
+        
+      for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
+      
+    }
+
+      m->mothurOutEndLine();
+      m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+      for (int i = 0; i < outputNames.size(); i++) {   m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
+      m->mothurOutEndLine();
+      
+    return 0;
+    
+  }
+  catch(exception& e) {
+    m->errorOut(e, "ClassifySharedCommand", "execute");
+    exit(1);
+  }
+}
+//**********************************************************************************************************************
+
+void ClassifyRFSharedCommand::processSharedAndDesignData(vector<SharedRAbundVector*> lookup){  
+    try {
+//    for (int i = 0; i < designMap->getNamesOfGroups().size(); i++) {
+//      string groupName = designMap->getNamesOfGroups()[i];
+//      cout << groupName << endl;
+//    }
+
+//    for (int i = 0; i < designMap->getNumSeqs(); i++) {
+//      string sharedGroupName = designMap->getNamesSeqs()[i];
+//      string treatmentName = designMap->getGroup(sharedGroupName);
+//      cout << sharedGroupName << " : " << treatmentName <<  endl;
+//    }
+  
+        map<string, int> treatmentToIntMap;
+        map<int, string> intToTreatmentMap;
+        for (int  i = 0; i < designMap.getNumGroups(); i++) {
+            string treatmentName = designMap.getNamesOfGroups()[i];
+            treatmentToIntMap[treatmentName] = i;
+            intToTreatmentMap[i] = treatmentName;
+        }
+        
+        int numSamples = lookup.size();
+        int numFeatures = lookup[0]->getNumBins();
+        
+        int numRows = numSamples;
+        int numColumns = numFeatures + 1;           // extra one space needed for the treatment/outcome
+        
+        vector< vector<int> > dataSet(numRows, vector<int>(numColumns, 0));
+        
+        vector<string> names;
+        
+        for (int i = 0; i < lookup.size(); i++) {
+            string sharedGroupName = lookup[i]->getGroup();
+            names.push_back(sharedGroupName);
+            string treatmentName = designMap.getGroup(sharedGroupName);
+            
+            int j = 0;
+            for (; j < lookup[i]->getNumBins(); j++) {
+                int otuCount = lookup[i]->getAbundance(j);
+                dataSet[i][j] = otuCount;
+            }
+            dataSet[i][j] = treatmentToIntMap[treatmentName];
+        }
+        
+        RandomForest randomForest(dataSet, numDecisionTrees, treeSplitCriterion, doPruning, pruneAggressiveness, discardHighErrorTrees, highErrorTreeDiscardThreshold, optimumFeatureSubsetSelectionCriteria, featureStandardDeviationThreshold);
+        
+        randomForest.populateDecisionTrees();
+        randomForest.calcForrestErrorRate();
+        randomForest.printConfusionMatrix(intToTreatmentMap);
+        
+        map<string, string> variables; 
+        variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "RF.";
+        variables["[distance]"] = lookup[0]->getLabel();
+        string filename = getOutputFileName("summary", variables);
+        outputNames.push_back(filename); outputTypes["summary"].push_back(filename);
+        randomForest.calcForrestVariableImportance(filename);
+        
+        //
+        map<string, string> variable; 
+        variable["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "misclassifications.";
+        variable["[distance]"] = lookup[0]->getLabel();
+        string mc_filename = getOutputFileName("summary", variable);
+        outputNames.push_back(mc_filename); outputTypes["summary"].push_back(mc_filename);
+        randomForest.getMissclassifications(mc_filename, intToTreatmentMap, names);
+        //
+        
+        m->mothurOutEndLine();
+    }
+    catch(exception& e) {
+        m->errorOut(e, "ClassifySharedCommand", "processSharedAndDesignData");
+        exit(1);
+    }
+}
+//**********************************************************************************************************************
+
diff --git a/classifyrfsharedcommand.h b/classifyrfsharedcommand.h
new file mode 100755 (executable)
index 0000000..6a948b2
--- /dev/null
@@ -0,0 +1,54 @@
+//
+//  classifysharedcommand.h
+//  Mothur
+//
+//  Created by Abu Zaher Md. Faridee on 8/13/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#ifndef __Mothur__classifyrfsharedcommand__
+#define __Mothur__classifyrfsharedcommand__
+
+#include "command.hpp"
+#include "inputdata.h"
+
+class ClassifyRFSharedCommand : public Command {
+public:
+  ClassifyRFSharedCommand();
+  ClassifyRFSharedCommand(string);
+  ~ClassifyRFSharedCommand() {};
+  
+  vector<string> setParameters();
+  string getCommandName()                      { return "classifyrf.shared";     }
+  string getCommandCategory()          { return "OTU-Based Approaches";                }  
+  string getHelpString();      
+  string getOutputPattern(string);
+  string getCitation() { return "http://www.mothur.org/wiki/Classifyrf.shared\n"; }
+  string getDescription()              { return "implements the random forest machine learning algorithm to identify OTUs that can be used to differentiate between various groups of samples"; }
+  int execute();
+  
+  void help() { m->mothurOut(getHelpString()); }
+
+private:
+    bool abort;
+    string outputDir;
+    vector<string> outputNames, Groups;
+  
+    string sharedfile, designfile;
+    set<string> labels;
+    bool allLines;
+  
+    int processors;
+    bool useTiming;
+
+    GroupMap designMap;
+  
+    int numDecisionTrees;
+    string treeSplitCriterion, optimumFeatureSubsetSelectionCriteria;
+    bool doPruning, discardHighErrorTrees;
+    double pruneAggressiveness, highErrorTreeDiscardThreshold, featureStandardDeviationThreshold;
+    
+    void processSharedAndDesignData(vector<SharedRAbundVector*> lookup);
+};
+
+#endif /* defined(__Mothur__classifyrfsharedcommand__) */
index 6e32fd1e800023348e4a58eeaad678f262430d25..c7eb6cd0daa18f5627824a94d297270a1ff32147 100755 (executable)
@@ -359,8 +359,11 @@ void ClassifySharedCommand::processSharedAndDesignData(vector<SharedRAbundVector
         
         vector< vector<int> > dataSet(numRows, vector<int>(numColumns, 0));
         
+        vector<string> names;
+        
         for (int i = 0; i < lookup.size(); i++) {
             string sharedGroupName = lookup[i]->getGroup();
+            names.push_back(sharedGroupName);
             string treatmentName = designMap.getGroup(sharedGroupName);
             
             int j = 0;
@@ -375,15 +378,24 @@ void ClassifySharedCommand::processSharedAndDesignData(vector<SharedRAbundVector
         
         randomForest.populateDecisionTrees();
         randomForest.calcForrestErrorRate();
+        randomForest.printConfusionMatrix(intToTreatmentMap);
         
         map<string, string> variables; 
-        variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
+        variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "RF.";
         variables["[distance]"] = lookup[0]->getLabel();
         string filename = getOutputFileName("summary", variables);
         outputNames.push_back(filename); outputTypes["summary"].push_back(filename);
-        
         randomForest.calcForrestVariableImportance(filename);
         
+        //
+        map<string, string> variable; 
+        variable["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "misclassifications.";
+        variable["[distance]"] = lookup[0]->getLabel();
+        string mc_filename = getOutputFileName("summary", variable);
+        outputNames.push_back(mc_filename); outputTypes["summary"].push_back(mc_filename);
+        randomForest.getMissclassifications(mc_filename, intToTreatmentMap, names);
+        //
+        
         m->mothurOutEndLine();
     }
     catch(exception& e) {
index 5786da220046713d4eb18a26f2aba4529011093c..cd9f47b07d9e8af9a6f04c014fabcfaf36a2818e 100644 (file)
@@ -21,7 +21,7 @@
        The cluster command outputs a .list , .rabund and .sabund files.  
        The cluster command parameter options are method, cuttoff and precision. No parameters are required.  
        The cluster command should be in the following format: cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision).  
-       The acceptable methods are furthest, nearest and average.  If you do not provide a method the default algorythm is furthest neighbor.  
+       The acceptable methods are furthest, nearest and average.  If you do not provide a method the default algorithm is furthest neighbor.  
        The cluster() command outputs three files *.list, *.rabund, and *.sabund.   */
 
 
index b02bd20063e9f70bb422fa541403013d6a1976ab..95693cc420a7e55d5c9a96a102e3d9e55ad0221c 100644 (file)
@@ -61,7 +61,7 @@ string ClusterSplitCommand::getHelpString(){
         helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
                helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
                helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
-               helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
+               helpString += "The method allows you to specify what clustering algorithm you want to use, default=average, option furthest, nearest, or average. \n";
                helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
                helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
                helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
index 6ceabbfc6eee41a10f70ef8391fc3a12e202696f..03c3e49ccff50f128c657780da2b9d8109289358 100644 (file)
 #include "makecontigscommand.h"
 #include "loadlogfilecommand.h"
 #include "sffmultiplecommand.h"
-#include "classifysharedcommand.h"
+#include "classifyrfsharedcommand.h"
 #include "filtersharedcommand.h"
 #include "primerdesigncommand.h"
 #include "getdistscommand.h"
@@ -306,7 +306,7 @@ CommandFactory::CommandFactory(){
     commands["make.table"]          = "make.table";
     commands["sff.multiple"]        = "sff.multiple";
        commands["quit"]                                = "MPIEnabled"; 
-    commands["classify.shared"]                = "classify.shared"; 
+    commands["classifyrf.shared"]              = "classifyrf.shared"; 
     commands["filter.shared"]          = "filter.shared"; 
     commands["primer.design"]          = "primer.design";
     commands["get.dists"]           = "get.dists";
@@ -533,7 +533,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){
         else if(commandName == "make.contigs")          {      command = new MakeContigsCommand(optionString);             }
         else if(commandName == "load.logfile")          {      command = new LoadLogfileCommand(optionString);             }
         else if(commandName == "sff.multiple")          {      command = new SffMultipleCommand(optionString);             }
-        else if(commandName == "classify.shared")       {      command = new ClassifySharedCommand(optionString);          }
+        else if(commandName == "classifyrf.shared")       {    command = new ClassifyRFSharedCommand(optionString);          }
         else if(commandName == "filter.shared")         {      command = new FilterSharedCommand(optionString);            }
         else if(commandName == "primer.design")         {      command = new PrimerDesignCommand(optionString);            }
         else if(commandName == "get.dists")             {      command = new GetDistsCommand(optionString);                }
@@ -701,7 +701,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str
         else if(commandName == "make.contigs")          {      pipecommand = new MakeContigsCommand(optionString);             }
         else if(commandName == "load.logfile")          {      pipecommand = new LoadLogfileCommand(optionString);             }
         else if(commandName == "sff.multiple")          {      pipecommand = new SffMultipleCommand(optionString);             }
-        else if(commandName == "classify.shared")       {      pipecommand = new ClassifySharedCommand(optionString);          }
+        else if(commandName == "classifyrf.shared")       {    pipecommand = new ClassifyRFSharedCommand(optionString);          }
         else if(commandName == "filter.shared")         {      pipecommand = new FilterSharedCommand(optionString);            }
         else if(commandName == "primer.design")         {      pipecommand = new PrimerDesignCommand(optionString);            }
         else if(commandName == "get.dists")             {      pipecommand = new GetDistsCommand(optionString);                }
@@ -855,7 +855,7 @@ Command* CommandFactory::getCommand(string commandName){
         else if(commandName == "make.contigs")          {      shellcommand = new MakeContigsCommand();            }
         else if(commandName == "load.logfile")          {      shellcommand = new LoadLogfileCommand();            }
         else if(commandName == "sff.multiple")          {      shellcommand = new SffMultipleCommand();            }
-        else if(commandName == "classify.shared")       {      shellcommand = new ClassifySharedCommand();         }
+        else if(commandName == "classifyrf.shared")       {    shellcommand = new ClassifyRFSharedCommand();         }
         else if(commandName == "filter.shared")         {      shellcommand = new FilterSharedCommand();           }
         else if(commandName == "primer.design")         {      shellcommand = new PrimerDesignCommand();           }
         else if(commandName == "get.dists")             {      shellcommand = new GetDistsCommand();               }
index d6cca182e81937dd5a494d57c2c8221b6039ad45..d1daf050e004333228e39fb5aaa7621dd1b0bcf6 100644 (file)
--- a/decalc.h
+++ b/decalc.h
@@ -14,7 +14,7 @@
 
 /***********************************************************************/
 
-//This class was created using the algorythms described in the 
+//This class was created using the algorithms described in the 
 // "At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies" paper 
 //by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1.
 
index d1074070420fc89347a65b8fa8c0448c953c8881..1b8e9b7607df66c0a36621f31ced21772f36a909 100644 (file)
@@ -18,7 +18,7 @@
 #include "readcluster.h"
 
 /******************************************************************/
-//This command is an implementation of the HCluster algorythmn described in 
+//This command is an implementation of the HCluster algorithmn described in 
 //ESPRIT: estimating species richness using large collections of 16S rRNA pyrosequences by
 //Yijun Sun1,2,*, Yunpeng Cai2, Li Liu1, Fahong Yu1, Michael L. Farrell3, William McKendree3 
 //and William Farmerie1 1 
index 92c399889c3be0aa0e552fe296612eb336bc8168..c970fdd87371e66afef3a9e1a40b465857d07b4b 100644 (file)
--- a/pintail.h
+++ b/pintail.h
@@ -15,7 +15,7 @@
 #include "decalc.h"
 
 /***********************************************************/
-//This class was created using the algorythms described in the 
+//This class was created using the algorithms described in the 
 // "At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies" paper 
 //by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1.
 
index 2ae0eb595c1c8d5f989c44a5e561a0ff3dac2d2b..d998de63ee8f31df493cd8546ecb5fb00b83311c 100644 (file)
@@ -37,7 +37,7 @@ int RandomForest::calcForrestErrorRate() {
             vector<int>::iterator maxPredictedOutComeIterator = max_element(predictedOutComes.begin(), predictedOutComes.end());
             int majorityVotedOutcome = (int)(maxPredictedOutComeIterator - predictedOutComes.begin());
             int realOutcome = dataSet[indexOfSample][numFeatures];
-            
+                                   
             if (majorityVotedOutcome == realOutcome) { numCorrect++; }
         }
         
@@ -46,7 +46,7 @@ int RandomForest::calcForrestErrorRate() {
         
         m->mothurOut("numCorrect = " + toString(numCorrect)+ "\n");
         m->mothurOut("forrestErrorRate = " + toString(forrestErrorRate)+ "\n");
-    
+            
         return 0;
     }
        catch(exception& e) {
@@ -54,6 +54,87 @@ int RandomForest::calcForrestErrorRate() {
                exit(1);
        } 
 }
+/***********************************************************************/
+
+int RandomForest::printConfusionMatrix(map<int, string> intToTreatmentMap) {
+    try {
+        int numGroups = intToTreatmentMap.size();
+        vector<vector<int> > cm(numGroups, vector<int>(numGroups, 0));
+        
+        for (map<int, vector<int> >::iterator it = globalOutOfBagEstimates.begin(); it != globalOutOfBagEstimates.end(); it++) {
+            
+            if (m->control_pressed) { return 0; }
+            
+            int indexOfSample = it->first; //key
+            vector<int> predictedOutComes = it->second; //value, vector of all predicted classes
+            vector<int>::iterator maxPredictedOutComeIterator = max_element(predictedOutComes.begin(), predictedOutComes.end());
+            int majorityVotedOutcome = (int)(maxPredictedOutComeIterator - predictedOutComes.begin());
+            int realOutcome = dataSet[indexOfSample][numFeatures];                       
+            cm[realOutcome][majorityVotedOutcome] = cm[realOutcome][majorityVotedOutcome] + 1;
+        }
+        
+        vector<int> fw;
+        for (int w = 0; w <numGroups; w++) {
+            fw.push_back(intToTreatmentMap[w].length());
+        }
+        
+        m->mothurOut("confusion matrix:\n\t\t");
+        for (int k = 0; k < numGroups; k++) {
+            //m->mothurOut(intToTreatmentMap[k] + "\t");
+            cout << setw(fw[k]) << intToTreatmentMap[k] << "\t";
+        }
+        for (int i = 0; i < numGroups; i++) {
+            cout << "\n" << setw(fw[i]) << intToTreatmentMap[i] << "\t";
+            //m->mothurOut("\n" + intToTreatmentMap[i] + "\t");
+            if (m->control_pressed) { return 0; }
+            for (int j = 0; j < numGroups; j++) {
+                //m->mothurOut(toString(cm[i][j]) + "\t");
+                cout << setw(fw[i]) << cm[i][j] << "\t";
+            }    
+        }
+        //m->mothurOut("\n");
+        cout << "\n";
+
+        return 0;
+    }
+    
+    catch(exception& e) {
+               m->errorOut(e, "RandomForest", "printConfusionMatrix");
+               exit(1);
+       }
+}
+
+/***********************************************************************/
+
+int RandomForest::getMissclassifications(string filename, map<int, string> intToTreatmentMap, vector<string> names) {
+    try {
+        ofstream out;
+        m->openOutputFile(filename, out);
+        out <<"Sample\tRF classification\tActual classification\n";
+        for (map<int, vector<int> >::iterator it = globalOutOfBagEstimates.begin(); it != globalOutOfBagEstimates.end(); it++) {
+            
+            if (m->control_pressed) { return 0; }
+            
+            int indexOfSample = it->first;
+            vector<int> predictedOutComes = it->second;
+            vector<int>::iterator maxPredictedOutComeIterator = max_element(predictedOutComes.begin(), predictedOutComes.end());
+            int majorityVotedOutcome = (int)(maxPredictedOutComeIterator - predictedOutComes.begin());
+            int realOutcome = dataSet[indexOfSample][numFeatures];
+                                   
+            if (majorityVotedOutcome != realOutcome) {             
+                out << names[indexOfSample] << "\t" << intToTreatmentMap[majorityVotedOutcome] << "\t" << intToTreatmentMap[realOutcome] << endl;
+                                
+            }
+        }
+        
+        out.close();    
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "RandomForest", "getMissclassifications");
+               exit(1);
+       } 
+}
 
 /***********************************************************************/
 int RandomForest::calcForrestVariableImportance(string filename) {
@@ -97,7 +178,7 @@ int RandomForest::calcForrestVariableImportance(string filename) {
         
         ofstream out;
         m->openOutputFile(filename, out);
-        out <<"OTU\tRank\n";
+        out <<"OTU\tMean decrease accuracy\n";
         for (int i = 0; i < globalVariableRanks.size(); i++) {
             out << m->currentBinLabels[(int)globalVariableRanks[i].first] << '\t' << globalVariableImportanceList[globalVariableRanks[i].first] << endl;
         }
index d0ac1ec063d46d4ff1367e87cf322870f7e49a9c..67d4372715ac2a6e31ba92b4a1f954910d1197f2 100644 (file)
@@ -43,6 +43,8 @@ public:
     int calcForrestVariableImportance(string);
     int populateDecisionTrees();
     int updateGlobalOutOfBagEstimates(DecisionTree* decisionTree);
+    int printConfusionMatrix(map<int, string> intToTreatmentMap);
+    int getMissclassifications(string, map<int, string> intToTreatmentMap, vector<string> names);
     
 private:
     MothurOut* m;