]> git.donarmstrong.com Git - mothur.git/commitdiff
added new class forest
authorKathryn Iverson <kd.iverson@gmail.com>
Fri, 26 Oct 2012 17:48:30 +0000 (13:48 -0400)
committerKathryn Iverson <kd.iverson@gmail.com>
Fri, 26 Oct 2012 17:48:30 +0000 (13:48 -0400)
Mothur.xcodeproj/project.pbxproj
forest.cpp [new file with mode: 0644]
forest.h [new file with mode: 0644]

index ecb0619a39d993201f9bd102813c9b10b7cc3f0b..79f445785da964b5ef5b8581b2d57e252ea9c7b3 100644 (file)
@@ -10,6 +10,7 @@
                219C1DE01552C4BD004209F9 /* newcommandtemplate.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 219C1DDF1552C4BD004209F9 /* newcommandtemplate.cpp */; };
                219C1DE41559BCCF004209F9 /* getcoremicrobiomecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 219C1DE31559BCCD004209F9 /* getcoremicrobiomecommand.cpp */; };
                7E6BE10A12F710D8007ADDBE /* refchimeratest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */; };
+               83F25B0C163B031200ABE73D /* forest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 83F25B0A163B031200ABE73D /* forest.cpp */; };
                8DD76FB00486AB0100D96B5E /* mothur.1 in CopyFiles */ = {isa = PBXBuildFile; fileRef = C6A0FF2C0290799A04C91782 /* mothur.1 */; };
                A70056E6156A93D000924A2D /* getotulabelscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70056E5156A93D000924A2D /* getotulabelscommand.cpp */; };
                A70056EB156AB6E500924A2D /* removeotulabelscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70056EA156AB6E500924A2D /* removeotulabelscommand.cpp */; };
                7E6BE10812F710D8007ADDBE /* refchimeratest.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = refchimeratest.h; sourceTree = "<group>"; };
                7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = refchimeratest.cpp; sourceTree = "<group>"; };
                7E78911B135F3E8600E725D2 /* eachgapdistignorens.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = eachgapdistignorens.h; sourceTree = "<group>"; };
+               83F25B0A163B031200ABE73D /* forest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = forest.cpp; sourceTree = "<group>"; };
+               83F25B0B163B031200ABE73D /* forest.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = forest.h; sourceTree = "<group>"; };
                8DD76FB20486AB0100D96B5E /* mothur */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = mothur; sourceTree = BUILT_PRODUCTS_DIR; };
                A70056E5156A93D000924A2D /* getotulabelscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getotulabelscommand.cpp; sourceTree = "<group>"; };
                A70056E8156A93E300924A2D /* getotulabelscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getotulabelscommand.h; sourceTree = "<group>"; };
                                A77E1937161B201E00DB1A2A /* randomforest.cpp */,
                                A7386C201619CACB00651424 /* rftreenode.hpp */,
                                A77E193A161B289600DB1A2A /* rftreenode.cpp */,
+                               83F25B0A163B031200ABE73D /* forest.cpp */,
+                               83F25B0B163B031200ABE73D /* forest.h */,
                        );
                        name = randomforest;
                        sourceTree = "<group>";
                                A721AB71161C572A009860A1 /* kmernode.cpp in Sources */,
                                A721AB72161C572A009860A1 /* kmertree.cpp in Sources */,
                                A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */,
+                               83F25B0C163B031200ABE73D /* forest.cpp in Sources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
diff --git a/forest.cpp b/forest.cpp
new file mode 100644 (file)
index 0000000..8ac1b79
--- /dev/null
@@ -0,0 +1,59 @@
+//
+//  forest.cpp
+//  Mothur
+//
+//  Created by Kathryn Iverson on 10/26/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "forest.h"
+
+/***********************************************************************/
+Forest::Forest(const std::vector < std::vector<int> > dataSet,
+                                           const int numDecisionTrees,
+                                           const string treeSplitCriterion = "informationGain")
+: dataSet(dataSet),
+numDecisionTrees(numDecisionTrees),
+numSamples((int)dataSet.size()),
+numFeatures((int)(dataSet[0].size() - 1)),
+globalDiscardedFeatureIndices(getGlobalDiscardedFeatureIndices()),
+globalVariableImportanceList(numFeatures, 0),
+treeSplitCriterion(treeSplitCriterion) {
+    m = MothurOut::getInstance();
+    // TODO: double check if the implemenatation of 'globalOutOfBagEstimates' is correct
+}
+
+/***********************************************************************/
+
+vector<int> Forest::getGlobalDiscardedFeatureIndices() {
+    try {
+        vector<int> globalDiscardedFeatureIndices;
+        
+        // calculate feature vectors
+        vector< vector<int> > featureVectors(numFeatures, vector<int>(numSamples, 0));
+        for (int i = 0; i < numSamples; i++) {
+            if (m->control_pressed) { return globalDiscardedFeatureIndices; }
+            for (int j = 0; j < numFeatures; j++) { featureVectors[j][i] = dataSet[i][j]; }
+        }
+        
+        for (int i = 0; i < featureVectors.size(); i++) {
+            if (m->control_pressed) { return globalDiscardedFeatureIndices; }
+            double standardDeviation = m->getStandardDeviation(featureVectors[i]);
+            if (standardDeviation <= 0){ globalDiscardedFeatureIndices.push_back(i); }
+        }
+        
+        if (m->debug) {
+            m->mothurOut("number of global discarded features:  " + toString(globalDiscardedFeatureIndices.size())+ "\n");
+            m->mothurOut("total features: " + toString(featureVectors.size())+ "\n");
+        }
+        
+        return globalDiscardedFeatureIndices;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "Forest", "getGlobalDiscardedFeatureIndices");
+               exit(1);
+       }
+}
+
+/***********************************************************************/
+
diff --git a/forest.h b/forest.h
new file mode 100644 (file)
index 0000000..c9d29dc
--- /dev/null
+++ b/forest.h
@@ -0,0 +1,67 @@
+//
+//  forest.h
+//  Mothur
+//
+//  Created by Kathryn Iverson on 10/26/12. Modified abstractrandomforest
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#ifndef __Mothur__forest__
+#define __Mothur__forest__
+
+#include <iostream>
+#include "mothurout.h"
+#include "macros.h"
+#include "abstractdecisiontree.hpp"
+/***********************************************************************/
+//this is a re-implementation of the abstractrandomforest class
+
+class Forest{
+public:
+    // intialization with vectors
+    Forest(const std::vector < std::vector<int> > dataSet,
+                         const int numDecisionTrees,
+                         const string);
+    virtual ~Forest(){ }
+    virtual int populateDecisionTrees() = 0;
+    virtual int calcForrestErrorRate() = 0;
+    virtual int calcForrestVariableImportance(string) = 0;
+    
+    /***********************************************************************/
+    
+protected:
+    
+    // TODO: create a better way of discarding feature
+    // currently we just set FEATURE_DISCARD_SD_THRESHOLD to 0 to solved this
+    // it can be tuned for better selection
+    // also, there might be other factors like Mean or other stuffs
+    // same would apply for createLocalDiscardedFeatureList in the TreeNode class
+    
+    // TODO: Another idea is getting an aggregated discarded feature indices after the run, from combining
+    // the local discarded feature indices
+    // this would penalize a feature, even if in global space the feature looks quite good
+    // the penalization would be averaged, so this woould unlikely to create a local optmina
+    
+    vector<int> getGlobalDiscardedFeatureIndices();
+    
+    int numDecisionTrees;
+    int numSamples;
+    int numFeatures;
+    vector< vector<int> > dataSet;
+    vector<int> globalDiscardedFeatureIndices;
+    vector<double> globalVariableImportanceList;
+    string treeSplitCriterion;
+    // This is a map of each feature to outcome count of each classes
+    // e.g. 1 => [2 7] means feature 1 has 2 outcome of 0 and 7 outcome of 1
+    map<int, vector<int> > globalOutOfBagEstimates;
+    
+    // TODO: fix this, do we use pointers?
+    vector<AbstractDecisionTree*> decisionTrees;
+    
+    MothurOut* m;
+    
+private:
+    
+};
+
+#endif /* defined(__Mothur__forest__) */