219C1DE01552C4BD004209F9 /* newcommandtemplate.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 219C1DDF1552C4BD004209F9 /* newcommandtemplate.cpp */; };
219C1DE41559BCCF004209F9 /* getcoremicrobiomecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 219C1DE31559BCCD004209F9 /* getcoremicrobiomecommand.cpp */; };
7E6BE10A12F710D8007ADDBE /* refchimeratest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */; };
+ 83F25B0C163B031200ABE73D /* forest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 83F25B0A163B031200ABE73D /* forest.cpp */; };
8DD76FB00486AB0100D96B5E /* mothur.1 in CopyFiles */ = {isa = PBXBuildFile; fileRef = C6A0FF2C0290799A04C91782 /* mothur.1 */; };
A70056E6156A93D000924A2D /* getotulabelscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70056E5156A93D000924A2D /* getotulabelscommand.cpp */; };
A70056EB156AB6E500924A2D /* removeotulabelscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70056EA156AB6E500924A2D /* removeotulabelscommand.cpp */; };
7E6BE10812F710D8007ADDBE /* refchimeratest.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = refchimeratest.h; sourceTree = "<group>"; };
7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = refchimeratest.cpp; sourceTree = "<group>"; };
7E78911B135F3E8600E725D2 /* eachgapdistignorens.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = eachgapdistignorens.h; sourceTree = "<group>"; };
+ 83F25B0A163B031200ABE73D /* forest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = forest.cpp; sourceTree = "<group>"; };
+ 83F25B0B163B031200ABE73D /* forest.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = forest.h; sourceTree = "<group>"; };
8DD76FB20486AB0100D96B5E /* mothur */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = mothur; sourceTree = BUILT_PRODUCTS_DIR; };
A70056E5156A93D000924A2D /* getotulabelscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getotulabelscommand.cpp; sourceTree = "<group>"; };
A70056E8156A93E300924A2D /* getotulabelscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getotulabelscommand.h; sourceTree = "<group>"; };
A77E1937161B201E00DB1A2A /* randomforest.cpp */,
A7386C201619CACB00651424 /* rftreenode.hpp */,
A77E193A161B289600DB1A2A /* rftreenode.cpp */,
+ 83F25B0A163B031200ABE73D /* forest.cpp */,
+ 83F25B0B163B031200ABE73D /* forest.h */,
);
name = randomforest;
sourceTree = "<group>";
A721AB71161C572A009860A1 /* kmernode.cpp in Sources */,
A721AB72161C572A009860A1 /* kmertree.cpp in Sources */,
A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */,
+ 83F25B0C163B031200ABE73D /* forest.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
--- /dev/null
+//
+// forest.cpp
+// Mothur
+//
+// Created by Kathryn Iverson on 10/26/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "forest.h"
+
+/***********************************************************************/
+Forest::Forest(const std::vector < std::vector<int> > dataSet,
+ const int numDecisionTrees,
+ const string treeSplitCriterion = "informationGain")
+: dataSet(dataSet),
+numDecisionTrees(numDecisionTrees),
+numSamples((int)dataSet.size()),
+numFeatures((int)(dataSet[0].size() - 1)),
+globalDiscardedFeatureIndices(getGlobalDiscardedFeatureIndices()),
+globalVariableImportanceList(numFeatures, 0),
+treeSplitCriterion(treeSplitCriterion) {
+ m = MothurOut::getInstance();
+ // TODO: double check if the implemenatation of 'globalOutOfBagEstimates' is correct
+}
+
+/***********************************************************************/
+
+vector<int> Forest::getGlobalDiscardedFeatureIndices() {
+ try {
+ vector<int> globalDiscardedFeatureIndices;
+
+ // calculate feature vectors
+ vector< vector<int> > featureVectors(numFeatures, vector<int>(numSamples, 0));
+ for (int i = 0; i < numSamples; i++) {
+ if (m->control_pressed) { return globalDiscardedFeatureIndices; }
+ for (int j = 0; j < numFeatures; j++) { featureVectors[j][i] = dataSet[i][j]; }
+ }
+
+ for (int i = 0; i < featureVectors.size(); i++) {
+ if (m->control_pressed) { return globalDiscardedFeatureIndices; }
+ double standardDeviation = m->getStandardDeviation(featureVectors[i]);
+ if (standardDeviation <= 0){ globalDiscardedFeatureIndices.push_back(i); }
+ }
+
+ if (m->debug) {
+ m->mothurOut("number of global discarded features: " + toString(globalDiscardedFeatureIndices.size())+ "\n");
+ m->mothurOut("total features: " + toString(featureVectors.size())+ "\n");
+ }
+
+ return globalDiscardedFeatureIndices;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Forest", "getGlobalDiscardedFeatureIndices");
+ exit(1);
+ }
+}
+
+/***********************************************************************/
+
--- /dev/null
+//
+// forest.h
+// Mothur
+//
+// Created by Kathryn Iverson on 10/26/12. Modified abstractrandomforest
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#ifndef __Mothur__forest__
+#define __Mothur__forest__
+
+#include <iostream>
+#include "mothurout.h"
+#include "macros.h"
+#include "abstractdecisiontree.hpp"
+/***********************************************************************/
+//this is a re-implementation of the abstractrandomforest class
+
+class Forest{
+public:
+ // intialization with vectors
+ Forest(const std::vector < std::vector<int> > dataSet,
+ const int numDecisionTrees,
+ const string);
+ virtual ~Forest(){ }
+ virtual int populateDecisionTrees() = 0;
+ virtual int calcForrestErrorRate() = 0;
+ virtual int calcForrestVariableImportance(string) = 0;
+
+ /***********************************************************************/
+
+protected:
+
+ // TODO: create a better way of discarding feature
+ // currently we just set FEATURE_DISCARD_SD_THRESHOLD to 0 to solved this
+ // it can be tuned for better selection
+ // also, there might be other factors like Mean or other stuffs
+ // same would apply for createLocalDiscardedFeatureList in the TreeNode class
+
+ // TODO: Another idea is getting an aggregated discarded feature indices after the run, from combining
+ // the local discarded feature indices
+ // this would penalize a feature, even if in global space the feature looks quite good
+ // the penalization would be averaged, so this woould unlikely to create a local optmina
+
+ vector<int> getGlobalDiscardedFeatureIndices();
+
+ int numDecisionTrees;
+ int numSamples;
+ int numFeatures;
+ vector< vector<int> > dataSet;
+ vector<int> globalDiscardedFeatureIndices;
+ vector<double> globalVariableImportanceList;
+ string treeSplitCriterion;
+ // This is a map of each feature to outcome count of each classes
+ // e.g. 1 => [2 7] means feature 1 has 2 outcome of 0 and 7 outcome of 1
+ map<int, vector<int> > globalOutOfBagEstimates;
+
+ // TODO: fix this, do we use pointers?
+ vector<AbstractDecisionTree*> decisionTrees;
+
+ MothurOut* m;
+
+private:
+
+};
+
+#endif /* defined(__Mothur__forest__) */