From: Kathryn Iverson Date: Fri, 26 Oct 2012 17:48:30 +0000 (-0400) Subject: added new class forest X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=5e1ab7456ec5e9e516cfa0fec6afef2c2a03a257 added new class forest --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index ecb0619..79f4457 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -10,6 +10,7 @@ 219C1DE01552C4BD004209F9 /* newcommandtemplate.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 219C1DDF1552C4BD004209F9 /* newcommandtemplate.cpp */; }; 219C1DE41559BCCF004209F9 /* getcoremicrobiomecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 219C1DE31559BCCD004209F9 /* getcoremicrobiomecommand.cpp */; }; 7E6BE10A12F710D8007ADDBE /* refchimeratest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */; }; + 83F25B0C163B031200ABE73D /* forest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 83F25B0A163B031200ABE73D /* forest.cpp */; }; 8DD76FB00486AB0100D96B5E /* mothur.1 in CopyFiles */ = {isa = PBXBuildFile; fileRef = C6A0FF2C0290799A04C91782 /* mothur.1 */; }; A70056E6156A93D000924A2D /* getotulabelscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70056E5156A93D000924A2D /* getotulabelscommand.cpp */; }; A70056EB156AB6E500924A2D /* removeotulabelscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70056EA156AB6E500924A2D /* removeotulabelscommand.cpp */; }; @@ -384,6 +385,8 @@ 7E6BE10812F710D8007ADDBE /* refchimeratest.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = refchimeratest.h; sourceTree = ""; }; 7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = refchimeratest.cpp; sourceTree = ""; }; 7E78911B135F3E8600E725D2 /* eachgapdistignorens.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = eachgapdistignorens.h; sourceTree = ""; }; + 83F25B0A163B031200ABE73D /* forest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = forest.cpp; sourceTree = ""; }; + 83F25B0B163B031200ABE73D /* forest.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = forest.h; sourceTree = ""; }; 8DD76FB20486AB0100D96B5E /* mothur */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = mothur; sourceTree = BUILT_PRODUCTS_DIR; }; A70056E5156A93D000924A2D /* getotulabelscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getotulabelscommand.cpp; sourceTree = ""; }; A70056E8156A93E300924A2D /* getotulabelscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getotulabelscommand.h; sourceTree = ""; }; @@ -1222,6 +1225,8 @@ A77E1937161B201E00DB1A2A /* randomforest.cpp */, A7386C201619CACB00651424 /* rftreenode.hpp */, A77E193A161B289600DB1A2A /* rftreenode.cpp */, + 83F25B0A163B031200ABE73D /* forest.cpp */, + 83F25B0B163B031200ABE73D /* forest.h */, ); name = randomforest; sourceTree = ""; @@ -2280,6 +2285,7 @@ A721AB71161C572A009860A1 /* kmernode.cpp in Sources */, A721AB72161C572A009860A1 /* kmertree.cpp in Sources */, A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */, + 83F25B0C163B031200ABE73D /* forest.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/forest.cpp b/forest.cpp new file mode 100644 index 0000000..8ac1b79 --- /dev/null +++ b/forest.cpp @@ -0,0 +1,59 @@ +// +// forest.cpp +// Mothur +// +// Created by Kathryn Iverson on 10/26/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "forest.h" + +/***********************************************************************/ +Forest::Forest(const std::vector < std::vector > dataSet, + const int numDecisionTrees, + const string treeSplitCriterion = "informationGain") +: dataSet(dataSet), +numDecisionTrees(numDecisionTrees), +numSamples((int)dataSet.size()), +numFeatures((int)(dataSet[0].size() - 1)), +globalDiscardedFeatureIndices(getGlobalDiscardedFeatureIndices()), +globalVariableImportanceList(numFeatures, 0), +treeSplitCriterion(treeSplitCriterion) { + m = MothurOut::getInstance(); + // TODO: double check if the implemenatation of 'globalOutOfBagEstimates' is correct +} + +/***********************************************************************/ + +vector Forest::getGlobalDiscardedFeatureIndices() { + try { + vector globalDiscardedFeatureIndices; + + // calculate feature vectors + vector< vector > featureVectors(numFeatures, vector(numSamples, 0)); + for (int i = 0; i < numSamples; i++) { + if (m->control_pressed) { return globalDiscardedFeatureIndices; } + for (int j = 0; j < numFeatures; j++) { featureVectors[j][i] = dataSet[i][j]; } + } + + for (int i = 0; i < featureVectors.size(); i++) { + if (m->control_pressed) { return globalDiscardedFeatureIndices; } + double standardDeviation = m->getStandardDeviation(featureVectors[i]); + if (standardDeviation <= 0){ globalDiscardedFeatureIndices.push_back(i); } + } + + if (m->debug) { + m->mothurOut("number of global discarded features: " + toString(globalDiscardedFeatureIndices.size())+ "\n"); + m->mothurOut("total features: " + toString(featureVectors.size())+ "\n"); + } + + return globalDiscardedFeatureIndices; + } + catch(exception& e) { + m->errorOut(e, "Forest", "getGlobalDiscardedFeatureIndices"); + exit(1); + } +} + +/***********************************************************************/ + diff --git a/forest.h b/forest.h new file mode 100644 index 0000000..c9d29dc --- /dev/null +++ b/forest.h @@ -0,0 +1,67 @@ +// +// forest.h +// Mothur +// +// Created by Kathryn Iverson on 10/26/12. Modified abstractrandomforest +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#ifndef __Mothur__forest__ +#define __Mothur__forest__ + +#include +#include "mothurout.h" +#include "macros.h" +#include "abstractdecisiontree.hpp" +/***********************************************************************/ +//this is a re-implementation of the abstractrandomforest class + +class Forest{ +public: + // intialization with vectors + Forest(const std::vector < std::vector > dataSet, + const int numDecisionTrees, + const string); + virtual ~Forest(){ } + virtual int populateDecisionTrees() = 0; + virtual int calcForrestErrorRate() = 0; + virtual int calcForrestVariableImportance(string) = 0; + + /***********************************************************************/ + +protected: + + // TODO: create a better way of discarding feature + // currently we just set FEATURE_DISCARD_SD_THRESHOLD to 0 to solved this + // it can be tuned for better selection + // also, there might be other factors like Mean or other stuffs + // same would apply for createLocalDiscardedFeatureList in the TreeNode class + + // TODO: Another idea is getting an aggregated discarded feature indices after the run, from combining + // the local discarded feature indices + // this would penalize a feature, even if in global space the feature looks quite good + // the penalization would be averaged, so this woould unlikely to create a local optmina + + vector getGlobalDiscardedFeatureIndices(); + + int numDecisionTrees; + int numSamples; + int numFeatures; + vector< vector > dataSet; + vector globalDiscardedFeatureIndices; + vector globalVariableImportanceList; + string treeSplitCriterion; + // This is a map of each feature to outcome count of each classes + // e.g. 1 => [2 7] means feature 1 has 2 outcome of 0 and 7 outcome of 1 + map > globalOutOfBagEstimates; + + // TODO: fix this, do we use pointers? + vector decisionTrees; + + MothurOut* m; + +private: + +}; + +#endif /* defined(__Mothur__forest__) */