]> git.donarmstrong.com Git - mothur.git/commitdiff
Merge remote-tracking branch 'origin/master'
authorSarah Westcott <mothur.westcott@gmail.com>
Tue, 30 Oct 2012 18:07:54 +0000 (14:07 -0400)
committerSarah Westcott <mothur.westcott@gmail.com>
Tue, 30 Oct 2012 18:07:54 +0000 (14:07 -0400)
Mothur.xcodeproj/project.pbxproj
forest.cpp [new file with mode: 0644]
forest.h [new file with mode: 0644]
randomforest.cpp
randomforest.hpp

index df58a36026bed6961481c3e31efeccf6a8086810..1b298dfea5f58b9bd1f4ca1b09194967515a4de8 100644 (file)
@@ -10,6 +10,7 @@
                219C1DE01552C4BD004209F9 /* newcommandtemplate.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 219C1DDF1552C4BD004209F9 /* newcommandtemplate.cpp */; };
                219C1DE41559BCCF004209F9 /* getcoremicrobiomecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 219C1DE31559BCCD004209F9 /* getcoremicrobiomecommand.cpp */; };
                7E6BE10A12F710D8007ADDBE /* refchimeratest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */; };
+               83F25B0C163B031200ABE73D /* forest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 83F25B0A163B031200ABE73D /* forest.cpp */; };
                8DD76FB00486AB0100D96B5E /* mothur.1 in CopyFiles */ = {isa = PBXBuildFile; fileRef = C6A0FF2C0290799A04C91782 /* mothur.1 */; };
                A70056E6156A93D000924A2D /* getotulabelscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70056E5156A93D000924A2D /* getotulabelscommand.cpp */; };
                A70056EB156AB6E500924A2D /* removeotulabelscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70056EA156AB6E500924A2D /* removeotulabelscommand.cpp */; };
                7E6BE10812F710D8007ADDBE /* refchimeratest.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = refchimeratest.h; sourceTree = "<group>"; };
                7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = refchimeratest.cpp; sourceTree = "<group>"; };
                7E78911B135F3E8600E725D2 /* eachgapdistignorens.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = eachgapdistignorens.h; sourceTree = "<group>"; };
+               83F25B0A163B031200ABE73D /* forest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = forest.cpp; sourceTree = "<group>"; };
+               83F25B0B163B031200ABE73D /* forest.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = forest.h; sourceTree = "<group>"; };
                8DD76FB20486AB0100D96B5E /* mothur */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = mothur; sourceTree = BUILT_PRODUCTS_DIR; };
                A70056E5156A93D000924A2D /* getotulabelscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getotulabelscommand.cpp; sourceTree = "<group>"; };
                A70056E8156A93E300924A2D /* getotulabelscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getotulabelscommand.h; sourceTree = "<group>"; };
                                A77E1937161B201E00DB1A2A /* randomforest.cpp */,
                                A7386C201619CACB00651424 /* rftreenode.hpp */,
                                A77E193A161B289600DB1A2A /* rftreenode.cpp */,
+                               83F25B0A163B031200ABE73D /* forest.cpp */,
+                               83F25B0B163B031200ABE73D /* forest.h */,
                        );
                        name = randomforest;
                        sourceTree = "<group>";
                                A721AB71161C572A009860A1 /* kmernode.cpp in Sources */,
                                A721AB72161C572A009860A1 /* kmertree.cpp in Sources */,
                                A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */,
+                               83F25B0C163B031200ABE73D /* forest.cpp in Sources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
diff --git a/forest.cpp b/forest.cpp
new file mode 100644 (file)
index 0000000..58c7f7e
--- /dev/null
@@ -0,0 +1,60 @@
+//
+//  forest.cpp
+//  Mothur
+//
+//  Created by Kathryn Iverson on 10/26/12.
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "forest.h"
+
+/***********************************************************************/
+Forest::Forest(const std::vector < std::vector<int> > dataSet,
+                                           const int numDecisionTrees,
+                                           const string treeSplitCriterion = "informationGain")
+: dataSet(dataSet),
+numDecisionTrees(numDecisionTrees),
+numSamples((int)dataSet.size()),
+numFeatures((int)(dataSet[0].size() - 1)),
+globalDiscardedFeatureIndices(getGlobalDiscardedFeatureIndices()),
+globalVariableImportanceList(numFeatures, 0),
+treeSplitCriterion(treeSplitCriterion) {
+    m = MothurOut::getInstance();
+    // TODO: double check if the implemenatation of 'globalOutOfBagEstimates' is correct
+}
+
+/***********************************************************************/
+
+vector<int> Forest::getGlobalDiscardedFeatureIndices() {
+    try {
+        //vector<int> globalDiscardedFeatureIndices;
+        //globalDiscardedFeatureIndices.push_back(1);
+        
+        // calculate feature vectors
+        vector< vector<int> > featureVectors(numFeatures, vector<int>(numSamples, 0) );
+        for (int i = 0; i < numSamples; i++) {
+            if (m->control_pressed) { return globalDiscardedFeatureIndices; }
+            for (int j = 0; j < numFeatures; j++) { featureVectors[j][i] = dataSet[i][j]; }
+        }
+        
+        for (int i = 0; i < featureVectors.size(); i++) {
+            if (m->control_pressed) { return globalDiscardedFeatureIndices; }
+            double standardDeviation = m->getStandardDeviation(featureVectors[i]);
+            if (standardDeviation <= 0){ globalDiscardedFeatureIndices.push_back(i); }
+        }
+        
+        if (m->debug) {
+            m->mothurOut("number of global discarded features:  " + toString(globalDiscardedFeatureIndices.size())+ "\n");
+            m->mothurOut("total features: " + toString(featureVectors.size())+ "\n");
+        }
+        
+        return globalDiscardedFeatureIndices;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "Forest", "getGlobalDiscardedFeatureIndices");
+               exit(1);
+       }
+}
+
+/***********************************************************************/
+
diff --git a/forest.h b/forest.h
new file mode 100644 (file)
index 0000000..78f61b3
--- /dev/null
+++ b/forest.h
@@ -0,0 +1,69 @@
+//
+//  forest.h
+//  Mothur
+//
+//  Created by Kathryn Iverson on 10/26/12. Modified abstractrandomforest
+//  Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#ifndef __Mothur__forest__
+#define __Mothur__forest__
+
+#include <iostream>
+#include "mothurout.h"
+#include "macros.h"
+#include "decisiontree.hpp"
+#include "abstractdecisiontree.hpp"
+/***********************************************************************/
+//this is a re-implementation of the abstractrandomforest class
+
+class Forest{
+public:
+    // intialization with vectors
+    Forest(const std::vector < std::vector<int> > dataSet,
+                         const int numDecisionTrees,
+                         const string);
+    virtual ~Forest(){ }
+    virtual int populateDecisionTrees() = 0;
+    virtual int calcForrestErrorRate() = 0;
+    virtual int calcForrestVariableImportance(string) = 0;
+    virtual int updateGlobalOutOfBagEstimates(DecisionTree* decisionTree) = 0;
+    
+    /***********************************************************************/
+    
+protected:
+    
+    // TODO: create a better way of discarding feature
+    // currently we just set FEATURE_DISCARD_SD_THRESHOLD to 0 to solved this
+    // it can be tuned for better selection
+    // also, there might be other factors like Mean or other stuffs
+    // same would apply for createLocalDiscardedFeatureList in the TreeNode class
+    
+    // TODO: Another idea is getting an aggregated discarded feature indices after the run, from combining
+    // the local discarded feature indices
+    // this would penalize a feature, even if in global space the feature looks quite good
+    // the penalization would be averaged, so this woould unlikely to create a local optmina
+    
+    vector<int> getGlobalDiscardedFeatureIndices();
+    
+    int numDecisionTrees;
+    int numSamples;
+    int numFeatures;
+    vector< vector<int> > dataSet;
+    vector<int> globalDiscardedFeatureIndices;
+    vector<double> globalVariableImportanceList;
+    string treeSplitCriterion;
+    // This is a map of each feature to outcome count of each classes
+    // e.g. 1 => [2 7] means feature 1 has 2 outcome of 0 and 7 outcome of 1
+    map<int, vector<int> > globalOutOfBagEstimates;
+    
+    // TODO: fix this, do we use pointers?
+    vector<AbstractDecisionTree*> decisionTrees;
+    
+    MothurOut* m;
+    
+private:
+    
+};
+
+#endif /* defined(__Mothur__forest__) */
index 36a2c1a261f27514c394a370d0d97387ac9cbfac..bd96cd2f7177633e3d21181c95e7ff2c07682eb2 100644 (file)
@@ -11,7 +11,7 @@
 /***********************************************************************/
 
 RandomForest::RandomForest(const vector <vector<int> > dataSet,const int numDecisionTrees,
-             const string treeSplitCriterion = "informationGain") : AbstractRandomForest(dataSet, numDecisionTrees, treeSplitCriterion) {
+             const string treeSplitCriterion = "informationGain") : Forest(dataSet, numDecisionTrees, treeSplitCriterion) {
     m = MothurOut::getInstance();
 }
 
@@ -58,6 +58,7 @@ int RandomForest::calcForrestVariableImportance(string filename) {
         //could cause maintenance issues later if other types of Abstract decison trees are created that cannot be cast as a decision tree.
     for (int i = 0; i < decisionTrees.size(); i++) {
         if (m->control_pressed) { return 0; }
+        
         DecisionTree* decisionTree = dynamic_cast<DecisionTree*>(decisionTrees[i]);
         
         for (int j = 0; j < numFeatures; j++) {
@@ -127,7 +128,7 @@ int RandomForest::populateDecisionTrees() {
 }
 /***********************************************************************/
 // TODO: need to finalize bettween reference and pointer for DecisionTree [partially solved]
-// TODO: make this pure virtual in superclass
+// DONE: make this pure virtual in superclass
 // DONE
 int RandomForest::updateGlobalOutOfBagEstimates(DecisionTree* decisionTree) {
     try {
index 716d1a1ede60c667840172db275814c73867041a..30eb43842f8cb280e1e4e95919909885f2702d28 100755 (executable)
 #define rrf_fs_prototype_randomforest_hpp
 
 #include "macros.h"
-#include "abstractrandomforest.hpp"
+#include "forest.h"
 #include "decisiontree.hpp"
 
-class RandomForest: public AbstractRandomForest {
+class RandomForest: public Forest {
     
 public:
     
@@ -23,14 +23,14 @@ public:
     
     //NOTE:: if you are going to dynamically cast, aren't you undoing the advantage of abstraction. Why abstract at all?
     //could cause maintenance issues later if other types of Abstract decison trees are created that cannot be cast as a decision tree.
-    virtual ~RandomForest() {
-        for (vector<AbstractDecisionTree*>::iterator it = decisionTrees.begin(); it != decisionTrees.end(); it++) {
-            // we know that this is decision tree, so we can do a dynamic_case<DecisionTree*> here
-            DecisionTree* decisionTree = dynamic_cast<DecisionTree*>(*it);
-            // calling the destructor by deleting
-            delete decisionTree;
-        }
-    }
+//    virtual ~RandomForest() {
+//        for (vector<AbstractDecisionTree*>::iterator it = decisionTrees.begin(); it != decisionTrees.end(); it++) {
+//            // we know that this is decision tree, so we can do a dynamic_case<DecisionTree*> here
+//            DecisionTree* decisionTree = dynamic_cast<DecisionTree*>(*it);
+//            // calling the destructor by deleting
+//            delete decisionTree;
+//        }
+//    }
     
     int calcForrestErrorRate();
     int calcForrestVariableImportance(string);