]> git.donarmstrong.com Git - mothur.git/blobdiff - randomforest.cpp
adding labels to list file.
[mothur.git] / randomforest.cpp
index bb8e2d565cb6348594b7c3e5cd229dd40ac37d76..acf87dfebcd022d37cf6974f8331c7ad940a017a 100644 (file)
@@ -37,7 +37,7 @@ int RandomForest::calcForrestErrorRate() {
             vector<int>::iterator maxPredictedOutComeIterator = max_element(predictedOutComes.begin(), predictedOutComes.end());
             int majorityVotedOutcome = (int)(maxPredictedOutComeIterator - predictedOutComes.begin());
             int realOutcome = dataSet[indexOfSample][numFeatures];
-            
+                                   
             if (majorityVotedOutcome == realOutcome) { numCorrect++; }
         }
         
@@ -46,7 +46,7 @@ int RandomForest::calcForrestErrorRate() {
         
         m->mothurOut("numCorrect = " + toString(numCorrect)+ "\n");
         m->mothurOut("forrestErrorRate = " + toString(forrestErrorRate)+ "\n");
-    
+            
         return 0;
     }
        catch(exception& e) {
@@ -54,6 +54,87 @@ int RandomForest::calcForrestErrorRate() {
                exit(1);
        } 
 }
+/***********************************************************************/
+
+int RandomForest::printConfusionMatrix(map<int, string> intToTreatmentMap) {
+    try {
+        int numGroups = intToTreatmentMap.size();
+        vector<vector<int> > cm(numGroups, vector<int>(numGroups, 0));
+        
+        for (map<int, vector<int> >::iterator it = globalOutOfBagEstimates.begin(); it != globalOutOfBagEstimates.end(); it++) {
+            
+            if (m->control_pressed) { return 0; }
+            
+            int indexOfSample = it->first; //key
+            vector<int> predictedOutComes = it->second; //value, vector of all predicted classes
+            vector<int>::iterator maxPredictedOutComeIterator = max_element(predictedOutComes.begin(), predictedOutComes.end());
+            int majorityVotedOutcome = (int)(maxPredictedOutComeIterator - predictedOutComes.begin());
+            int realOutcome = dataSet[indexOfSample][numFeatures];                       
+            cm[realOutcome][majorityVotedOutcome] = cm[realOutcome][majorityVotedOutcome] + 1;
+        }
+        
+        vector<int> fw;
+        for (int w = 0; w <numGroups; w++) {
+            fw.push_back(intToTreatmentMap[w].length());
+        }
+        
+        m->mothurOut("confusion matrix:\n\t\t");
+        for (int k = 0; k < numGroups; k++) {
+            //m->mothurOut(intToTreatmentMap[k] + "\t");
+            cout << setw(fw[k]) << intToTreatmentMap[k] << "\t";
+        }
+        for (int i = 0; i < numGroups; i++) {
+            cout << "\n" << setw(fw[i]) << intToTreatmentMap[i] << "\t";
+            //m->mothurOut("\n" + intToTreatmentMap[i] + "\t");
+            if (m->control_pressed) { return 0; }
+            for (int j = 0; j < numGroups; j++) {
+                //m->mothurOut(toString(cm[i][j]) + "\t");
+                cout << setw(fw[i]) << cm[i][j] << "\t";
+            }    
+        }
+        //m->mothurOut("\n");
+        cout << "\n";
+
+        return 0;
+    }
+    
+    catch(exception& e) {
+               m->errorOut(e, "RandomForest", "printConfusionMatrix");
+               exit(1);
+       }
+}
+
+/***********************************************************************/
+
+int RandomForest::getMissclassifications(string filename, map<int, string> intToTreatmentMap, vector<string> names) {
+    try {
+        ofstream out;
+        m->openOutputFile(filename, out);
+        out <<"Sample\tRF classification\tActual classification\n";
+        for (map<int, vector<int> >::iterator it = globalOutOfBagEstimates.begin(); it != globalOutOfBagEstimates.end(); it++) {
+            
+            if (m->control_pressed) { return 0; }
+            
+            int indexOfSample = it->first;
+            vector<int> predictedOutComes = it->second;
+            vector<int>::iterator maxPredictedOutComeIterator = max_element(predictedOutComes.begin(), predictedOutComes.end());
+            int majorityVotedOutcome = (int)(maxPredictedOutComeIterator - predictedOutComes.begin());
+            int realOutcome = dataSet[indexOfSample][numFeatures];
+                                   
+            if (majorityVotedOutcome != realOutcome) {             
+                out << names[indexOfSample] << "\t" << intToTreatmentMap[majorityVotedOutcome] << "\t" << intToTreatmentMap[realOutcome] << endl;
+                                
+            }
+        }
+        
+        out.close();    
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "RandomForest", "getMissclassifications");
+               exit(1);
+       } 
+}
 
 /***********************************************************************/
 int RandomForest::calcForrestVariableImportance(string filename) {
@@ -78,7 +159,7 @@ int RandomForest::calcForrestVariableImportance(string filename) {
         
         vector< pair<int, double> > globalVariableRanks;
         for (int i = 0; i < globalVariableImportanceList.size(); i++) {
-            cout << "[" << i << ',' << globalVariableImportanceList[i] << "], ";
+            //cout << "[" << i << ',' << globalVariableImportanceList[i] << "], ";
             if (globalVariableImportanceList[i] > 0) {
                 pair<int, double> globalVariableRank(0, 0.0);
                 globalVariableRank.first = i;
@@ -97,9 +178,9 @@ int RandomForest::calcForrestVariableImportance(string filename) {
         
         ofstream out;
         m->openOutputFile(filename, out);
-        out <<"OTU\tRank\n";
+        out <<"OTU\tMean decrease accuracy\n";
         for (int i = 0; i < globalVariableRanks.size(); i++) {
-            out << m->currentBinLabels[(int)globalVariableRanks[i].first] << '\t' << globalVariableImportanceList[globalVariableRanks[i].first] << endl;
+            out << m->currentSharedBinLabels[(int)globalVariableRanks[i].first] << '\t' << globalVariableImportanceList[globalVariableRanks[i].first] << endl;
         }
         out.close();
         return 0;
@@ -118,7 +199,7 @@ int RandomForest::populateDecisionTrees() {
         for (int i = 0; i < numDecisionTrees; i++) {
           
             if (m->control_pressed) { return 0; }
-            if (((i+1) % 10) == 0) {  m->mothurOut("Creating " + toString(i+1) + " (th) Decision tree\n");  }
+            if (((i+1) % 100) == 0) {  m->mothurOut("Creating " + toString(i+1) + " (th) Decision tree\n");  }
           
             // TODO: need to first fix if we are going to use pointer based system or anything else
             DecisionTree* decisionTree = new DecisionTree(dataSet, globalDiscardedFeatureIndices, OptimumFeatureSubsetSelector(optimumFeatureSubsetSelectionCriteria), treeSplitCriterion, featureStandardDeviationThreshold);