From: Sarah Westcott Date: Wed, 18 Sep 2013 16:54:25 +0000 (-0400) Subject: Merge remote-tracking branch 'origin' X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=bcb6728939694811bf6a00ae6d568f783840edfd;hp=3a5dd9e428ab93a6dcdce7912e8ebb977be0b893;p=mothur.git Merge remote-tracking branch 'origin' --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 27607ff..8304f07 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -589,20 +589,20 @@ A7E9B67212D37EC400DA6239 /* catchallcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = catchallcommand.cpp; sourceTree = ""; }; A7E9B67312D37EC400DA6239 /* catchallcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = catchallcommand.h; sourceTree = ""; }; A7E9B67412D37EC400DA6239 /* ccode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ccode.cpp; sourceTree = ""; }; - A7E9B67512D37EC400DA6239 /* ccode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ccode.h; sourceTree = ""; }; + A7E9B67512D37EC400DA6239 /* ccode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; lineEnding = 0; path = ccode.h; sourceTree = ""; xcLanguageSpecificationIdentifier = xcode.lang.objcpp; }; A7E9B67612D37EC400DA6239 /* chao1.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chao1.cpp; sourceTree = ""; }; A7E9B67712D37EC400DA6239 /* chao1.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chao1.h; sourceTree = ""; }; A7E9B67812D37EC400DA6239 /* chimera.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimera.cpp; sourceTree = ""; }; A7E9B67912D37EC400DA6239 /* chimera.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimera.h; sourceTree = ""; }; A7E9B67A12D37EC400DA6239 /* chimerabellerophoncommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerabellerophoncommand.cpp; sourceTree = ""; }; A7E9B67B12D37EC400DA6239 /* chimerabellerophoncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerabellerophoncommand.h; sourceTree = ""; }; - A7E9B67C12D37EC400DA6239 /* chimeraccodecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraccodecommand.cpp; sourceTree = ""; }; + A7E9B67C12D37EC400DA6239 /* chimeraccodecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; lineEnding = 0; path = chimeraccodecommand.cpp; sourceTree = ""; xcLanguageSpecificationIdentifier = xcode.lang.cpp; }; A7E9B67D12D37EC400DA6239 /* chimeraccodecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraccodecommand.h; sourceTree = ""; }; - A7E9B67E12D37EC400DA6239 /* chimeracheckcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeracheckcommand.cpp; sourceTree = ""; }; + A7E9B67E12D37EC400DA6239 /* chimeracheckcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; lineEnding = 0; path = chimeracheckcommand.cpp; sourceTree = ""; xcLanguageSpecificationIdentifier = xcode.lang.cpp; }; A7E9B67F12D37EC400DA6239 /* chimeracheckcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeracheckcommand.h; sourceTree = ""; }; A7E9B68012D37EC400DA6239 /* chimeracheckrdp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeracheckrdp.cpp; sourceTree = ""; }; - A7E9B68112D37EC400DA6239 /* chimeracheckrdp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeracheckrdp.h; sourceTree = ""; }; - A7E9B68212D37EC400DA6239 /* chimerapintailcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerapintailcommand.cpp; sourceTree = ""; }; + A7E9B68112D37EC400DA6239 /* chimeracheckrdp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; lineEnding = 0; path = chimeracheckrdp.h; sourceTree = ""; xcLanguageSpecificationIdentifier = xcode.lang.objcpp; }; + A7E9B68212D37EC400DA6239 /* chimerapintailcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; lineEnding = 0; path = chimerapintailcommand.cpp; sourceTree = ""; xcLanguageSpecificationIdentifier = xcode.lang.cpp; }; A7E9B68312D37EC400DA6239 /* chimerapintailcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerapintailcommand.h; sourceTree = ""; }; A7E9B68412D37EC400DA6239 /* chimerarealigner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerarealigner.cpp; sourceTree = ""; }; A7E9B68512D37EC400DA6239 /* chimerarealigner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerarealigner.h; sourceTree = ""; }; @@ -627,12 +627,12 @@ A7E9B69A12D37EC400DA6239 /* clusterclassic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clusterclassic.cpp; sourceTree = ""; }; A7E9B69B12D37EC400DA6239 /* clusterclassic.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clusterclassic.h; sourceTree = ""; }; A7E9B69C12D37EC400DA6239 /* clustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clustercommand.cpp; sourceTree = ""; }; - A7E9B69D12D37EC400DA6239 /* clustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clustercommand.h; sourceTree = ""; }; + A7E9B69D12D37EC400DA6239 /* clustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; lineEnding = 0; path = clustercommand.h; sourceTree = ""; xcLanguageSpecificationIdentifier = xcode.lang.objcpp; }; A7E9B69E12D37EC400DA6239 /* clusterdoturcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clusterdoturcommand.cpp; sourceTree = ""; }; A7E9B69F12D37EC400DA6239 /* clusterdoturcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clusterdoturcommand.h; sourceTree = ""; }; A7E9B6A012D37EC400DA6239 /* clusterfragmentscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clusterfragmentscommand.cpp; sourceTree = ""; }; A7E9B6A112D37EC400DA6239 /* clusterfragmentscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clusterfragmentscommand.h; sourceTree = ""; }; - A7E9B6A212D37EC400DA6239 /* clustersplitcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = clustersplitcommand.cpp; sourceTree = ""; }; + A7E9B6A212D37EC400DA6239 /* clustersplitcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; lineEnding = 0; path = clustersplitcommand.cpp; sourceTree = ""; xcLanguageSpecificationIdentifier = xcode.lang.cpp; }; A7E9B6A312D37EC400DA6239 /* clustersplitcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clustersplitcommand.h; sourceTree = ""; }; A7E9B6A412D37EC400DA6239 /* cmdargs.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cmdargs.cpp; sourceTree = ""; }; A7E9B6A512D37EC400DA6239 /* cmdargs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cmdargs.h; sourceTree = ""; }; @@ -664,7 +664,7 @@ A7E9B6BF12D37EC400DA6239 /* datavector.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = datavector.hpp; sourceTree = ""; }; A7E9B6C012D37EC400DA6239 /* dayhoff.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dayhoff.h; sourceTree = ""; }; A7E9B6C112D37EC400DA6239 /* decalc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decalc.cpp; sourceTree = ""; }; - A7E9B6C212D37EC400DA6239 /* decalc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decalc.h; sourceTree = ""; }; + A7E9B6C212D37EC400DA6239 /* decalc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; lineEnding = 0; path = decalc.h; sourceTree = ""; xcLanguageSpecificationIdentifier = xcode.lang.objcpp; }; A7E9B6C312D37EC400DA6239 /* deconvolutecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deconvolutecommand.cpp; sourceTree = ""; }; A7E9B6C412D37EC400DA6239 /* deconvolutecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = deconvolutecommand.h; sourceTree = ""; }; A7E9B6C512D37EC400DA6239 /* degapseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; lineEnding = 0; path = degapseqscommand.cpp; sourceTree = ""; xcLanguageSpecificationIdentifier = xcode.lang.cpp; }; @@ -749,7 +749,7 @@ A7E9B71812D37EC400DA6239 /* hcluster.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hcluster.cpp; sourceTree = SOURCE_ROOT; }; A7E9B71912D37EC400DA6239 /* hcluster.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hcluster.h; sourceTree = SOURCE_ROOT; }; A7E9B71A12D37EC400DA6239 /* hclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = hclustercommand.cpp; sourceTree = ""; }; - A7E9B71B12D37EC400DA6239 /* hclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hclustercommand.h; sourceTree = ""; }; + A7E9B71B12D37EC400DA6239 /* hclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; lineEnding = 0; path = hclustercommand.h; sourceTree = ""; xcLanguageSpecificationIdentifier = xcode.lang.objcpp; }; A7E9B71C12D37EC400DA6239 /* heatmap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = heatmap.cpp; sourceTree = SOURCE_ROOT; }; A7E9B71D12D37EC400DA6239 /* heatmap.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = heatmap.h; sourceTree = SOURCE_ROOT; }; A7E9B71E12D37EC400DA6239 /* heatmapcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = heatmapcommand.cpp; sourceTree = ""; }; diff --git a/ccode.h b/ccode.h index 456b735..dcd4879 100644 --- a/ccode.h +++ b/ccode.h @@ -15,7 +15,7 @@ #include "decalc.h" /***********************************************************/ -//This class was created using the algorythms described in the +//This class was created using the algorithms described in the // "Evaluating putative chimeric sequences from PCR-amplified products" paper //by Juan M. Gonzalez, Johannes Zimmerman and Cesareo Saiz-Jimenez. diff --git a/chimeraccodecommand.cpp b/chimeraccodecommand.cpp index 627f3a9..d890db4 100644 --- a/chimeraccodecommand.cpp +++ b/chimeraccodecommand.cpp @@ -38,7 +38,7 @@ string ChimeraCcodeCommand::getHelpString(){ try { string helpString = ""; helpString += "The chimera.ccode command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n"; - helpString += "This command was created using the algorythms described in the 'Evaluating putative chimeric sequences from PCR-amplified products' paper by Juan M. Gonzalez, Johannes Zimmerman and Cesareo Saiz-Jimenez.\n"; + helpString += "This command was created using the algorithms described in the 'Evaluating putative chimeric sequences from PCR-amplified products' paper by Juan M. Gonzalez, Johannes Zimmerman and Cesareo Saiz-Jimenez.\n"; helpString += "The chimera.ccode command parameters are fasta, reference, filter, mask, processors, window and numwanted.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required unless you have a valid current fasta file. \n"; helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n"; diff --git a/chimeracheckcommand.cpp b/chimeracheckcommand.cpp index 31a36f7..64ed9fa 100644 --- a/chimeracheckcommand.cpp +++ b/chimeracheckcommand.cpp @@ -38,7 +38,7 @@ string ChimeraCheckCommand::getHelpString(){ try { string helpString = ""; helpString += "The chimera.check command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n"; - helpString += "This command was created using the algorythms described in CHIMERA_CHECK version 2.7 written by Niels Larsen. \n"; + helpString += "This command was created using the algorithms described in CHIMERA_CHECK version 2.7 written by Niels Larsen. \n"; helpString += "The chimera.check command parameters are fasta, reference, processors, ksize, increment, svg and name.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required unless you have a valid current fasta file. \n"; helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n"; diff --git a/chimeracheckrdp.h b/chimeracheckrdp.h index cc23c2e..25db15b 100644 --- a/chimeracheckrdp.h +++ b/chimeracheckrdp.h @@ -17,7 +17,7 @@ #include "alignmentdb.h" /***********************************************************/ -//This class was created using the algorythms described in +//This class was created using the algorithms described in //CHIMERA_CHECK version 2.7 written by Niels Larsen. /***********************************************************/ diff --git a/chimerapintailcommand.cpp b/chimerapintailcommand.cpp index 9d492af..fca9f17 100644 --- a/chimerapintailcommand.cpp +++ b/chimerapintailcommand.cpp @@ -41,7 +41,7 @@ string ChimeraPintailCommand::getHelpString(){ try { string helpString = ""; helpString += "The chimera.pintail command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n"; - helpString += "This command was created using the algorythms described in the 'At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies' paper by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1.\n"; + helpString += "This command was created using the algorithms described in the 'At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies' paper by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1.\n"; helpString += "The chimera.pintail command parameters are fasta, reference, filter, mask, processors, window, increment, conservation and quantile.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required unless you have a valid current fasta file. \n"; helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n"; diff --git a/classifyrfsharedcommand.cpp b/classifyrfsharedcommand.cpp new file mode 100755 index 0000000..d2cd9f9 --- /dev/null +++ b/classifyrfsharedcommand.cpp @@ -0,0 +1,407 @@ +// +// classifysharedcommand.cpp +// Mothur +// +// Created by Abu Zaher Md. Faridee on 8/13/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "classifyrfsharedcommand.h" +#include "randomforest.hpp" +#include "decisiontree.hpp" +#include "rftreenode.hpp" + +//********************************************************************************************************************** +vector ClassifyRFSharedCommand::setParameters(){ + try { + //CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); + CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none","summary",false,true,true); parameters.push_back(pshared); + CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pdesign); + CommandParameter potupersplit("otupersplit", "Multiple", "log2-squareroot", "log2", "", "", "","",false,false); parameters.push_back(potupersplit); + CommandParameter psplitcriteria("splitcriteria", "Multiple", "gainratio-infogain", "gainratio", "", "", "","",false,false); parameters.push_back(psplitcriteria); + CommandParameter pnumtrees("numtrees", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pnumtrees); + + // parameters related to pruning + CommandParameter pdopruning("prune", "Boolean", "", "T", "", "", "", "", false, false); parameters.push_back(pdopruning); + CommandParameter ppruneaggrns("pruneaggressiveness", "Number", "", "0.9", "", "", "", "", false, false); parameters.push_back(ppruneaggrns); + CommandParameter pdiscardhetrees("discarderrortrees", "Boolean", "", "T", "", "", "", "", false, false); parameters.push_back(pdiscardhetrees); + CommandParameter phetdiscardthreshold("errorthreshold", "Number", "", "0.4", "", "", "", "", false, false); parameters.push_back(phetdiscardthreshold); + CommandParameter psdthreshold("stdthreshold", "Number", "", "0.0", "", "", "", "", false, false); parameters.push_back(psdthreshold); + // pruning params end + + CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups); + CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } + return myArray; + } + catch(exception& e) { + m->errorOut(e, "ClassifySharedCommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string ClassifyRFSharedCommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The classify.shared command allows you to ....\n"; + helpString += "The classify.shared command parameters are: shared, design, label, groups, otupersplit.\n"; + helpString += "The label parameter is used to analyze specific labels in your input.\n"; + helpString += "The groups parameter allows you to specify which of the groups in your designfile you would like analyzed.\n"; + helpString += "The classify.shared should be in the following format: \n"; + helpString += "classify.shared(shared=yourSharedFile, design=yourDesignFile)\n"; + return helpString; + } + catch(exception& e) { + m->errorOut(e, "ClassifySharedCommand", "getHelpString"); + exit(1); + } +} +//********************************************************************************************************************** +string ClassifyRFSharedCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "summary") { pattern = "[filename],[distance],summary"; } //makes file like: amazon.0.03.fasta + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "ClassifySharedCommand", "getOutputPattern"); + exit(1); + } +} +//********************************************************************************************************************** + +ClassifyRFSharedCommand::ClassifyRFSharedCommand() { + try { + abort = true; calledHelp = true; + setParameters(); + vector tempOutNames; + outputTypes["summary"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "ClassifySharedCommand", "ClassifySharedCommand"); + exit(1); + } +} + +//********************************************************************************************************************** +ClassifyRFSharedCommand::ClassifyRFSharedCommand(string option) { + try { + abort = false; calledHelp = false; + allLines = 1; + + //allow user to run help + if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} + + else { + //valid paramters for this command + vector myArray = setParameters(); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + vector tempOutNames; + outputTypes["summary"] = tempOutNames; + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("shared"); + //user has given a shared file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["shared"] = inputDir + it->second; } + } + + it = parameters.find("design"); + //user has given a design file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["design"] = inputDir + it->second; } + } + + } + //check for parameters + //get shared file, it is required + sharedfile = validParameter.validFile(parameters, "shared", true); + if (sharedfile == "not open") { sharedfile = ""; abort = true; } + else if (sharedfile == "not found") { + //if there is a current shared file, use it + sharedfile = m->getSharedFile(); + if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; } + }else { m->setSharedFile(sharedfile); } + + //get design file, it is required + designfile = validParameter.validFile(parameters, "design", true); + if (designfile == "not open") { sharedfile = ""; abort = true; } + else if (designfile == "not found") { + //if there is a current shared file, use it + designfile = m->getDesignFile(); + if (designfile != "") { m->mothurOut("Using " + designfile + " as input file for the design parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("You have no current designfile and the design parameter is required."); m->mothurOutEndLine(); abort = true; } + }else { m->setDesignFile(designfile); } + + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + outputDir = m->hasPath(sharedfile); //if user entered a file with a path then preserve it + } + + // NEW CODE for OTU per split selection criteria + string temp = validParameter.validFile(parameters, "splitcriteria", false); + if (temp == "not found") { temp = "gainratio"; } + if ((temp == "gainratio") || (temp == "infogain")) { + treeSplitCriterion = temp; + } else { m->mothurOut("Not a valid tree splitting criterio. Valid tree splitting criteria are 'gainratio' and 'infogain'."); + m->mothurOutEndLine(); + abort = true; + } + + temp = validParameter.validFile(parameters, "numtrees", false); if (temp == "not found"){ temp = "100"; } + m->mothurConvert(temp, numDecisionTrees); + + // parameters for pruning + temp = validParameter.validFile(parameters, "prune", false); + if (temp == "not found") { temp = "f"; } + doPruning = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "pruneaggressiveness", false); + if (temp == "not found") { temp = "0.9"; } + m->mothurConvert(temp, pruneAggressiveness); + + temp = validParameter.validFile(parameters, "discarderrortrees", false); + if (temp == "not found") { temp = "f"; } + discardHighErrorTrees = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "errorthreshold", false); + if (temp == "not found") { temp = "0.4"; } + m->mothurConvert(temp, highErrorTreeDiscardThreshold); + + temp = validParameter.validFile(parameters, "otupersplit", false); + if (temp == "not found") { temp = "log2"; } + if ((temp == "squareroot") || (temp == "log2")) { + optimumFeatureSubsetSelectionCriteria = temp; + } else { m->mothurOut("Not a valid OTU per split selection method. Valid OTU per split selection methods are 'log2' and 'squareroot'."); + m->mothurOutEndLine(); + abort = true; + } + + temp = validParameter.validFile(parameters, "stdthreshold", false); + if (temp == "not found") { temp = "0.0"; } + m->mothurConvert(temp, featureStandardDeviationThreshold); + + // end of pruning params + + //Groups must be checked later to make sure they are valid. SharedUtilities has functions of check the validity, just make to so m->setGroups() after the checks. If you are using these with a shared file no need to check the SharedRAbundVector class will call SharedUtilites for you, kinda nice, huh? + string groups = validParameter.validFile(parameters, "groups", false); + if (groups == "not found") { groups = ""; } + else { m->splitAtDash(groups, Groups); } + m->setGroups(Groups); + + //Commonly used to process list, rabund, sabund, shared and relabund files. Look at "smart distancing" examples below in the execute function. + string label = validParameter.validFile(parameters, "label", false); + if (label == "not found") { label = ""; } + else { + if(label != "all") { m->splitAtDash(label, labels); allLines = 0; } + else { allLines = 1; } + } + } + + } + catch(exception& e) { + m->errorOut(e, "ClassifySharedCommand", "ClassifySharedCommand"); + exit(1); + } +} +//********************************************************************************************************************** +int ClassifyRFSharedCommand::execute() { + try { + + if (abort == true) { if (calledHelp) { return 0; } return 2; } + + InputData input(sharedfile, "sharedfile"); + vector lookup = input.getSharedRAbundVectors(); + + //read design file + designMap.readDesignMap(designfile); + + string lastLabel = lookup[0]->getLabel(); + set processedLabels; + set userLabels = labels; + + //as long as you are not at the end of the file or done wih the lines you want + while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { + + if (m->control_pressed) { for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } return 0; } + + if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){ + + m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine(); + + processSharedAndDesignData(lookup); + + processedLabels.insert(lookup[0]->getLabel()); + userLabels.erase(lookup[0]->getLabel()); + } + + if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { + string saveLabel = lookup[0]->getLabel(); + + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } + lookup = input.getSharedRAbundVectors(lastLabel); + m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine(); + processSharedAndDesignData(lookup); + + processedLabels.insert(lookup[0]->getLabel()); + userLabels.erase(lookup[0]->getLabel()); + + //restore real lastlabel to save below + lookup[0]->setLabel(saveLabel); + } + + lastLabel = lookup[0]->getLabel(); + //prevent memory leak + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; } + + if (m->control_pressed) { return 0; } + + //get next line to process + lookup = input.getSharedRAbundVectors(); + } + + if (m->control_pressed) { return 0; } + + //output error messages about any remaining user labels + set::iterator it; + bool needToRun = false; + for (it = userLabels.begin(); it != userLabels.end(); it++) { + m->mothurOut("Your file does not include the label " + *it); + if (processedLabels.count(lastLabel) != 1) { + m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine(); + needToRun = true; + }else { + m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine(); + } + } + + //run last label if you need to + if (needToRun == true) { + for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } } + lookup = input.getSharedRAbundVectors(lastLabel); + + m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine(); + + processSharedAndDesignData(lookup); + + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } + + } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ClassifySharedCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** + +void ClassifyRFSharedCommand::processSharedAndDesignData(vector lookup){ + try { +// for (int i = 0; i < designMap->getNamesOfGroups().size(); i++) { +// string groupName = designMap->getNamesOfGroups()[i]; +// cout << groupName << endl; +// } + +// for (int i = 0; i < designMap->getNumSeqs(); i++) { +// string sharedGroupName = designMap->getNamesSeqs()[i]; +// string treatmentName = designMap->getGroup(sharedGroupName); +// cout << sharedGroupName << " : " << treatmentName << endl; +// } + + map treatmentToIntMap; + map intToTreatmentMap; + for (int i = 0; i < designMap.getNumGroups(); i++) { + string treatmentName = designMap.getNamesOfGroups()[i]; + treatmentToIntMap[treatmentName] = i; + intToTreatmentMap[i] = treatmentName; + } + + int numSamples = lookup.size(); + int numFeatures = lookup[0]->getNumBins(); + + int numRows = numSamples; + int numColumns = numFeatures + 1; // extra one space needed for the treatment/outcome + + vector< vector > dataSet(numRows, vector(numColumns, 0)); + + vector names; + + for (int i = 0; i < lookup.size(); i++) { + string sharedGroupName = lookup[i]->getGroup(); + names.push_back(sharedGroupName); + string treatmentName = designMap.getGroup(sharedGroupName); + + int j = 0; + for (; j < lookup[i]->getNumBins(); j++) { + int otuCount = lookup[i]->getAbundance(j); + dataSet[i][j] = otuCount; + } + dataSet[i][j] = treatmentToIntMap[treatmentName]; + } + + RandomForest randomForest(dataSet, numDecisionTrees, treeSplitCriterion, doPruning, pruneAggressiveness, discardHighErrorTrees, highErrorTreeDiscardThreshold, optimumFeatureSubsetSelectionCriteria, featureStandardDeviationThreshold); + + randomForest.populateDecisionTrees(); + randomForest.calcForrestErrorRate(); + randomForest.printConfusionMatrix(intToTreatmentMap); + + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "RF."; + variables["[distance]"] = lookup[0]->getLabel(); + string filename = getOutputFileName("summary", variables); + outputNames.push_back(filename); outputTypes["summary"].push_back(filename); + randomForest.calcForrestVariableImportance(filename); + + // + map variable; + variable["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "misclassifications."; + variable["[distance]"] = lookup[0]->getLabel(); + string mc_filename = getOutputFileName("summary", variable); + outputNames.push_back(mc_filename); outputTypes["summary"].push_back(mc_filename); + randomForest.getMissclassifications(mc_filename, intToTreatmentMap, names); + // + + m->mothurOutEndLine(); + } + catch(exception& e) { + m->errorOut(e, "ClassifySharedCommand", "processSharedAndDesignData"); + exit(1); + } +} +//********************************************************************************************************************** + diff --git a/classifyrfsharedcommand.h b/classifyrfsharedcommand.h new file mode 100755 index 0000000..6a948b2 --- /dev/null +++ b/classifyrfsharedcommand.h @@ -0,0 +1,54 @@ +// +// classifysharedcommand.h +// Mothur +// +// Created by Abu Zaher Md. Faridee on 8/13/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#ifndef __Mothur__classifyrfsharedcommand__ +#define __Mothur__classifyrfsharedcommand__ + +#include "command.hpp" +#include "inputdata.h" + +class ClassifyRFSharedCommand : public Command { +public: + ClassifyRFSharedCommand(); + ClassifyRFSharedCommand(string); + ~ClassifyRFSharedCommand() {}; + + vector setParameters(); + string getCommandName() { return "classifyrf.shared"; } + string getCommandCategory() { return "OTU-Based Approaches"; } + string getHelpString(); + string getOutputPattern(string); + string getCitation() { return "http://www.mothur.org/wiki/Classifyrf.shared\n"; } + string getDescription() { return "implements the random forest machine learning algorithm to identify OTUs that can be used to differentiate between various groups of samples"; } + int execute(); + + void help() { m->mothurOut(getHelpString()); } + +private: + bool abort; + string outputDir; + vector outputNames, Groups; + + string sharedfile, designfile; + set labels; + bool allLines; + + int processors; + bool useTiming; + + GroupMap designMap; + + int numDecisionTrees; + string treeSplitCriterion, optimumFeatureSubsetSelectionCriteria; + bool doPruning, discardHighErrorTrees; + double pruneAggressiveness, highErrorTreeDiscardThreshold, featureStandardDeviationThreshold; + + void processSharedAndDesignData(vector lookup); +}; + +#endif /* defined(__Mothur__classifyrfsharedcommand__) */ diff --git a/classifysharedcommand.cpp b/classifysharedcommand.cpp index 6e32fd1..c7eb6cd 100755 --- a/classifysharedcommand.cpp +++ b/classifysharedcommand.cpp @@ -359,8 +359,11 @@ void ClassifySharedCommand::processSharedAndDesignData(vector > dataSet(numRows, vector(numColumns, 0)); + vector names; + for (int i = 0; i < lookup.size(); i++) { string sharedGroupName = lookup[i]->getGroup(); + names.push_back(sharedGroupName); string treatmentName = designMap.getGroup(sharedGroupName); int j = 0; @@ -375,15 +378,24 @@ void ClassifySharedCommand::processSharedAndDesignData(vector variables; - variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile)); + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "RF."; variables["[distance]"] = lookup[0]->getLabel(); string filename = getOutputFileName("summary", variables); outputNames.push_back(filename); outputTypes["summary"].push_back(filename); - randomForest.calcForrestVariableImportance(filename); + // + map variable; + variable["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "misclassifications."; + variable["[distance]"] = lookup[0]->getLabel(); + string mc_filename = getOutputFileName("summary", variable); + outputNames.push_back(mc_filename); outputTypes["summary"].push_back(mc_filename); + randomForest.getMissclassifications(mc_filename, intToTreatmentMap, names); + // + m->mothurOutEndLine(); } catch(exception& e) { diff --git a/clustercommand.h b/clustercommand.h index 5786da2..cd9f47b 100644 --- a/clustercommand.h +++ b/clustercommand.h @@ -21,7 +21,7 @@ The cluster command outputs a .list , .rabund and .sabund files. The cluster command parameter options are method, cuttoff and precision. No parameters are required. The cluster command should be in the following format: cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision). - The acceptable methods are furthest, nearest and average. If you do not provide a method the default algorythm is furthest neighbor. + The acceptable methods are furthest, nearest and average. If you do not provide a method the default algorithm is furthest neighbor. The cluster() command outputs three files *.list, *.rabund, and *.sabund. */ diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index b02bd20..95693cc 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -61,7 +61,7 @@ string ClusterSplitCommand::getHelpString(){ helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t"; helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n"; helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n"; - helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n"; + helpString += "The method allows you to specify what clustering algorithm you want to use, default=average, option furthest, nearest, or average. \n"; helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n"; helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n"; helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n"; diff --git a/commandfactory.cpp b/commandfactory.cpp index 6ceabbf..03c3e49 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -135,7 +135,7 @@ #include "makecontigscommand.h" #include "loadlogfilecommand.h" #include "sffmultiplecommand.h" -#include "classifysharedcommand.h" +#include "classifyrfsharedcommand.h" #include "filtersharedcommand.h" #include "primerdesigncommand.h" #include "getdistscommand.h" @@ -306,7 +306,7 @@ CommandFactory::CommandFactory(){ commands["make.table"] = "make.table"; commands["sff.multiple"] = "sff.multiple"; commands["quit"] = "MPIEnabled"; - commands["classify.shared"] = "classify.shared"; + commands["classifyrf.shared"] = "classifyrf.shared"; commands["filter.shared"] = "filter.shared"; commands["primer.design"] = "primer.design"; commands["get.dists"] = "get.dists"; @@ -533,7 +533,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "make.contigs") { command = new MakeContigsCommand(optionString); } else if(commandName == "load.logfile") { command = new LoadLogfileCommand(optionString); } else if(commandName == "sff.multiple") { command = new SffMultipleCommand(optionString); } - else if(commandName == "classify.shared") { command = new ClassifySharedCommand(optionString); } + else if(commandName == "classifyrf.shared") { command = new ClassifyRFSharedCommand(optionString); } else if(commandName == "filter.shared") { command = new FilterSharedCommand(optionString); } else if(commandName == "primer.design") { command = new PrimerDesignCommand(optionString); } else if(commandName == "get.dists") { command = new GetDistsCommand(optionString); } @@ -701,7 +701,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "make.contigs") { pipecommand = new MakeContigsCommand(optionString); } else if(commandName == "load.logfile") { pipecommand = new LoadLogfileCommand(optionString); } else if(commandName == "sff.multiple") { pipecommand = new SffMultipleCommand(optionString); } - else if(commandName == "classify.shared") { pipecommand = new ClassifySharedCommand(optionString); } + else if(commandName == "classifyrf.shared") { pipecommand = new ClassifyRFSharedCommand(optionString); } else if(commandName == "filter.shared") { pipecommand = new FilterSharedCommand(optionString); } else if(commandName == "primer.design") { pipecommand = new PrimerDesignCommand(optionString); } else if(commandName == "get.dists") { pipecommand = new GetDistsCommand(optionString); } @@ -855,7 +855,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "make.contigs") { shellcommand = new MakeContigsCommand(); } else if(commandName == "load.logfile") { shellcommand = new LoadLogfileCommand(); } else if(commandName == "sff.multiple") { shellcommand = new SffMultipleCommand(); } - else if(commandName == "classify.shared") { shellcommand = new ClassifySharedCommand(); } + else if(commandName == "classifyrf.shared") { shellcommand = new ClassifyRFSharedCommand(); } else if(commandName == "filter.shared") { shellcommand = new FilterSharedCommand(); } else if(commandName == "primer.design") { shellcommand = new PrimerDesignCommand(); } else if(commandName == "get.dists") { shellcommand = new GetDistsCommand(); } diff --git a/decalc.h b/decalc.h index d6cca18..d1daf05 100644 --- a/decalc.h +++ b/decalc.h @@ -14,7 +14,7 @@ /***********************************************************************/ -//This class was created using the algorythms described in the +//This class was created using the algorithms described in the // "At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies" paper //by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1. diff --git a/hclustercommand.h b/hclustercommand.h index d107407..1b8e9b7 100644 --- a/hclustercommand.h +++ b/hclustercommand.h @@ -18,7 +18,7 @@ #include "readcluster.h" /******************************************************************/ -//This command is an implementation of the HCluster algorythmn described in +//This command is an implementation of the HCluster algorithmn described in //ESPRIT: estimating species richness using large collections of 16S rRNA pyrosequences by //Yijun Sun1,2,*, Yunpeng Cai2, Li Liu1, Fahong Yu1, Michael L. Farrell3, William McKendree3 //and William Farmerie1 1 diff --git a/pintail.h b/pintail.h index 92c3998..c970fdd 100644 --- a/pintail.h +++ b/pintail.h @@ -15,7 +15,7 @@ #include "decalc.h" /***********************************************************/ -//This class was created using the algorythms described in the +//This class was created using the algorithms described in the // "At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies" paper //by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1. diff --git a/randomforest.cpp b/randomforest.cpp index 2ae0eb5..d998de6 100644 --- a/randomforest.cpp +++ b/randomforest.cpp @@ -37,7 +37,7 @@ int RandomForest::calcForrestErrorRate() { vector::iterator maxPredictedOutComeIterator = max_element(predictedOutComes.begin(), predictedOutComes.end()); int majorityVotedOutcome = (int)(maxPredictedOutComeIterator - predictedOutComes.begin()); int realOutcome = dataSet[indexOfSample][numFeatures]; - + if (majorityVotedOutcome == realOutcome) { numCorrect++; } } @@ -46,7 +46,7 @@ int RandomForest::calcForrestErrorRate() { m->mothurOut("numCorrect = " + toString(numCorrect)+ "\n"); m->mothurOut("forrestErrorRate = " + toString(forrestErrorRate)+ "\n"); - + return 0; } catch(exception& e) { @@ -54,6 +54,87 @@ int RandomForest::calcForrestErrorRate() { exit(1); } } +/***********************************************************************/ + +int RandomForest::printConfusionMatrix(map intToTreatmentMap) { + try { + int numGroups = intToTreatmentMap.size(); + vector > cm(numGroups, vector(numGroups, 0)); + + for (map >::iterator it = globalOutOfBagEstimates.begin(); it != globalOutOfBagEstimates.end(); it++) { + + if (m->control_pressed) { return 0; } + + int indexOfSample = it->first; //key + vector predictedOutComes = it->second; //value, vector of all predicted classes + vector::iterator maxPredictedOutComeIterator = max_element(predictedOutComes.begin(), predictedOutComes.end()); + int majorityVotedOutcome = (int)(maxPredictedOutComeIterator - predictedOutComes.begin()); + int realOutcome = dataSet[indexOfSample][numFeatures]; + cm[realOutcome][majorityVotedOutcome] = cm[realOutcome][majorityVotedOutcome] + 1; + } + + vector fw; + for (int w = 0; w mothurOut("confusion matrix:\n\t\t"); + for (int k = 0; k < numGroups; k++) { + //m->mothurOut(intToTreatmentMap[k] + "\t"); + cout << setw(fw[k]) << intToTreatmentMap[k] << "\t"; + } + for (int i = 0; i < numGroups; i++) { + cout << "\n" << setw(fw[i]) << intToTreatmentMap[i] << "\t"; + //m->mothurOut("\n" + intToTreatmentMap[i] + "\t"); + if (m->control_pressed) { return 0; } + for (int j = 0; j < numGroups; j++) { + //m->mothurOut(toString(cm[i][j]) + "\t"); + cout << setw(fw[i]) << cm[i][j] << "\t"; + } + } + //m->mothurOut("\n"); + cout << "\n"; + + return 0; + } + + catch(exception& e) { + m->errorOut(e, "RandomForest", "printConfusionMatrix"); + exit(1); + } +} + +/***********************************************************************/ + +int RandomForest::getMissclassifications(string filename, map intToTreatmentMap, vector names) { + try { + ofstream out; + m->openOutputFile(filename, out); + out <<"Sample\tRF classification\tActual classification\n"; + for (map >::iterator it = globalOutOfBagEstimates.begin(); it != globalOutOfBagEstimates.end(); it++) { + + if (m->control_pressed) { return 0; } + + int indexOfSample = it->first; + vector predictedOutComes = it->second; + vector::iterator maxPredictedOutComeIterator = max_element(predictedOutComes.begin(), predictedOutComes.end()); + int majorityVotedOutcome = (int)(maxPredictedOutComeIterator - predictedOutComes.begin()); + int realOutcome = dataSet[indexOfSample][numFeatures]; + + if (majorityVotedOutcome != realOutcome) { + out << names[indexOfSample] << "\t" << intToTreatmentMap[majorityVotedOutcome] << "\t" << intToTreatmentMap[realOutcome] << endl; + + } + } + + out.close(); + return 0; + } + catch(exception& e) { + m->errorOut(e, "RandomForest", "getMissclassifications"); + exit(1); + } +} /***********************************************************************/ int RandomForest::calcForrestVariableImportance(string filename) { @@ -97,7 +178,7 @@ int RandomForest::calcForrestVariableImportance(string filename) { ofstream out; m->openOutputFile(filename, out); - out <<"OTU\tRank\n"; + out <<"OTU\tMean decrease accuracy\n"; for (int i = 0; i < globalVariableRanks.size(); i++) { out << m->currentBinLabels[(int)globalVariableRanks[i].first] << '\t' << globalVariableImportanceList[globalVariableRanks[i].first] << endl; } diff --git a/randomforest.hpp b/randomforest.hpp index d0ac1ec..67d4372 100644 --- a/randomforest.hpp +++ b/randomforest.hpp @@ -43,6 +43,8 @@ public: int calcForrestVariableImportance(string); int populateDecisionTrees(); int updateGlobalOutOfBagEstimates(DecisionTree* decisionTree); + int printConfusionMatrix(map intToTreatmentMap); + int getMissclassifications(string, map intToTreatmentMap, vector names); private: MothurOut* m;