2 // classifysharedcommand.cpp
5 // Created by Abu Zaher Md. Faridee on 8/13/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "classifysharedcommand.h"
10 #include "randomforest.hpp"
11 #include "decisiontree.hpp"
12 #include "rftreenode.hpp"
14 //**********************************************************************************************************************
15 vector<string> ClassifySharedCommand::setParameters(){
17 //CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
18 CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared);
19 CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pdesign);
20 CommandParameter potupersplit("otupersplit", "Multiple", "log2-squareroot", "log2", "", "", "",false,false); parameters.push_back(potupersplit);
21 CommandParameter psplitcriteria("splitcriteria", "Multiple", "gainratio-infogain", "gainratio", "", "", "",false,false); parameters.push_back(psplitcriteria);
22 CommandParameter pnumtrees("numtrees", "Number", "", "100", "", "", "",false,false); parameters.push_back(pnumtrees);
24 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
25 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
26 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
27 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29 vector<string> myArray;
30 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
34 m->errorOut(e, "ClassifySharedCommand", "setParameters");
38 //**********************************************************************************************************************
39 string ClassifySharedCommand::getHelpString(){
41 string helpString = "";
42 helpString += "The classify.shared command allows you to ....\n";
43 helpString += "The classify.shared command parameters are: shared, design, label, groups, otupersplit.\n";
44 helpString += "The label parameter is used to analyze specific labels in your input.\n";
45 helpString += "The groups parameter allows you to specify which of the groups in your designfile you would like analyzed.\n";
46 helpString += "The classify.shared should be in the following format: \n";
47 helpString += "classify.shared(shared=yourSharedFile, design=yourDesignFile)\n";
51 m->errorOut(e, "ClassifySharedCommand", "getHelpString");
55 //**********************************************************************************************************************
56 string ClassifySharedCommand::getOutputFileNameTag(string type, string inputName=""){
59 map<string, vector<string> >::iterator it;
61 //is this a type this command creates
62 it = outputTypes.find(type);
63 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
65 if (type == "summary") { tag = "summary"; }
66 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
71 m->errorOut(e, "ClassifySharedCommand", "getOutputFileName");
75 //**********************************************************************************************************************
77 ClassifySharedCommand::ClassifySharedCommand() {
79 abort = true; calledHelp = true;
81 vector<string> tempOutNames;
82 outputTypes["summary"] = tempOutNames;
85 m->errorOut(e, "ClassifySharedCommand", "ClassifySharedCommand");
89 //**********************************************************************************************************************
90 ClassifySharedCommand::ClassifySharedCommand(string option) {
92 abort = false; calledHelp = false;
95 //allow user to run help
96 if(option == "help") { help(); abort = true; calledHelp = true; }
97 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
100 //valid paramters for this command
101 vector<string> myArray = setParameters();
103 OptionParser parser(option);
104 map<string,string> parameters = parser.getParameters();
106 ValidParameters validParameter;
107 map<string,string>::iterator it;
108 //check to make sure all parameters are valid for command
109 for (it = parameters.begin(); it != parameters.end(); it++) {
110 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
113 vector<string> tempOutNames;
114 outputTypes["summary"] = tempOutNames;
116 //if the user changes the input directory command factory will send this info to us in the output parameter
117 string inputDir = validParameter.validFile(parameters, "inputdir", false);
118 if (inputDir == "not found"){ inputDir = ""; }
121 it = parameters.find("shared");
122 //user has given a shared file
123 if(it != parameters.end()){
124 path = m->hasPath(it->second);
125 //if the user has not given a path then, add inputdir. else leave path alone.
126 if (path == "") { parameters["shared"] = inputDir + it->second; }
129 it = parameters.find("design");
130 //user has given a design file
131 if(it != parameters.end()){
132 path = m->hasPath(it->second);
133 //if the user has not given a path then, add inputdir. else leave path alone.
134 if (path == "") { parameters["design"] = inputDir + it->second; }
139 //check for parameters
140 //get shared file, it is required
141 sharedfile = validParameter.validFile(parameters, "shared", true);
142 if (sharedfile == "not open") { sharedfile = ""; abort = true; }
143 else if (sharedfile == "not found") {
144 //if there is a current shared file, use it
145 sharedfile = m->getSharedFile();
146 if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
147 else { m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
148 }else { m->setSharedFile(sharedfile); }
150 //get design file, it is required
151 designfile = validParameter.validFile(parameters, "design", true);
152 if (designfile == "not open") { sharedfile = ""; abort = true; }
153 else if (designfile == "not found") {
154 //if there is a current shared file, use it
155 designfile = m->getDesignFile();
156 if (designfile != "") { m->mothurOut("Using " + designfile + " as input file for the design parameter."); m->mothurOutEndLine(); }
157 else { m->mothurOut("You have no current designfile and the design parameter is required."); m->mothurOutEndLine(); abort = true; }
158 }else { m->setDesignFile(designfile); }
161 //if the user changes the output directory command factory will send this info to us in the output parameter
162 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
163 outputDir = m->hasPath(sharedfile); //if user entered a file with a path then preserve it
167 // NEW CODE for OTU per split selection criteria
168 otupersplit = validParameter.validFile(parameters, "otupersplit", false);
169 if (otupersplit == "not found") { otupersplit = "log2"; }
170 if ((otupersplit == "squareroot") || (otupersplit == "log2")) {
171 optimumFeatureSubsetSelectionCriteria = otupersplit;
172 }else { m->mothurOut("Not a valid OTU per split selection method. Valid OTU per split selection methods are 'log2' and 'squareroot'."); m->mothurOutEndLine(); abort = true; }
175 splitcriteria = validParameter.validFile(parameters, "splitcriteria", false);
176 if (splitcriteria == "not found") { splitcriteria = "gainratio"; }
177 if ((splitcriteria == "gainratio") || (splitcriteria == "infogain")) {
178 treeSplitCriterion = splitcriteria;
179 }else { m->mothurOut("Not a valid tree splitting criterio. Valid tree splitting criteria are 'gainratio' and 'infogain'."); m->mothurOutEndLine(); abort = true; }
182 string temp = validParameter.validFile(parameters, "numtrees", false); if (temp == "not found"){ temp = "100"; }
183 m->mothurConvert(temp, numDecisionTrees);
185 //Groups must be checked later to make sure they are valid. SharedUtilities has functions of check the validity, just make to so m->setGroups() after the checks. If you are using these with a shared file no need to check the SharedRAbundVector class will call SharedUtilites for you, kinda nice, huh?
186 string groups = validParameter.validFile(parameters, "groups", false);
187 if (groups == "not found") { groups = ""; }
188 else { m->splitAtDash(groups, Groups); }
189 m->setGroups(Groups);
191 //Commonly used to process list, rabund, sabund, shared and relabund files. Look at "smart distancing" examples below in the execute function.
192 string label = validParameter.validFile(parameters, "label", false);
193 if (label == "not found") { label = ""; }
195 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
196 else { allLines = 1; }
201 catch(exception& e) {
202 m->errorOut(e, "ClassifySharedCommand", "ClassifySharedCommand");
206 //**********************************************************************************************************************
207 int ClassifySharedCommand::execute() {
210 if (abort == true) { if (calledHelp) { return 0; } return 2; }
212 InputData input(sharedfile, "sharedfile");
213 vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
216 designMap.readDesignMap(designfile);
218 string lastLabel = lookup[0]->getLabel();
219 set<string> processedLabels;
220 set<string> userLabels = labels;
222 //as long as you are not at the end of the file or done wih the lines you want
223 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
225 if (m->control_pressed) { for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } return 0; }
227 if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
229 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
231 processSharedAndDesignData(lookup);
233 processedLabels.insert(lookup[0]->getLabel());
234 userLabels.erase(lookup[0]->getLabel());
237 if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
238 string saveLabel = lookup[0]->getLabel();
240 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
241 lookup = input.getSharedRAbundVectors(lastLabel);
242 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
244 processSharedAndDesignData(lookup);
246 processedLabels.insert(lookup[0]->getLabel());
247 userLabels.erase(lookup[0]->getLabel());
249 //restore real lastlabel to save below
250 lookup[0]->setLabel(saveLabel);
253 lastLabel = lookup[0]->getLabel();
254 //prevent memory leak
255 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; }
257 if (m->control_pressed) { return 0; }
259 //get next line to process
260 lookup = input.getSharedRAbundVectors();
263 if (m->control_pressed) { return 0; }
265 //output error messages about any remaining user labels
266 set<string>::iterator it;
267 bool needToRun = false;
268 for (it = userLabels.begin(); it != userLabels.end(); it++) {
269 m->mothurOut("Your file does not include the label " + *it);
270 if (processedLabels.count(lastLabel) != 1) {
271 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
274 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
278 //run last label if you need to
279 if (needToRun == true) {
280 for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
281 lookup = input.getSharedRAbundVectors(lastLabel);
283 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
285 processSharedAndDesignData(lookup);
287 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
291 m->mothurOutEndLine();
292 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
293 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
294 m->mothurOutEndLine();
299 catch(exception& e) {
300 m->errorOut(e, "ClassifySharedCommand", "execute");
304 //**********************************************************************************************************************
306 void ClassifySharedCommand::processSharedAndDesignData(vector<SharedRAbundVector*> lookup){
308 // for (int i = 0; i < designMap->getNamesOfGroups().size(); i++) {
309 // string groupName = designMap->getNamesOfGroups()[i];
310 // cout << groupName << endl;
313 // for (int i = 0; i < designMap->getNumSeqs(); i++) {
314 // string sharedGroupName = designMap->getNamesSeqs()[i];
315 // string treatmentName = designMap->getGroup(sharedGroupName);
316 // cout << sharedGroupName << " : " << treatmentName << endl;
319 map<string, int> treatmentToIntMap;
320 map<int, string> intToTreatmentMap;
321 for (int i = 0; i < designMap.getNumGroups(); i++) {
322 string treatmentName = designMap.getNamesOfGroups()[i];
323 treatmentToIntMap[treatmentName] = i;
324 intToTreatmentMap[i] = treatmentName;
327 int numSamples = lookup.size();
328 int numFeatures = lookup[0]->getNumBins();
330 int numRows = numSamples;
331 int numColumns = numFeatures + 1; // extra one space needed for the treatment/outcome
333 vector< vector<int> > dataSet(numRows, vector<int>(numColumns, 0));
335 for (int i = 0; i < lookup.size(); i++) {
336 string sharedGroupName = lookup[i]->getGroup();
337 string treatmentName = designMap.getGroup(sharedGroupName);
340 for (; j < lookup[i]->getNumBins(); j++) {
341 int otuCount = lookup[i]->getAbundance(j);
342 dataSet[i][j] = otuCount;
344 dataSet[i][j] = treatmentToIntMap[treatmentName];
347 RandomForest randomForest(dataSet, numDecisionTrees, treeSplitCriterion);
348 randomForest.populateDecisionTrees();
349 randomForest.calcForrestErrorRate();
351 string filename = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + lookup[0]->getLabel() + "." + getOutputFileNameTag("summary");
352 outputNames.push_back(filename); outputTypes["summary"].push_back(filename);
354 randomForest.calcForrestVariableImportance(filename);
356 m->mothurOutEndLine();
358 catch(exception& e) {
359 m->errorOut(e, "ClassifySharedCommand", "processSharedAndDesignData");
363 //**********************************************************************************************************************