5 * Created by westcott on 9/16/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "metastatscommand.h"
11 #include "sharedutilities.h"
14 //**********************************************************************************************************************
15 vector<string> MetaStatsCommand::setParameters(){
17 CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared);
18 CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pdesign);
19 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20 CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
21 CommandParameter pthreshold("threshold", "Number", "", "0.05", "", "", "",false,false); parameters.push_back(pthreshold);
22 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
23 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
24 CommandParameter psets("sets", "String", "", "", "", "", "",false,false); parameters.push_back(psets);
25 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
26 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
28 vector<string> myArray;
29 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
33 m->errorOut(e, "MetaStatsCommand", "setParameters");
37 //**********************************************************************************************************************
38 string MetaStatsCommand::getHelpString(){
40 string helpString = "";
41 helpString += "This command is based on the Metastats program, White, J.R., Nagarajan, N. & Pop, M. Statistical methods for detecting differentially abundant features in clinical metagenomic samples. PLoS Comput Biol 5, e1000352 (2009).\n";
42 helpString += "The metastats command outputs a .metastats file. \n";
43 helpString += "The metastats command parameters are shared, iters, threshold, groups, label, design, sets and processors. The shared and design parameters are required, unless you have valid current files.\n";
44 helpString += "The design parameter allows you to assign your groups to sets when you are running metastat. mothur will run all pairwise comparisons of the sets. It is required. \n";
45 helpString += "The design file looks like the group file. It is a 2 column tab delimited file, where the first column is the group name and the second column is the set the group belongs to.\n";
46 helpString += "The sets parameter allows you to specify which of the sets in your designfile you would like to analyze. The set names are separated by dashes. THe default is all sets in the designfile.\n";
47 helpString += "The iters parameter allows you to set number of bootstrap permutations for estimating null distribution of t statistic. The default is 1000. \n";
48 helpString += "The threshold parameter allows you to set the significance level to reject null hypotheses (default 0.05).\n";
49 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n";
50 helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n";
51 helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
52 helpString += "The metastats command should be in the following format: metastats(design=yourDesignFile).\n";
53 helpString += "Example metastats(design=temp.design, groups=A-B-C).\n";
54 helpString += "The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n";
55 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
59 m->errorOut(e, "MetaStatsCommand", "getHelpString");
63 //**********************************************************************************************************************
64 MetaStatsCommand::MetaStatsCommand(){
66 abort = true; calledHelp = true;
68 vector<string> tempOutNames;
69 outputTypes["metastats"] = tempOutNames;
72 m->errorOut(e, "MetaStatsCommand", "MetaStatsCommand");
76 //**********************************************************************************************************************
78 MetaStatsCommand::MetaStatsCommand(string option) {
80 abort = false; calledHelp = false;
84 //allow user to run help
85 if(option == "help") { help(); abort = true; calledHelp = true; }
86 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
89 vector<string> myArray = setParameters();
91 OptionParser parser(option);
92 map<string,string> parameters = parser.getParameters();
94 ValidParameters validParameter;
96 //check to make sure all parameters are valid for command
97 map<string,string>::iterator it;
98 for (it = parameters.begin(); it != parameters.end(); it++) {
99 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
102 //initialize outputTypes
103 vector<string> tempOutNames;
104 outputTypes["metastats"] = tempOutNames;
107 //if the user changes the input directory command factory will send this info to us in the output parameter
108 string inputDir = validParameter.validFile(parameters, "inputdir", false);
109 if (inputDir == "not found"){ inputDir = ""; }
112 it = parameters.find("design");
113 //user has given a template file
114 if(it != parameters.end()){
115 path = m->hasPath(it->second);
116 //if the user has not given a path then, add inputdir. else leave path alone.
117 if (path == "") { parameters["design"] = inputDir + it->second; }
120 it = parameters.find("shared");
121 //user has given a template file
122 if(it != parameters.end()){
123 path = m->hasPath(it->second);
124 //if the user has not given a path then, add inputdir. else leave path alone.
125 if (path == "") { parameters["shared"] = inputDir + it->second; }
130 //check for required parameters
131 sharedfile = validParameter.validFile(parameters, "shared", true);
132 if (sharedfile == "not open") { abort = true; }
133 else if (sharedfile == "not found") { //if there is a current shared file, use it
134 sharedfile = m->getSharedFile();
135 if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
136 else { m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
137 }else { m->setSharedFile(sharedfile); }
139 //check for required parameters
140 designfile = validParameter.validFile(parameters, "design", true);
141 if (designfile == "not open") { abort = true; }
142 else if (designfile == "not found") {
143 //if there is a current design file, use it
144 designfile = m->getDesignFile();
145 if (designfile != "") { m->mothurOut("Using " + designfile + " as input file for the design parameter."); m->mothurOutEndLine(); }
146 else { m->mothurOut("You have no current designfile and the design parameter is required."); m->mothurOutEndLine(); abort = true; }
147 }else { m->setDesignFile(designfile); }
149 //if the user changes the output directory command factory will send this info to us in the output parameter
150 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
152 outputDir += m->hasPath(sharedfile); //if user entered a file with a path then preserve it
155 //check for optional parameter and set defaults
156 // ...at some point should added some additional type checking...
157 label = validParameter.validFile(parameters, "label", false);
158 if (label == "not found") { label = ""; }
160 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
161 else { allLines = 1; }
164 groups = validParameter.validFile(parameters, "groups", false);
165 if (groups == "not found") { groups = ""; pickedGroups = false; }
168 m->splitAtDash(groups, Groups);
169 m->setGroups(Groups);
172 sets = validParameter.validFile(parameters, "sets", false);
173 if (sets == "not found") { sets = ""; }
175 m->splitAtDash(sets, Sets);
179 string temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; }
180 m->mothurConvert(temp, iters);
182 temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found") { temp = "0.05"; }
183 m->mothurConvert(temp, threshold);
185 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
186 m->setProcessors(temp);
187 m->mothurConvert(temp, processors);
191 catch(exception& e) {
192 m->errorOut(e, "MetaStatsCommand", "MetaStatsCommand");
196 //**********************************************************************************************************************
198 int MetaStatsCommand::execute(){
201 if (abort == true) { if (calledHelp) { return 0; } return 2; }
203 designMap = new GroupMap(designfile);
204 designMap->readDesignMap();
206 input = new InputData(sharedfile, "sharedfile");
207 lookup = input->getSharedRAbundVectors();
208 string lastLabel = lookup[0]->getLabel();
210 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
211 set<string> processedLabels;
212 set<string> userLabels = labels;
214 //setup the pairwise comparions of sets for metastats
215 //calculate number of comparisons i.e. with groups A,B,C = AB, AC, BC = 3;
216 //make sure sets are all in designMap
217 SharedUtil* util = new SharedUtil();
218 vector<string> dGroups = designMap->getNamesOfGroups();
219 util->setGroups(Sets, dGroups);
222 int numGroups = Sets.size();
223 for (int a=0; a<numGroups; a++) {
224 for (int l = 0; l < a; l++) {
225 vector<string> groups; groups.push_back(Sets[a]); groups.push_back(Sets[l]);
226 namesOfGroupCombos.push_back(groups);
232 if (numGroups == 2) { processors = 1; }
233 else if (numGroups < 2) { m->mothurOut("Not enough sets, I need at least 2 valid sets. Unable to complete command."); m->mothurOutEndLine(); m->control_pressed = true; }
236 int numPairs = namesOfGroupCombos.size();
237 int numPairsPerProcessor = numPairs / processors;
239 for (int i = 0; i < processors; i++) {
240 int startPos = i * numPairsPerProcessor;
241 if(i == processors - 1){
242 numPairsPerProcessor = numPairs - i * numPairsPerProcessor;
244 lines.push_back(linePair(startPos, numPairsPerProcessor));
248 //as long as you are not at the end of the file or done wih the lines you want
249 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
251 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } m->clearGroups(); delete input; delete designMap; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
253 if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
255 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
258 processedLabels.insert(lookup[0]->getLabel());
259 userLabels.erase(lookup[0]->getLabel());
262 if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
263 string saveLabel = lookup[0]->getLabel();
265 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
266 lookup = input->getSharedRAbundVectors(lastLabel);
267 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
271 processedLabels.insert(lookup[0]->getLabel());
272 userLabels.erase(lookup[0]->getLabel());
274 //restore real lastlabel to save below
275 lookup[0]->setLabel(saveLabel);
278 lastLabel = lookup[0]->getLabel();
279 //prevent memory leak
280 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; }
282 if (m->control_pressed) { outputTypes.clear(); m->clearGroups(); delete input; delete designMap; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
284 //get next line to process
285 lookup = input->getSharedRAbundVectors();
288 if (m->control_pressed) { outputTypes.clear(); m->clearGroups(); delete input; delete designMap; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
290 //output error messages about any remaining user labels
291 set<string>::iterator it;
292 bool needToRun = false;
293 for (it = userLabels.begin(); it != userLabels.end(); it++) {
294 m->mothurOut("Your file does not include the label " + *it);
295 if (processedLabels.count(lastLabel) != 1) {
296 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
299 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
303 //run last label if you need to
304 if (needToRun == true) {
305 for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
306 lookup = input->getSharedRAbundVectors(lastLabel);
308 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
312 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
315 //reset groups parameter
320 if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0;}
322 m->mothurOutEndLine();
323 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
324 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
325 m->mothurOutEndLine();
329 catch(exception& e) {
330 m->errorOut(e, "MetaStatsCommand", "execute");
334 //**********************************************************************************************************************
336 int MetaStatsCommand::process(vector<SharedRAbundVector*>& thisLookUp){
341 driver(0, namesOfGroupCombos.size(), thisLookUp);
344 vector<int> processIDS;
345 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
346 //loop through and create all the processes you want
347 while (process != processors) {
351 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
354 driver(lines[process].start, lines[process].num, thisLookUp);
357 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
358 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
364 driver(lines[0].start, lines[0].num, thisLookUp);
366 //force parent to wait until all the processes are done
367 for (int i=0;i<(processors-1);i++) {
368 int temp = processIDS[i];
373 //////////////////////////////////////////////////////////////////////////////////////////////////////
374 //Windows version shared memory, so be careful when passing variables through the summarySharedData struct.
375 //Above fork() will clone, so memory is separate, but that's not the case with windows,
376 //Taking advantage of shared memory to pass results vectors.
377 //////////////////////////////////////////////////////////////////////////////////////////////////////
379 vector<metastatsData*> pDataArray;
380 DWORD dwThreadIdArray[processors-1];
381 HANDLE hThreadArray[processors-1];
383 //Create processor worker threads.
384 for( int i=1; i<processors; i++ ){
386 //make copy of lookup so we don't get access violations
387 vector<SharedRAbundVector*> newLookup;
388 vector<string> designMapGroups;
389 for (int k = 0; k < thisLookUp.size(); k++) {
390 SharedRAbundVector* temp = new SharedRAbundVector();
391 temp->setLabel(thisLookUp[k]->getLabel());
392 temp->setGroup(thisLookUp[k]->getGroup());
393 newLookup.push_back(temp);
394 designMapGroups.push_back(designMap->getGroup(thisLookUp[k]->getGroup()));
398 for (int k = 0; k < thisLookUp[0]->getNumBins(); k++) {
399 if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
400 for (int j = 0; j < thisLookUp.size(); j++) { newLookup[j]->push_back(thisLookUp[j]->getAbundance(k), thisLookUp[j]->getGroup()); }
403 // Allocate memory for thread data.
404 metastatsData* tempSum = new metastatsData(sharedfile, outputDir, m, lines[i].start, lines[i].num, namesOfGroupCombos, newLookup, designMapGroups, iters, threshold);
405 pDataArray.push_back(tempSum);
406 processIDS.push_back(i);
408 hThreadArray[i-1] = CreateThread(NULL, 0, MyMetastatsThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
412 driver(lines[0].start, lines[0].num, thisLookUp);
414 //Wait until all threads have terminated.
415 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
417 //Close all thread handles and free memory allocations.
418 for(int i=0; i < pDataArray.size(); i++){
419 for (int j = 0; j < pDataArray[i]->thisLookUp.size(); j++) { delete pDataArray[i]->thisLookUp[j]; }
420 for (int j = 0; j < pDataArray[i]->outputNames.size(); j++) {
421 outputNames.push_back(pDataArray[i]->outputNames[j]);
422 outputTypes["metastats"].push_back(pDataArray[i]->outputNames[j]);
425 CloseHandle(hThreadArray[i]);
426 delete pDataArray[i];
435 catch(exception& e) {
436 m->errorOut(e, "MetaStatsCommand", "process");
440 //**********************************************************************************************************************
441 int MetaStatsCommand::driver(int start, int num, vector<SharedRAbundVector*>& thisLookUp) {
445 for (int c = start; c < (start+num); c++) {
448 string setA = namesOfGroupCombos[c][0];
449 string setB = namesOfGroupCombos[c][1];
452 string outputFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + thisLookUp[0]->getLabel() + "." + setA + "-" + setB + ".metastats";
453 outputNames.push_back(outputFileName); outputTypes["metastats"].push_back(outputFileName);
454 //int nameLength = outputFileName.length();
455 //char * output = new char[nameLength];
456 //strcpy(output, outputFileName.c_str());
458 //build matrix from shared rabunds
460 //data = new double*[thisLookUp[0]->getNumBins()];
462 vector< vector<double> > data2; data2.resize(thisLookUp[0]->getNumBins());
464 vector<SharedRAbundVector*> subset;
467 for (int i = 0; i < thisLookUp.size(); i++) {
468 string thisGroup = thisLookUp[i]->getGroup();
470 //is this group for a set we want to compare??
471 //sorting the sets by putting setB at the back and setA in the front
472 if ((designMap->getGroup(thisGroup) == setB)) {
473 subset.push_back(thisLookUp[i]);
475 }else if ((designMap->getGroup(thisGroup) == setA)) {
476 subset.insert(subset.begin()+setACount, thisLookUp[i]);
481 if ((setACount == 0) || (setBCount == 0)) {
482 m->mothurOut("Missing shared info for " + setA + " or " + setB + ". Skipping comparison."); m->mothurOutEndLine();
483 outputNames.pop_back();
487 for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) {
488 //data[j] = new double[subset.size()];
489 data2[j].resize(subset.size(), 0.0);
491 for (int i = 0; i < subset.size(); i++) {
492 data2[j][i] = (subset[i]->getAbundance(j));
496 m->mothurOut("Comparing " + setA + " and " + setB + "..."); m->mothurOutEndLine();
497 //metastat_main(output, thisLookUp[0]->getNumBins(), subset.size(), threshold, iters, data, setACount);
499 m->mothurOutEndLine();
500 MothurMetastats mothurMeta(threshold, iters);
501 mothurMeta.runMetastats(outputFileName , data2, setACount);
502 m->mothurOutEndLine();
503 m->mothurOutEndLine();
508 //for(int i = 0; i < thisLookUp[0]->getNumBins(); i++) { delete[] data[i]; }
515 catch(exception& e) {
516 m->errorOut(e, "MetaStatsCommand", "driver");
520 //**********************************************************************************************************************