2 // getmetacommunitycommand.cpp
5 // Created by SarahsWork on 4/9/13.
6 // Copyright (c) 2013 Schloss Lab. All rights reserved.
9 #include "getmetacommunitycommand.h"
10 #include "communitytype.h"
12 #include "validcalculator.h"
13 #include "subsample.h"
15 //**********************************************************************************************************************
16 vector<string> GetMetaCommunityCommand::setParameters(){
18 CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none","outputType",false,true); parameters.push_back(pshared);
19 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
20 CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
21 CommandParameter pcalc("calc", "Multiple", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-whittaker-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-hamming-structchi2-gower-memchi2-memchord-memeuclidean-mempearson-jsd-rjsd", "rjsd", "", "", "","",false,false,true); parameters.push_back(pcalc);
22 CommandParameter psubsample("subsample", "String", "", "", "", "", "","",false,false); parameters.push_back(psubsample);
23 CommandParameter piters("iters", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(piters);
24 CommandParameter pminpartitions("minpartitions", "Number", "", "5", "", "", "","",false,false,true); parameters.push_back(pminpartitions);
25 CommandParameter pmaxpartitions("maxpartitions", "Number", "", "100", "", "", "","",false,false,true); parameters.push_back(pmaxpartitions);
26 CommandParameter poptimizegap("optimizegap", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(poptimizegap);
27 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
28 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
29 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
30 CommandParameter pmethod("method", "Multiple", "dmm-kmeans-pam", "dmm", "", "", "","",false,false,true); parameters.push_back(pmethod);
32 vector<string> myArray;
33 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
37 m->errorOut(e, "NewCommand", "setParameters");
41 //**********************************************************************************************************************
42 string GetMetaCommunityCommand::getHelpString(){
44 string helpString = "";
45 helpString += "The get.communitytype command parameters are shared, method, label, groups, minpartitions, maxpartitions, optimizegap and processors. The shared file is required. \n";
46 helpString += "The label parameter is used to analyze specific labels in your input. labels are separated by dashes.\n";
47 helpString += "The groups parameter allows you to specify which of the groups in your shared file you would like analyzed. Group names are separated by dashes.\n";
48 helpString += "The method parameter allows to select the method you would like to use. Options are dmm, kmeans and pam. Default=dmm.\n";
49 helpString += "The calc parameter allows to select the calculator you would like to use to calculate the distance matrix used by the pam method. By default the jsd calculator is used.\n";
50 helpString += "The iters parameter allows you to choose the number of times you would like to run the subsample while calculating the distance matirx for the pam method.\n";
51 helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group while calculating the distance matrix for the pam method.\n";
52 helpString += "The minpartitions parameter is used to .... Default=5.\n";
53 helpString += "The maxpartitions parameter is used to .... Default=10.\n";
54 helpString += "The optimizegap parameter is used to .... Default=3.\n";
55 helpString += "The processors parameter allows you to specify number of processors to use. The default is 1.\n";
56 helpString += "The get.communitytype command should be in the following format: get.communitytype(shared=yourSharedFile).\n";
60 m->errorOut(e, "GetMetaCommunityCommand", "getHelpString");
64 //**********************************************************************************************************************
65 string GetMetaCommunityCommand::getOutputPattern(string type) {
69 if (type == "fit") { pattern = "[filename],[distance],[method],mix.fit"; }
70 else if (type == "relabund") { pattern = "[filename],[distance],[method],[tag],mix.relabund"; }
71 else if (type == "design") { pattern = "[filename],[distance],[method],mix.design"; }
72 else if (type == "matrix") { pattern = "[filename],[distance],[method],[tag],mix.posterior"; }
73 else if (type == "parameters") { pattern = "[filename],[distance],[method],mix.parameters"; }
74 else if (type == "summary") { pattern = "[filename],[distance],[method],mix.summary"; }
75 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
80 m->errorOut(e, "GetMetaCommunityCommand", "getOutputPattern");
84 //**********************************************************************************************************************
85 GetMetaCommunityCommand::GetMetaCommunityCommand(){
87 abort = true; calledHelp = true;
89 vector<string> tempOutNames;
90 outputTypes["fit"] = tempOutNames;
91 outputTypes["relabund"] = tempOutNames;
92 outputTypes["matrix"] = tempOutNames;
93 outputTypes["design"] = tempOutNames;
94 outputTypes["parameters"] = tempOutNames;
95 outputTypes["summary"] = tempOutNames;
98 m->errorOut(e, "GetMetaCommunityCommand", "GetMetaCommunityCommand");
102 //**********************************************************************************************************************
103 GetMetaCommunityCommand::GetMetaCommunityCommand(string option) {
105 abort = false; calledHelp = false;
108 //allow user to run help
109 if(option == "help") { help(); abort = true; calledHelp = true; }
110 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
113 //valid paramters for this command
114 vector<string> myArray = setParameters();
116 OptionParser parser(option);
117 map<string,string> parameters = parser.getParameters();
119 ValidParameters validParameter;
120 map<string,string>::iterator it;
121 //check to make sure all parameters are valid for command
122 for (it = parameters.begin(); it != parameters.end(); it++) {
123 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
126 vector<string> tempOutNames;
127 outputTypes["fit"] = tempOutNames;
128 outputTypes["relabund"] = tempOutNames;
129 outputTypes["matrix"] = tempOutNames;
130 outputTypes["design"] = tempOutNames;
131 outputTypes["parameters"] = tempOutNames;
132 outputTypes["summary"] = tempOutNames;
134 //if the user changes the input directory command factory will send this info to us in the output parameter
135 string inputDir = validParameter.validFile(parameters, "inputdir", false);
136 if (inputDir == "not found"){ inputDir = ""; }
139 it = parameters.find("shared");
140 if(it != parameters.end()){
141 path = m->hasPath(it->second);
142 if (path == "") { parameters["shared"] = inputDir + it->second; }
146 //get shared file, it is required
147 sharedfile = validParameter.validFile(parameters, "shared", true);
148 if (sharedfile == "not open") { sharedfile = ""; abort = true; }
149 else if (sharedfile == "not found") {
150 //if there is a current shared file, use it
151 sharedfile = m->getSharedFile();
152 if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
153 else { m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
154 }else { m->setSharedFile(sharedfile); }
156 //if the user changes the output directory command factory will send this info to us in the output parameter
157 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
158 outputDir = m->hasPath(sharedfile); //if user entered a file with a path then preserve it
161 string temp = validParameter.validFile(parameters, "minpartitions", false); if (temp == "not found"){ temp = "5"; }
162 m->mothurConvert(temp, minpartitions);
164 temp = validParameter.validFile(parameters, "maxpartitions", false); if (temp == "not found"){ temp = "10"; }
165 m->mothurConvert(temp, maxpartitions);
167 temp = validParameter.validFile(parameters, "optimizegap", false); if (temp == "not found"){ temp = "3"; }
168 m->mothurConvert(temp, optimizegap);
170 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
171 m->setProcessors(temp);
172 m->mothurConvert(temp, processors);
174 string groups = validParameter.validFile(parameters, "groups", false);
175 if (groups == "not found") { groups = ""; }
176 else { m->splitAtDash(groups, Groups); }
177 m->setGroups(Groups);
179 string label = validParameter.validFile(parameters, "label", false);
180 if (label == "not found") { label = ""; }
182 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
183 else { allLines = 1; }
186 method = validParameter.validFile(parameters, "method", false);
187 if (method == "not found") { method = "dmm"; }
189 if ((method == "dmm") || (method == "kmeans") || (method == "pam")) { }
190 else { m->mothurOut("[ERROR]: " + method + " is not a valid method. Valid algorithms are dmm, kmeans and pam."); m->mothurOutEndLine(); abort = true; }
192 calc = validParameter.validFile(parameters, "calc", false);
193 if (calc == "not found") { calc = "rjsd"; }
195 if (calc == "default") { calc = "rjsd"; }
197 m->splitAtDash(calc, Estimators);
198 if (m->inUsersGroups("citation", Estimators)) {
199 ValidCalculators validCalc; validCalc.printCitations(Estimators);
200 //remove citation from list of calcs
201 for (int i = 0; i < Estimators.size(); i++) { if (Estimators[i] == "citation") { Estimators.erase(Estimators.begin()+i); break; } }
203 if (Estimators.size() != 1) { abort = true; m->mothurOut("[ERROR]: only one calculator is allowed.\n"); }
205 temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; }
206 m->mothurConvert(temp, iters);
208 temp = validParameter.validFile(parameters, "subsample", false); if (temp == "not found") { temp = "F"; }
209 if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; }
211 if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; } //we will set it to smallest group later
212 else { subsample = false; }
215 if (subsample == false) { iters = 0; }
219 catch(exception& e) {
220 m->errorOut(e, "GetMetaCommunityCommand", "GetMetaCommunityCommand");
224 //**********************************************************************************************************************
226 int GetMetaCommunityCommand::execute(){
229 if (abort == true) { if (calledHelp) { return 0; } return 2; }
231 InputData input(sharedfile, "sharedfile");
232 vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
233 string lastLabel = lookup[0]->getLabel();
235 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
236 set<string> processedLabels;
237 set<string> userLabels = labels;
240 if (subsampleSize == -1) { //user has not set size, set size = smallest samples size
241 subsampleSize = lookup[0]->getNumSeqs();
242 for (int i = 1; i < lookup.size(); i++) {
243 int thisSize = lookup[i]->getNumSeqs();
245 if (thisSize < subsampleSize) { subsampleSize = thisSize; }
250 vector<SharedRAbundVector*> temp;
251 for (int i = 0; i < lookup.size(); i++) {
252 if (lookup[i]->getNumSeqs() < subsampleSize) {
253 m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine();
256 Groups.push_back(lookup[i]->getGroup());
257 temp.push_back(lookup[i]);
261 m->setGroups(Groups);
264 if (lookup.size() < 2) { m->mothurOut("You have not provided enough valid groups. I cannot run the command."); m->mothurOutEndLine(); m->control_pressed = true; return 0; }
268 //as long as you are not at the end of the file or done wih the lines you want
269 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
271 if (m->control_pressed) { for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } return 0; }
273 if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
275 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
277 createProcesses(lookup);
279 processedLabels.insert(lookup[0]->getLabel());
280 userLabels.erase(lookup[0]->getLabel());
283 if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
284 string saveLabel = lookup[0]->getLabel();
286 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
287 lookup = input.getSharedRAbundVectors(lastLabel);
288 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
290 createProcesses(lookup);
292 processedLabels.insert(lookup[0]->getLabel());
293 userLabels.erase(lookup[0]->getLabel());
295 //restore real lastlabel to save below
296 lookup[0]->setLabel(saveLabel);
299 lastLabel = lookup[0]->getLabel();
300 //prevent memory leak
301 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; }
303 if (m->control_pressed) { return 0; }
305 //get next line to process
306 lookup = input.getSharedRAbundVectors();
309 if (m->control_pressed) { return 0; }
311 //output error messages about any remaining user labels
312 set<string>::iterator it;
313 bool needToRun = false;
314 for (it = userLabels.begin(); it != userLabels.end(); it++) {
315 m->mothurOut("Your file does not include the label " + *it);
316 if (processedLabels.count(lastLabel) != 1) {
317 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
320 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
324 //run last label if you need to
325 if (needToRun == true) {
326 for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
327 lookup = input.getSharedRAbundVectors(lastLabel);
329 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
331 createProcesses(lookup);
333 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
336 //output files created by command
337 m->mothurOutEndLine();
338 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
339 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
340 m->mothurOutEndLine();
344 catch(exception& e) {
345 m->errorOut(e, "GetMetaCommunityCommand", "execute");
349 //**********************************************************************************************************************
350 int GetMetaCommunityCommand::createProcesses(vector<SharedRAbundVector*>& thislookup){
353 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
355 processors=1; //qFinderDMM not thread safe
358 vector<int> processIDS;
361 int minPartition = 0;
364 if (maxpartitions < processors) { processors = maxpartitions; }
366 map<string, string> variables;
367 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
368 variables["[distance]"] = thislookup[0]->getLabel();
369 variables["[method]"] = method;
370 string outputFileName = getOutputFileName("fit", variables);
371 outputNames.push_back(outputFileName); outputTypes["fit"].push_back(outputFileName);
373 //divide the partitions between the processors
374 vector< vector<int> > dividedPartitions;
375 vector< vector<string> > rels, matrix;
376 vector<string> doneFlags;
377 dividedPartitions.resize(processors);
378 rels.resize(processors);
379 matrix.resize(processors);
381 //for each file group figure out which process will complete it
382 //want to divide the load intelligently so the big files are spread between processes
383 for (int i=1; i<=maxpartitions; i++) {
385 int processToAssign = (i+1) % processors;
386 if (processToAssign == 0) { processToAssign = processors; }
388 if (m->debug) { m->mothurOut("[DEBUG]: assigning " + toString(i) + " to process " + toString(processToAssign-1) + "\n"); }
389 dividedPartitions[(processToAssign-1)].push_back(i);
391 variables["[tag]"] = toString(i);
392 string relName = getOutputFileName("relabund", variables);
393 string mName = getOutputFileName("matrix", variables);
394 rels[(processToAssign-1)].push_back(relName);
395 matrix[(processToAssign-1)].push_back(mName);
398 for (int i = 0; i < processors; i++) { //read from everyone elses, just write to yours
399 string tempDoneFile = m->getRootName(m->getSimpleName(sharedfile)) + toString(i) + ".done.temp";
400 doneFlags.push_back(tempDoneFile);
402 m->openOutputFile(tempDoneFile, out); //clear out
407 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
409 //loop through and create all the processes you want
410 while (process != processors) {
414 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
418 num = processDriver(thislookup, dividedPartitions[process], (outputFileName + toString(getpid())), rels[process], matrix[process], doneFlags, process);
420 //pass numSeqs to parent
422 string tempFile = toString(getpid()) + ".outputNames.temp";
423 m->openOutputFile(tempFile, out);
425 out << outputNames.size() << endl;
426 for (int i = 0; i < outputNames.size(); i++) { out << outputNames[i] << endl; }
431 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
432 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
438 if (method == "dmm") { m->mothurOut("K\tNLE\t\tlogDet\tBIC\t\tAIC\t\tLaplace\n"); }
440 m->mothurOut("K\tCH\t");
441 for (int i = 0; i < thislookup.size(); i++) { m->mothurOut(thislookup[i]->getGroup() + '\t'); }
444 minPartition = processDriver(thislookup, dividedPartitions[0], outputFileName, rels[0], matrix[0], doneFlags, 0);
446 //force parent to wait until all the processes are done
447 for (int i=0;i<processIDS.size();i++) {
448 int temp = processIDS[i];
452 vector<string> tempOutputNames = outputNames;
453 for (int i=0;i<processIDS.size();i++) {
455 string tempFile = toString(processIDS[i]) + ".outputNames.temp";
456 m->openInputFile(tempFile, in);
459 in >> tempNum; m->gobble(in);
460 if (tempNum < minPartition) { minPartition = tempNum; }
461 in >> tempNum; m->gobble(in);
462 for (int i = 0; i < tempNum; i++) {
463 string tempName = "";
464 in >> tempName; m->gobble(in);
465 tempOutputNames.push_back(tempName);
468 in.close(); m->mothurRemove(tempFile);
470 m->appendFilesWithoutHeaders(outputFileName + toString(processIDS[i]), outputFileName);
471 m->mothurRemove(outputFileName + toString(processIDS[i]));
474 if (processors > 1) {
476 for (int i = 0; i < tempOutputNames.size(); i++) { //remove files if needed
477 string name = tempOutputNames[i];
478 vector<string> parts;
479 m->splitAtChar(name, parts, '.');
481 if (((parts[parts.size()-1] == "relabund") || (parts[parts.size()-1] == "posterior")) && (parts[parts.size()-2] == "mix")) {
482 string tempNum = parts[parts.size()-3];
483 int num; m->mothurConvert(tempNum, num);
484 //if (num > minPartition) {
485 // m->mothurRemove(tempOutputNames[i]);
486 // keep = false; if (m->debug) { m->mothurOut("[DEBUG]: removing " + tempOutputNames[i] + ".\n"); }
489 if (keep) { outputNames.push_back(tempOutputNames[i]); }
494 m->openInputFile(outputFileName, in);
495 string headers = m->getline(in); m->gobble(in);
497 map<int, string> file;
499 string numString, line;
501 in >> numString; line = m->getline(in); m->gobble(in);
502 m->mothurConvert(numString, num);
507 m->openOutputFile(outputFileName, out);
508 out << headers << endl;
509 for (map<int, string>::iterator it = file.begin(); it != file.end(); it++) {
510 out << it->first << '\t' << it->second << endl;
511 if (m->debug) { m->mothurOut("[DEBUG]: printing: " + toString(it->first) + '\t' + it->second + ".\n"); }
517 m->mothurOut("K\tNLE\t\tlogDet\tBIC\t\tAIC\t\tLaplace\n");
518 minPartition = processDriver(thislookup, dividedPartitions[0], outputFileName, rels[0], matrix[0], doneFlags, 0);
520 for (int i = 0; i < processors; i++) { //read from everyone elses, just write to yours
521 string tempDoneFile = m->getRootName(m->getSimpleName(sharedfile)) + toString(i) + ".done.temp";
522 m->mothurRemove(tempDoneFile);
525 if (m->control_pressed) { return 0; }
527 if (m->debug) { m->mothurOut("[DEBUG]: minPartition = " + toString(minPartition) + "\n"); }
529 //run generate Summary function for smallest minPartition
530 variables["[tag]"] = toString(minPartition);
531 vector<double> piValues = generateDesignFile(minPartition, variables);
532 if (method == "dmm") { generateSummaryFile(minPartition, variables, piValues); } //pam doesn't make a relabund file
537 catch(exception& e) {
538 m->errorOut(e, "GetMetaCommunityCommand", "createProcesses");
542 //**********************************************************************************************************************
543 int GetMetaCommunityCommand::processDriver(vector<SharedRAbundVector*>& thislookup, vector<int>& parts, string outputFileName, vector<string> relabunds, vector<string> matrix, vector<string> doneFlags, int processID){
546 double minLaplace = 1e10;
547 int minPartition = 1;
548 vector<double> minSilhouettes; minSilhouettes.resize(thislookup.size(), 0);
550 ofstream fitData, silData;
551 if (method == "dmm") {
552 m->openOutputFile(outputFileName, fitData);
553 fitData.setf(ios::fixed, ios::floatfield);
554 fitData.setf(ios::showpoint);
555 fitData << "K\tNLE\tlogDet\tBIC\tAIC\tLaplace" << endl;
556 }else if((method == "pam") || (method == "kmeans")) { //because ch is looking of maximal value
558 m->openOutputFile(outputFileName, silData);
559 silData.setf(ios::fixed, ios::floatfield);
560 silData.setf(ios::showpoint);
561 silData << "K\tCH\t";
562 for (int i = 0; i < thislookup.size(); i++) { silData << thislookup[i]->getGroup() << '\t'; }
566 cout.setf(ios::fixed, ios::floatfield);
567 cout.setf(ios::showpoint);
569 vector< vector<int> > sharedMatrix;
570 vector<string> thisGroups;
571 for (int i = 0; i < thislookup.size(); i++) { sharedMatrix.push_back(thislookup[i]->getAbundances()); thisGroups.push_back(thislookup[i]->getGroup()); }
573 vector< vector<double> > dists; //do we want to output this matrix??
574 if ((method == "pam") || (method == "kmeans")) { dists = generateDistanceMatrix(thislookup); }
577 m->mothurOut("[DEBUG]: dists = \n");
578 for (int i = 0; i < dists.size(); i++) {
579 if (m->control_pressed) { break; }
580 m->mothurOut("[DEBUG]: i = " + toString(i) + '\t');
581 for (int j = 0; j < i; j++) { m->mothurOut(toString(dists[i][j]) +"\t"); }
586 for(int i=0;i<parts.size();i++){
588 int numPartitions = parts[i];
590 if (m->debug) { m->mothurOut("[DEBUG]: running partition " + toString(numPartitions) + "\n"); }
592 if (m->control_pressed) { break; }
594 //check to see if anyone else is done
595 for (int j = 0; j < doneFlags.size(); j++) {
596 if (!m->isBlank(doneFlags[j])) { //another process has finished
597 //are they done at a lower partition?
599 m->openInputFile(doneFlags[j], in);
601 in >> tempNum; in.close();
602 if (tempNum < numPartitions) { break; } //quit, because someone else has finished
606 CommunityTypeFinder* finder = NULL;
607 if (method == "dmm") { finder = new qFinderDMM(sharedMatrix, numPartitions); }
608 else if (method == "kmeans") { finder = new KMeans(sharedMatrix, numPartitions); }
609 else if (method == "pam") { finder = new Pam(sharedMatrix, dists, numPartitions); }
611 if (i == 0) { m->mothurOut(method + " is not a valid method option. I will run the command using dmm.\n"); }
612 finder = new qFinderDMM(sharedMatrix, numPartitions);
615 string relabund = relabunds[i];
616 string matrixName = matrix[i];
617 outputNames.push_back(matrixName); outputTypes["matrix"].push_back(matrixName);
619 finder->printZMatrix(matrixName, thisGroups);
621 double chi; vector<double> silhouettes;
622 if (method == "dmm") {
623 double laplace = finder->getLaplace();
624 if(laplace < minLaplace){
625 minPartition = numPartitions;
626 minLaplace = laplace;
629 chi = finder->calcCHIndex(dists);
630 silhouettes = finder->calcSilhouettes(dists);
631 if (chi > minLaplace) { //save partition with maximum ch index score
632 minPartition = numPartitions;
634 minSilhouettes = silhouettes;
638 if (method == "dmm") {
639 finder->printFitData(cout, minLaplace);
640 finder->printFitData(fitData);
641 finder->printRelAbund(relabund, m->currentSharedBinLabels);
642 outputNames.push_back(relabund); outputTypes["relabund"].push_back(relabund);
643 }else if ((method == "pam") || (method == "kmeans")) { //print silouettes and ch values
644 finder->printSilData(cout, chi, silhouettes);
645 finder->printSilData(silData, chi, silhouettes);
646 if (method == "kmeans") {
647 finder->printRelAbund(relabund, m->currentSharedBinLabels);
648 outputNames.push_back(relabund); outputTypes["relabund"].push_back(relabund);
653 if(optimizegap != -1 && (numPartitions - minPartition) >= optimizegap && numPartitions >= minpartitions){
654 string tempDoneFile = m->getRootName(m->getSimpleName(sharedfile)) + toString(processID) + ".done.temp";
656 m->openOutputFile(tempDoneFile, outDone);
657 outDone << minPartition << endl;
662 if (method == "dmm") { fitData.close(); }
664 if (m->control_pressed) { return 0; }
668 catch(exception& e) {
669 m->errorOut(e, "GetMetaCommunityCommand", "processDriver");
673 /**************************************************************************************************/
675 vector<double> GetMetaCommunityCommand::generateDesignFile(int numPartitions, map<string,string> variables){
677 vector<double> piValues(numPartitions, 0);
680 variables["[tag]"] = toString(numPartitions);
681 string input = getOutputFileName("matrix", variables);
682 m->openInputFile(input, postFile);//((fileRoot + toString(numPartitions) + "mix.posterior").c_str()); //matrix file
683 variables.erase("[tag]");
684 string outputFileName = getOutputFileName("design", variables);
686 m->openOutputFile(outputFileName, designFile);
687 outputNames.push_back(outputFileName); outputTypes["design"].push_back(outputFileName);
690 vector<string> titles(numPartitions);
692 for(int i=0;i<numPartitions;i++){ postFile >> titles[i]; }
700 if (m->control_pressed) { break; }
702 double maxPosterior = 0.0000;
703 int maxPartition = -1;
705 postFile >> sampleName;
707 for(int i=0;i<numPartitions;i++){
709 postFile >> posterior;
710 if(posterior > maxPosterior){
711 maxPosterior = posterior;
714 piValues[i] += posterior;
718 designFile << sampleName << '\t' << titles[maxPartition] << endl;
723 for(int i=0;i<numPartitions;i++){
724 piValues[i] /= (double)numSamples;
733 catch(exception& e) {
734 m->errorOut(e, "GetMetaCommunityCommand", "generateDesignFile");
739 /**************************************************************************************************/
741 inline bool summaryFunction(summaryData i, summaryData j){ return i.difference > j.difference; }
743 /**************************************************************************************************/
744 int GetMetaCommunityCommand::generateSummaryFile(int numPartitions, map<string,string> v, vector<double> piValues){
746 vector<summaryData> summary;
748 vector<double> pMean(numPartitions, 0);
749 vector<double> pLCI(numPartitions, 0);
750 vector<double> pUCI(numPartitions, 0);
753 double mean, lci, uci;
755 ifstream referenceFile;
756 map<string, string> variables;
757 variables["[filename]"] = v["[filename]"];
758 variables["[distance]"] = v["[distance]"];
759 variables["]method]"] = method;
760 variables["[tag]"] = "1";
761 string reference = getOutputFileName("relabund", variables);
762 m->openInputFile(reference, referenceFile); //((fileRoot + label + ".1mix.relabund").c_str());
763 variables["[tag]"] = toString(numPartitions);
764 string partFile = getOutputFileName("relabund", variables);
765 ifstream partitionFile;
766 m->openInputFile(partFile, partitionFile); //((fileRoot + toString(numPartitions) + "mix.relabund").c_str());
768 header = m->getline(referenceFile);
769 header = m->getline(partitionFile);
770 stringstream head(header);
773 vector<string> thetaValues(numPartitions, "");
774 for(int i=0;i<numPartitions;i++){
775 head >> label >> dummy >> dummy;
776 thetaValues[i] = label.substr(label.find_last_of('_')+1);
780 vector<double> partitionDiff(numPartitions, 0.0000);
782 while(referenceFile){
784 if (m->control_pressed) { break; }
786 referenceFile >> name >> mean >> lci >> uci;
788 summaryData tempData;
789 tempData.name = name;
790 tempData.refMean = mean;
792 double difference = 0.0000;
794 partitionFile >> name;
795 for(int j=0;j<numPartitions;j++){
796 partitionFile >> pMean[j] >> pLCI[j] >> pUCI[j];
797 difference += abs(mean - pMean[j]);
798 partitionDiff[j] += abs(mean - pMean[j]);;
801 tempData.partMean = pMean;
802 tempData.partLCI = pLCI;
803 tempData.partUCI = pUCI;
804 tempData.difference = difference;
805 summary.push_back(tempData);
807 m->gobble(referenceFile);
808 m->gobble(partitionFile);
810 referenceFile.close();
811 partitionFile.close();
813 if (m->control_pressed) { return 0; }
815 int numOTUs = (int)summary.size();
817 sort(summary.begin(), summary.end(), summaryFunction);
819 variables.erase("[tag]");
820 string outputFileName = getOutputFileName("parameters", variables);
821 outputNames.push_back(outputFileName); outputTypes["parameters"].push_back(outputFileName);
823 ofstream parameterFile;
824 m->openOutputFile(outputFileName, parameterFile); //((fileRoot + "mix.parameters").c_str());
825 parameterFile.setf(ios::fixed, ios::floatfield);
826 parameterFile.setf(ios::showpoint);
828 double totalDifference = 0.0000;
829 parameterFile << "Part\tDif2Ref_i\ttheta_i\tpi_i\n";
830 for(int i=0;i<numPartitions;i++){
831 if (m->control_pressed) { break; }
832 parameterFile << i+1 << '\t' << setprecision(2) << partitionDiff[i] << '\t' << thetaValues[i] << '\t' << piValues[i] << endl;
833 totalDifference += partitionDiff[i];
835 parameterFile.close();
837 if (m->control_pressed) { return 0; }
839 string summaryFileName = getOutputFileName("summary", variables);
840 outputNames.push_back(summaryFileName); outputTypes["summary"].push_back(summaryFileName);
842 ofstream summaryFile;
843 m->openOutputFile(summaryFileName, summaryFile); //((fileRoot + "mix.summary").c_str());
844 summaryFile.setf(ios::fixed, ios::floatfield);
845 summaryFile.setf(ios::showpoint);
848 summaryFile << "OTU\tP0.mean";
849 for(int i=0;i<numPartitions;i++){
850 summaryFile << "\tP" << i+1 << ".mean\tP" << i+1 << ".lci\tP" << i+1 << ".uci";
852 summaryFile << "\tDifference\tCumFraction" << endl;
854 double cumDiff = 0.0000;
856 for(int i=0;i<numOTUs;i++){
857 if (m->control_pressed) { break; }
858 summaryFile << summary[i].name << setprecision(2) << '\t' << summary[i].refMean;
859 for(int j=0;j<numPartitions;j++){
860 summaryFile << '\t' << summary[i].partMean[j] << '\t' << summary[i].partLCI[j] << '\t' << summary[i].partUCI[j];
863 cumDiff += summary[i].difference/totalDifference;
864 summaryFile << '\t' << summary[i].difference << '\t' << cumDiff << endl;
871 catch(exception& e) {
872 m->errorOut(e, "GetMetaCommunityCommand", "generateSummaryFile");
877 //**********************************************************************************************************************
878 vector<vector<double> > GetMetaCommunityCommand::generateDistanceMatrix(vector<SharedRAbundVector*>& thisLookup){
880 vector<vector<double> > results;
882 Calculator* matrixCalculator;
883 ValidCalculators validCalculator;
886 if (validCalculator.isValidCalculator("matrix", Estimators[i]) == true) {
887 if (Estimators[i] == "sharedsobs") {
888 matrixCalculator = new SharedSobsCS();
889 }else if (Estimators[i] == "sharedchao") {
890 matrixCalculator = new SharedChao1();
891 }else if (Estimators[i] == "sharedace") {
892 matrixCalculator = new SharedAce();
893 }else if (Estimators[i] == "jabund") {
894 matrixCalculator = new JAbund();
895 }else if (Estimators[i] == "sorabund") {
896 matrixCalculator = new SorAbund();
897 }else if (Estimators[i] == "jclass") {
898 matrixCalculator = new Jclass();
899 }else if (Estimators[i] == "sorclass") {
900 matrixCalculator = new SorClass();
901 }else if (Estimators[i] == "jest") {
902 matrixCalculator = new Jest();
903 }else if (Estimators[i] == "sorest") {
904 matrixCalculator = new SorEst();
905 }else if (Estimators[i] == "thetayc") {
906 matrixCalculator = new ThetaYC();
907 }else if (Estimators[i] == "thetan") {
908 matrixCalculator = new ThetaN();
909 }else if (Estimators[i] == "kstest") {
910 matrixCalculator = new KSTest();
911 }else if (Estimators[i] == "sharednseqs") {
912 matrixCalculator = new SharedNSeqs();
913 }else if (Estimators[i] == "ochiai") {
914 matrixCalculator = new Ochiai();
915 }else if (Estimators[i] == "anderberg") {
916 matrixCalculator = new Anderberg();
917 }else if (Estimators[i] == "kulczynski") {
918 matrixCalculator = new Kulczynski();
919 }else if (Estimators[i] == "kulczynskicody") {
920 matrixCalculator = new KulczynskiCody();
921 }else if (Estimators[i] == "lennon") {
922 matrixCalculator = new Lennon();
923 }else if (Estimators[i] == "morisitahorn") {
924 matrixCalculator = new MorHorn();
925 }else if (Estimators[i] == "braycurtis") {
926 matrixCalculator = new BrayCurtis();
927 }else if (Estimators[i] == "whittaker") {
928 matrixCalculator = new Whittaker();
929 }else if (Estimators[i] == "odum") {
930 matrixCalculator = new Odum();
931 }else if (Estimators[i] == "canberra") {
932 matrixCalculator = new Canberra();
933 }else if (Estimators[i] == "structeuclidean") {
934 matrixCalculator = new StructEuclidean();
935 }else if (Estimators[i] == "structchord") {
936 matrixCalculator = new StructChord();
937 }else if (Estimators[i] == "hellinger") {
938 matrixCalculator = new Hellinger();
939 }else if (Estimators[i] == "manhattan") {
940 matrixCalculator = new Manhattan();
941 }else if (Estimators[i] == "structpearson") {
942 matrixCalculator = new StructPearson();
943 }else if (Estimators[i] == "soergel") {
944 matrixCalculator = new Soergel();
945 }else if (Estimators[i] == "spearman") {
946 matrixCalculator = new Spearman();
947 }else if (Estimators[i] == "structkulczynski") {
948 matrixCalculator = new StructKulczynski();
949 }else if (Estimators[i] == "speciesprofile") {
950 matrixCalculator = new SpeciesProfile();
951 }else if (Estimators[i] == "hamming") {
952 matrixCalculator = new Hamming();
953 }else if (Estimators[i] == "structchi2") {
954 matrixCalculator = new StructChi2();
955 }else if (Estimators[i] == "gower") {
956 matrixCalculator = new Gower();
957 }else if (Estimators[i] == "memchi2") {
958 matrixCalculator = new MemChi2();
959 }else if (Estimators[i] == "memchord") {
960 matrixCalculator = new MemChord();
961 }else if (Estimators[i] == "memeuclidean") {
962 matrixCalculator = new MemEuclidean();
963 }else if (Estimators[i] == "mempearson") {
964 matrixCalculator = new MemPearson();
965 }else if (Estimators[i] == "jsd") {
966 matrixCalculator = new JSD();
967 }else if (Estimators[i] == "rjsd") {
968 matrixCalculator = new RJSD();
970 m->mothurOut("[ERROR]: " + Estimators[i] + " is not a valid calculator, please correct.\n"); m->control_pressed = true; return results;
975 vector< vector< vector<seqDist> > > calcDistsTotals; //each iter, then each groupCombos dists. this will be used to make .dist files
976 vector< vector<seqDist> > calcDists; calcDists.resize(1);
978 for (int thisIter = 0; thisIter < iters+1; thisIter++) {
980 vector<SharedRAbundVector*> thisItersLookup = thisLookup;
982 if (subsample && (thisIter != 0)) {
984 vector<string> tempLabels; //dont need since we arent printing the sampled sharedRabunds
986 //make copy of lookup so we don't get access violations
987 vector<SharedRAbundVector*> newLookup;
988 for (int k = 0; k < thisItersLookup.size(); k++) {
989 SharedRAbundVector* temp = new SharedRAbundVector();
990 temp->setLabel(thisItersLookup[k]->getLabel());
991 temp->setGroup(thisItersLookup[k]->getGroup());
992 newLookup.push_back(temp);
996 for (int k = 0; k < thisItersLookup[0]->getNumBins(); k++) {
997 if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return results; }
998 for (int j = 0; j < thisItersLookup.size(); j++) { newLookup[j]->push_back(thisItersLookup[j]->getAbundance(k), thisItersLookup[j]->getGroup()); }
1001 tempLabels = sample.getSample(newLookup, subsampleSize);
1002 thisItersLookup = newLookup;
1006 driver(thisItersLookup, calcDists, matrixCalculator);
1008 if (subsample && (thisIter != 0)) {
1009 if((thisIter) % 100 == 0){ m->mothurOutJustToScreen(toString(thisIter)+"\n"); }
1010 calcDistsTotals.push_back(calcDists);
1011 for (int i = 0; i < calcDists.size(); i++) {
1012 for (int j = 0; j < calcDists[i].size(); j++) {
1013 if (m->debug) { m->mothurOut("[DEBUG]: Results: iter = " + toString(thisIter) + ", " + thisLookup[calcDists[i][j].seq1]->getGroup() + " - " + thisLookup[calcDists[i][j].seq2]->getGroup() + " distance = " + toString(calcDists[i][j].dist) + ".\n"); }
1017 for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; }
1018 thisItersLookup.clear();
1019 }else { //print results for whole dataset
1020 for (int i = 0; i < calcDists.size(); i++) {
1021 if (m->control_pressed) { break; }
1024 results.resize(thisLookup.size());
1025 for (int k = 0; k < thisLookup.size(); k++) { results[k].resize(thisLookup.size(), 0.0); }
1027 for (int j = 0; j < calcDists[i].size(); j++) {
1028 int row = calcDists[i][j].seq1;
1029 int column = calcDists[i][j].seq2;
1030 double dist = calcDists[i][j].dist;
1032 results[row][column] = dist;
1033 results[column][row] = dist;
1037 for (int i = 0; i < calcDists.size(); i++) { calcDists[i].clear(); }
1041 //we need to find the average distance and standard deviation for each groups distance
1042 vector< vector<seqDist> > calcAverages = m->getAverages(calcDistsTotals, "average");
1045 for (int i = 0; i < calcDists.size(); i++) {
1046 results.resize(thisLookup.size());
1047 for (int k = 0; k < thisLookup.size(); k++) { results[k].resize(thisLookup.size(), 0.0); }
1049 for (int j = 0; j < calcAverages[i].size(); j++) {
1050 int row = calcAverages[i][j].seq1;
1051 int column = calcAverages[i][j].seq2;
1052 float dist = calcAverages[i][j].dist;
1054 results[row][column] = dist;
1055 results[column][row] = dist;
1063 catch(exception& e) {
1064 m->errorOut(e, "GetMetaCommunityCommand", "generateDistanceMatrix");
1068 /**************************************************************************************************/
1069 int GetMetaCommunityCommand::driver(vector<SharedRAbundVector*> thisLookup, vector< vector<seqDist> >& calcDists, Calculator* matrixCalculator) {
1071 vector<SharedRAbundVector*> subset;
1073 for (int k = 0; k < thisLookup.size(); k++) { // pass cdd each set of groups to compare
1075 for (int l = 0; l < k; l++) {
1077 if (k != l) { //we dont need to similiarity of a groups to itself
1078 subset.clear(); //clear out old pair of sharedrabunds
1079 //add new pair of sharedrabunds
1080 subset.push_back(thisLookup[k]); subset.push_back(thisLookup[l]);
1084 //if this calc needs all groups to calculate the pair load all groups
1085 if (matrixCalculator->getNeedsAll()) {
1086 //load subset with rest of lookup for those calcs that need everyone to calc for a pair
1087 for (int w = 0; w < thisLookup.size(); w++) {
1088 if ((w != k) && (w != l)) { subset.push_back(thisLookup[w]); }
1092 vector<double> tempdata = matrixCalculator->getValues(subset); //saves the calculator outputs
1094 if (m->control_pressed) { return 1; }
1096 seqDist temp(l, k, tempdata[0]);
1097 //cout << l << '\t' << k << '\t' << tempdata[0] << endl;
1098 calcDists[0].push_back(temp);
1106 catch(exception& e) {
1107 m->errorOut(e, "MatrixOutputCommand", "driver");
1111 //**********************************************************************************************************************