2 * summarysharedcommand.cpp
5 * Created by Sarah Westcott on 1/2/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "summarysharedcommand.h"
11 #include "sharedsobscollectsummary.h"
12 #include "sharedchao1.h"
13 #include "sharedace.h"
14 #include "sharednseqs.h"
15 #include "sharedjabund.h"
16 #include "sharedsorabund.h"
17 #include "sharedjclass.h"
18 #include "sharedsorclass.h"
19 #include "sharedjest.h"
20 #include "sharedsorest.h"
21 #include "sharedthetayc.h"
22 #include "sharedthetan.h"
23 #include "sharedkstest.h"
24 #include "whittaker.h"
25 #include "sharedochiai.h"
26 #include "sharedanderbergs.h"
27 #include "sharedkulczynski.h"
28 #include "sharedkulczynskicody.h"
29 #include "sharedlennon.h"
30 #include "sharedmorisitahorn.h"
31 #include "sharedbraycurtis.h"
32 #include "sharedjackknife.h"
33 #include "whittaker.h"
35 //**********************************************************************************************************************
36 vector<string> SummarySharedCommand::getValidParameters(){
38 string Array[] = {"label","calc","groups","all","outputdir","distance","inputdir", "processors"};
39 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
43 m->errorOut(e, "SummarySharedCommand", "getValidParameters");
47 //**********************************************************************************************************************
48 SummarySharedCommand::SummarySharedCommand(){
51 //initialize outputTypes
52 vector<string> tempOutNames;
53 outputTypes["summary"] = tempOutNames;
56 m->errorOut(e, "SummarySharedCommand", "SummarySharedCommand");
60 //**********************************************************************************************************************
61 vector<string> SummarySharedCommand::getRequiredParameters(){
63 vector<string> myArray;
67 m->errorOut(e, "SummarySharedCommand", "getRequiredParameters");
71 //**********************************************************************************************************************
72 vector<string> SummarySharedCommand::getRequiredFiles(){
74 string Array[] = {"shared"};
75 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
79 m->errorOut(e, "SummarySharedCommand", "getRequiredFiles");
83 //**********************************************************************************************************************
85 SummarySharedCommand::SummarySharedCommand(string option) {
87 globaldata = GlobalData::getInstance();
93 //allow user to run help
94 if(option == "help") { validCalculator = new ValidCalculators(); help(); abort = true; }
97 //valid paramters for this command
98 string Array[] = {"label","calc","groups","all","outputdir","distance","inputdir", "processors"};
99 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
101 OptionParser parser(option);
102 map<string, string> parameters = parser.getParameters();
104 ValidParameters validParameter;
106 //check to make sure all parameters are valid for command
107 for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
108 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
111 //make sure the user has already run the read.otu command
112 if (globaldata->getSharedFile() == "") {
113 m->mothurOut("You must read a list and a group, or a shared before you can use the summary.shared command."); m->mothurOutEndLine(); abort = true;
116 //initialize outputTypes
117 vector<string> tempOutNames;
118 outputTypes["summary"] = tempOutNames;
120 //if the user changes the output directory command factory will send this info to us in the output parameter
121 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
123 outputDir += m->hasPath(globaldata->getSharedFile()); //if user entered a file with a path then preserve it
126 //check for optional parameter and set defaults
127 // ...at some point should added some additional type checking...
128 label = validParameter.validFile(parameters, "label", false);
129 if (label == "not found") { label = ""; }
131 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
132 else { allLines = 1; }
135 //if the user has not specified any labels use the ones from read.otu
137 allLines = globaldata->allLines;
138 labels = globaldata->labels;
141 calc = validParameter.validFile(parameters, "calc", false);
142 if (calc == "not found") { calc = "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan"; }
144 if (calc == "default") { calc = "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan"; }
146 m->splitAtDash(calc, Estimators);
148 groups = validParameter.validFile(parameters, "groups", false);
149 if (groups == "not found") { groups = ""; }
151 m->splitAtDash(groups, Groups);
152 globaldata->Groups = Groups;
155 string temp = validParameter.validFile(parameters, "all", false); if (temp == "not found") { temp = "false"; }
156 all = m->isTrue(temp);
158 temp = validParameter.validFile(parameters, "distance", false); if (temp == "not found") { temp = "false"; }
159 createPhylip = m->isTrue(temp);
161 temp = validParameter.validFile(parameters, "processors", false); if(temp == "not found"){ temp = "1"; }
162 convert(temp, processors);
164 if (abort == false) {
166 validCalculator = new ValidCalculators();
169 for (i=0; i<Estimators.size(); i++) {
170 if (validCalculator->isValidCalculator("sharedsummary", Estimators[i]) == true) {
171 if (Estimators[i] == "sharedsobs") {
172 sumCalculators.push_back(new SharedSobsCS());
173 }else if (Estimators[i] == "sharedchao") {
174 sumCalculators.push_back(new SharedChao1());
175 }else if (Estimators[i] == "sharedace") {
176 sumCalculators.push_back(new SharedAce());
177 }else if (Estimators[i] == "jabund") {
178 sumCalculators.push_back(new JAbund());
179 }else if (Estimators[i] == "sorabund") {
180 sumCalculators.push_back(new SorAbund());
181 }else if (Estimators[i] == "jclass") {
182 sumCalculators.push_back(new Jclass());
183 }else if (Estimators[i] == "sorclass") {
184 sumCalculators.push_back(new SorClass());
185 }else if (Estimators[i] == "jest") {
186 sumCalculators.push_back(new Jest());
187 }else if (Estimators[i] == "sorest") {
188 sumCalculators.push_back(new SorEst());
189 }else if (Estimators[i] == "thetayc") {
190 sumCalculators.push_back(new ThetaYC());
191 }else if (Estimators[i] == "thetan") {
192 sumCalculators.push_back(new ThetaN());
193 }else if (Estimators[i] == "kstest") {
194 sumCalculators.push_back(new KSTest());
195 }else if (Estimators[i] == "sharednseqs") {
196 sumCalculators.push_back(new SharedNSeqs());
197 }else if (Estimators[i] == "ochiai") {
198 sumCalculators.push_back(new Ochiai());
199 }else if (Estimators[i] == "anderberg") {
200 sumCalculators.push_back(new Anderberg());
201 }else if (Estimators[i] == "kulczynski") {
202 sumCalculators.push_back(new Kulczynski());
203 }else if (Estimators[i] == "kulczynskicody") {
204 sumCalculators.push_back(new KulczynskiCody());
205 }else if (Estimators[i] == "lennon") {
206 sumCalculators.push_back(new Lennon());
207 }else if (Estimators[i] == "morisitahorn") {
208 sumCalculators.push_back(new MorHorn());
209 }else if (Estimators[i] == "braycurtis") {
210 sumCalculators.push_back(new BrayCurtis());
211 }else if (Estimators[i] == "whittaker") {
212 sumCalculators.push_back(new Whittaker());
221 catch(exception& e) {
222 m->errorOut(e, "SummarySharedCommand", "SummarySharedCommand");
227 //**********************************************************************************************************************
229 void SummarySharedCommand::help(){
231 m->mothurOut("The summary.shared command can only be executed after a successful read.otu command.\n");
232 m->mothurOut("The summary.shared command parameters are label, calc, distance and all. No parameters are required.\n");
233 m->mothurOut("The summary.shared command should be in the following format: \n");
234 m->mothurOut("summary.shared(label=yourLabel, calc=yourEstimators, groups=yourGroups).\n");
235 m->mothurOut("Example summary.shared(label=unique-.01-.03, groups=B-C, calc=sharedchao-sharedace-jabund-sorensonabund-jclass-sorclass-jest-sorest-thetayc-thetan).\n");
236 validCalculator->printCalc("sharedsummary", cout);
237 m->mothurOut("The default value for calc is sharedsobs-sharedchao-sharedace-jabund-sorensonabund-jclass-sorclass-jest-sorest-thetayc-thetan\n");
238 m->mothurOut("The default value for groups is all the groups in your groupfile.\n");
239 m->mothurOut("The distance parameter allows you to indicate you would like a distance file created for each calculator for each label, default=f.\n");
240 m->mothurOut("The label parameter is used to analyze specific labels in your input.\n");
241 m->mothurOut("The all parameter is used to specify if you want the estimate of all your groups together. This estimate can only be made for sharedsobs and sharedchao calculators. The default is false.\n");
242 m->mothurOut("If you use sharedchao and run into memory issues, set all to false. \n");
243 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n");
244 m->mothurOut("Note: No spaces between parameter labels (i.e. label), '=' and parameters (i.e.yourLabel).\n\n");
246 catch(exception& e) {
247 m->errorOut(e, "SummarySharedCommand", "help");
252 //**********************************************************************************************************************
254 SummarySharedCommand::~SummarySharedCommand(){
255 if (abort == false) {
257 delete validCalculator;
261 //**********************************************************************************************************************
263 int SummarySharedCommand::execute(){
266 if (abort == true) { return 0; }
268 ofstream outputFileHandle, outAll;
269 string outputFileName = outputDir + m->getRootName(m->getSimpleName(globaldata->inputFileName)) + "shared.summary";
271 //if the users entered no valid calculators don't execute command
272 if (sumCalculators.size() == 0) { return 0; }
273 //check if any calcs can do multiples
276 for (int i = 0; i < sumCalculators.size(); i++) {
277 if (sumCalculators[i]->getMultiple() == true) { mult = true; }
283 read = new ReadOTUFile(globaldata->inputFileName);
284 read->read(&*globaldata);
286 input = globaldata->ginput;
287 lookup = input->getSharedRAbundVectors();
288 string lastLabel = lookup[0]->getLabel();
290 /******************************************************/
291 //output headings for files
292 /******************************************************/
293 //output estimator names as column headers
294 m->openOutputFile(outputFileName, outputFileHandle);
295 outputFileHandle << "label" <<'\t' << "comparison" << '\t';
296 for(int i=0;i<sumCalculators.size();i++){
297 outputFileHandle << '\t' << sumCalculators[i]->getName();
298 if (sumCalculators[i]->getCols() == 3) { outputFileHandle << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; }
300 outputFileHandle << endl;
301 outputFileHandle.close();
303 //create file and put column headers for multiple groups file
304 string outAllFileName = ((m->getRootName(globaldata->inputFileName)) + "sharedmultiple.summary");
306 m->openOutputFile(outAllFileName, outAll);
307 outputNames.push_back(outAllFileName);
309 outAll << "label" <<'\t' << "comparison" << '\t';
310 for(int i=0;i<sumCalculators.size();i++){
311 if (sumCalculators[i]->getMultiple() == true) {
312 outAll << '\t' << sumCalculators[i]->getName();
319 if (lookup.size() < 2) {
320 m->mothurOut("I cannot run the command without at least 2 valid groups.");
321 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
323 //close files and clean up
324 remove(outputFileName.c_str());
325 if (mult == true) { remove(outAllFileName.c_str()); }
327 //if you only have 2 groups you don't need a .sharedmultiple file
328 }else if ((lookup.size() == 2) && (mult == true)) {
330 remove(outAllFileName.c_str());
331 outputNames.pop_back();
334 if (m->control_pressed) {
335 if (mult) { remove(outAllFileName.c_str()); }
336 remove(outputFileName.c_str());
337 delete input; globaldata->ginput = NULL;
338 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
339 for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
340 globaldata->Groups.clear();
343 /******************************************************/
346 /******************************************************/
347 //comparison breakup to be used by different processes later
348 numGroups = globaldata->Groups.size();
349 lines.resize(processors);
350 for (int i = 0; i < processors; i++) {
351 lines[i].start = int (sqrt(float(i)/float(processors)) * numGroups);
352 lines[i].end = int (sqrt(float(i+1)/float(processors)) * numGroups);
354 /******************************************************/
356 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
357 set<string> processedLabels;
358 set<string> userLabels = labels;
360 //as long as you are not at the end of the file or done wih the lines you want
361 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
362 if (m->control_pressed) {
363 if (mult) { remove(outAllFileName.c_str()); }
364 remove(outputFileName.c_str());
365 delete input; globaldata->ginput = NULL;
366 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
367 for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
368 globaldata->Groups.clear();
373 if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
374 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
375 process(lookup, outputFileName, outAllFileName);
377 processedLabels.insert(lookup[0]->getLabel());
378 userLabels.erase(lookup[0]->getLabel());
381 if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
382 string saveLabel = lookup[0]->getLabel();
384 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
385 lookup = input->getSharedRAbundVectors(lastLabel);
387 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
388 process(lookup, outputFileName, outAllFileName);
390 processedLabels.insert(lookup[0]->getLabel());
391 userLabels.erase(lookup[0]->getLabel());
393 //restore real lastlabel to save below
394 lookup[0]->setLabel(saveLabel);
397 lastLabel = lookup[0]->getLabel();
399 //get next line to process
400 //prevent memory leak
401 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
402 lookup = input->getSharedRAbundVectors();
405 if (m->control_pressed) {
406 if (mult) { remove(outAllFileName.c_str()); }
407 remove(outputFileName.c_str());
408 delete input; globaldata->ginput = NULL;
409 for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
410 globaldata->Groups.clear();
414 //output error messages about any remaining user labels
415 set<string>::iterator it;
416 bool needToRun = false;
417 for (it = userLabels.begin(); it != userLabels.end(); it++) {
418 m->mothurOut("Your file does not include the label " + *it);
419 if (processedLabels.count(lastLabel) != 1) {
420 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
423 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
427 //run last label if you need to
428 if (needToRun == true) {
429 for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
430 lookup = input->getSharedRAbundVectors(lastLabel);
432 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
433 process(lookup, outputFileName, outAllFileName);
434 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
438 //reset groups parameter
439 globaldata->Groups.clear();
441 for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
442 delete input; globaldata->ginput = NULL;
444 if (m->control_pressed) {
445 remove(outAllFileName.c_str());
446 remove(outputFileName.c_str());
450 m->mothurOutEndLine();
451 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
452 m->mothurOut(outputFileName); m->mothurOutEndLine();
453 if (mult) { m->mothurOut(outAllFileName); m->mothurOutEndLine(); outputTypes["summary"].push_back(outAllFileName); }
454 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } outputTypes["summary"].push_back(outputFileName);
455 m->mothurOutEndLine();
459 catch(exception& e) {
460 m->errorOut(e, "SummarySharedCommand", "execute");
465 /***********************************************************/
466 int SummarySharedCommand::process(vector<SharedRAbundVector*> thisLookup, string sumFileName, string sumAllFileName) {
468 vector< vector<seqDist> > calcDists; //vector containing vectors that contains the summary results for each group compare
469 calcDists.resize(sumCalculators.size()); //one for each calc, this will be used to make .dist files
471 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
473 driver(thisLookup, 0, numGroups, sumFileName+".temp", sumAllFileName+".temp", calcDists);
474 m->appendFiles((sumFileName + ".temp"), sumFileName);
475 remove((sumFileName + ".temp").c_str());
477 m->appendFiles((sumAllFileName + ".temp"), sumAllFileName);
478 remove((sumAllFileName + ".temp").c_str());
482 vector<int> processIDS;
484 //loop through and create all the processes you want
485 while (process != processors) {
489 processIDS.push_back(pid);
492 driver(thisLookup, lines[process].start, lines[process].end, sumFileName + toString(getpid()) + ".temp", sumAllFileName + toString(getpid()) + ".temp", calcDists);
494 //only do this if you want a distance file
496 string tempdistFileName = m->getRootName(m->getSimpleName(sumFileName)) + toString(getpid()) + ".dist";
498 m->openOutputFile(tempdistFileName, outtemp);
500 for (int i = 0; i < calcDists.size(); i++) {
501 outtemp << calcDists[i].size() << endl;
503 for (int j = 0; j < calcDists[i].size(); j++) {
504 outtemp << calcDists[i][j].seq1 << '\t' << calcDists[i][j].seq2 << '\t' << calcDists[i][j].dist << endl;
511 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
514 //parent do your part
515 driver(thisLookup, lines[0].start, lines[0].end, sumFileName + toString(getpid()) + ".temp", sumAllFileName + toString(getpid()) + ".temp", calcDists);
516 m->appendFiles((sumFileName + toString(getpid()) + ".temp"), sumFileName);
517 remove((sumFileName + toString(getpid()) + ".temp").c_str());
518 if (mult) { m->appendFiles((sumAllFileName + toString(getpid()) + ".temp"), sumAllFileName); }
520 //force parent to wait until all the processes are done
521 for (int i = 0; i < processIDS.size(); i++) {
522 int temp = processIDS[i];
526 for (int i = 0; i < processIDS.size(); i++) {
527 m->appendFiles((sumFileName + toString(processIDS[i]) + ".temp"), sumFileName);
528 remove((sumFileName + toString(processIDS[i]) + ".temp").c_str());
529 if (mult) { remove((sumAllFileName + toString(processIDS[i]) + ".temp").c_str()); }
532 string tempdistFileName = m->getRootName(m->getSimpleName(sumFileName)) + toString(processIDS[i]) + ".dist";
534 m->openInputFile(tempdistFileName, intemp);
536 for (int i = 0; i < calcDists.size(); i++) {
538 intemp >> size; m->gobble(intemp);
540 for (int j = 0; j < size; j++) {
545 intemp >> seq1 >> seq2 >> dist; m->gobble(intemp);
547 seqDist tempDist(seq1, seq2, dist);
548 calcDists[i].push_back(tempDist);
552 remove(tempdistFileName.c_str());
558 driver(thisLookup, 0, numGroups, (sumFileName + ".temp"), (sumAllFileName + ".temp"), calcDists);
559 m->appendFiles((sumFileName + ".temp"), sumFileName);
560 remove((sumFileName + ".temp").c_str());
562 m->appendFiles((sumAllFileName + ".temp"), sumAllFileName);
563 remove((sumAllFileName + ".temp").c_str());
568 for (int i = 0; i < calcDists.size(); i++) {
569 if (m->control_pressed) { break; }
571 string distFileName = outputDir + m->getRootName(m->getSimpleName(sumFileName)) + sumCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + ".dist";
572 outputNames.push_back(distFileName);
574 m->openOutputFile(distFileName, outDist);
575 outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
578 vector< vector<float> > matrix; //square matrix to represent the distance
579 matrix.resize(thisLookup.size());
580 for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); }
583 for (int j = 0; j < calcDists[i].size(); j++) {
584 int row = calcDists[i][j].seq1;
585 int column = calcDists[i][j].seq2;
586 float dist = calcDists[i][j].dist;
588 matrix[row][column] = dist;
589 matrix[column][row] = dist;
593 outDist << thisLookup.size() << endl;
594 for (int r=0; r<thisLookup.size(); r++) {
596 string name = thisLookup[r]->getGroup();
597 if (name.length() < 10) { //pad with spaces to make compatible
598 while (name.length() < 10) { name += " "; }
600 outDist << name << '\t';
603 for (int l = 0; l < r; l++) { outDist << matrix[r][l] << '\t'; }
611 catch(exception& e) {
612 m->errorOut(e, "SummarySharedCommand", "process");
616 /**************************************************************************************************/
617 int SummarySharedCommand::driver(vector<SharedRAbundVector*> thisLookup, int start, int end, string sumFile, string sumAllFile, vector< vector<seqDist> >& calcDists) {
620 //loop through calculators and add to file all for all calcs that can do mutiple groups
623 m->openOutputFile(sumAllFile, outAll);
626 outAll << thisLookup[0]->getLabel() << '\t';
628 //output groups names
629 string outNames = "";
630 for (int j = 0; j < thisLookup.size(); j++) {
631 outNames += thisLookup[j]->getGroup() + "-";
633 outNames = outNames.substr(0, outNames.length()-1); //rip off extra '-';
634 outAll << outNames << '\t';
636 for(int i=0;i<sumCalculators.size();i++){
637 if (sumCalculators[i]->getMultiple() == true) {
638 sumCalculators[i]->getValues(thisLookup);
640 if (m->control_pressed) { outAll.close(); return 1; }
643 sumCalculators[i]->print(outAll);
650 ofstream outputFileHandle;
651 m->openOutputFile(sumFile, outputFileHandle);
653 vector<SharedRAbundVector*> subset;
654 for (int k = start; k < end; k++) { // pass cdd each set of groups to compare
656 for (int l = 0; l < k; l++) {
658 outputFileHandle << thisLookup[0]->getLabel() << '\t';
660 subset.clear(); //clear out old pair of sharedrabunds
661 //add new pair of sharedrabunds
662 subset.push_back(thisLookup[k]); subset.push_back(thisLookup[l]);
664 //sort groups to be alphanumeric
665 if (thisLookup[k]->getGroup() > thisLookup[l]->getGroup()) {
666 outputFileHandle << (thisLookup[l]->getGroup() +'\t' + thisLookup[k]->getGroup()) << '\t'; //print out groups
668 outputFileHandle << (thisLookup[k]->getGroup() +'\t' + thisLookup[l]->getGroup()) << '\t'; //print out groups
671 for(int i=0;i<sumCalculators.size();i++) {
673 vector<double> tempdata = sumCalculators[i]->getValues(subset); //saves the calculator outputs
675 if (m->control_pressed) { outputFileHandle.close(); return 1; }
677 outputFileHandle << '\t';
678 sumCalculators[i]->print(outputFileHandle);
680 seqDist temp(l, k, tempdata[0]);
681 calcDists[i].push_back(temp);
683 outputFileHandle << endl;
687 outputFileHandle.close();
691 catch(exception& e) {
692 m->errorOut(e, "SummarySharedCommand", "driver");
696 /**************************************************************************************************/