2 * summarysharedcommand.cpp
5 * Created by Sarah Westcott on 1/2/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "summarysharedcommand.h"
11 #include "sharedsobscollectsummary.h"
12 #include "sharedchao1.h"
13 #include "sharedace.h"
14 #include "sharednseqs.h"
15 #include "sharedjabund.h"
16 #include "sharedsorabund.h"
17 #include "sharedjclass.h"
18 #include "sharedsorclass.h"
19 #include "sharedjest.h"
20 #include "sharedsorest.h"
21 #include "sharedthetayc.h"
22 #include "sharedthetan.h"
23 #include "sharedkstest.h"
24 #include "whittaker.h"
25 #include "sharedochiai.h"
26 #include "sharedanderbergs.h"
27 #include "sharedkulczynski.h"
28 #include "sharedkulczynskicody.h"
29 #include "sharedlennon.h"
30 #include "sharedmorisitahorn.h"
31 #include "sharedbraycurtis.h"
32 #include "sharedjackknife.h"
33 #include "whittaker.h"
36 //**********************************************************************************************************************
38 SummarySharedCommand::SummarySharedCommand(string option) {
40 globaldata = GlobalData::getInstance();
46 //allow user to run help
47 if(option == "help") { validCalculator = new ValidCalculators(); help(); abort = true; }
50 //valid paramters for this command
51 string Array[] = {"label","calc","groups","all","outputdir","distance","inputdir", "processors"};
52 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
54 OptionParser parser(option);
55 map<string, string> parameters = parser.getParameters();
57 ValidParameters validParameter;
59 //check to make sure all parameters are valid for command
60 for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
61 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
64 //make sure the user has already run the read.otu command
65 if (globaldata->getSharedFile() == "") {
66 m->mothurOut("You must read a list and a group, or a shared before you can use the summary.shared command."); m->mothurOutEndLine(); abort = true;
69 //if the user changes the output directory command factory will send this info to us in the output parameter
70 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
72 outputDir += m->hasPath(globaldata->getSharedFile()); //if user entered a file with a path then preserve it
75 //check for optional parameter and set defaults
76 // ...at some point should added some additional type checking...
77 label = validParameter.validFile(parameters, "label", false);
78 if (label == "not found") { label = ""; }
80 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
81 else { allLines = 1; }
84 //if the user has not specified any labels use the ones from read.otu
86 allLines = globaldata->allLines;
87 labels = globaldata->labels;
90 calc = validParameter.validFile(parameters, "calc", false);
91 if (calc == "not found") { calc = "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan"; }
93 if (calc == "default") { calc = "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan"; }
95 m->splitAtDash(calc, Estimators);
97 groups = validParameter.validFile(parameters, "groups", false);
98 if (groups == "not found") { groups = ""; }
100 m->splitAtDash(groups, Groups);
101 globaldata->Groups = Groups;
104 string temp = validParameter.validFile(parameters, "all", false); if (temp == "not found") { temp = "false"; }
105 all = m->isTrue(temp);
107 temp = validParameter.validFile(parameters, "distance", false); if (temp == "not found") { temp = "false"; }
108 createPhylip = m->isTrue(temp);
110 temp = validParameter.validFile(parameters, "processors", false); if(temp == "not found"){ temp = "1"; }
111 convert(temp, processors);
113 if (abort == false) {
115 validCalculator = new ValidCalculators();
118 for (i=0; i<Estimators.size(); i++) {
119 if (validCalculator->isValidCalculator("sharedsummary", Estimators[i]) == true) {
120 if (Estimators[i] == "sharedsobs") {
121 sumCalculators.push_back(new SharedSobsCS());
122 }else if (Estimators[i] == "sharedchao") {
123 sumCalculators.push_back(new SharedChao1());
124 }else if (Estimators[i] == "sharedace") {
125 sumCalculators.push_back(new SharedAce());
126 }else if (Estimators[i] == "jabund") {
127 sumCalculators.push_back(new JAbund());
128 }else if (Estimators[i] == "sorabund") {
129 sumCalculators.push_back(new SorAbund());
130 }else if (Estimators[i] == "jclass") {
131 sumCalculators.push_back(new Jclass());
132 }else if (Estimators[i] == "sorclass") {
133 sumCalculators.push_back(new SorClass());
134 }else if (Estimators[i] == "jest") {
135 sumCalculators.push_back(new Jest());
136 }else if (Estimators[i] == "sorest") {
137 sumCalculators.push_back(new SorEst());
138 }else if (Estimators[i] == "thetayc") {
139 sumCalculators.push_back(new ThetaYC());
140 }else if (Estimators[i] == "thetan") {
141 sumCalculators.push_back(new ThetaN());
142 }else if (Estimators[i] == "kstest") {
143 sumCalculators.push_back(new KSTest());
144 }else if (Estimators[i] == "sharednseqs") {
145 sumCalculators.push_back(new SharedNSeqs());
146 }else if (Estimators[i] == "ochiai") {
147 sumCalculators.push_back(new Ochiai());
148 }else if (Estimators[i] == "anderberg") {
149 sumCalculators.push_back(new Anderberg());
150 }else if (Estimators[i] == "kulczynski") {
151 sumCalculators.push_back(new Kulczynski());
152 }else if (Estimators[i] == "kulczynskicody") {
153 sumCalculators.push_back(new KulczynskiCody());
154 }else if (Estimators[i] == "lennon") {
155 sumCalculators.push_back(new Lennon());
156 }else if (Estimators[i] == "morisitahorn") {
157 sumCalculators.push_back(new MorHorn());
158 }else if (Estimators[i] == "braycurtis") {
159 sumCalculators.push_back(new BrayCurtis());
160 }else if (Estimators[i] == "whittaker") {
161 sumCalculators.push_back(new Whittaker());
170 catch(exception& e) {
171 m->errorOut(e, "SummarySharedCommand", "SummarySharedCommand");
176 //**********************************************************************************************************************
178 void SummarySharedCommand::help(){
180 m->mothurOut("The summary.shared command can only be executed after a successful read.otu command.\n");
181 m->mothurOut("The summary.shared command parameters are label, calc and all. No parameters are required.\n");
182 m->mothurOut("The summary.shared command should be in the following format: \n");
183 m->mothurOut("summary.shared(label=yourLabel, calc=yourEstimators, groups=yourGroups).\n");
184 m->mothurOut("Example summary.shared(label=unique-.01-.03, groups=B-C, calc=sharedchao-sharedace-jabund-sorensonabund-jclass-sorclass-jest-sorest-thetayc-thetan).\n");
185 validCalculator->printCalc("sharedsummary", cout);
186 m->mothurOut("The default value for calc is sharedsobs-sharedchao-sharedace-jabund-sorensonabund-jclass-sorclass-jest-sorest-thetayc-thetan\n");
187 m->mothurOut("The default value for groups is all the groups in your groupfile.\n");
188 m->mothurOut("The label parameter is used to analyze specific labels in your input.\n");
189 m->mothurOut("The all parameter is used to specify if you want the estimate of all your groups together. This estimate can only be made for sharedsobs and sharedchao calculators. The default is false.\n");
190 m->mothurOut("If you use sharedchao and run into memory issues, set all to false. \n");
191 m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n");
192 m->mothurOut("Note: No spaces between parameter labels (i.e. label), '=' and parameters (i.e.yourLabel).\n\n");
194 catch(exception& e) {
195 m->errorOut(e, "SummarySharedCommand", "help");
200 //**********************************************************************************************************************
202 SummarySharedCommand::~SummarySharedCommand(){
203 if (abort == false) {
205 delete validCalculator;
209 //**********************************************************************************************************************
211 int SummarySharedCommand::execute(){
214 if (abort == true) { return 0; }
216 ofstream outputFileHandle, outAll;
217 string outputFileName = outputDir + m->getRootName(m->getSimpleName(globaldata->inputFileName)) + "shared.summary";
219 //if the users entered no valid calculators don't execute command
220 if (sumCalculators.size() == 0) { return 0; }
221 //check if any calcs can do multiples
224 for (int i = 0; i < sumCalculators.size(); i++) {
225 if (sumCalculators[i]->getMultiple() == true) { mult = true; }
231 read = new ReadOTUFile(globaldata->inputFileName);
232 read->read(&*globaldata);
234 input = globaldata->ginput;
235 lookup = input->getSharedRAbundVectors();
236 string lastLabel = lookup[0]->getLabel();
238 /******************************************************/
239 //output headings for files
240 /******************************************************/
241 //output estimator names as column headers
242 m->openOutputFile(outputFileName, outputFileHandle);
243 outputFileHandle << "label" <<'\t' << "comparison" << '\t';
244 for(int i=0;i<sumCalculators.size();i++){
245 outputFileHandle << '\t' << sumCalculators[i]->getName();
246 if (sumCalculators[i]->getCols() == 3) { outputFileHandle << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; }
248 outputFileHandle << endl;
249 outputFileHandle.close();
251 //create file and put column headers for multiple groups file
252 string outAllFileName = ((m->getRootName(globaldata->inputFileName)) + "sharedmultiple.summary");
254 m->openOutputFile(outAllFileName, outAll);
255 outputNames.push_back(outAllFileName);
257 outAll << "label" <<'\t' << "comparison" << '\t';
258 for(int i=0;i<sumCalculators.size();i++){
259 if (sumCalculators[i]->getMultiple() == true) {
260 outAll << '\t' << sumCalculators[i]->getName();
267 if (lookup.size() < 2) {
268 m->mothurOut("I cannot run the command without at least 2 valid groups.");
269 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
271 //close files and clean up
272 remove(outputFileName.c_str());
273 if (mult == true) { remove(outAllFileName.c_str()); }
275 //if you only have 2 groups you don't need a .sharedmultiple file
276 }else if ((lookup.size() == 2) && (mult == true)) {
278 remove(outAllFileName.c_str());
279 outputNames.pop_back();
282 if (m->control_pressed) {
283 if (mult) { remove(outAllFileName.c_str()); }
284 remove(outputFileName.c_str());
285 delete input; globaldata->ginput = NULL;
286 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
287 for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
288 globaldata->Groups.clear();
291 /******************************************************/
294 /******************************************************/
295 //comparison breakup to be used by different processes later
296 numGroups = globaldata->Groups.size();
297 lines.resize(processors);
298 for (int i = 0; i < processors; i++) {
299 lines[i].start = int (sqrt(float(i)/float(processors)) * numGroups);
300 lines[i].end = int (sqrt(float(i+1)/float(processors)) * numGroups);
302 /******************************************************/
304 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
305 set<string> processedLabels;
306 set<string> userLabels = labels;
308 //as long as you are not at the end of the file or done wih the lines you want
309 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
310 if (m->control_pressed) {
311 if (mult) { remove(outAllFileName.c_str()); }
312 remove(outputFileName.c_str());
313 delete input; globaldata->ginput = NULL;
314 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
315 for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
316 globaldata->Groups.clear();
321 if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
322 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
323 process(lookup, outputFileName, outAllFileName);
325 processedLabels.insert(lookup[0]->getLabel());
326 userLabels.erase(lookup[0]->getLabel());
329 if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
330 string saveLabel = lookup[0]->getLabel();
332 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
333 lookup = input->getSharedRAbundVectors(lastLabel);
335 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
336 process(lookup, outputFileName, outAllFileName);
338 processedLabels.insert(lookup[0]->getLabel());
339 userLabels.erase(lookup[0]->getLabel());
341 //restore real lastlabel to save below
342 lookup[0]->setLabel(saveLabel);
345 lastLabel = lookup[0]->getLabel();
347 //get next line to process
348 //prevent memory leak
349 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
350 lookup = input->getSharedRAbundVectors();
353 if (m->control_pressed) {
354 if (mult) { remove(outAllFileName.c_str()); }
355 remove(outputFileName.c_str());
356 delete input; globaldata->ginput = NULL;
357 for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
358 globaldata->Groups.clear();
362 //output error messages about any remaining user labels
363 set<string>::iterator it;
364 bool needToRun = false;
365 for (it = userLabels.begin(); it != userLabels.end(); it++) {
366 m->mothurOut("Your file does not include the label " + *it);
367 if (processedLabels.count(lastLabel) != 1) {
368 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
371 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
375 //run last label if you need to
376 if (needToRun == true) {
377 for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
378 lookup = input->getSharedRAbundVectors(lastLabel);
380 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
381 process(lookup, outputFileName, outAllFileName);
382 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
386 //reset groups parameter
387 globaldata->Groups.clear();
389 for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
390 delete input; globaldata->ginput = NULL;
392 if (m->control_pressed) {
393 remove(outAllFileName.c_str());
394 remove(outputFileName.c_str());
398 m->mothurOutEndLine();
399 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
400 m->mothurOut(outputFileName); m->mothurOutEndLine();
401 if (mult) { m->mothurOut(outAllFileName); m->mothurOutEndLine(); }
402 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
403 m->mothurOutEndLine();
407 catch(exception& e) {
408 m->errorOut(e, "SummarySharedCommand", "execute");
413 /***********************************************************/
414 int SummarySharedCommand::process(vector<SharedRAbundVector*> thisLookup, string sumFileName, string sumAllFileName) {
416 vector< vector<seqDist> > calcDists; //vector containing vectors that contains the summary results for each group compare
417 calcDists.resize(sumCalculators.size()); //one for each calc, this will be used to make .dist files
419 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
421 driver(thisLookup, 0, numGroups, sumFileName+".temp", sumAllFileName+".temp", calcDists);
422 m->appendFiles((sumFileName + ".temp"), sumFileName);
423 remove((sumFileName + ".temp").c_str());
425 m->appendFiles((sumAllFileName + ".temp"), sumAllFileName);
426 remove((sumAllFileName + ".temp").c_str());
430 vector<int> processIDS;
432 //loop through and create all the processes you want
433 while (process != processors) {
437 processIDS.push_back(pid);
440 driver(thisLookup, lines[process].start, lines[process].end, sumFileName + toString(getpid()) + ".temp", sumAllFileName + toString(getpid()) + ".temp", calcDists);
442 //only do this if you want a distance file
444 string tempdistFileName = m->getRootName(m->getSimpleName(sumFileName)) + toString(getpid()) + ".dist";
446 m->openOutputFile(tempdistFileName, outtemp);
448 for (int i = 0; i < calcDists.size(); i++) {
449 outtemp << calcDists[i].size() << endl;
451 for (int j = 0; j < calcDists[i].size(); j++) {
452 outtemp << calcDists[i][j].seq1 << '\t' << calcDists[i][j].seq2 << '\t' << calcDists[i][j].dist << endl;
459 }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
462 //parent do your part
463 driver(thisLookup, lines[0].start, lines[0].end, sumFileName + toString(getpid()) + ".temp", sumAllFileName + toString(getpid()) + ".temp", calcDists);
464 m->appendFiles((sumFileName + toString(getpid()) + ".temp"), sumFileName);
465 remove((sumFileName + toString(getpid()) + ".temp").c_str());
466 if (mult) { m->appendFiles((sumAllFileName + toString(getpid()) + ".temp"), sumAllFileName); }
468 //force parent to wait until all the processes are done
469 for (int i = 0; i < processIDS.size(); i++) {
470 int temp = processIDS[i];
474 for (int i = 0; i < processIDS.size(); i++) {
475 m->appendFiles((sumFileName + toString(processIDS[i]) + ".temp"), sumFileName);
476 remove((sumFileName + toString(processIDS[i]) + ".temp").c_str());
477 if (mult) { remove((sumAllFileName + toString(processIDS[i]) + ".temp").c_str()); }
480 string tempdistFileName = m->getRootName(m->getSimpleName(sumFileName)) + toString(processIDS[i]) + ".dist";
482 m->openInputFile(tempdistFileName, intemp);
484 for (int i = 0; i < calcDists.size(); i++) {
486 intemp >> size; m->gobble(intemp);
488 for (int j = 0; j < size; j++) {
493 intemp >> seq1 >> seq2 >> dist; m->gobble(intemp);
495 seqDist tempDist(seq1, seq2, dist);
496 calcDists[i].push_back(tempDist);
500 remove(tempdistFileName.c_str());
506 driver(thisLookup, 0, numGroups, (sumFileName + ".temp"), (sumAllFileName + ".temp"), calcDists);
507 m->appendFiles((sumFileName + ".temp"), sumFileName);
508 remove((sumFileName + ".temp").c_str());
510 m->appendFiles((sumAllFileName + ".temp"), sumAllFileName);
511 remove((sumAllFileName + ".temp").c_str());
516 for (int i = 0; i < calcDists.size(); i++) {
517 if (m->control_pressed) { break; }
519 string distFileName = outputDir + m->getRootName(m->getSimpleName(sumFileName)) + sumCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + ".dist";
520 outputNames.push_back(distFileName);
522 m->openOutputFile(distFileName, outDist);
523 outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
526 vector< vector<float> > matrix; //square matrix to represent the distance
527 matrix.resize(thisLookup.size());
528 for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); }
531 for (int j = 0; j < calcDists[i].size(); j++) {
532 int row = calcDists[i][j].seq1;
533 int column = calcDists[i][j].seq2;
534 float dist = calcDists[i][j].dist;
536 matrix[row][column] = dist;
537 matrix[column][row] = dist;
541 outDist << thisLookup.size() << endl;
542 for (int r=0; r<thisLookup.size(); r++) {
544 string name = thisLookup[r]->getGroup();
545 if (name.length() < 10) { //pad with spaces to make compatible
546 while (name.length() < 10) { name += " "; }
548 outDist << name << '\t';
551 for (int l = 0; l < r; l++) { outDist << matrix[r][l] << '\t'; }
559 catch(exception& e) {
560 m->errorOut(e, "SummarySharedCommand", "process");
564 /**************************************************************************************************/
565 int SummarySharedCommand::driver(vector<SharedRAbundVector*> thisLookup, int start, int end, string sumFile, string sumAllFile, vector< vector<seqDist> >& calcDists) {
568 //loop through calculators and add to file all for all calcs that can do mutiple groups
571 m->openOutputFile(sumAllFile, outAll);
574 outAll << thisLookup[0]->getLabel() << '\t';
576 //output groups names
577 string outNames = "";
578 for (int j = 0; j < thisLookup.size(); j++) {
579 outNames += thisLookup[j]->getGroup() + "-";
581 outNames = outNames.substr(0, outNames.length()-1); //rip off extra '-';
582 outAll << outNames << '\t';
584 for(int i=0;i<sumCalculators.size();i++){
585 if (sumCalculators[i]->getMultiple() == true) {
586 sumCalculators[i]->getValues(thisLookup);
588 if (m->control_pressed) { outAll.close(); return 1; }
591 sumCalculators[i]->print(outAll);
598 ofstream outputFileHandle;
599 m->openOutputFile(sumFile, outputFileHandle);
601 vector<SharedRAbundVector*> subset;
602 for (int k = start; k < end; k++) { // pass cdd each set of groups to compare
604 for (int l = 0; l < k; l++) {
606 outputFileHandle << thisLookup[0]->getLabel() << '\t';
608 subset.clear(); //clear out old pair of sharedrabunds
609 //add new pair of sharedrabunds
610 subset.push_back(thisLookup[k]); subset.push_back(thisLookup[l]);
612 //sort groups to be alphanumeric
613 if (thisLookup[k]->getGroup() > thisLookup[l]->getGroup()) {
614 outputFileHandle << (thisLookup[l]->getGroup() +'\t' + thisLookup[k]->getGroup()) << '\t'; //print out groups
616 outputFileHandle << (thisLookup[k]->getGroup() +'\t' + thisLookup[l]->getGroup()) << '\t'; //print out groups
619 for(int i=0;i<sumCalculators.size();i++) {
621 vector<double> tempdata = sumCalculators[i]->getValues(subset); //saves the calculator outputs
623 if (m->control_pressed) { outputFileHandle.close(); return 1; }
625 outputFileHandle << '\t';
626 sumCalculators[i]->print(outputFileHandle);
628 seqDist temp(l, k, tempdata[0]);
629 calcDists[i].push_back(temp);
631 outputFileHandle << endl;
635 outputFileHandle.close();
639 catch(exception& e) {
640 m->errorOut(e, "SummarySharedCommand", "driver");
644 /**************************************************************************************************/